Esempio n. 1
0
def spacy_stats(caption):
    doc = nlp(caption)
    tokens = [token for token in doc]
    POS = ["POS_" + token.pos_ for token in tokens]
    tags = ["TAG_" + token.tag_ for token in tokens]
    ents = ["ENT_" + ent.label_ for ent in doc.ents]

    is_blank = {
        k: sum(getattr(token, k) for token in tokens)
        for k in [
            "is_digit",
            "is_lower",
            "is_upper",
            "is_title",
            "is_punct",
            "is_currency",
            "like_num",
            "is_oov",
            "is_stop",
        ]
    }

    return {
        "num_stop": sum(t.is_stop for t in tokens),
        "num_alpha": sum(t.is_alpha for t in tokens),
        "num_tokens": len(tokens),
        "num_noun_chunks": len(list(doc.noun_chunks)),
        "num_words": len(doc),
        **toolz.frequencies(POS),
        **toolz.frequencies(tags),
        **toolz.frequencies(ents),
        **is_blank,
    }
Esempio n. 2
0
def get_citation_histograms(identifiers, data=None):
    ch = {}
    current_year = datetime.now().year
    # Get necessary data if nothing was provided
    if not data:
        data = get_citations(identifiers)
    if len(data) == 0:
        data = get_citations(identifiers, no_zero=False)
    years = [int(p.bibcode[:4]) for p in data]
    # First gather all necessary data
    # refereed -> refereed
    rr_data = [([int(c[:4]) for c in p.refereed_citations],
                1.0 / float(p.author_num)) for p in data if p.refereed]
    # refereed -> non-refereed
    rn_data = [([int(c[:4]) for c in p.citations if c in p.refereed_citations],
                1.0 / float(p.author_num)) for p in data if not p.refereed]
    # non-refereed -> refereed
    nr_data = [([int(c[:4]) for c in list(set(p.citations).difference(
        set(p.refereed_citations)))], 1.0 / float(p.author_num)) for
        p in data if p.refereed]
    # non-refereed -> non-refereed
    nn_data = [([int(c[:4]) for c in p.citations if
                 c not in p.refereed_citations],
                1.0 / float(p.author_num)) for p in data if not p.refereed]
    # First construct the regular histograms
    rr_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in rr_data])))
    rn_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in rn_data])))
    nr_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in nr_data])))
    nn_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in nn_data])))
    # Get the earliest citation
    try:
        min_year = min(
            rr_hist.keys() + rn_hist.keys() + nr_hist.keys() + nn_hist.keys())
        nullhist = [(y, 0) for y in range(min_year, current_year + 1)]
    except:
        nullhist = [(y, 0) for y in range(min(years), current_year + 1)]
    # Now create the histograms with zeroes for year without values
    ch['refereed to refereed'] = merge_dictionaries(dict(nullhist), rr_hist)
    ch['refereed to nonrefereed'] = merge_dictionaries(dict(nullhist), rn_hist)
    ch['nonrefereed to refereed'] = merge_dictionaries(dict(nullhist), nr_hist)
    ch['nonrefereed to nonrefereed'] = merge_dictionaries(
        dict(nullhist), nn_hist)
    min_year = min(ch['refereed to refereed'].keys() +
                   ch['refereed to nonrefereed'].keys() +
                   ch['nonrefereed to refereed'].keys() +
                   ch['nonrefereed to nonrefereed'].keys())
    nullhist = [(y, 0) for y in range(min_year, current_year + 1)]
    # Normalized histograms need a different approach
    tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in rr_data]))
    ch['refereed to refereed normalized'] = get_norm_histo(nullhist + tmp)
    tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in rn_data]))
    ch['refereed to nonrefereed normalized'] = get_norm_histo(nullhist + tmp)
    tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in nr_data]))
    ch['nonrefereed to refereed normalized'] = get_norm_histo(nullhist + tmp)
    tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in nn_data]))
    ch['nonrefereed to nonrefereed normalized'] = get_norm_histo(
        nullhist + tmp)
    return ch
Esempio n. 3
0
	def prep_work(self, test_files, develop_files, train_files, threshold):
	
		#Reduce DID to language-specific samples
		if self.type == "DID": 
			test_files = [x for x in test_files if x[0] == self.language]
			develop_files = [x for x in develop_files if x[0] == self.language]
			train_files = [x for x in train_files if x[0] == self.language]
			
			#Filter by number of samples
			country_list = [x[-1] for x in train_files]
			starting = len(set(country_list))
			country_dict = ct.frequencies(country_list)
			country_threshold = lambda x: x  >= threshold
			country_dict = ct.valfilter(country_threshold, country_dict)
			country_list = list(country_dict.keys())
			print("\t\tReducing initial set of " + str(starting) + " countries to " + str(len(country_list)) + " after frequency threshold.")
			
			#Prune and shuffle file lists
			test_files = [x for x in test_files if x[-1] in country_list]
			shuffle(test_files)
			
			train_files = [x for x in train_files if x[-1] in country_list]
			shuffle(train_files)
			
			develop_files = [x for x in develop_files if x[-1] in country_list]
			shuffle(develop_files)
			
			return test_files, develop_files, train_files, country_list
		
		elif self.type == "LID":
			#Filter by number of samples
			lang_list = [x[0] for x in train_files]
			starting = len(set(lang_list))
			lang_dict = ct.frequencies(lang_list)
			lang_threshold = lambda x: x  >= threshold
			lang_dict = ct.valfilter(lang_threshold, lang_dict)
			lang_list = list(lang_dict.keys())
			print("\t\tReducing initial set of " + str(starting) + " languages to " + str(len(lang_list)) + " after frequency threshold.")
			
			#Prune and shuffle file lists
			test_files = [x for x in test_files if x[0] in lang_list]
			shuffle(test_files)
			
			train_files = [x for x in train_files if x[0] in lang_list]
			shuffle(train_files)
			
			develop_files = [x for x in develop_files if x[0] in lang_list]
			shuffle(develop_files)
		
			return test_files, develop_files, train_files, lang_list
Esempio n. 4
0
	def data_description(self, y_dev):
	
		freqs = ct.frequencies(y_dev)
		
		for i in range(len(self.y_encoder.classes_)):
			print("\t", end = "")
			print(self.y_encoder.classes_[i], freqs[i])
Esempio n. 5
0
	def process_file(self, filename, delta_threshold = 0.05, freq_threshold = 1, save = True):
		
		candidates = []
		starting = time.time()
		
		#Initialize Beam Search class
		BS = BeamSearch(delta_threshold, self.association_dict)
		
		for line in self.Encoder.load_stream(filename):

			if len(line) > 2:
				
				#Beam Search extraction
				candidates += BS.beam_search(line)
			
		#Count each candidate, get dictionary with candidate frequencies
		candidates = ct.frequencies(candidates)
		print("\t" + str(len(candidates)) + " candidates before pruning.")
		
		#Reduce nonce candidates
		above_zero = lambda x: x > freq_threshold
		candidates = ct.valfilter(above_zero, candidates)		
			
		#Print time and number of remaining candidates
		print("\t" + str(len(candidates)) + " candidates in " + str(time.time() - starting) + " seconds.")
	
		if save == True:
			self.Loader.save_file(candidates, filename + ".candidates.p")
			return os.path.join(self.Loader.output_dir, filename + ".candidates.p")
				
		else:
			return candidates
Esempio n. 6
0
def calc_stats(dp: dataset_pb2.DataPoint) -> pd.DataFrame:
    items = []
    signal = np.array(dp.signal)
    items.append(("Signal length", len(signal)))
    items.append(("Signal min value", np.min(signal)))
    items.append(("Signal median value", np.median(signal)))
    items.append(("Signal max value", np.max(signal)))
    items.append(("Signal value std", np.std(signal)))
    items.append(("Basecalled length", len(dp.basecalled)))
    items.append(("Reference length", len(dp.aligned_ref)))
    occ = toolz.frequencies(dp.cigar)
    items.append((
        "Match Rate",
        occ.get(dataset_pb2.MATCH, 0) / len(dp.aligned_ref),
    ))
    items.append((
        "Mismatch Rate",
        occ.get(dataset_pb2.MISMATCH, 0) / len(dp.aligned_ref),
    ))
    items.append((
        "Insertion Rate",
        occ.get(dataset_pb2.INSERTION, 0) / len(dp.aligned_ref),
    ))
    items.append((
        "Deletion Rate",
        occ.get(dataset_pb2.DELETION, 0) / len(dp.aligned_ref),
    ))
    items.append(("Signal sample/bases", len(signal) / len(dp.basecalled)))
    return pd.DataFrame(items, columns=("Attribute", "Value"))
Esempio n. 7
0
    def print_labels(df, labels):
        """
        Print an inventory of labels counts, and return it as a dictionary

        :param df:
        :param labels:
        :return:
        """
        return ct.frequencies(df.loc[:, labels])
Esempio n. 8
0
def build_vocab(tokenized_texts, min_occur_count):
    word_counts = cytoolz.frequencies(w for doc in tokenized_texts
                                      for w in doc.lower().split())
    word_counts = cytoolz.valfilter(lambda v: v >= min_occur_count,
                                    word_counts)
    vocab, counts = zip(
        *sorted(word_counts.items(), key=operator.itemgetter(1), reverse=True))
    vocab = list(vocab)
    counts = np.array(counts)
    return vocab, counts
Esempio n. 9
0
def get_publication_histograms(identifiers):
    ph = {}
    current_year = datetime.now().year
    # Get necessary data
    data = get_publication_data(identifiers)
    # Get the publication histogram
    years = [int(p.bibcode[:4]) for p in data]
    nullhist = [(y, 0) for y in range(min(years), current_year + 1)]
    yearhist = cy.frequencies(years)
    ph['all publications'] = merge_dictionaries(dict(nullhist), yearhist)
    years_ref = [int(p.bibcode[:4]) for p in data if p.refereed]
    yearhist = cy.frequencies(years_ref)
    ph['refereed publications'] = merge_dictionaries(dict(nullhist), yearhist)
    # Get the normalized publication histogram
    tmp = [(int(p.bibcode[:4]), 1.0 / float(p.author_num)) for p in data]
    ph['all publications normalized'] = get_norm_histo(nullhist + tmp)
    tmp = [(int(p.bibcode[:4]), 1.0 / float(p.author_num)) for p in data
           if p.refereed]
    ph['refereed publications normalized'] = get_norm_histo(nullhist + tmp)
    return ph
Esempio n. 10
0
def get_publication_histograms(identifiers):
    ph = {}
    current_year = datetime.now().year
    # Get necessary data
    data = get_publication_data(identifiers)
    # Get the publication histogram
    years = [int(p.bibcode[:4]) for p in data]
    nullhist = [(y, 0) for y in range(min(years), current_year + 1)]
    yearhist = cy.frequencies(years)
    ph['all publications'] = merge_dictionaries(dict(nullhist), yearhist)
    years_ref = [int(p.bibcode[:4]) for p in data if p.refereed]
    yearhist = cy.frequencies(years_ref)
    ph['refereed publications'] = merge_dictionaries(dict(nullhist), yearhist)
    # Get the normalized publication histogram
    tmp = [(int(p.bibcode[:4]), 1.0 / float(p.author_num)) for p in data]
    ph['all publications normalized'] = get_norm_histo(nullhist + tmp)
    tmp = [(int(p.bibcode[:4]), 1.0 / float(p.author_num))
           for p in data if p.refereed]
    ph['refereed publications normalized'] = get_norm_histo(nullhist + tmp)
    return ph
Esempio n. 11
0
def simulate_counts(p, C, phys_dim=2, seed=None):
    """Simulate measuring each qubit of ``p`` in the computational basis,
    producing output like that of ``qiskit``.

    Parameters
    ----------
    p : vector or operator
        The quantum state, assumed to be normalized, as either a ket or density
        operator.
    C : int
        The number of counts to perform.
    phys_dim : int, optional
        The assumed size of the subsystems of ``p``, defaults to 2 for qubits.

    Returns
    -------
    results : dict[str, int]
        The counts for each bit string measured.

    Examples
    --------

    Simulate measuring the state of each qubit in a GHZ-state:

    .. code:: python3

        >>> import quimb as qu
        >>> psi = qu.ghz_state(3)
        >>> qu.simulate_counts(psi, 1024)
        {'000': 514, '111': 510}

    """
    if seed is not None:
        np.random.seed(seed)

    n = infer_size(p, phys_dim)
    d = phys_dim**n

    if isop(p):
        pi = np.diag(p).real
    else:
        pi = np.multiply(np.conj(p), p).real

    # probability of each basis state
    pi = pi.reshape(-1)

    # raw counts in terms of integers
    raw_counts = np.random.choice(np.arange(d), size=C, p=pi)

    # convert to frequencies of binary
    bin_str = '{:0>' + str(n) + 'b}'
    results = keymap(bin_str.format, frequencies(raw_counts))

    return results
def doc_features(doc):
    doc_words = cytoolz.frequencies(cm.filter_sw(doc))

    # initialize to 0
    features = zero_features.copy()

    word_matches = match(doc_words, word_features)

    for word in word_matches:
        features[word] = doc_words[word]

    return features
Esempio n. 13
0
def doc_features(doc):
    doc_words = cytoolz.frequencies(cm.filter_sw(doc))

    # initialize to 0
    features = zero_features.copy()

    word_matches = match(doc_words, word_features)

    for word in word_matches:
        features[word] = (doc_words[word])

    return features
Esempio n. 14
0
def _mk_fork_configuration_params(fork_config):
    all_block_numbers = tuple(fork_config.values())
    if len(all_block_numbers) != len(set(all_block_numbers)):
        duplicates = tuple(
            sorted(blk_num
                   for blk_num, freq in frequencies(all_block_numbers).items()
                   if freq > 1))
        raise ValueError("Duplicate block numbers: {0}".format(duplicates))

    args = {(block_number, FORK_NAME_MAPPING[fork_name])
            for fork_name, block_number in fork_config.items()
            if (block_number is not None and fork_name != FORK_DAO)}

    if FORK_DAO in fork_config:
        kwargs = {'dao_start_block': fork_config[FORK_DAO]}
    else:
        kwargs = {}

    return args, kwargs
Esempio n. 15
0
    def process_file(self,
                     filename,
                     delta_threshold=0.05,
                     freq_threshold=1,
                     save=True):

        candidates = []
        starting = time.time()

        #Initialize Beam Search class
        BS = BeamSearch(delta_threshold, self.association_dict)

        for line in self.Encoder.load_stream(filename):

            if len(line) > 2:

                #Beam Search extraction
                candidates += BS.beam_search(line)

        #Count each candidate, get dictionary with candidate frequencies
        candidates = ct.frequencies(candidates)
        print("\t" + str(len(candidates)) + " candidates before pruning.")

        #Reduce nonce candidates
        above_zero = lambda x: x > freq_threshold
        candidates = ct.valfilter(above_zero, candidates)

        #Print time and number of remaining candidates
        print("\t" + str(len(candidates)) + " candidates in " +
              str(time.time() - starting) + " seconds.")

        if save == True:
            self.Loader.save_file(candidates, filename + ".candidates.p")
            return os.path.join(self.Loader.output_dir,
                                filename + ".candidates.p")

        else:
            return candidates
Esempio n. 16
0
def get_citation_histograms(identifiers, data=None):
    ch = {}
    current_year = datetime.now().year
    # Get necessary data if nothing was provided
    if not data:
        data = get_citations(identifiers)
    if len(data) == 0:
        data = get_citations(identifiers, no_zero=False)
    years = [int(p.bibcode[:4]) for p in data]
    # First gather all necessary data
    # refereed -> refereed
    rr_data = [([int(c[:4])
                 for c in p.refereed_citations], 1.0 / float(p.author_num))
               for p in data if p.refereed]
    # refereed -> non-refereed
    rn_data = [([int(c[:4]) for c in p.citations
                 if c in p.refereed_citations], 1.0 / float(p.author_num))
               for p in data if not p.refereed]
    # non-refereed -> refereed
    nr_data = [([
        int(c[:4])
        for c in list(set(p.citations).difference(set(p.refereed_citations)))
    ], 1.0 / float(p.author_num)) for p in data if p.refereed]
    # non-refereed -> non-refereed
    nn_data = [
        ([int(c[:4]) for c in p.citations
          if c not in p.refereed_citations], 1.0 / float(p.author_num))
        for p in data if not p.refereed
    ]
    # First construct the regular histograms
    rr_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in rr_data])))
    rn_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in rn_data])))
    nr_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in nr_data])))
    nn_hist = cy.frequencies(list(itertools.chain(*[d[0] for d in nn_data])))
    # Get the earliest citation
    try:
        min_year = min(rr_hist.keys() + rn_hist.keys() + nr_hist.keys() +
                       nn_hist.keys())
        nullhist = [(y, 0) for y in range(min_year, current_year + 1)]
    except:
        nullhist = [(y, 0) for y in range(min(years), current_year + 1)]
    # Now create the histograms with zeroes for year without values
    ch['refereed to refereed'] = merge_dictionaries(dict(nullhist), rr_hist)
    ch['refereed to nonrefereed'] = merge_dictionaries(dict(nullhist), rn_hist)
    ch['nonrefereed to refereed'] = merge_dictionaries(dict(nullhist), nr_hist)
    ch['nonrefereed to nonrefereed'] = merge_dictionaries(
        dict(nullhist), nn_hist)
    min_year = min(ch['refereed to refereed'].keys() +
                   ch['refereed to nonrefereed'].keys() +
                   ch['nonrefereed to refereed'].keys() +
                   ch['nonrefereed to nonrefereed'].keys())
    nullhist = [(y, 0) for y in range(min_year, current_year + 1)]
    # Normalized histograms need a different approach
    tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in rr_data]))
    ch['refereed to refereed normalized'] = get_norm_histo(nullhist + tmp)
    tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in rn_data]))
    ch['refereed to nonrefereed normalized'] = get_norm_histo(nullhist + tmp)
    tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in nr_data]))
    ch['nonrefereed to refereed normalized'] = get_norm_histo(nullhist + tmp)
    tmp = list(itertools.chain(*[[(d, x[1]) for d in x[0]] for x in nn_data]))
    ch['nonrefereed to nonrefereed normalized'] = get_norm_histo(nullhist +
                                                                 tmp)
    return ch
Esempio n. 17
0
def ngram_counts(words, n, pad='<eos>'):
    """
    generates a dictionary of ngram counts from a list of words.
    """
    return frequencies(ngrams(words, n, pad))
Esempio n. 18
0
def freq_dict(file_words):
    filtered = cm.filter_sw(file_words[1].split())

    fd = cytoolz.frequencies(filtered)

    return fd
Esempio n. 19
0
def build_ngram_model(sentences, n, pad='<eos>'):
    """
    generates a dictionary of word-ngram counts from a list of sentences.
    """
    return frequencies( concat(ngrams(sent, n, pad) for sent in sentences) )
Esempio n. 20
0
def select_word_features(corpus):
    words = cytoolz.frequencies(corpus)
    sorted_words = sorted(words, key=words.get)
    N = int(.02 * len(sorted_words))

    return sorted_words[-N:]
Esempio n. 21
0
 def frequencies(self):
     return fdict(cytoolz.frequencies(self))
Esempio n. 22
0
                    noun1, preposition, noun2 = binary.split('-')
                    l1 += list(df[df['object1'] == noun1].image)
                    l2 += list(df[df['object2'] == noun2].image)
                    l3 += list(df[df['preposition'] == preposition].image)
                    l4 += list(df[(
                        (df['object1'] == unary) | (df['object2'] == unary))
                                  & (df['rcc'] == 'DC')].image)

                    # l1 += list(df[(df['object1'] == noun1) & (df['rcc'].notnull())].image)
                    # l2 += list(df[(df['object2'] == noun2) & (df['rcc'].notnull())].image)
                    # l3 += list(df[(df['preposition'] == preposition) & (df['rcc'].notnull())].image)
                    # l4 += list(df[((df['object1'] == unary) | (df['object2'] == unary)) & (df['rcc'] == 'DC')].images)

                retrieved = {
                    k: v / weights[k]
                    for k, v in cytoolz.frequencies(l1 + l2 + l3 + l4).items()
                }
                valids = [
                    (k, retrieved[k])
                    for k in sorted(retrieved, key=retrieved.get, reverse=True)
                    if retrieved[k] >= 2.5
                ]
                retrieved = []
                relevance = []
                if valids:
                    retrieved, relevance = zip(*valids)

                gs = [
                    imagenames[idx]
                    for idx, is_valid in enumerate(query['rank']) if is_valid
                ]
Esempio n. 23
0
def freq_dict(file_words):
    filtered = cm.filter_sw(file_words[1].split())

    fd = cytoolz.frequencies(filtered)

    return fd
Esempio n. 24
0
def alignment_stats(lable_ind,
                    label_val,
                    pred_ind,
                    pred_val,
                    batch_size,
                    debug=False):
    """Returns a list of numpy array representing alignemnt stats. First N elements are
    in aligment_stats_ordering and the last one in identity.

    The return is like this due to tf.py_func requirements --> this function is made for
    embedding as tf operation via tf.py_func

    :param lable_ind:
    :param label_val:
    :param pred_ind:
    :param pred_val:
    :param batch_size:
    :param debug:
    :return:
    """

    prefix = os.environ.get("MINCALL_LOG_DATA", None)
    if prefix:
        fname = os.path.abspath(os.path.join(prefix,
                                             f"{uuid.uuid4().hex}.npz"))
        with open(fname, "wb") as f:
            np.savez(
                f, **{
                    "label_val": label_val,
                    "lable_ind": lable_ind,
                    "pred_val": pred_val,
                    "pred_ind": pred_ind,
                    "batch_size": batch_size,
                })
        logger.debug(f"Saves alignment stats input data to {fname}")

    yt = defaultdict(list)
    for ind, val in zip(lable_ind, label_val):
        yt[ind[0]].append(val)

    yp = defaultdict(list)
    for ind, val in zip(pred_ind, pred_val):
        yp[ind[0]].append(val)

    sol = defaultdict(list)
    identities = []
    for x in range(batch_size):
        query = decode(np.array(yp[x], dtype=int))
        target = decode(np.array(yt[x], dtype=int))
        if len(target) == 0:
            raise ValueError("Empty target sequence")
        if len(query) == 0:
            logger.warning(f"Empty query sequence\n" f"Target: {target}")
            sol[dataset_pb2.MATCH].append(0.0)
            sol[dataset_pb2.MISMATCH].append(0.0)
            sol[dataset_pb2.DELETION].append(1.0)
            sol[dataset_pb2.INSERTION].append(0.0)
            identities.append(0)
            continue
        edlib_res = edlib.align(query, target, task='path')
        stats = ext_cigar_stats(edlib_res['cigar'])

        read_len = stats[dataset_pb2.MISMATCH] + stats[
            dataset_pb2.MATCH] + stats[dataset_pb2.INSERTION]

        #  https://github.com/isovic/samscripts/blob/master/src/errorrates.py
        identities.append(stats[dataset_pb2.MATCH] / sum(stats.values()))

        for op in aligment_stats_ordering:
            sol[op].append(stats[op] / read_len)
        if True:
            msg = "edlib results\n"
            s_query, s_target, _ = squggle(query, target)
            exp_cigar = expand_cigar(edlib_res['cigar'])

            for i in range(0, len(s_query), 80):
                msg += "query:  " + s_query[i:i + 80] + "\n"
                msg += "target: " + s_target[i:i + 80] + "\n"
                msg += "cigar : " + exp_cigar[i:i + 80] + "\n"
                msg += "--------" + 80 * "-" + "\n"

            msg += "query:  " + query + "\n"
            msg += "target: " + target + "\n"
            msg += "full cigar:  " + edlib_res['cigar'] + "\n"
            msg += pformat(
                {dataset_pb2.Cigar.Name(k): v
                 for k, v in stats.items()}) + "\n"
            msg += "readl:  " + str(read_len) + "\n"
            df = pd.DataFrame({
                "query":
                toolz.merge(
                    toolz.frequencies(query),
                    toolz.keymap(
                        "".join,
                        toolz.frequencies(toolz.sliding_window(2, query))),
                ),
                "target":
                toolz.merge(
                    toolz.frequencies(target),
                    toolz.keymap(
                        "".join,
                        toolz.frequencies(toolz.sliding_window(2, target))),
                ),
            })
            df["delta"] = 100 * (df['target'] / df['query'] - 1)
            df = df[['query', 'target', 'delta']]
            msg += "Stats\n" + str(df) + "\n"
            msg += "==================\n"
            logger.info(msg)
    sol = [
        np.array(sol[op], dtype=np.float32) for op in aligment_stats_ordering
    ]
    sol_data = {
        dataset_pb2.Cigar.Name(k): v
        for k, v in zip(aligment_stats_ordering, sol)
    }
    sol_data["IDENTITY"] = identities
    logger.info(f"sol: \n{pd.DataFrame(sol_data)}")
    return sol + [np.array(identities, dtype=np.float32)]