def get_distr(strlist, n_len):
    alphabet = ['A', 'C', 'G', 'T', 'N']
    n = NGram(N=n_len, pad_len=0)
    all_ngrams = 0
    grams = init_grams_dict(n_len, alphabet)
    for item in strlist:
        if item == '':
            continue
        ngram_list = list(n._split(item))
        for ng in ngram_list:
            if ng in grams:
                grams[ng] += float(1)
                all_ngrams += 1
    for item in grams.keys():
        grams[item] /= all_ngrams
    return grams
                and oases_item['top'] == trinity_item['top'] \
                and oases_item['n'] == trinity_item['n']\
                and oases_item['kernel'].values()[0] == trinity_item['kernel'].values()[0]:
            print (oases_item, trinity_item)

#intersect = set(good_list_oases) & set(good_list_trinity)

from tr_parser import get_assemblies
(ref, oases_reads, oases_name_index, trinity_reads, trinity_name_index) = get_assemblies("data/ref_for_reads.fasta",
                                                                                         "data/Oases.fasta",
                                                                                         "data/Trinity.fasta")
from ngram import NGram

n = NGram(N=4, pad_len=0)
grams = dict()
for transcript in ref:
    if transcript == '':
        continue
    ngram_list = list(n._split(transcript))
    for ng in ngram_list:
        if ng == 'TTSG':
            pdb.set_trace()
        if ng in grams:
            grams[ng] += 1
        else:
            grams[ng] = 1

pp = pprint.PrettyPrinter()
pp.pprint(grams)
print len(grams)
            ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], "n")
        masked_domain_name = multi_replace(aux3_domain_name, ['-'], "s")
        #print domain_name + " -> " + masked_domain_name
        #continue

        feature.append(domain_name)
        feature.append(masked_domain_name)
        feature.append(tag)

        if len(domain_name) < 5:
            continue

        ### ID Designation
        ### V4-6   1-gram (mean, variance and standard deviation)
        n = NGram(N=1)
        v = list(n._split(domain_name))
        [f1, f2, f3] = ngram_stats(v)
        feature.extend([f1, f2, f3])

        ### V6-9   2-gram (mean, variance and standard deviation)
        n = NGram(N=2)
        v = list(n._split(domain_name))
        [f1, f2, f3] = ngram_stats(v)
        feature.extend([f1, f2, f3])

        ### V10-12   3-gram (mean, variance and standard deviation)
        n = NGram(N=3)
        v = list(n._split(domain_name))
        [f1, f2, f3] = ngram_stats(v)
        feature.extend([f1, f2, f3])