def get_distr(strlist, n_len): alphabet = ['A', 'C', 'G', 'T', 'N'] n = NGram(N=n_len, pad_len=0) all_ngrams = 0 grams = init_grams_dict(n_len, alphabet) for item in strlist: if item == '': continue ngram_list = list(n._split(item)) for ng in ngram_list: if ng in grams: grams[ng] += float(1) all_ngrams += 1 for item in grams.keys(): grams[item] /= all_ngrams return grams
and oases_item['top'] == trinity_item['top'] \ and oases_item['n'] == trinity_item['n']\ and oases_item['kernel'].values()[0] == trinity_item['kernel'].values()[0]: print (oases_item, trinity_item) #intersect = set(good_list_oases) & set(good_list_trinity) from tr_parser import get_assemblies (ref, oases_reads, oases_name_index, trinity_reads, trinity_name_index) = get_assemblies("data/ref_for_reads.fasta", "data/Oases.fasta", "data/Trinity.fasta") from ngram import NGram n = NGram(N=4, pad_len=0) grams = dict() for transcript in ref: if transcript == '': continue ngram_list = list(n._split(transcript)) for ng in ngram_list: if ng == 'TTSG': pdb.set_trace() if ng in grams: grams[ng] += 1 else: grams[ng] = 1 pp = pprint.PrettyPrinter() pp.pprint(grams) print len(grams)
['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], "n") masked_domain_name = multi_replace(aux3_domain_name, ['-'], "s") #print domain_name + " -> " + masked_domain_name #continue feature.append(domain_name) feature.append(masked_domain_name) feature.append(tag) if len(domain_name) < 5: continue ### ID Designation ### V4-6 1-gram (mean, variance and standard deviation) n = NGram(N=1) v = list(n._split(domain_name)) [f1, f2, f3] = ngram_stats(v) feature.extend([f1, f2, f3]) ### V6-9 2-gram (mean, variance and standard deviation) n = NGram(N=2) v = list(n._split(domain_name)) [f1, f2, f3] = ngram_stats(v) feature.extend([f1, f2, f3]) ### V10-12 3-gram (mean, variance and standard deviation) n = NGram(N=3) v = list(n._split(domain_name)) [f1, f2, f3] = ngram_stats(v) feature.extend([f1, f2, f3])