def export_sparse_features(sigmers, sample, indir, outfile): outfh = open(outfile, 'w') i = 0 for (s, l) in sample: i = i + 1 if (i % 50 == 0): echo("\t\t ... Completed %f" % (float(i) / float(len(sample)))) filename = indir + s + "_count.jf" qf = jellyfish.QueryMerFile(filename) outfh.write("%s " % (l)) j = 0 for mer in sigmers: j = j + 1 jmer = jellyfish.MerDNA(mer) jmer.canonicalize() if (qf[jmer] > 0): outfh.write("%d:%d " % (j, qf[jmer])) # outfh.write("%d\t%d\t%d\n" %(i, sigmers[mer], qf[jmer])) outfh.write("\n") outfh.close()
def pickle_profiles(file_lists, resource_path, kmer_size=31): jellyfish.MerDNA_k(kmer_size) #instantiate the pickle obj mlst_profiles_dict = OrderedDict() for species, file_list in file_lists.items(): if species not in mlst_profiles_dict: mlst_profiles_dict.update({ species: { "ST": OrderedDict(), "GENES": OrderedDict(), "GENE_ORDER": None } }) number_of_genes = len(file_list) - 1 for i, l in enumerate( open([ os.path.join(resource_path, species, f) for f in file_list if '.txt' in f ][0])): line = l.strip().split("\t") if i == 0: gene_list = line[1:number_of_genes + 1] else: profile = ":".join(line[1:number_of_genes + 1]) st = line[0] mlst_profiles_dict[species]["ST"].update({profile: st}) mlst_profiles_dict[species]["GENE_ORDER"] = gene_list for _file in [f for f in file_list if f[-4:] == '.tfa']: for seq_record in SeqIO.parse( os.path.join(resource_path, species, _file), 'fasta'): seq_num = seq_record.name.replace("-", "_").split("_")[-1] gene_name = "_".join( seq_record.name.replace("__", "_").replace("-", "_").split("_")[:-1]) if gene_name not in mlst_profiles_dict[species]["GENES"]: mlst_profiles_dict[species]["GENES"].update( {gene_name: { seq_num: set([]) }}) else: mlst_profiles_dict[species]["GENES"][gene_name].update( {seq_num: set([])}) for j in range(0, len(seq_record.seq) - kmer_size + 1): kmer = seq_record.seq[j:j + kmer_size] mer = jellyfish.MerDNA(str(kmer)) mer.canonicalize() mlst_profiles_dict[species]["GENES"][gene_name][ seq_num].add(str(mer)) sys.stderr.write("\tparsing: {0} : {1}\n".format( species, gene_name)) pickle.dump(mlst_profiles_dict, open(os.path.join(resource_path, "mlst_profiles.pkl"), "wb")) return
def test_all_mers(self): count = 0 good = True mers = jellyfish.string_mers(self.str) for m in mers: m2 = jellyfish.MerDNA(self.str[count:count + self.k]) good = good and m == m2 count += 1 self.assertTrue(good) self.assertEqual(len(self.str) - self.k + 1, count)
def test_canonical_mers(self): good = True mers = jellyfish.string_canonicals(self.str) for count, m in enumerate(mers): m2 = jellyfish.MerDNA(self.str[count:count + self.k]) rm2 = m2.get_reverse_complement() good = good and (m == m2 or m == rm2) good = good and (not (m > m2)) and (not (m > rm2)) # count += 1 self.assertTrue(good) self.assertEqual(len(self.str) - self.k + 0, count)
def compare_ard(so, kmer_size=31): from Bio import SeqIO import jellyfish _p = "/home/ksimmon/reference/ard/" _p = "/Users/ksimmon/Box Sync/ARUP/strainTypeMer_resources/ard/" sys.stderr.write("Retrieving antibiotic resistance genes\n") descriptions = {} for i in open(_p + "categories.txt"): v = i.strip().split("\t") name = ".".join(v[0].split(".")[:-1]) descriptions.update({name: v}) aro_tags = {} for i in open(_p + "AROtags.txt"): v = i.strip().split("\t") # print v aro_tags.update({v[2]: v[1]}) count = 0 num_of_sequences = len( [i.name for i in SeqIO.parse(_p + "ARmeta-genes.fa", "fasta")]) for s in SeqIO.parse(_p + "ARmeta-genes.fa", "fasta"): count += 1 sys.stderr.write( "\rAnalyzed {0} of {1} antibiotic resistant genes".format( count, num_of_sequences)) if count != num_of_sequences: sys.stderr.flush() else: sys.stderr.write("\n") id = s.description.split(" ")[0] species = s.description[s.description.rfind("[") + 1:s.description.rfind("]")] aro_tag = [ i.split(" ")[0] for i in s.description.split(". ") if "ARO:" in i and "ARO:1000001" not in i ] # print id, species, descriptions[id][1], ",".join([aro_tags[tag] for tag in aro_tag]) for j in range(0, len(s.seq) - kmer_size + 1): kmer = s.seq[j:j + kmer_size] mer = jellyfish.MerDNA(str(kmer)) mer.canonicalize() if id in so.ard: so.ard[id][0].append(so.qf[mer]) else: so.ard.update({ id: ([so.qf[mer]], species, descriptions[id][1], [aro_tags[tag] for tag in aro_tag]) }) return
def kmercount(k, fname): try: qf = jellyfish.QueryMerFile(fname) except RuntimeError: raise else: # initialize with pseudo count # add 0.5 for smoothing # store data in doble quantity to use int vector c = np.ones(1 << (2 * k), dtype=np.uint16) i = 0 for l in allkmers(k): c[i] += 2 * qf[jellyfish.MerDNA(''.join(l))] i += 1 # print len(c); return c
def get_kmer_freq_v(jfdb='../data/GRCh38.p2.ch21/GRCh38.p2.ch21.5010000.jf', k=5): try: qf = jellyfish.QueryMerFile(jfdb) except RuntimeError: raise else: alph = ('A', 'C', 'G', 'T') freq_l = [] kmer = None for km in itertools.product(alph, repeat=k): kmer = ''.join(km) freq = qf[jellyfish.MerDNA(kmer)] freq_l.append(freq) # how to close qf?? a = np.array([freq_l], dtype=np.float64) a /= np.sum(a) return a
def test07(jfdb='../data/GRCh38.p2.ch21/GRCh38.p2.ch21.5010000.jf', k=5): try: qf = jellyfish.QueryMerFile(jfdb) except RuntimeError: print 'jellyfish runtime error' raise else: alph = ('A', 'C', 'G', 'T') freq_l = [] for km in itertools.product(alph, repeat=k): kmer = ''.join(km) freq = qf[jellyfish.MerDNA(kmer)] freq_l.append(freq) #print '{kmer}\t{freq}'.format(kmer =kmer, freq = freq); a = np.array([freq_l], dtype=np.float64) a /= np.sum(a) print a return
def test_add(self): mer = jellyfish.MerDNA() good = True for i in range(1000): mer.randomize() val = random.randrange(1000) good = good and self.hash.add(mer, val) if not good: break if i % 3 > 0: nval = random.randrange(1000) val = val + nval if i % 3 == 1: good = good and (not self.hash.add(mer, nval)) else: good = good and self.hash.update_add(mer, nval) if not good: break good = good and (val == self.hash.get(mer)) and (val == self.hash[mer]) if not good: break self.assertTrue(good)
def ttest_kmer(positive_qfs, negative_qfs, positive_factor, negative_factor, kmer_candidates, outfile): kmer_fh = open(kmer_candidates, 'r') outfh = open(outfile, 'w') i = 0 for line in kmer_fh: mer = jellyfish.MerDNA(line.rstrip()) mer.canonicalize() positive = [] negative = [] for x in xrange(len(positive_qfs)): factor = positive_factor[x] p_qfs = positive_qfs[x] positive.append(float(p_qfs[mer]) / float(factor)) for j in xrange(len(negative_qfs)): factor = negative_factor[j] n_qfs = negative_qfs[j] negative.append(float(n_qfs[mer]) / float(factor)) p_mean = numpy.mean(positive) n_mean = numpy.mean(negative) if (not p_mean == 0 and not n_mean == 0): t_stat, p_val = stats.ttest_ind(positive, negative, equal_var=False) ## running t-test outfh.write( "%s\t%E\t%E\t%f\t%E\n" % (mer, Decimal(p_mean), Decimal(n_mean), t_stat, Decimal(p_val))) if (i % 1 == 1000): echo("------ completed %d" % (i)) i = i + 1 kmer_fh.close() outfh.close()
def kmercount(k, pos, chr = 21, fname_head = '/data/yt/GRCh38.p2.ch21/GRCh38.p2'): try: fname = '{head}.ch{chr}.{pos}.fasta.{k}.jf'.format(head = fname_head, chr = chr, pos = pos, k = k); qf = jellyfish.QueryMerFile(fname); except RuntimeError: raise; else: # initialize with pseudo count # add 0.5 for smoothing # store data in doble quantity to use int vector c = np.ones((1 << (2 * k), 1), dtype = np.uint16); i = 0; for l in allkmers(k): c[i][0] += 2 * qf[jellyfish.MerDNA(''.join(l))]; i += 1; # print c.T # print len(c); return c;
def mlst_profiles(self, mlst_profiles): results = [] if mlst_profiles is None: return ["no profiles loaded"] matching_sequences = OrderedDict() for species, _d in mlst_profiles.items(): matching_sequences.update({species: OrderedDict()}) for i, gene in enumerate(_d["GENE_ORDER"]): matching_sequences[species].update({gene: []}) for profile_number, profile in _d["GENES"][gene].items(): for kmer in profile: mer = jellyfish.MerDNA(kmer) mer.canonicalize() if self.qf[mer] == 0: break else: matching_sequences[species][gene].append( profile_number) st_keys = [ ":".join(t) for t in list( itertools.product(*matching_sequences[species].values())) ] # print st_keys for k in st_keys: if k in _d["ST"]: st = _d["ST"][k] else: st = 'NONE' results.append("{0}\tST: {2}\tprofile: {1} [{3}]".format( species, k, st, ":".join(_d["GENE_ORDER"]))) if len(results) == 0: return ["no matching profiles found"] return results
def get_count_jf(self, jf): res_k = jellyfish.MerDNA(self._seq) res_k.canonicalize() return jf[res_k]
def query(self, seq): kmer = jellyfish.MerDNA(seq) if (self.canonical): kmer.canonicalize() return self.jf[kmer]
def compare_to_and_filter(self, strain, complexity_cutoff=12, coverage_cutoff=3, reference_set=None, inverse=False, filtering_cutoff=85, verbose=False): """ Compares the strains using a pairwise filter :param strain: :param complexity_cutoff: :param coverage_cutoff: :param reference_set: :param inverse: :param filtering_cutoff: :param verbose: :return: """ # USE THE ARCHIVE SET IF RAPID_MODE=True if self.rapid_mode: strain_1_kmer_set = self.kmer_archive strain_2_kmer_set = strain.kmer_archive else: strain_1_kmer_set = self.kmer_set strain_2_kmer_set = strain.kmer_set # FILTER IN OR OUT THE REFERENCE SET if reference_set is None: strain_1 = strain_1_kmer_set strain_2 = strain_2_kmer_set else: if inverse: strain_1 = strain_1_kmer_set.difference(reference_set) strain_2 = strain_2_kmer_set.difference(reference_set) else: strain_1 = strain_1_kmer_set.intersection(reference_set) strain_2 = strain_2_kmer_set.intersection(reference_set) intersection = float(len(strain_1.intersection(strain_2))) denom = ((len(strain_1) - intersection) + (len(strain_2) - intersection)) + intersection total = intersection / denom * 100.0 smallest_count = float(len(strain_1)) strain_1_smallest = True if len(strain_2) < smallest_count: smallest_count = len(strain_2) strain_1_smallest = False #catching a divide by zero error and returning 0 try: rescue_numerator = float(len(strain_1.intersection(strain_2))) rescue = rescue_numerator / smallest_count * 100.0 except ZeroDivisionError as e: print( "###############\nWARNING:\tSample {} or {} does not have sufficient coverage." "\n###############".format(self.name, strain.name)) rescue = 0 # return self.name, strain.name, total, rescue, denom, smallest_count if total < filtering_cutoff or self.do_not_filter or strain.do_not_filter: return self.name, strain.name, total, rescue, denom, smallest_count # get the difference kmers differences_1 = strain_1.difference(strain_2) differences_2 = strain_2.difference(strain_1) differences = strain_1.symmetric_difference(strain_2) # differences = differences_1.union(differences_2) # combined complexity_count = 0 within_1_strain_1 = 0 within_2_strain_1 = 0 within_3_strain_1 = 0 within_1_strain_2 = 0 within_2_strain_2 = 0 within_3_strain_2 = 0 counter_not_filtered = 0 counter_filtered = 0 coverage_100 = 0 kept_1 = 0 kept_2 = 0 nucleotide_skew = 0 filtered_1 = 0 filtered_2 = 0 below_cutoff_1 = 0 below_cutoff_2 = 0 for i, kmer in enumerate(differences): is_filtered_kmer = False mer = jellyfish.MerDNA(kmer) mer.canonicalize() s1_count = int(self.qf_filtered[mer]) s2_count = int(strain.qf_filtered[mer]) # s1_out = "{0}:n={1} [cutoff={2}]".format(self.name, s1_count, self.kmer_cutoff) # s2_out = "{0}:n={1} [cutoff={2}]".format(strain.name, s2_count, strain.kmer_cutoff) # if s1_count > self.coverage * coverage_cutoff: # coverage_100 += 1 # is_filtered_kmer = True # elif s2_count > strain.coverage * coverage_cutoff: # coverage_100 += 1 # is_filtered_kmer = True if s2_count == 0: if s1_count - int(self.kmer_cutoff) == 1: within_1_strain_1 += 1 #within_2_strain_1 += 1 #within_3_strain_1 += 1 is_filtered_kmer = True elif s1_count - int(self.kmer_cutoff) == 2: # within_1 += 1 within_2_strain_1 += 1 #within_3_strain_1 += 1 is_filtered_kmer = True elif s1_count - int(self.kmer_cutoff) == 3: # within_1 += 1 # within_2 += 1 within_3_strain_1 += 1 is_filtered_kmer = True else: if s2_count - int(strain.kmer_cutoff) == 1: within_1_strain_2 += 1 #within_2_strain_2 += 1 #within_3_strain_2 += 1 is_filtered_kmer = True elif s2_count - int(strain.kmer_cutoff) == 2: # within_1 += 1 within_2_strain_2 += 1 #within_3_strain_2 += 1 is_filtered_kmer = True elif s2_count - int(strain.kmer_cutoff) == 3: # within_1 += 1 # within_2 += 1 within_3_strain_2 += 1 is_filtered_kmer = True complexity = [[k, len(list(g))] for k, g in groupby(kmer)] complexity = sorted(complexity, key=lambda l: l[1], reverse=True) complexity = sum([v for g, v in complexity[:3]]) complexity_char = sorted(Counter(kmer).values(), reverse=True) if complexity_char[0] > (31.0 / 2): nucleotide_skew += 1 is_filtered_kmer = True if complexity > complexity_cutoff: is_filtered_kmer = True complexity_count += 1 if is_filtered_kmer is False: # is_filtered_kmer is False: # counter_not_filtered += 1 if s1_count == 0: if self.qf[mer] == 0: kept_2 += 1 if verbose: print("strain: 2\tcount: {0}\t{1}|{2}\t{3}".format( s2_count, kmer, rc(kmer), self.qf[mer])) else: below_cutoff_1 += 1 is_filtered_kmer = True else: if strain.qf[mer] == 0: kept_1 += 1 if verbose: print("strain: 1\tcount: {0}\t{1}|{2}\t{3}".format( s1_count, kmer, rc(kmer), strain.qf[mer])) else: below_cutoff_2 += 1 is_filtered_kmer = True # count updating if is_filtered_kmer: counter_filtered += 1 if s1_count == 0: filtered_2 += 1 else: filtered_1 += 1 else: counter_not_filtered += 1 # sys.stdout.write(">{0}\t{1}\t{2}\tcomplexity:{3}\n{4}\n".format( # counter_not_filtered, s1_out, s2_out, complexity, kmer)) s = "\n" s += "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" s += "{0:.1f} X\tcoverage '{1}' [kmer cutoff = {2}]\n".format( self.coverage, self.name, self.kmer_cutoff) s += "{0:.1f} X\tcoverage '{1}' [kmer cutoff = {2}]\n".format( strain.coverage, strain.name, strain.kmer_cutoff) s += "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" s += "{0}\tkmers found in '{1}' but not '{2}'\n".format( len(differences_1), self.name, strain.name) s += "{0}\tkmers found in '{1}' but not '{2}' [AFTER FILTERING]\n".format( kept_1, self.name, strain.name) s += "{0}\tkmers found in '{1}' but not '{2}'\n".format( len(differences_2), strain.name, self.name) s += "{0}\tkmers found in '{1}' but not '{2}' [AFTER FILTERING]\n".format( kept_2, strain.name, self.name) s += "~~~~~~~~~ FILTER ATTRS ~~~~~~~~~~~\n" s += "{1}\tHomopolymer runs summing >= {0} [sum of 3 homopolymer runs]\n".format( complexity_cutoff, complexity_count) s += "{0}\tHalf of the kmer contains a single base\n".format( nucleotide_skew) s += "{0} : {1} : {2}\twithin 1:2:3 count of cutoff [strain_1] " \ "(e.g the kmer is near the histogram tail)\n".format( within_1_strain_1, within_2_strain_1, within_3_strain_1) s += "{0} : {1} : {2}\tWithin 1:2:3 count of cutoff [strain_2] " \ "(e.g the kmer is near the histogram tail)\n".format( within_1_strain_2, within_2_strain_2, within_3_strain_2) s += "{1}\tkmers with excessive coverage [{0}X]\n".format( coverage_cutoff, coverage_100) s += "{0}\tkmer found below initial cutoff [strain_1]\n".format( below_cutoff_1) s += "{0}\tkmer found below initial cutoff [strain_2]\n".format( below_cutoff_2) s += "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" s += "{0}\tkmers filtered\n{1}\tkmers retained\n".format( counter_filtered, counter_not_filtered) s += "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" s += "\n" denom -= counter_filtered if strain_1_smallest: smallest_count -= filtered_1 else: smallest_count -= filtered_2 total = intersection / denom * 100.0 rescue = rescue_numerator / smallest_count * 100.0 return self.name, strain.name, total, rescue, denom, smallest_count
#! /usr/bin/env python import jellyfish import sys qf = jellyfish.QueryMerFile(sys.argv[1]) for str in sys.argv[2:]: print("%s %d" % (str, qf[jellyfish.MerDNA(str)]))
def get_count(kmer, jf): res_k = jellyfish.MerDNA(kmer.seq) res_k.canonicalize() return jf[res_k]
def genquery(genomeFile, jellyFile, totedits, medindel, insprob, delprob, queryfreq, querycount, outputFile): #genome - path to genome #totedits - total number of edits to make #medindel - median (mean) size of indel edits. actual edit length determined from gaussian with mean medindel and std medindel/2 #insprob - probability of insertion #delprob - probability of deletion #outputs all edits into a text file called "sampleedits.txt" if delprob + insprob > 1.0: raise "Error, delprob = {} and insprob = {}. "\ "The sum is {} > 1.0".format( delprob, insprob, delprob + insprob) genome = genomeFile.readline() genomeFile.close() #mf = jellyfish.ReadMerFile(jellyFile) qf = jellyfish.QueryMerFile(jellyFile) numbases = len(genome) - 1 genome = genome[0:numbases] letters = ['A', 'C', 'G', 'T'] randr = [] allinds = [] snpProb = 1.0 - (insprob + delprob) SNPrange = int(snpProb * totedits) insrange = int(insprob * totedits) delrange = int(delprob * totedits) editTypes = (['S'] * SNPrange) +\ (['D'] * delrange) +\ (['I'] * insrange) random.shuffle(editTypes) qcount = 0 effectedkmers = set() for val in editTypes: qcount += 1 if val == 'I': p, s, seq = random_insertion(numbases, medindel) numbases += s outputFile.write('I %d %s\n' % (p, seq)) add_kmers_in_seq(effectedkmers, seq) add_kmers_in_seq(effectedkmers, genome[p - K + 1:p + K]) elif val == 'D': p, s = random_deletion(numbases, medindel) numbases -= s outputFile.write('D %d %d\n' % (p, p + s - 1)) #add_kmers_in_seq(effectedkmers, genome[p-K+1:p+s-1+K]) else: p, seq = random_snp(numbases) outputFile.write('S %d %s\n' % (p, seq)) add_kmers_in_seq(effectedkmers, genome[p - K + 1:p + K - 1]) # if it's time to output some queries if qcount == queryfreq: qcount = 0 for qlist in xrange(querycount): dart = random.random() if dart <= EDIT_QUERY_PROB: kmer = random.sample(effectedkmers, 1)[0] editflag = 'I' else: p = random.randrange(K * 2, numbases - K * 2) kmer = genome[p:p + K].upper() editflag = 'N' kcount = int(qf[jellyfish.MerDNA(kmer)]) outputFile.write('Q %s %s %d\n' % (kmer, editflag, kcount)) outputFile.close()
def get_kmer_count(self, kmer): canon = jellyfish.MerDNA(str(kmer)) canon.canonicalize() return self.qf[canon]