def test_all(tmpdir): x = bbhash.PyMPHF(list(range(10)), 10, 1, 1.0) assert x.lookup(9) == 8 output = tmpdir.join('xxx') x.save(str(output)) y = bbhash.load_mphf(str(output)) assert y.lookup(9) == 8
def from_catlas_directory(cls, catlas_prefix): "Load kmer index created by search.contigs_by_kmer." mphf_filename = os.path.join(catlas_prefix, 'contigs.fa.gz.mphf') array_filename = os.path.join(catlas_prefix, 'contigs.fa.gz.indices') mphf = bbhash.load_mphf(mphf_filename) with open(array_filename, 'rb') as fp: np_dict = numpy.load(fp) mphf_to_kmer = np_dict['mphf_to_kmer'] mphf_to_cdbg = np_dict['kmer_to_cdbg'] cdbg_sizes = np_dict['sizes'] return cls(mphf, mphf_to_kmer, mphf_to_cdbg, cdbg_sizes)
def main(): parser = argparse.ArgumentParser() parser.add_argument('database') parser.add_argument('input_filenames', metavar='input_sequence_filename', help='Input FAST[AQ] sequence filename.', nargs='+') parser.add_argument('-k', '--ksize', type=int, default=31) parser.add_argument('-p', '--paired', action='store_true', help='require that all sequences be properly paired') parser.add_argument('--force_single', dest='force_single', action='store_true', help='treat all sequences as single-ended/unpaired') parser.add_argument('-u', '--unpaired-reads', metavar="unpaired_reads_filename", help='include a file of unpaired reads to which ' '-p/--paired does not apply.') parser.add_argument('-f', '--force', dest='force', help='continue past file reading errors', action='store_true') args = parser.parse_args() force_single = args.force_single #if args.reads == '-': # args.reads = sys.stdin # check that input files exist check_valid_file_exists(args.input_filenames) filenames = [] for pathfilename in args.input_filenames: filenames.append(pathfilename) # make a list of all filenames and if they're paired or not; # if we don't know if they're paired, default to allowing but not # forcing pairing. files = [] for element in filenames: files.append([element, args.paired]) if args.unpaired_reads: files.append([args.unpaired_reads, False]) # create object of Nodetable in Khmer to use its kh = khmer.Nodetable(args.ksize, 1, 1) # load database mphf_filename = args.database + '.mphf' array_filename = args.database + '.arr' print('loading database {}'.format(args.database)) with open(array_filename, 'rb') as fp: mphf_to_kmer, mphf_to_cdbg, family_ids, cdbg_to_family_id = pickle.load( fp) mphf = bbhash.load_mphf(mphf_filename) print('done!') def get_kmer_to_family_ids(hashval): mphf_hash = mphf.lookup(hashval) if mphf_hash is None: return set() kmer_hash = mphf_to_kmer[mphf_hash] if kmer_hash != hashval: return set() cdbg_id = mphf_to_cdbg[mphf_hash] id_list = cdbg_to_family_id[cdbg_id] return id_list def readFusion(read): global n_unmatched, n_same, n_amb_same, n_clear_fusion, n_ambig_fusion, n_mutli_fusion flag = None lf_ids = set() rt_ids = set() families = [] shared_kmers = [] gaps = [] hashvals = kh.get_kmer_hashes(read.sequence) # find a matching k-mer at the beginning of the read lf = hashvals[0] lf_ids = get_kmer_to_family_ids(lf) idx = 1 while idx < len(hashvals) and len(lf_ids) == 0: lf = hashvals[idx] lf_ids = get_kmer_to_family_ids(lf) idx += 1 if len(lf_ids) == 0: #print('no single match') n_unmatched += 1 flag = "unmatched" elif idx == len(hashvals): #print('same, only last kmer matched') families.append(lf_ids) if len(lf_ids) == 1: n_same += 1 flag = "unique" else: n_amb_same += 1 flag = "ambiguous" else: # len(lf_ids) > 0 & idx < len(hashvals) # find a matching k-mer at the end of the read rt = hashvals[-1] rt_ids = get_kmer_to_family_ids(rt) idy = len(hashvals) - 2 while idy >= idx and len(rt_ids) == 0: rt = hashvals[idy] rt_ids = get_kmer_to_family_ids(rt) idy -= 1 if len(rt_ids) == 0: #print('same, only one non-last kmer matched ') families.append(lf_ids) if len(lf_ids) == 1: n_same += 1 flag = "unique" else: n_amb_same += 1 flag = "ambiguous" else: intersect_ids = lf_ids.intersection(rt_ids) if len(intersect_ids) > 0: families.append(intersect_ids) if len(intersect_ids) == 1: n_same += 1 flag = "unique" else: n_amb_same += 1 flag = "ambiguous" else: # fusion to be resolved shared_kmer = 1 gap_size = 0 gap = False while idx <= idy + 1: temp = hashvals[idx] temp_ids = get_kmer_to_family_ids(temp) if len(temp_ids) > 0: intersect_ids = lf_ids.intersection(temp_ids) if len(intersect_ids) > 0: lf_ids = intersect_ids shared_kmer += 1 gap_size = 0 else: # len(intersect_ids) == 0 families.append(lf_ids) shared_kmers.append(shared_kmer) lf_ids = temp_ids shared_kmer = 1 gaps.append(gap_size) gap_size = 0 else: gap_size += 1 idx += 1 families.append(lf_ids) shared_kmers.append(shared_kmer) assert len(families) > 1 if len(families) == 2: if len(families[0]) == 1 and len(families[1]) == 1: n_clear_fusion += 1 flag = "clear_fusion" else: n_ambig_fusion += 1 flag = "ambig_fusion" else: # len(families) > 2 n_mutli_fusion += 1 flag = "multi_fusion" #if len(families) == 0: # families = "-" #if len(shared_kmers) == 0: # shared_kmers = "-" return flag, families, shared_kmers, gaps fusion_filename = args.database + '_fusion.fa' fusion_fp = open(fusion_filename, 'w') fusionInfo_filename = args.database + '_fusion.info' fusionInfo_fp = open(fusionInfo_filename, 'w') print("fileName", "recordIndex", "whichInPair", "align_class", "gene_families", "shared_kmers", "gaps", file=fusionInfo_fp, sep='\t') fusionCalc_filename = args.database + '_fusion.calc' fusionCalc_fp = open(fusionCalc_filename, 'w') print("fileName", "recordIndex", "whichInPair", "align_class", "familiy_A", "familiy_B", "no_families", "len_families", "shared_kmers", "gaps", "sorted_keys", file=fusionCalc_fp, sep='\t') fusionPairs_filename = args.database + '_fusionPairs.fa' fusPair_fp = open(fusionPairs_filename, 'w') fusionPairsInfo_filename = args.database + '_fusionPairs.info' fusPairInfo_fp = open(fusionPairsInfo_filename, 'w') print("fileName", "recordIndex", "fusion_class", "R1_family", "R2_family", file=fusPairInfo_fp, sep='\t') fusionPairsCalc_filename = args.database + '_fusionPairs.calc' fusPairCalc_fp = open(fusionPairsCalc_filename, 'w') print("fileName", "recordIndex", "fusion_class", "familiy_A", "familiy_B", "len_families", "sorted_keys", file=fusPairCalc_fp, sep='\t') corrupt_files = [] family_names = dict(zip(family_ids.values(), family_ids.keys())) n = 0 n_paired_fusion = 0 sameRef = ("unique", "ambiguous") fusion = ("clear_fusion", "ambig_fusion", "multi_fusion") for filename, require_paired in files: with catch_io_errors(filename, fusion_fp, fusionInfo_fp, fusionCalc_fp, fusPair_fp, fusPairInfo_fp, fusPairCalc_fp, args.force, corrupt_files): screed_iter = clean_input_reads(screed.open(filename)) reader = broken_paired_reader(screed_iter, min_length=args.ksize, force_single=force_single, require_paired=require_paired) for r_index, is_paired, read0, read1 in reader: n += 1 if n % 10000 == 0: print('...', n) #if n > 5000: # break flag0, families0, shared_kmers0, gaps0 = readFusion(read0) if not is_paired and flag0 in fusion: #families_names0 = [] #for gp in families0: # gp_names = [] # for family_id in gp: # family_name = family_names[family_id] # gp_names.append(family_name) # families_names0.append(gp_names) print(filename, r_index, "single", flag0, families0, shared_kmers0, gaps0, file=fusionInfo_fp, sep='\t') write_record(read0, fusion_fp) #i = 1 #while i < len(families0): # for g1 in families0[i-1]: # for g2 in families0[i]: # print(filename, r_index, "single", flag0, sorted([g1,g2]), len(families0), len(families0[i-1]), len(families0[i]), # shared_kmers0, gaps0, file=fusionCalc_fp, sep='\t') # i += 1 i = len(families0) - 1 for g1 in families0[0]: g1_name = family_names[g1] for g2 in families0[i]: g2_name = family_names[g2] print(filename, r_index, "single", flag0, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), len(families0), [len(f) for f in families0], shared_kmers0, gaps0, sorted([g1, g2]), file=fusionCalc_fp, sep='\t') if is_paired: flag1, families1, shared_kmers1, gaps1 = readFusion(read1) if flag0 in fusion or flag1 in fusion: print(filename, r_index, "Read_1", flag0, families0, shared_kmers0, gaps0, file=fusionInfo_fp, sep='\t') write_record(read0, fusion_fp) print(filename, r_index, "Read_2", flag1, families1, shared_kmers1, gaps1, file=fusionInfo_fp, sep='\t') write_record(read1, fusion_fp) if flag0 in fusion: i = len(families0) - 1 for g1 in families0[0]: g1_name = family_names[g1] for g2 in families0[i]: g2_name = family_names[g2] print(filename, r_index, "Read_1", flag0, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), len(families0), [len(f) for f in families0], shared_kmers0, gaps0, sorted([g1, g2]), file=fusionCalc_fp, sep='\t') if flag1 in fusion: i = len(families1) - 1 for g1 in families1[0]: g1_name = family_names[g1] for g2 in families1[i]: g2_name = family_names[g2] print(filename, r_index, "Read_2", flag1, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), len(families1), [len(f) for f in families1], shared_kmers1, gaps1, sorted([g1, g2]), file=fusionCalc_fp, sep='\t') elif flag0 in sameRef and flag1 in sameRef: if len(families0[0].intersection(families1[0])) == 0: n_paired_fusion += 1 if flag0 == "unique" and flag1 == "unique": fusion_class = "clear_fusion" else: fusion_class = "ambig_fusion" print(filename, r_index, fusion_class, families0, families1, file=fusPairInfo_fp, sep='\t') write_record(read0, fusPair_fp) write_record(read1, fusPair_fp) for g1 in families0[0]: g1_name = family_names[g1] for g2 in families1[0]: g2_name = family_names[g2] print(filename, r_index, fusion_class, '{}:{}'.format(g1, g1_name), '{}:{}'.format(g2, g2_name), [ len(f) for f in (families0[0], families1[0]) ], sorted([g1, g2]), file=fusPairCalc_fp, sep='\t') print('No of input fragments: ', n) print('unmatched:', n_unmatched) print('Unique:', n_same) print('Ambiguous:', n_amb_same) print('Single read clear fusion:', n_clear_fusion) print('Single read ambiguous fusion:', n_ambig_fusion) print('Single read multi fusion:', n_mutli_fusion) print('paired read fusion:', n_paired_fusion)
def main(): parser = argparse.ArgumentParser() parser.add_argument('database') parser.add_argument('reads') parser.add_argument('-k', '--ksize', type=int, default=31) args = parser.parse_args() if args.reads == '-': args.reads = sys.stdin kh = khmer.Nodetable(args.ksize, 1, 1) mphf_filename = args.database + '.mphf' array_filename = args.database + '.arr' print('loading database {}'.format(args.database)) with open(array_filename, 'rb') as fp: mphf_to_kmer, mphf_to_cdbg, family_ids, cdbg_to_family_id = pickle.load( fp) mphf = bbhash.load_mphf(mphf_filename) print('done!') def get_kmer_to_family_ids(hashval): mphf_hash = mphf.lookup(hashval) if mphf_hash is None: return set() kmer_hash = mphf_to_kmer[mphf_hash] if kmer_hash != hashval: return set() cdbg_id = mphf_to_cdbg[mphf_hash] id_list = cdbg_to_family_id[cdbg_id] return id_list n_same = 0 n_different = 0 n = 0 for record in screed.open(args.reads): n += 1 if n % 1000 == 0: print('...', n) if n > 5000: break hashvals = kh.get_kmer_hashes(record.sequence) if len(hashvals) <= 1: continue first = hashvals[0] last = hashvals[-1] # find the first unambiguously assigned k-mer first_ids = get_kmer_to_family_ids(first) idx = 1 while idx < len(hashvals) / 2 and len(first_ids) != 1: first = hashvals[idx] idx += 1 # find the last unambiguously assigned k-mer last_ids = get_kmer_to_family_ids(last) idx = len(hashvals) - 2 while idx > len(hashvals) / 2 and len(last_ids) != 1: last = hashvals[idx] idx -= 1 if len(first_ids) == 1 and len(last_ids) == 1 and \ first_ids == last_ids: n_same += 1 else: print('different {} {}'.format(first_ids, last_ids)) n_different += 1 print('same:', n_same) print('different:', n_different)
def main(): parser = argparse.ArgumentParser() parser.add_argument('database') parser.add_argument('reads') parser.add_argument('-k', '--ksize', type=int, default=31) args = parser.parse_args() #if args.reads == '-': # args.reads = sys.stdin kh = khmer.Nodetable(args.ksize, 1, 1) mphf_filename = args.database + '.mphf' array_filename = args.database + '.arr' print('loading database {}'.format(args.database)) with open(array_filename, 'rb') as fp: mphf_to_kmer, mphf_to_cdbg, family_ids, cdbg_to_family_id = pickle.load( fp) mphf = bbhash.load_mphf(mphf_filename) print('done!') def get_kmer_to_family_ids(hashval): mphf_hash = mphf.lookup(hashval) if mphf_hash is None: return set() kmer_hash = mphf_to_kmer[mphf_hash] if kmer_hash != hashval: return set() cdbg_id = mphf_to_cdbg[mphf_hash] id_list = cdbg_to_family_id[cdbg_id] return id_list n_same = 0 n_fusion = 0 n_unmatched = 0 n_amb_same = 0 n_amb_fusion = 0 n_others = 0 fusion_filename = args.database + '_fusion.fa' amb_fusion_filename = args.database + '_ambFusion.fa' fusion_fp = open(fusion_filename, 'w') ambfusion_fp = open(amb_fusion_filename, 'w') n = 0 for record in screed.open(args.reads): n += 1 if n % 1000 == 0: print('...', n) #if n > 5000: # break hashvals = kh.get_kmer_hashes(record.sequence) if len(hashvals) <= 1: continue # find a matching k-mer at the beginning of the read first = hashvals[0] first_ids = get_kmer_to_family_ids(first) idx = 1 while idx < len(hashvals) and len(first_ids) == 0: first = hashvals[idx] first_ids = get_kmer_to_family_ids(first) idx += 1 if len(first_ids) == 0: print('no single match') n_unmatched += 1 elif idx == len(hashvals): print('same, only last kmer matched ') n_same += 1 else: # find a matching k-mer at the end of the read last = hashvals[-1] last_ids = get_kmer_to_family_ids(last) idy = len(hashvals) - 2 while idy > idx and len(last_ids) == 0: last = hashvals[idy] last_ids = get_kmer_to_family_ids(last) idy -= 1 if len(last_ids) == 0: print('same, only first kmer matched ') n_same += 1 elif len(first_ids) == 1 and len(last_ids) == 1: if first_ids == last_ids: n_same += 1 else: print('fusion class A {} {} at kmers {} {}'.format( first_ids, last_ids, idx, idy + 2)) n_fusion += 1 write_record(record, fusion_fp) else: intersect_ids = first_ids.intersection(last_ids) if len(intersect_ids) > 0: n_amb_same += 1 else: ## try to resolve ambiguity # find the least unambiguous interval at the beginning of the read while idx <= idy: # and len(first_ids) > 1: first = hashvals[idx] temp_ids = get_kmer_to_family_ids(first) if len(temp_ids) == 0: idx += 1 else: intersect_ids = first_ids.intersection(temp_ids) if len(intersect_ids) == 0: break else: first_ids = intersect_ids idx += 1 # find the last unambiguous interval at the end of the read other = 0 while idy >= idx: # and len(last_ids) > 1: last = hashvals[idy] temp_ids = get_kmer_to_family_ids(last) if len(temp_ids) == 0: idy -= 1 else: intersect_ids = last_ids.intersection(temp_ids) if len(intersect_ids) == 0: other = 1 break else: last_ids = intersect_ids idy -= 1 if other: print('other {} {} {}'.format(first_ids, temp_ids, last_ids)) n_others += 1 elif len(first_ids) == 1 and len(last_ids) == 1: print('fusion class B {} {} at kmers {} {}'.format( first_ids, last_ids, idx, idy + 2)) n_fusion += 1 write_record(record, fusion_fp) else: print('ambiguous fusion {} {} at kmers {} {}'.format( first_ids, last_ids, idx, idy + 2)) n_amb_fusion += 1 ambfusion_fp write_record(record, ambfusion_fp) print('same:', n_same) print('fusion:', n_fusion) print('unmatched:', n_unmatched) print('ambiguous same:', n_amb_same) print('ambiguous fusion:', n_amb_fusion) print('others:', n_others)