def run(args): logging.basicConfig(level=int(round(10*args.verbose_level))) assert args.n_core <= multiprocessing.cpu_count(), 'Requested n_core={} > cpu_count={}'.format( args.n_core, multiprocessing.cpu_count()) def Start(): LOG.info('Started a worker in {} from parent {}'.format( os.getpid(), os.getppid())) exe_pool = Pool(args.n_core, initializer=Start) if args.trim: get_consensus = get_consensus_with_trim else: get_consensus = get_consensus_without_trim K = 8 config = args.min_cov, K, \ args.max_n_read, args.min_idt, args.edge_tolerance, \ args.trim_size, args.min_cov_aln, args.max_cov_aln, \ args.allow_external_mapping # TODO: pass config object, not tuple, so we can add fields inputs = [] for datum in get_seq_data(config, args.min_n_read, args.min_len_aln): inputs.append((get_consensus, datum)) try: LOG.info('running {!r}'.format(get_consensus)) for res in exe_pool.imap(io.run_func, inputs): process_get_consensus_result(res, args) LOG.info('finished {!r}'.format(get_consensus)) except: LOG.exception('failed gen_consensus') exe_pool.terminate() raise
def run(args): logging.basicConfig(level=int(round(10*args.verbose_level))) assert args.n_core <= multiprocessing.cpu_count(), 'Requested n_core={} > cpu_count={}'.format( args.n_core, multiprocessing.cpu_count()) def Start(): LOG.info('Started a worker in {} from parent {}'.format( os.getpid(), os.getppid())) exe_pool = Pool(args.n_core, initializer=Start) if args.trim: get_consensus = get_consensus_with_trim else: get_consensus = get_consensus_without_trim K = 8 config = args.min_cov, K, \ args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln # TODO: pass config object, not tuple, so we can add fields inputs = [] for datum in get_seq_data(config, args.min_n_read, args.min_len_aln): inputs.append((get_consensus, datum)) try: LOG.info('running {!r}'.format(get_consensus)) for res in exe_pool.imap(io.run_func, inputs): process_get_consensus_result(res, args) LOG.info('finished {!r}'.format(get_consensus)) except: LOG.exception('failed gen_consensus') exe_pool.terminate() raise
def try_run_ovlp_filter(n_core, fofn, max_diff, max_cov, min_cov, min_len, bestn, db_fn): io.LOG("starting ovlp_filter") file_list = io.validated_fns(fofn) io.LOG("fofn %r: %r" % (fofn, file_list)) n_core = min(n_core, len(file_list)) exe_pool = Pool(n_core) try: run_ovlp_filter(exe_pool, file_list, max_diff, max_cov, min_cov, min_len, bestn, db_fn) io.LOG("finished ovlp_filter") except KeyboardInterrupt: io.LOG("terminating ovlp_filter workers...") exe_pool.terminate()
def try_run_ovlp_stats(n_core, fofn, min_len): io.LOG('starting ovlp_stats') file_list = io.validated_fns(fofn) io.LOG('fofn %r: %r' % (fofn, file_list)) n_core = min(n_core, len(file_list)) exe_pool = Pool(n_core) try: run_ovlp_stats(exe_pool, file_list, min_len) io.LOG('finished ovlp_stats') except KeyboardInterrupt: io.LOG('terminating ovlp_stats workers...') exe_pool.terminate()
def try_run_ovlp_stats(n_core, fofn, min_len): io.LOG('starting ovlp_stats') file_list = io.validated_fns(fofn) io.LOG('fofn %r: %r' %(fofn, file_list)) n_core = min(n_core, len(file_list)) exe_pool = Pool(n_core) try: run_ovlp_stats(exe_pool, file_list, min_len) io.LOG('finished ovlp_stats') except KeyboardInterrupt: io.LOG('terminating ovlp_stats workers...') exe_pool.terminate()
def run(args): logging.basicConfig(level=int(round(10 * args.verbose_level))) good_region = re.compile("[ACGT]+") assert args.n_core <= multiprocessing.cpu_count( ), 'Requested n_core={} > cpu_count={}'.format(args.n_core, multiprocessing.cpu_count()) def Start(): LOG.info('Started a worker in {} from parent {}'.format( os.getpid(), os.getppid())) exe_pool = Pool(args.n_core, initializer=Start) if args.trim: get_consensus = get_consensus_with_trim else: get_consensus = get_consensus_without_trim K = 8 config = args.min_cov, K, \ args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln # TODO: pass config object, not tuple, so we can add fields for res in exe_pool.imap( get_consensus, get_seq_data(config, args.min_n_read, args.min_len_aln)): cns, seed_id = res if len(cns) < 500: continue if args.output_full: print(">" + seed_id + "_f") print(cns) else: cns = good_region.findall(cns) if len(cns) == 0: continue if args.output_multi: seq_i = 0 for cns_seq in cns: if len(cns_seq) < 500: continue if seq_i >= 10: break print(">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0, len(cns_seq))) print(format_seq(cns_seq, 80)) seq_i += 1 else: cns.sort(key=lambda x: len(x)) print(">" + seed_id) print(cns[-1])
def try_run_ovlp_stats(n_core, db_fn, fofn, min_len): io.LOG('starting ovlp_stats') file_list = io.validated_fns(fofn) io.LOG('fofn {!r}: {}'.format(fofn, file_list)) io.LOG('db {!r}; n_core={}'.format(db_fn, n_core)) n_core = min(n_core, len(file_list)) exe_pool = Pool(n_core) try: run_ovlp_stats(exe_pool, db_fn, file_list, min_len) io.LOG('finished ovlp_stats') except KeyboardInterrupt: io.LOG('terminating ovlp_stats workers...') exe_pool.terminate()
def try_run_ovlp_filter(n_core, fofn, max_diff, max_cov, min_cov, min_len, bestn, db_fn): io.LOG('starting ovlp_filter') file_list = io.validated_fns(fofn) io.LOG('fofn %r: %r' %(fofn, file_list)) n_core = min(n_core, len(file_list)) exe_pool = Pool(n_core) try: run_ovlp_filter(exe_pool, file_list, max_diff, max_cov, min_cov, min_len, bestn, db_fn) io.LOG('finished ovlp_filter') except: io.LOG('terminating ovlp_filter workers...') exe_pool.terminate() raise
def try_run_track_reads(n_core, base_dir, min_len, bestn): io.LOG('starting track_reads') pread_dir = os.path.abspath(os.path.join(base_dir, "1-preads_ovl")) file_list = glob.glob(os.path.join(pread_dir, "m*/preads.*.las")) io.LOG('file list: %r' % file_list) db_fn = os.path.join(pread_dir, "preads.db") n_core = min(n_core, len(file_list)) exe_pool = Pool(n_core) try: run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn) io.LOG('finished track_reads') except: io.LOG('terminating track_reads workers...') exe_pool.terminate() raise
def run(args): logging.basicConfig(level=int(round(10*args.verbose_level))) good_region = re.compile("[ACGT]+") assert args.n_core <= multiprocessing.cpu_count(), 'Requested n_core={} > cpu_count={}'.format( args.n_core, multiprocessing.cpu_count()) def Start(): LOG.info('Started a worker in {} from parent {}'.format( os.getpid(), os.getppid())) exe_pool = Pool(args.n_core, initializer=Start) if args.trim: get_consensus = get_consensus_with_trim else: get_consensus = get_consensus_without_trim K = 8 config = args.min_cov, K, \ args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln # TODO: pass config object, not tuple, so we can add fields for res in exe_pool.imap(get_consensus, get_seq_data(config, args.min_n_read, args.min_len_aln)): cns, seed_id = res if len(cns) < 500: continue if args.output_full: print(">" + seed_id + "_f") print(cns) else: cns = good_region.findall(cns) if len(cns) == 0: continue if args.output_multi: seq_i = 0 for cns_seq in cns: if len(cns_seq) < 500: continue if seq_i >= 10: break print(">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0, len(cns_seq))) print(format_seq(cns_seq, 80)) seq_i += 1 else: cns.sort(key=lambda x: len(x)) print(">" + seed_id) print(cns[-1])
def try_run_ovlp_filter(out_fn, n_core, fofn, max_diff, max_cov, min_cov, min_len, min_idt, ignore_indels, bestn, db_fn): io.LOG('starting ovlp_filter') file_list = io.validated_fns(fofn) io.LOG('fofn %r: %r' % (fofn, file_list)) n_core = min(n_core, len(file_list)) exe_pool = Pool(n_core) tmp_out_fn = out_fn + '.tmp' try: with open(tmp_out_fn, 'w') as outs: run_ovlp_filter(outs, exe_pool, file_list, max_diff, max_cov, min_cov, min_len, min_idt, ignore_indels, bestn, db_fn) os.rename(tmp_out_fn, out_fn) io.LOG('finished ovlp_filter') except: io.LOG('terminating ovlp_filter workers...') exe_pool.terminate() raise
def try_run_ovlp_filter(out_fn, n_core, fofn, max_diff, max_cov, min_cov, min_len, bestn, db_fn): io.LOG('starting ovlp_filter') file_list = io.validated_fns(fofn) io.LOG('fofn %r: %r' % (fofn, file_list)) n_core = min(n_core, len(file_list)) exe_pool = Pool(n_core) tmp_out_fn = out_fn + '.tmp' try: with open(tmp_out_fn, 'w') as outs: run_ovlp_filter(outs, exe_pool, file_list, max_diff, max_cov, min_cov, min_len, bestn, db_fn) outs.write('---\n') os.rename(tmp_out_fn, out_fn) io.LOG('finished ovlp_filter') except: io.LOG('terminating ovlp_filter workers...') exe_pool.terminate() raise
def try_run_track_reads(n_core, phased_read_file, read_to_contig_map, rawread_ids, min_len, bestn, output): io.LOG('starting track_reads') rawread_dir = os.path.abspath('0-rawreads') # better logic for finding the las files path or move the logic to extern (taking the --fofn option?) file_list = glob.glob( os.path.join(rawread_dir, 'm*/raw_reads.*.las')) # TODO: More inputs io.LOG('file list: %r' % file_list) db_fn = os.path.join(rawread_dir, 'raw_reads.db') # TODO: Another input n_core = min(n_core, len(file_list)) exe_pool = Pool(n_core) try: run_track_reads(exe_pool, phased_read_file, read_to_contig_map, rawread_ids, file_list, min_len, bestn, db_fn, output) io.LOG('finished track_reads') except: io.LOG('terminating track_reads workers...') exe_pool.terminate() raise
def main(*argv): parser = argparse.ArgumentParser(description='a simple multi-processes LAS ovelap data filter') parser.add_argument('--n_core', type=int, default=4, help='number of processes used for generating consensus; ' '0 for main process only (default=%(default)s)') parser.add_argument('--fofn', type=str, help='file contains the path of all LAS file to be processed in parallel') parser.add_argument('--min_len', type=int, default=2500, help="min length of the reads") args = parser.parse_args(argv) exe_pool = Pool(args.n_core) file_list = open(args.fofn).read().split("\n") #print "all", len(contained) inputs = [] for fn in file_list: if len(fn) != 0: inputs.append( (fn, args.min_len ) ) for res in exe_pool.imap(run_filter_stats, inputs): for l in res[1]: print " ".join([str(c) for c in l])
def try_run_track_reads(n_core, base_dir, min_len, bestn): io.LOG('starting track_reads') rawread_dir = os.path.abspath(os.path.join(base_dir, "0-rawreads")) # better logic for finding the las files path or move the logic to extern (taking the --fofn option?) file_list = glob.glob(os.path.join(rawread_dir, "m*/raw_reads.*.las")) io.LOG('file list: %r' % file_list) # same, shoud we decide this as a parameter db_fn = os.path.join(rawread_dir, "raw_reads.db") n_core = min(n_core, len(file_list)) exe_pool = Pool(n_core) try: run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn) io.LOG('finished track_reads') except: io.LOG('terminating track_reads workers...') exe_pool.terminate() raise
def fc_ovlp_filter(n_core, fofn, max_diff, max_cov, min_cov, min_len, bestn, db_fn, debug, silent): global LOG if silent: LOG = write_nothing exe_pool = Pool(n_core) file_list = open(fofn).read().split("\n") inputs = [] for fn in file_list: if len(fn) != 0: inputs.append((db_fn, fn, max_diff, max_cov, min_cov, min_len)) ignore_all = [] for res in exe_pool.imap(filter_stage1, inputs): ignore_all.extend(res[1]) inputs = [] ignore_all = set(ignore_all) for fn in file_list: if len(fn) != 0: inputs.append( (db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all)) contained = set() for res in exe_pool.imap(filter_stage2, inputs): contained.update(res[1]) #print res[0], len(res[1]), len(contained) #print "all", len(contained) inputs = [] ignore_all = set(ignore_all) for fn in file_list: if len(fn) != 0: inputs.append((db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all, contained, bestn)) for res in exe_pool.imap(filter_stage3, inputs): for l in res[1]: print " ".join(l)
def main(argv=sys.argv): parser = argparse.ArgumentParser(description='a simple multi-processor consensus sequence generator', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--n_core', type=int, default=24, help='number of processes used for generating consensus; ' '0 for main process only') parser.add_argument('--min_cov', type=int, default=6, help='minimum coverage to break the consensus') parser.add_argument('--min_cov_aln', type=int, default=10, help='minimum coverage of alignment data; a seed read with less than MIN_COV_ALN average depth' + \ ' of coverage will be completely ignored') parser.add_argument('--max_cov_aln', type=int, default=0, # 0 to emulate previous behavior help='maximum coverage of alignment data; a seed read with more than MAX_COV_ALN average depth' + \ ' of coverage of the longest alignments will be capped, excess shorter alignments will be ignored') parser.add_argument('--min_len_aln', type=int, default=0, # 0 to emulate previous behavior help='minimum length of a sequence in an alignment to be used in consensus; any shorter sequence will be completely ignored') parser.add_argument('--min_n_read', type=int, default=10, help='1 + minimum number of reads used in generating the consensus; a seed read with fewer alignments will '+ \ 'be completely ignored') parser.add_argument('--max_n_read', type=int, default=500, help='1 + maximum number of reads used in generating the consensus') parser.add_argument('--trim', action="store_true", default=False, help='trim the input sequence with k-mer spare dynamic programming to find the mapped range') parser.add_argument('--output_full', action="store_true", default=False, help='output uncorrected regions too') parser.add_argument('--output_multi', action="store_true", default=False, help='output multi correct regions') parser.add_argument('--min_idt', type=float, default=0.70, help='minimum identity of the alignments used for correction') parser.add_argument('--edge_tolerance', type=int, default=1000, help='for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read') parser.add_argument('--trim_size', type=int, default=50, help='the size for triming both ends from initial sparse aligned region') good_region = re.compile("[ACGT]+") args = parser.parse_args(argv[1:]) def Start(): print>>sys.stderr, 'Started a worker in %d from parent %d' %(os.getpid(), os.getppid()) exe_pool = Pool(args.n_core, initializer=Start) if args.trim: get_consensus = get_consensus_with_trim else: get_consensus = get_consensus_without_trim K = 8 config = args.min_cov, K, \ args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln # TODO: pass config object, not tuple, so we can add fields for res in exe_pool.imap(get_consensus, get_seq_data(config, args.min_n_read, args.min_len_aln)): cns, seed_id = res if len(cns) < 500: continue if args.output_full: print ">"+seed_id+"_f" print cns else: cns = good_region.findall(cns) if len(cns) == 0: continue if args.output_multi: seq_i = 0 for cns_seq in cns: if len(cns_seq) < 500: continue if seq_i >= 10: break print ">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0, len(cns_seq)) print format_seq(cns_seq, 80) seq_i += 1 else: cns.sort(key = lambda x: len(x)) print ">"+seed_id print cns[-1]
def main(argv=sys.argv): parser = argparse.ArgumentParser( description='a simple multi-processor consensus sequence generator', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( '--n_core', type=int, default=24, help='number of processes used for generating consensus; ' '0 for main process only') parser.add_argument('--min_cov', type=int, default=6, help='minimum coverage to break the consensus') parser.add_argument( '--min_cov_aln', type=int, default=10, help= 'minimum coverage of alignment data; a seed read with less than MIN_COV_ALN average depth' + ' of coverage will be completely ignored') parser.add_argument('--max_cov_aln', type=int, default=0, # 0 to emulate previous behavior help='maximum coverage of alignment data; a seed read with more than MAX_COV_ALN average depth' + \ ' of coverage of the longest alignments will be capped, excess shorter alignments will be ignored') parser.add_argument( '--min_len_aln', type=int, default=0, # 0 to emulate previous behavior help= 'minimum length of a sequence in an alignment to be used in consensus; any shorter sequence will be completely ignored' ) parser.add_argument( '--min_n_read', type=int, default=10, help= '1 + minimum number of reads used in generating the consensus; a seed read with fewer alignments will ' + 'be completely ignored') parser.add_argument( '--max_n_read', type=int, default=500, help='1 + maximum number of reads used in generating the consensus') parser.add_argument( '--trim', action="store_true", default=False, help= 'trim the input sequence with k-mer spare dynamic programming to find the mapped range' ) parser.add_argument('--output_full', action="store_true", default=False, help='output uncorrected regions too') parser.add_argument('--output_multi', action="store_true", default=False, help='output multi correct regions') parser.add_argument( '--min_idt', type=float, default=0.70, help='minimum identity of the alignments used for correction') parser.add_argument( '--edge_tolerance', type=int, default=1000, help= 'for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read' ) parser.add_argument( '--trim_size', type=int, default=50, help='the size for triming both ends from initial sparse aligned region' ) good_region = re.compile("[ACGT]+") args = parser.parse_args(argv[1:]) def Start(): print >> sys.stderr, 'Started a worker in %d from parent %d' % ( os.getpid(), os.getppid()) exe_pool = Pool(args.n_core, initializer=Start) if args.trim: get_consensus = get_consensus_with_trim else: get_consensus = get_consensus_without_trim K = 8 config = args.min_cov, K, \ args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln # TODO: pass config object, not tuple, so we can add fields for res in exe_pool.imap( get_consensus, get_seq_data(config, args.min_n_read, args.min_len_aln)): cns, seed_id = res if len(cns) < 500: continue if args.output_full: print ">" + seed_id + "_f" print cns else: cns = good_region.findall(cns) if len(cns) == 0: continue if args.output_multi: seq_i = 0 for cns_seq in cns: if len(cns_seq) < 500: continue if seq_i >= 10: break print ">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0, len(cns_seq)) print format_seq(cns_seq, 80) seq_i += 1 else: cns.sort(key=lambda x: len(x)) print ">" + seed_id print cns[-1]
def main(argv=sys.argv): parser = argparse.ArgumentParser( description='a simple multi-processor consensus sequence generator') parser.add_argument( '--n_core', type=int, default=24, help='number of processes used for generating consensus; ' '0 for main process only (default=%(default)s)') parser.add_argument('--local_match_count_window', type=int, default=12, help='local match window size (obsoleted, no effect)') parser.add_argument( '--local_match_count_threshold', type=int, default=6, help='local match count threshold (obsoleted, no effect)') parser.add_argument('--min_cov', type=int, default=6, help='minimum coverage to break the consensus') parser.add_argument('--min_cov_aln', type=int, default=10, help='minimum coverage of alignment data; a seed read with less than MIN_COV_ALN average depth' + \ ' of coverage will be completely ignored') parser.add_argument('--max_cov_aln', type=int, default=0, # 0 to emulate previous behavior help='maximum coverage of alignment data; a seed read with more than MAX_COV_ALN average depth' + \ ' of coverage of the longest alignments will be capped, excess shorter alignments will be ignored') parser.add_argument( '--min_len_aln', type=int, default=0, # 0 to emulate previous behavior help= 'minimum length of a sequence in an alignment to be used in consensus; any shorter sequence will be completely ignored' ) parser.add_argument('--min_n_read', type=int, default=10, help='minimum number of reads used in generating the consensus; a seed read with fewer alignments will '+ \ 'be completely ignored (obsoleted, formerly called `--min_cov_aln\')') parser.add_argument( '--max_n_read', type=int, default=500, help='maximum number of reads used in generating the consensus') parser.add_argument( '--trim', action="store_true", default=False, help= 'trim the input sequence with k-mer spare dynamic programming to find the mapped range' ) parser.add_argument('--output_full', action="store_true", default=False, help='output uncorrected regions too') parser.add_argument( '--output_multi', action="store_true", default=False, help= 'output multi correct regions; implies --output_dformat, unless --output-simple-fasta-header' ) parser.add_argument( '--output_dformat', action="store_true", default=True, help= 'output daligner compatible header, only work with --output_multi; DEPRECATED and ignored, as this is the default now' ) parser.add_argument( '--output_simple_fasta_header', action='store_true', default=False, help= 'Turn off --output_dformat. This was for older (pre spring 2015) DALIGNER. Never needed now.' ) parser.add_argument( '--min_idt', type=float, default=0.70, help='minimum identity of the alignments used for correction') parser.add_argument( '--edge_tolerance', type=int, default=1000, help= 'for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read' ) parser.add_argument( '--trim_size', type=int, default=50, help='the size for triming both ends from initial sparse aligned region' ) good_region = re.compile("[ACGT]+") args = parser.parse_args(argv[1:]) def Start(): print >> sys.stderr, 'Started a worker in %d from parent %d' % ( os.getpid(), os.getppid()) exe_pool = Pool(args.n_core, initializer=Start) if args.trim: get_consensus = get_consensus_with_trim else: get_consensus = get_consensus_without_trim K = 8 config = args.min_cov, K, args.local_match_count_window, args.local_match_count_threshold,\ args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln # TODO: pass config object, not tuple, so we can add fields for res in exe_pool.imap( get_consensus, get_seq_data(config, args.min_n_read, args.min_len_aln)): cns, seed_id = res if len(cns) < 500: continue if args.output_full: print ">" + seed_id + "_f" print cns else: cns = good_region.findall(cns) if len(cns) == 0: continue if args.output_multi: seq_i = 0 for cns_seq in cns: if len(cns_seq) < 500: continue if not args.output_simple_fasta_header: if seq_i >= 10: break print ">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0, len(cns_seq)) print format_seq(cns_seq, 80) else: print ">" + seed_id + "_%d" % seq_i print cns_seq seq_i += 1 else: cns.sort(key=lambda x: len(x)) print ">" + seed_id print cns[-1]
def main(argv=sys.argv): parser = argparse.ArgumentParser(description='a simple multi-processor consensus sequence generator') parser.add_argument('--n_core', type=int, default=24, help='number of processes used for generating consensus; ' '0 for main process only (default=%(default)s)') parser.add_argument('--local_match_count_window', type=int, default=12, help='local match window size (obsoleted, no effect)') parser.add_argument('--local_match_count_threshold', type=int, default=6, help='local match count threshold (obsoleted, no effect)') parser.add_argument('--min_cov', type=int, default=6, help='minimum coverage to break the consensus') parser.add_argument('--min_cov_aln', type=int, default=10, help='minimum coverage of alignment data; a seed read with less than MIN_COV_ALN average depth' + \ ' of coverage will be completely ignored') parser.add_argument('--max_cov_aln', type=int, default=0, # 0 to emulate previous behavior help='maximum coverage of alignment data; a seed read with more than MAX_COV_ALN average depth' + \ ' of coverage of the longest alignments will be capped, excess shorter alignments will be ignored') parser.add_argument('--min_len_aln', type=int, default=0, # 0 to emulate previous behavior help='minimum length of a sequence in an alignment to be used in consensus; any shorter sequence will be completely ignored') parser.add_argument('--min_n_read', type=int, default=10, help='minimum number of reads used in generating the consensus; a seed read with fewer alignments will '+ \ 'be completely ignored (obsoleted, formerly called `--min_cov_aln\')') parser.add_argument('--max_n_read', type=int, default=500, help='maximum number of reads used in generating the consensus') parser.add_argument('--trim', action="store_true", default=False, help='trim the input sequence with k-mer spare dynamic programming to find the mapped range') parser.add_argument('--output_full', action="store_true", default=False, help='output uncorrected regions too') parser.add_argument('--output_multi', action="store_true", default=False, help='output multi correct regions; implies --output_dformat, unless --output-simple-fasta-header') parser.add_argument('--output_dformat', action="store_true", default=True, help='output daligner compatible header, only work with --output_multi; DEPRECATED and ignored, as this is the default now') parser.add_argument('--output_simple_fasta_header', action='store_true', default=False, help='Turn off --output_dformat. This was for older (pre spring 2015) DALIGNER. Never needed now.') parser.add_argument('--min_idt', type=float, default=0.70, help='minimum identity of the alignments used for correction') parser.add_argument('--edge_tolerance', type=int, default=1000, help='for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read') parser.add_argument('--trim_size', type=int, default=50, help='the size for triming both ends from initial sparse aligned region') good_region = re.compile("[ACGT]+") args = parser.parse_args(argv[1:]) def Start(): print>>sys.stderr, 'Started a worker in %d from parent %d' %(os.getpid(), os.getppid()) exe_pool = Pool(args.n_core, initializer=Start) if args.trim: get_consensus = get_consensus_with_trim else: get_consensus = get_consensus_without_trim K = 8 config = args.min_cov, K, args.local_match_count_window, args.local_match_count_threshold,\ args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln # TODO: pass config object, not tuple, so we can add fields for res in exe_pool.imap(get_consensus, get_seq_data(config, args.min_n_read, args.min_len_aln)): cns, seed_id = res if len(cns) < 500: continue if args.output_full: print ">"+seed_id+"_f" print cns else: cns = good_region.findall(cns) if len(cns) == 0: continue if args.output_multi: seq_i = 0 for cns_seq in cns: if len(cns_seq) < 500: continue if not args.output_simple_fasta_header: if seq_i >= 10: break print ">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0, len(cns_seq)) print format_seq(cns_seq, 80) else: print ">"+seed_id+"_%d" % seq_i print cns_seq seq_i += 1 else: cns.sort(key = lambda x: len(x)) print ">"+seed_id print cns[-1]
def main(*argv): parser = argparse.ArgumentParser(description='a simple multi-processor consensus sequence generator') parser.add_argument('--n_core', type=int, default=24, help='number of processes used for generating consensus; ' '0 for main process only (default=%(default)s)') parser.add_argument('--local_match_count_window', type=int, default=12, help='local match window size (obsoleted, no effect)') parser.add_argument('--local_match_count_threshold', type=int, default=6, help='local match count threshold (obsoleted, no effect)') parser.add_argument('--min_cov', type=int, default=6, help='minimum coverage to break the consensus') parser.add_argument('--min_cov_aln', type=int, default=10, help='minimum coverage of alignment data; an alignment with fewer reads will be completely ignored') parser.add_argument('--min_len_aln', type=int, default=100, help='minimum length of a sequence in an alignment to be used in consensus; any shorter sequence will be completely ignored') parser.add_argument('--max_n_read', type=int, default=500, help='maximum number of reads used in generating the consensus') parser.add_argument('--trim', action="store_true", default=False, help='trim the input sequence with k-mer spare dynamic programming to find the mapped range') parser.add_argument('--output_full', action="store_true", default=False, help='output uncorrected regions too') parser.add_argument('--output_multi', action="store_true", default=False, help='output multi correct regions') parser.add_argument('--output_dformat', action="store_true", default=False, help='output daligner compatible header, only work with --output_multi') parser.add_argument('--min_idt', type=float, default=0.70, help='minimum identity of the alignments used for correction') parser.add_argument('--edge_tolerance', type=int, default=1000, help='for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read') parser.add_argument('--trim_size', type=int, default=50, help='the size for triming both ends from initial sparse aligned region') good_region = re.compile("[ACGT]+") args = parser.parse_args(argv[1:]) exe_pool = Pool(args.n_core) if args.trim: get_consensus = get_consensus_with_trim else: get_consensus = get_consensus_without_trim K = 8 config = args.min_cov, K, args.local_match_count_window, args.local_match_count_threshold,\ args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size # TODO: pass config object, not tuple, so we can add fields for res in exe_pool.imap(get_consensus, get_seq_data(config, args.min_cov_aln, args.min_len_aln)): cns, seed_id = res if len(cns) < 500: continue if args.output_full == True: print ">"+seed_id+"_f" print cns else: cns = good_region.findall(cns) if len(cns) == 0: continue if args.output_multi == True: seq_i = 0 for cns_seq in cns: if len(cns_seq) < 500: continue if args.output_dformat: if seq_i >= 10: break print ">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0, len(cns_seq)) print format_seq(cns_seq, 80) else: print ">"+seed_id+"_%d" % seq_i print cns_seq seq_i += 1 else: cns.sort(key = lambda x: len(x)) print ">"+seed_id print cns[-1]