Example #1
0
def run(args):
    logging.basicConfig(level=int(round(10*args.verbose_level)))

    assert args.n_core <= multiprocessing.cpu_count(), 'Requested n_core={} > cpu_count={}'.format(
            args.n_core, multiprocessing.cpu_count())

    def Start():
        LOG.info('Started a worker in {} from parent {}'.format(
            os.getpid(), os.getppid()))
    exe_pool = Pool(args.n_core, initializer=Start)
    if args.trim:
        get_consensus = get_consensus_with_trim
    else:
        get_consensus = get_consensus_without_trim

    K = 8
    config = args.min_cov, K, \
        args.max_n_read, args.min_idt, args.edge_tolerance, \
        args.trim_size, args.min_cov_aln, args.max_cov_aln, \
        args.allow_external_mapping
    # TODO: pass config object, not tuple, so we can add fields
    inputs = []
    for datum in get_seq_data(config, args.min_n_read, args.min_len_aln):
        inputs.append((get_consensus, datum))
    try:
        LOG.info('running {!r}'.format(get_consensus))
        for res in exe_pool.imap(io.run_func, inputs):
            process_get_consensus_result(res, args)
        LOG.info('finished {!r}'.format(get_consensus))
    except:
        LOG.exception('failed gen_consensus')
        exe_pool.terminate()
        raise
Example #2
0
def run(args):
    logging.basicConfig(level=int(round(10*args.verbose_level)))

    assert args.n_core <= multiprocessing.cpu_count(), 'Requested n_core={} > cpu_count={}'.format(
            args.n_core, multiprocessing.cpu_count())

    def Start():
        LOG.info('Started a worker in {} from parent {}'.format(
            os.getpid(), os.getppid()))
    exe_pool = Pool(args.n_core, initializer=Start)
    if args.trim:
        get_consensus = get_consensus_with_trim
    else:
        get_consensus = get_consensus_without_trim

    K = 8
    config = args.min_cov, K, \
        args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln
    # TODO: pass config object, not tuple, so we can add fields
    inputs = []
    for datum in get_seq_data(config, args.min_n_read, args.min_len_aln):
        inputs.append((get_consensus, datum))
    try:
        LOG.info('running {!r}'.format(get_consensus))
        for res in exe_pool.imap(io.run_func, inputs):
            process_get_consensus_result(res, args)
        LOG.info('finished {!r}'.format(get_consensus))
    except:
        LOG.exception('failed gen_consensus')
        exe_pool.terminate()
        raise
Example #3
0
def try_run_ovlp_filter(n_core, fofn, max_diff, max_cov, min_cov, min_len, bestn, db_fn):
    io.LOG("starting ovlp_filter")
    file_list = io.validated_fns(fofn)
    io.LOG("fofn %r: %r" % (fofn, file_list))
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    try:
        run_ovlp_filter(exe_pool, file_list, max_diff, max_cov, min_cov, min_len, bestn, db_fn)
        io.LOG("finished ovlp_filter")
    except KeyboardInterrupt:
        io.LOG("terminating ovlp_filter workers...")
        exe_pool.terminate()
Example #4
0
def try_run_ovlp_stats(n_core, fofn, min_len):
    io.LOG('starting ovlp_stats')
    file_list = io.validated_fns(fofn)
    io.LOG('fofn %r: %r' % (fofn, file_list))
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    try:
        run_ovlp_stats(exe_pool, file_list, min_len)
        io.LOG('finished ovlp_stats')
    except KeyboardInterrupt:
        io.LOG('terminating ovlp_stats workers...')
        exe_pool.terminate()
Example #5
0
def try_run_ovlp_stats(n_core, fofn, min_len):
    io.LOG('starting ovlp_stats')
    file_list = io.validated_fns(fofn)
    io.LOG('fofn %r: %r' %(fofn, file_list))
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    try:
        run_ovlp_stats(exe_pool, file_list, min_len)
        io.LOG('finished ovlp_stats')
    except KeyboardInterrupt:
        io.LOG('terminating ovlp_stats workers...')
        exe_pool.terminate()
Example #6
0
def run(args):
    logging.basicConfig(level=int(round(10 * args.verbose_level)))

    good_region = re.compile("[ACGT]+")

    assert args.n_core <= multiprocessing.cpu_count(
    ), 'Requested n_core={} > cpu_count={}'.format(args.n_core,
                                                   multiprocessing.cpu_count())

    def Start():
        LOG.info('Started a worker in {} from parent {}'.format(
            os.getpid(), os.getppid()))

    exe_pool = Pool(args.n_core, initializer=Start)
    if args.trim:
        get_consensus = get_consensus_with_trim
    else:
        get_consensus = get_consensus_without_trim

    K = 8
    config = args.min_cov, K, \
        args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln
    # TODO: pass config object, not tuple, so we can add fields
    for res in exe_pool.imap(
            get_consensus,
            get_seq_data(config, args.min_n_read, args.min_len_aln)):
        cns, seed_id = res
        if len(cns) < 500:
            continue

        if args.output_full:
            print(">" + seed_id + "_f")
            print(cns)
        else:
            cns = good_region.findall(cns)
            if len(cns) == 0:
                continue
            if args.output_multi:
                seq_i = 0
                for cns_seq in cns:
                    if len(cns_seq) < 500:
                        continue
                    if seq_i >= 10:
                        break
                    print(">prolog/%s%01d/%d_%d" %
                          (seed_id, seq_i, 0, len(cns_seq)))
                    print(format_seq(cns_seq, 80))
                    seq_i += 1
            else:
                cns.sort(key=lambda x: len(x))
                print(">" + seed_id)
                print(cns[-1])
Example #7
0
def try_run_ovlp_stats(n_core, db_fn, fofn, min_len):
    io.LOG('starting ovlp_stats')
    file_list = io.validated_fns(fofn)
    io.LOG('fofn {!r}: {}'.format(fofn, file_list))
    io.LOG('db {!r}; n_core={}'.format(db_fn, n_core))
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    try:
        run_ovlp_stats(exe_pool, db_fn, file_list, min_len)
        io.LOG('finished ovlp_stats')
    except KeyboardInterrupt:
        io.LOG('terminating ovlp_stats workers...')
        exe_pool.terminate()
Example #8
0
def try_run_ovlp_stats(n_core, db_fn, fofn, min_len):
    io.LOG('starting ovlp_stats')
    file_list = io.validated_fns(fofn)
    io.LOG('fofn {!r}: {}'.format(fofn, file_list))
    io.LOG('db {!r}; n_core={}'.format(db_fn, n_core))
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    try:
        run_ovlp_stats(exe_pool, db_fn, file_list, min_len)
        io.LOG('finished ovlp_stats')
    except KeyboardInterrupt:
        io.LOG('terminating ovlp_stats workers...')
        exe_pool.terminate()
Example #9
0
def try_run_ovlp_filter(n_core, fofn, max_diff, max_cov, min_cov, min_len, bestn, db_fn):
    io.LOG('starting ovlp_filter')
    file_list = io.validated_fns(fofn)
    io.LOG('fofn %r: %r' %(fofn, file_list))
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    try:
        run_ovlp_filter(exe_pool, file_list, max_diff, max_cov, min_cov, min_len, bestn, db_fn)
        io.LOG('finished ovlp_filter')
    except:
        io.LOG('terminating ovlp_filter workers...')
        exe_pool.terminate()
        raise
Example #10
0
def try_run_ovlp_filter(n_core, fofn, max_diff, max_cov, min_cov, min_len, bestn, db_fn):
    io.LOG('starting ovlp_filter')
    file_list = io.validated_fns(fofn)
    io.LOG('fofn %r: %r' %(fofn, file_list))
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    try:
        run_ovlp_filter(exe_pool, file_list, max_diff, max_cov, min_cov, min_len, bestn, db_fn)
        io.LOG('finished ovlp_filter')
    except:
        io.LOG('terminating ovlp_filter workers...')
        exe_pool.terminate()
        raise
Example #11
0
def try_run_track_reads(n_core, base_dir, min_len, bestn):
    io.LOG('starting track_reads')
    pread_dir = os.path.abspath(os.path.join(base_dir, "1-preads_ovl"))
    file_list = glob.glob(os.path.join(pread_dir, "m*/preads.*.las"))
    io.LOG('file list: %r' % file_list)
    db_fn = os.path.join(pread_dir, "preads.db")
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    try:
        run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn)
        io.LOG('finished track_reads')
    except:
        io.LOG('terminating track_reads workers...')
        exe_pool.terminate()
        raise
Example #12
0
def try_run_track_reads(n_core, base_dir, min_len, bestn):
    io.LOG('starting track_reads')
    pread_dir = os.path.abspath(os.path.join(base_dir, "1-preads_ovl"))
    file_list = glob.glob(os.path.join(pread_dir, "m*/preads.*.las"))
    io.LOG('file list: %r' % file_list)
    db_fn = os.path.join(pread_dir, "preads.db")
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    try:
        run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn)
        io.LOG('finished track_reads')
    except:
        io.LOG('terminating track_reads workers...')
        exe_pool.terminate()
        raise
Example #13
0
def run(args):
    logging.basicConfig(level=int(round(10*args.verbose_level)))

    good_region = re.compile("[ACGT]+")

    assert args.n_core <= multiprocessing.cpu_count(), 'Requested n_core={} > cpu_count={}'.format(
            args.n_core, multiprocessing.cpu_count())

    def Start():
        LOG.info('Started a worker in {} from parent {}'.format(
            os.getpid(), os.getppid()))
    exe_pool = Pool(args.n_core, initializer=Start)
    if args.trim:
        get_consensus = get_consensus_with_trim
    else:
        get_consensus = get_consensus_without_trim

    K = 8
    config = args.min_cov, K, \
        args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln
    # TODO: pass config object, not tuple, so we can add fields
    for res in exe_pool.imap(get_consensus, get_seq_data(config, args.min_n_read, args.min_len_aln)):
        cns, seed_id = res
        if len(cns) < 500:
            continue

        if args.output_full:
            print(">" + seed_id + "_f")
            print(cns)
        else:
            cns = good_region.findall(cns)
            if len(cns) == 0:
                continue
            if args.output_multi:
                seq_i = 0
                for cns_seq in cns:
                    if len(cns_seq) < 500:
                        continue
                    if seq_i >= 10:
                        break
                    print(">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0, len(cns_seq)))
                    print(format_seq(cns_seq, 80))
                    seq_i += 1
            else:
                cns.sort(key=lambda x: len(x))
                print(">" + seed_id)
                print(cns[-1])
Example #14
0
def try_run_ovlp_filter(out_fn, n_core, fofn, max_diff, max_cov, min_cov, min_len, min_idt, ignore_indels, bestn, db_fn):
    io.LOG('starting ovlp_filter')
    file_list = io.validated_fns(fofn)
    io.LOG('fofn %r: %r' % (fofn, file_list))
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    tmp_out_fn = out_fn + '.tmp'
    try:
        with open(tmp_out_fn, 'w') as outs:
            run_ovlp_filter(outs, exe_pool, file_list, max_diff, max_cov,
                            min_cov, min_len, min_idt, ignore_indels, bestn, db_fn)
        os.rename(tmp_out_fn, out_fn)
        io.LOG('finished ovlp_filter')
    except:
        io.LOG('terminating ovlp_filter workers...')
        exe_pool.terminate()
        raise
Example #15
0
def try_run_ovlp_filter(out_fn, n_core, fofn, max_diff, max_cov, min_cov, min_len, bestn, db_fn):
    io.LOG('starting ovlp_filter')
    file_list = io.validated_fns(fofn)
    io.LOG('fofn %r: %r' % (fofn, file_list))
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    tmp_out_fn = out_fn + '.tmp'
    try:
        with open(tmp_out_fn, 'w') as outs:
            run_ovlp_filter(outs, exe_pool, file_list, max_diff, max_cov,
                            min_cov, min_len, bestn, db_fn)
            outs.write('---\n')
        os.rename(tmp_out_fn, out_fn)
        io.LOG('finished ovlp_filter')
    except:
        io.LOG('terminating ovlp_filter workers...')
        exe_pool.terminate()
        raise
Example #16
0
def try_run_track_reads(n_core, phased_read_file, read_to_contig_map, rawread_ids, min_len, bestn, output):
    io.LOG('starting track_reads')

    rawread_dir = os.path.abspath('0-rawreads')

    # better logic for finding the las files path or move the logic to extern (taking the --fofn option?)
    file_list = glob.glob( os.path.join(rawread_dir, 'm*/raw_reads.*.las')) # TODO: More inputs
    io.LOG('file list: %r' % file_list)

    db_fn = os.path.join(rawread_dir, 'raw_reads.db') # TODO: Another input
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    try:
        run_track_reads(exe_pool, phased_read_file, read_to_contig_map, rawread_ids, file_list, min_len, bestn, db_fn, output)
        io.LOG('finished track_reads')
    except:
        io.LOG('terminating track_reads workers...')
        exe_pool.terminate()
        raise
Example #17
0
def main(*argv):
    parser = argparse.ArgumentParser(description='a simple multi-processes LAS ovelap data filter')
    parser.add_argument('--n_core', type=int, default=4,
                        help='number of processes used for generating consensus; '
                        '0 for main process only (default=%(default)s)')
    parser.add_argument('--fofn', type=str, help='file contains the path of all LAS file to be processed in parallel')
    parser.add_argument('--min_len', type=int, default=2500, help="min length of the reads")
    args = parser.parse_args(argv)
    exe_pool = Pool(args.n_core)

    file_list = open(args.fofn).read().split("\n")
    #print "all", len(contained)
    inputs = []
    for fn in file_list:
        if len(fn) != 0:
            inputs.append( (fn, args.min_len ) )
    for res in exe_pool.imap(run_filter_stats, inputs):
        for l in res[1]:
            print " ".join([str(c) for c in l])
Example #18
0
def try_run_track_reads(n_core, base_dir, min_len, bestn):
    io.LOG('starting track_reads')

    rawread_dir = os.path.abspath(os.path.join(base_dir, "0-rawreads"))

    # better logic for finding the las files path or move the logic to extern (taking the --fofn option?)
    file_list = glob.glob(os.path.join(rawread_dir, "m*/raw_reads.*.las"))
    io.LOG('file list: %r' % file_list)

    # same, shoud we decide this as a parameter
    db_fn = os.path.join(rawread_dir, "raw_reads.db")
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    try:
        run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn)
        io.LOG('finished track_reads')
    except:
        io.LOG('terminating track_reads workers...')
        exe_pool.terminate()
        raise
Example #19
0
def try_run_track_reads(n_core, base_dir, min_len, bestn):
    io.LOG('starting track_reads')

    rawread_dir = os.path.abspath(os.path.join(base_dir, "0-rawreads"))

    # better logic for finding the las files path or move the logic to extern (taking the --fofn option?)
    file_list = glob.glob(os.path.join(rawread_dir, "m*/raw_reads.*.las"))
    io.LOG('file list: %r' % file_list)

    # same, shoud we decide this as a parameter
    db_fn = os.path.join(rawread_dir, "raw_reads.db")
    n_core = min(n_core, len(file_list))
    exe_pool = Pool(n_core)
    try:
        run_track_reads(exe_pool, base_dir, file_list, min_len, bestn, db_fn)
        io.LOG('finished track_reads')
    except:
        io.LOG('terminating track_reads workers...')
        exe_pool.terminate()
        raise
Example #20
0
def fc_ovlp_filter(n_core, fofn, max_diff, max_cov, min_cov, min_len, bestn,
                   db_fn, debug, silent):
    global LOG
    if silent:
        LOG = write_nothing
    exe_pool = Pool(n_core)

    file_list = open(fofn).read().split("\n")
    inputs = []
    for fn in file_list:
        if len(fn) != 0:
            inputs.append((db_fn, fn, max_diff, max_cov, min_cov, min_len))

    ignore_all = []
    for res in exe_pool.imap(filter_stage1, inputs):
        ignore_all.extend(res[1])

    inputs = []
    ignore_all = set(ignore_all)
    for fn in file_list:
        if len(fn) != 0:
            inputs.append(
                (db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all))
    contained = set()
    for res in exe_pool.imap(filter_stage2, inputs):
        contained.update(res[1])
        #print res[0], len(res[1]), len(contained)

    #print "all", len(contained)
    inputs = []
    ignore_all = set(ignore_all)
    for fn in file_list:
        if len(fn) != 0:
            inputs.append((db_fn, fn, max_diff, max_cov, min_cov, min_len,
                           ignore_all, contained, bestn))
    for res in exe_pool.imap(filter_stage3, inputs):
        for l in res[1]:
            print " ".join(l)
Example #21
0
def main(argv=sys.argv):
    parser = argparse.ArgumentParser(description='a simple multi-processor consensus sequence generator',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--n_core', type=int, default=24,
                        help='number of processes used for generating consensus; '
                        '0 for main process only')
    parser.add_argument('--min_cov', type=int, default=6,
                        help='minimum coverage to break the consensus')
    parser.add_argument('--min_cov_aln', type=int, default=10,
                        help='minimum coverage of alignment data; a seed read with less than MIN_COV_ALN average depth' + \
                        ' of coverage will be completely ignored')
    parser.add_argument('--max_cov_aln', type=int, default=0, # 0 to emulate previous behavior
                        help='maximum coverage of alignment data; a seed read with more than MAX_COV_ALN average depth' + \
                        ' of coverage of the longest alignments will be capped, excess shorter alignments will be ignored')
    parser.add_argument('--min_len_aln', type=int, default=0, # 0 to emulate previous behavior
                        help='minimum length of a sequence in an alignment to be used in consensus; any shorter sequence will be completely ignored')
    parser.add_argument('--min_n_read', type=int, default=10,
                        help='1 + minimum number of reads used in generating the consensus; a seed read with fewer alignments will '+ \
                        'be completely ignored')
    parser.add_argument('--max_n_read', type=int, default=500,
                        help='1 + maximum number of reads used in generating the consensus')
    parser.add_argument('--trim', action="store_true", default=False,
                        help='trim the input sequence with k-mer spare dynamic programming to find the mapped range')
    parser.add_argument('--output_full', action="store_true", default=False,
                        help='output uncorrected regions too')
    parser.add_argument('--output_multi', action="store_true", default=False,
                        help='output multi correct regions')
    parser.add_argument('--min_idt', type=float, default=0.70,
                        help='minimum identity of the alignments used for correction')
    parser.add_argument('--edge_tolerance', type=int, default=1000,
                        help='for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read')
    parser.add_argument('--trim_size', type=int, default=50,
                        help='the size for triming both ends from initial sparse aligned region')
    good_region = re.compile("[ACGT]+")
    args = parser.parse_args(argv[1:])
    def Start():
        print>>sys.stderr, 'Started a worker in %d from parent %d' %(os.getpid(), os.getppid())
    exe_pool = Pool(args.n_core, initializer=Start)
    if args.trim:
        get_consensus = get_consensus_with_trim
    else:
        get_consensus = get_consensus_without_trim

    K = 8
    config = args.min_cov, K, \
             args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln
    # TODO: pass config object, not tuple, so we can add fields
    for res in exe_pool.imap(get_consensus, get_seq_data(config, args.min_n_read, args.min_len_aln)):
        cns, seed_id = res
        if len(cns) < 500:
            continue


        if args.output_full:
            print ">"+seed_id+"_f"
            print cns
        else:
            cns = good_region.findall(cns)
            if len(cns) == 0:
                continue
            if args.output_multi:
                seq_i = 0
                for cns_seq in cns:
                    if len(cns_seq) < 500:
                        continue
                    if seq_i >= 10:
                        break
                    print ">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0, len(cns_seq))
                    print format_seq(cns_seq, 80)
                    seq_i += 1
            else:
                cns.sort(key = lambda x: len(x))
                print ">"+seed_id
                print cns[-1]
Example #22
0
def main(argv=sys.argv):
    parser = argparse.ArgumentParser(
        description='a simple multi-processor consensus sequence generator',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--n_core',
        type=int,
        default=24,
        help='number of processes used for generating consensus; '
        '0 for main process only')
    parser.add_argument('--min_cov',
                        type=int,
                        default=6,
                        help='minimum coverage to break the consensus')
    parser.add_argument(
        '--min_cov_aln',
        type=int,
        default=10,
        help=
        'minimum coverage of alignment data; a seed read with less than MIN_COV_ALN average depth'
        + ' of coverage will be completely ignored')
    parser.add_argument('--max_cov_aln', type=int, default=0,  # 0 to emulate previous behavior
                        help='maximum coverage of alignment data; a seed read with more than MAX_COV_ALN average depth' + \
                        ' of coverage of the longest alignments will be capped, excess shorter alignments will be ignored')
    parser.add_argument(
        '--min_len_aln',
        type=int,
        default=0,  # 0 to emulate previous behavior
        help=
        'minimum length of a sequence in an alignment to be used in consensus; any shorter sequence will be completely ignored'
    )
    parser.add_argument(
        '--min_n_read',
        type=int,
        default=10,
        help=
        '1 + minimum number of reads used in generating the consensus; a seed read with fewer alignments will '
        + 'be completely ignored')
    parser.add_argument(
        '--max_n_read',
        type=int,
        default=500,
        help='1 + maximum number of reads used in generating the consensus')
    parser.add_argument(
        '--trim',
        action="store_true",
        default=False,
        help=
        'trim the input sequence with k-mer spare dynamic programming to find the mapped range'
    )
    parser.add_argument('--output_full',
                        action="store_true",
                        default=False,
                        help='output uncorrected regions too')
    parser.add_argument('--output_multi',
                        action="store_true",
                        default=False,
                        help='output multi correct regions')
    parser.add_argument(
        '--min_idt',
        type=float,
        default=0.70,
        help='minimum identity of the alignments used for correction')
    parser.add_argument(
        '--edge_tolerance',
        type=int,
        default=1000,
        help=
        'for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read'
    )
    parser.add_argument(
        '--trim_size',
        type=int,
        default=50,
        help='the size for triming both ends from initial sparse aligned region'
    )
    good_region = re.compile("[ACGT]+")
    args = parser.parse_args(argv[1:])

    def Start():
        print >> sys.stderr, 'Started a worker in %d from parent %d' % (
            os.getpid(), os.getppid())

    exe_pool = Pool(args.n_core, initializer=Start)
    if args.trim:
        get_consensus = get_consensus_with_trim
    else:
        get_consensus = get_consensus_without_trim

    K = 8
    config = args.min_cov, K, \
        args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln
    # TODO: pass config object, not tuple, so we can add fields
    for res in exe_pool.imap(
            get_consensus,
            get_seq_data(config, args.min_n_read, args.min_len_aln)):
        cns, seed_id = res
        if len(cns) < 500:
            continue

        if args.output_full:
            print ">" + seed_id + "_f"
            print cns
        else:
            cns = good_region.findall(cns)
            if len(cns) == 0:
                continue
            if args.output_multi:
                seq_i = 0
                for cns_seq in cns:
                    if len(cns_seq) < 500:
                        continue
                    if seq_i >= 10:
                        break
                    print ">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0,
                                                    len(cns_seq))
                    print format_seq(cns_seq, 80)
                    seq_i += 1
            else:
                cns.sort(key=lambda x: len(x))
                print ">" + seed_id
                print cns[-1]
Example #23
0
def main(argv=sys.argv):
    parser = argparse.ArgumentParser(
        description='a simple multi-processor consensus sequence generator')
    parser.add_argument(
        '--n_core',
        type=int,
        default=24,
        help='number of processes used for generating consensus; '
        '0 for main process only (default=%(default)s)')
    parser.add_argument('--local_match_count_window',
                        type=int,
                        default=12,
                        help='local match window size (obsoleted, no effect)')
    parser.add_argument(
        '--local_match_count_threshold',
        type=int,
        default=6,
        help='local match count threshold (obsoleted, no effect)')
    parser.add_argument('--min_cov',
                        type=int,
                        default=6,
                        help='minimum coverage to break the consensus')
    parser.add_argument('--min_cov_aln', type=int, default=10,
                        help='minimum coverage of alignment data; a seed read with less than MIN_COV_ALN average depth' + \
                        ' of coverage will be completely ignored')
    parser.add_argument('--max_cov_aln', type=int, default=0, # 0 to emulate previous behavior
                        help='maximum coverage of alignment data; a seed read with more than MAX_COV_ALN average depth' + \
                        ' of coverage of the longest alignments will be capped, excess shorter alignments will be ignored')
    parser.add_argument(
        '--min_len_aln',
        type=int,
        default=0,  # 0 to emulate previous behavior
        help=
        'minimum length of a sequence in an alignment to be used in consensus; any shorter sequence will be completely ignored'
    )
    parser.add_argument('--min_n_read', type=int, default=10,
                        help='minimum number of reads used in generating the consensus; a seed read with fewer alignments will '+ \
                        'be completely ignored (obsoleted, formerly called `--min_cov_aln\')')
    parser.add_argument(
        '--max_n_read',
        type=int,
        default=500,
        help='maximum number of reads used in generating the consensus')
    parser.add_argument(
        '--trim',
        action="store_true",
        default=False,
        help=
        'trim the input sequence with k-mer spare dynamic programming to find the mapped range'
    )
    parser.add_argument('--output_full',
                        action="store_true",
                        default=False,
                        help='output uncorrected regions too')
    parser.add_argument(
        '--output_multi',
        action="store_true",
        default=False,
        help=
        'output multi correct regions; implies --output_dformat, unless --output-simple-fasta-header'
    )
    parser.add_argument(
        '--output_dformat',
        action="store_true",
        default=True,
        help=
        'output daligner compatible header, only work with --output_multi; DEPRECATED and ignored, as this is the default now'
    )
    parser.add_argument(
        '--output_simple_fasta_header',
        action='store_true',
        default=False,
        help=
        'Turn off --output_dformat. This was for older (pre spring 2015) DALIGNER. Never needed now.'
    )
    parser.add_argument(
        '--min_idt',
        type=float,
        default=0.70,
        help='minimum identity of the alignments used for correction')
    parser.add_argument(
        '--edge_tolerance',
        type=int,
        default=1000,
        help=
        'for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read'
    )
    parser.add_argument(
        '--trim_size',
        type=int,
        default=50,
        help='the size for triming both ends from initial sparse aligned region'
    )
    good_region = re.compile("[ACGT]+")
    args = parser.parse_args(argv[1:])

    def Start():
        print >> sys.stderr, 'Started a worker in %d from parent %d' % (
            os.getpid(), os.getppid())

    exe_pool = Pool(args.n_core, initializer=Start)
    if args.trim:
        get_consensus = get_consensus_with_trim
    else:
        get_consensus = get_consensus_without_trim

    K = 8
    config = args.min_cov, K, args.local_match_count_window, args.local_match_count_threshold,\
             args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln
    # TODO: pass config object, not tuple, so we can add fields
    for res in exe_pool.imap(
            get_consensus,
            get_seq_data(config, args.min_n_read, args.min_len_aln)):
        cns, seed_id = res
        if len(cns) < 500:
            continue

        if args.output_full:
            print ">" + seed_id + "_f"
            print cns
        else:
            cns = good_region.findall(cns)
            if len(cns) == 0:
                continue
            if args.output_multi:
                seq_i = 0
                for cns_seq in cns:
                    if len(cns_seq) < 500:
                        continue
                    if not args.output_simple_fasta_header:
                        if seq_i >= 10:
                            break
                        print ">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0,
                                                        len(cns_seq))
                        print format_seq(cns_seq, 80)
                    else:
                        print ">" + seed_id + "_%d" % seq_i
                        print cns_seq
                    seq_i += 1
            else:
                cns.sort(key=lambda x: len(x))
                print ">" + seed_id
                print cns[-1]
Example #24
0
def main(argv=sys.argv):
    parser = argparse.ArgumentParser(description='a simple multi-processor consensus sequence generator')
    parser.add_argument('--n_core', type=int, default=24,
                        help='number of processes used for generating consensus; '
                        '0 for main process only (default=%(default)s)')
    parser.add_argument('--local_match_count_window', type=int, default=12,
                        help='local match window size (obsoleted, no effect)')
    parser.add_argument('--local_match_count_threshold', type=int, default=6,
                        help='local match count threshold (obsoleted, no effect)')
    parser.add_argument('--min_cov', type=int, default=6,
                        help='minimum coverage to break the consensus')
    parser.add_argument('--min_cov_aln', type=int, default=10,
                        help='minimum coverage of alignment data; a seed read with less than MIN_COV_ALN average depth' + \
                        ' of coverage will be completely ignored')
    parser.add_argument('--max_cov_aln', type=int, default=0, # 0 to emulate previous behavior
                        help='maximum coverage of alignment data; a seed read with more than MAX_COV_ALN average depth' + \
                        ' of coverage of the longest alignments will be capped, excess shorter alignments will be ignored')
    parser.add_argument('--min_len_aln', type=int, default=0, # 0 to emulate previous behavior
                        help='minimum length of a sequence in an alignment to be used in consensus; any shorter sequence will be completely ignored')
    parser.add_argument('--min_n_read', type=int, default=10,
                        help='minimum number of reads used in generating the consensus; a seed read with fewer alignments will '+ \
                        'be completely ignored (obsoleted, formerly called `--min_cov_aln\')')
    parser.add_argument('--max_n_read', type=int, default=500,
                        help='maximum number of reads used in generating the consensus')
    parser.add_argument('--trim', action="store_true", default=False,
                        help='trim the input sequence with k-mer spare dynamic programming to find the mapped range')
    parser.add_argument('--output_full', action="store_true", default=False,
                        help='output uncorrected regions too')
    parser.add_argument('--output_multi', action="store_true", default=False,
                        help='output multi correct regions; implies --output_dformat, unless --output-simple-fasta-header')
    parser.add_argument('--output_dformat', action="store_true", default=True,
                        help='output daligner compatible header, only work with --output_multi; DEPRECATED and ignored, as this is the default now')
    parser.add_argument('--output_simple_fasta_header', action='store_true', default=False,
                        help='Turn off --output_dformat. This was for older (pre spring 2015) DALIGNER. Never needed now.')
    parser.add_argument('--min_idt', type=float, default=0.70,
                        help='minimum identity of the alignments used for correction')
    parser.add_argument('--edge_tolerance', type=int, default=1000,
                        help='for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read')
    parser.add_argument('--trim_size', type=int, default=50,
                        help='the size for triming both ends from initial sparse aligned region')
    good_region = re.compile("[ACGT]+")
    args = parser.parse_args(argv[1:])
    def Start():
        print>>sys.stderr, 'Started a worker in %d from parent %d' %(os.getpid(), os.getppid())
    exe_pool = Pool(args.n_core, initializer=Start)
    if args.trim:
        get_consensus = get_consensus_with_trim
    else:
        get_consensus = get_consensus_without_trim

    K = 8
    config = args.min_cov, K, args.local_match_count_window, args.local_match_count_threshold,\
             args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln
    # TODO: pass config object, not tuple, so we can add fields
    for res in exe_pool.imap(get_consensus, get_seq_data(config, args.min_n_read, args.min_len_aln)):
        cns, seed_id = res
        if len(cns) < 500:
            continue


        if args.output_full:
            print ">"+seed_id+"_f"
            print cns
        else:
            cns = good_region.findall(cns)
            if len(cns) == 0:
                continue
            if args.output_multi:
                seq_i = 0
                for cns_seq in cns:
                    if len(cns_seq) < 500:
                        continue
                    if not args.output_simple_fasta_header:
                        if seq_i >= 10:
                            break
                        print ">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0, len(cns_seq))
                        print format_seq(cns_seq, 80)
                    else:
                        print ">"+seed_id+"_%d" % seq_i
                        print cns_seq
                    seq_i += 1
            else:
                cns.sort(key = lambda x: len(x))
                print ">"+seed_id
                print cns[-1]
Example #25
0
def main(*argv):
    parser = argparse.ArgumentParser(description='a simple multi-processor consensus sequence generator')
    parser.add_argument('--n_core', type=int, default=24,
                        help='number of processes used for generating consensus; '
                        '0 for main process only (default=%(default)s)')
    parser.add_argument('--local_match_count_window', type=int, default=12,
                        help='local match window size (obsoleted, no effect)')
    parser.add_argument('--local_match_count_threshold', type=int, default=6,
                        help='local match count threshold (obsoleted, no effect)')
    parser.add_argument('--min_cov', type=int, default=6,
                        help='minimum coverage to break the consensus')
    parser.add_argument('--min_cov_aln', type=int, default=10,
                        help='minimum coverage of alignment data; an alignment with fewer reads will be completely ignored')
    parser.add_argument('--min_len_aln', type=int, default=100,
                        help='minimum length of a sequence in an alignment to be used in consensus; any shorter sequence will be completely ignored')
    parser.add_argument('--max_n_read', type=int, default=500,
                        help='maximum number of reads used in generating the consensus')
    parser.add_argument('--trim', action="store_true", default=False,
                        help='trim the input sequence with k-mer spare dynamic programming to find the mapped range')
    parser.add_argument('--output_full', action="store_true", default=False,
                        help='output uncorrected regions too')
    parser.add_argument('--output_multi', action="store_true", default=False,
                        help='output multi correct regions')
    parser.add_argument('--output_dformat', action="store_true", default=False,
                        help='output daligner compatible header, only work with --output_multi')
    parser.add_argument('--min_idt', type=float, default=0.70,
                        help='minimum identity of the alignments used for correction')
    parser.add_argument('--edge_tolerance', type=int, default=1000,
                        help='for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read')
    parser.add_argument('--trim_size', type=int, default=50,
                        help='the size for triming both ends from initial sparse aligned region')
    good_region = re.compile("[ACGT]+")
    args = parser.parse_args(argv[1:])
    exe_pool = Pool(args.n_core)
    if args.trim:
        get_consensus = get_consensus_with_trim
    else:
        get_consensus = get_consensus_without_trim

    K = 8
    config = args.min_cov, K, args.local_match_count_window, args.local_match_count_threshold,\
             args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size
    # TODO: pass config object, not tuple, so we can add fields
    for res in exe_pool.imap(get_consensus, get_seq_data(config, args.min_cov_aln, args.min_len_aln)):
        cns, seed_id = res
        if len(cns) < 500:
            continue


        if args.output_full == True:
            print ">"+seed_id+"_f"
            print cns
        else:
            cns = good_region.findall(cns)
            if len(cns) == 0:
                continue
            if args.output_multi == True:
                seq_i = 0
                for cns_seq in cns:
                    if len(cns_seq) < 500:
                        continue
                    if args.output_dformat:
                        if seq_i >= 10:
                            break
                        print ">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0, len(cns_seq))
                        print format_seq(cns_seq, 80)
                    else:
                        print ">"+seed_id+"_%d" % seq_i
                        print cns_seq
                    seq_i += 1
            else:
                cns.sort(key = lambda x: len(x))
                print ">"+seed_id
                print cns[-1]