Python Pool.imap Exemples, falcon_kit.multiproc.Pool.imap Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : consensus.py Projet : fangzhiyu/falcon3

def run(args):
    logging.basicConfig(level=int(round(10*args.verbose_level)))

    assert args.n_core <= multiprocessing.cpu_count(), 'Requested n_core={} > cpu_count={}'.format(
            args.n_core, multiprocessing.cpu_count())

    def Start():
        LOG.info('Started a worker in {} from parent {}'.format(
            os.getpid(), os.getppid()))
    exe_pool = Pool(args.n_core, initializer=Start)
    if args.trim:
        get_consensus = get_consensus_with_trim
    else:
        get_consensus = get_consensus_without_trim

    K = 8
    config = args.min_cov, K, \
        args.max_n_read, args.min_idt, args.edge_tolerance, \
        args.trim_size, args.min_cov_aln, args.max_cov_aln, \
        args.allow_external_mapping
    # TODO: pass config object, not tuple, so we can add fields
    inputs = []
    for datum in get_seq_data(config, args.min_n_read, args.min_len_aln):
        inputs.append((get_consensus, datum))
    try:
        LOG.info('running {!r}'.format(get_consensus))
        for res in exe_pool.imap(io.run_func, inputs):
            process_get_consensus_result(res, args)
        LOG.info('finished {!r}'.format(get_consensus))
    except:
        LOG.exception('failed gen_consensus')
        exe_pool.terminate()
        raise

Exemple #2

0

Afficher le fichier

Fichier : consensus.py Projet : pb-cdunn/FALCON

def run(args):
    logging.basicConfig(level=int(round(10*args.verbose_level)))

    assert args.n_core <= multiprocessing.cpu_count(), 'Requested n_core={} > cpu_count={}'.format(
            args.n_core, multiprocessing.cpu_count())

    def Start():
        LOG.info('Started a worker in {} from parent {}'.format(
            os.getpid(), os.getppid()))
    exe_pool = Pool(args.n_core, initializer=Start)
    if args.trim:
        get_consensus = get_consensus_with_trim
    else:
        get_consensus = get_consensus_without_trim

    K = 8
    config = args.min_cov, K, \
        args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln
    # TODO: pass config object, not tuple, so we can add fields
    inputs = []
    for datum in get_seq_data(config, args.min_n_read, args.min_len_aln):
        inputs.append((get_consensus, datum))
    try:
        LOG.info('running {!r}'.format(get_consensus))
        for res in exe_pool.imap(io.run_func, inputs):
            process_get_consensus_result(res, args)
        LOG.info('finished {!r}'.format(get_consensus))
    except:
        LOG.exception('failed gen_consensus')
        exe_pool.terminate()
        raise

Exemple #3

0

Afficher le fichier

def run(args):
    logging.basicConfig(level=int(round(10 * args.verbose_level)))

    good_region = re.compile("[ACGT]+")

    assert args.n_core <= multiprocessing.cpu_count(
    ), 'Requested n_core={} > cpu_count={}'.format(args.n_core,
                                                   multiprocessing.cpu_count())

    def Start():
        LOG.info('Started a worker in {} from parent {}'.format(
            os.getpid(), os.getppid()))

    exe_pool = Pool(args.n_core, initializer=Start)
    if args.trim:
        get_consensus = get_consensus_with_trim
    else:
        get_consensus = get_consensus_without_trim

    K = 8
    config = args.min_cov, K, \
        args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln
    # TODO: pass config object, not tuple, so we can add fields
    for res in exe_pool.imap(
            get_consensus,
            get_seq_data(config, args.min_n_read, args.min_len_aln)):
        cns, seed_id = res
        if len(cns) < 500:
            continue

        if args.output_full:
            print(">" + seed_id + "_f")
            print(cns)
        else:
            cns = good_region.findall(cns)
            if len(cns) == 0:
                continue
            if args.output_multi:
                seq_i = 0
                for cns_seq in cns:
                    if len(cns_seq) < 500:
                        continue
                    if seq_i >= 10:
                        break
                    print(">prolog/%s%01d/%d_%d" %
                          (seed_id, seq_i, 0, len(cns_seq)))
                    print(format_seq(cns_seq, 80))
                    seq_i += 1
            else:
                cns.sort(key=lambda x: len(x))
                print(">" + seed_id)
                print(cns[-1])

Exemple #4

0

Afficher le fichier

Fichier : ovlp_filter.py Projet : JanDrouaud/FALCON

def fc_ovlp_filter(n_core, fofn, max_diff, max_cov, min_cov, min_len, bestn,
                   db_fn, debug, silent):
    global LOG
    if silent:
        LOG = write_nothing
    exe_pool = Pool(n_core)

    file_list = open(fofn).read().split("\n")
    inputs = []
    for fn in file_list:
        if len(fn) != 0:
            inputs.append((db_fn, fn, max_diff, max_cov, min_cov, min_len))

    ignore_all = []
    for res in exe_pool.imap(filter_stage1, inputs):
        ignore_all.extend(res[1])

    inputs = []
    ignore_all = set(ignore_all)
    for fn in file_list:
        if len(fn) != 0:
            inputs.append(
                (db_fn, fn, max_diff, max_cov, min_cov, min_len, ignore_all))
    contained = set()
    for res in exe_pool.imap(filter_stage2, inputs):
        contained.update(res[1])
        #print res[0], len(res[1]), len(contained)

    #print "all", len(contained)
    inputs = []
    ignore_all = set(ignore_all)
    for fn in file_list:
        if len(fn) != 0:
            inputs.append((db_fn, fn, max_diff, max_cov, min_cov, min_len,
                           ignore_all, contained, bestn))
    for res in exe_pool.imap(filter_stage3, inputs):
        for l in res[1]:
            print " ".join(l)

Exemple #5

0

Afficher le fichier

Fichier : consensus.py Projet : PacificBiosciences/FALCON

def run(args):
    logging.basicConfig(level=int(round(10*args.verbose_level)))

    good_region = re.compile("[ACGT]+")

    assert args.n_core <= multiprocessing.cpu_count(), 'Requested n_core={} > cpu_count={}'.format(
            args.n_core, multiprocessing.cpu_count())

    def Start():
        LOG.info('Started a worker in {} from parent {}'.format(
            os.getpid(), os.getppid()))
    exe_pool = Pool(args.n_core, initializer=Start)
    if args.trim:
        get_consensus = get_consensus_with_trim
    else:
        get_consensus = get_consensus_without_trim

    K = 8
    config = args.min_cov, K, \
        args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln
    # TODO: pass config object, not tuple, so we can add fields
    for res in exe_pool.imap(get_consensus, get_seq_data(config, args.min_n_read, args.min_len_aln)):
        cns, seed_id = res
        if len(cns) < 500:
            continue

        if args.output_full:
            print(">" + seed_id + "_f")
            print(cns)
        else:
            cns = good_region.findall(cns)
            if len(cns) == 0:
                continue
            if args.output_multi:
                seq_i = 0
                for cns_seq in cns:
                    if len(cns_seq) < 500:
                        continue
                    if seq_i >= 10:
                        break
                    print(">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0, len(cns_seq)))
                    print(format_seq(cns_seq, 80))
                    seq_i += 1
            else:
                cns.sort(key=lambda x: len(x))
                print(">" + seed_id)
                print(cns[-1])

Exemple #6

0

Afficher le fichier

Fichier : ovlp_stats.py Projet : JanDrouaud/FALCON

def main(*argv):
    parser = argparse.ArgumentParser(description='a simple multi-processes LAS ovelap data filter')
    parser.add_argument('--n_core', type=int, default=4,
                        help='number of processes used for generating consensus; '
                        '0 for main process only (default=%(default)s)')
    parser.add_argument('--fofn', type=str, help='file contains the path of all LAS file to be processed in parallel')
    parser.add_argument('--min_len', type=int, default=2500, help="min length of the reads")
    args = parser.parse_args(argv)
    exe_pool = Pool(args.n_core)

    file_list = open(args.fofn).read().split("\n")
    #print "all", len(contained)
    inputs = []
    for fn in file_list:
        if len(fn) != 0:
            inputs.append( (fn, args.min_len ) )
    for res in exe_pool.imap(run_filter_stats, inputs):
        for l in res[1]:
            print " ".join([str(c) for c in l])

Exemple #7

0

Afficher le fichier

Fichier : consensus.py Projet : godotgildor/FALCON

def main(*argv):
    parser = argparse.ArgumentParser(description='a simple multi-processor consensus sequence generator')
    parser.add_argument('--n_core', type=int, default=24,
                        help='number of processes used for generating consensus; '
                        '0 for main process only (default=%(default)s)')
    parser.add_argument('--local_match_count_window', type=int, default=12,
                        help='local match window size (obsoleted, no effect)')
    parser.add_argument('--local_match_count_threshold', type=int, default=6,
                        help='local match count threshold (obsoleted, no effect)')
    parser.add_argument('--min_cov', type=int, default=6,
                        help='minimum coverage to break the consensus')
    parser.add_argument('--min_cov_aln', type=int, default=10,
                        help='minimum coverage of alignment data; an alignment with fewer reads will be completely ignored')
    parser.add_argument('--min_len_aln', type=int, default=100,
                        help='minimum length of a sequence in an alignment to be used in consensus; any shorter sequence will be completely ignored')
    parser.add_argument('--max_n_read', type=int, default=500,
                        help='maximum number of reads used in generating the consensus')
    parser.add_argument('--trim', action="store_true", default=False,
                        help='trim the input sequence with k-mer spare dynamic programming to find the mapped range')
    parser.add_argument('--output_full', action="store_true", default=False,
                        help='output uncorrected regions too')
    parser.add_argument('--output_multi', action="store_true", default=False,
                        help='output multi correct regions')
    parser.add_argument('--output_dformat', action="store_true", default=False,
                        help='output daligner compatible header, only work with --output_multi')
    parser.add_argument('--min_idt', type=float, default=0.70,
                        help='minimum identity of the alignments used for correction')
    parser.add_argument('--edge_tolerance', type=int, default=1000,
                        help='for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read')
    parser.add_argument('--trim_size', type=int, default=50,
                        help='the size for triming both ends from initial sparse aligned region')
    good_region = re.compile("[ACGT]+")
    args = parser.parse_args(argv[1:])
    exe_pool = Pool(args.n_core)
    if args.trim:
        get_consensus = get_consensus_with_trim
    else:
        get_consensus = get_consensus_without_trim

    K = 8
    config = args.min_cov, K, args.local_match_count_window, args.local_match_count_threshold,\
             args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size
    # TODO: pass config object, not tuple, so we can add fields
    for res in exe_pool.imap(get_consensus, get_seq_data(config, args.min_cov_aln, args.min_len_aln)):
        cns, seed_id = res
        if len(cns) < 500:
            continue


        if args.output_full == True:
            print ">"+seed_id+"_f"
            print cns
        else:
            cns = good_region.findall(cns)
            if len(cns) == 0:
                continue
            if args.output_multi == True:
                seq_i = 0
                for cns_seq in cns:
                    if len(cns_seq) < 500:
                        continue
                    if args.output_dformat:
                        if seq_i >= 10:
                            break
                        print ">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0, len(cns_seq))
                        print format_seq(cns_seq, 80)
                    else:
                        print ">"+seed_id+"_%d" % seq_i
                        print cns_seq
                    seq_i += 1
            else:
                cns.sort(key = lambda x: len(x))
                print ">"+seed_id
                print cns[-1]

Exemple #8

0

Afficher le fichier

Fichier : consensus.py Projet : bredeson/FALCON

def main(argv=sys.argv):
    parser = argparse.ArgumentParser(description='a simple multi-processor consensus sequence generator')
    parser.add_argument('--n_core', type=int, default=24,
                        help='number of processes used for generating consensus; '
                        '0 for main process only (default=%(default)s)')
    parser.add_argument('--local_match_count_window', type=int, default=12,
                        help='local match window size (obsoleted, no effect)')
    parser.add_argument('--local_match_count_threshold', type=int, default=6,
                        help='local match count threshold (obsoleted, no effect)')
    parser.add_argument('--min_cov', type=int, default=6,
                        help='minimum coverage to break the consensus')
    parser.add_argument('--min_cov_aln', type=int, default=10,
                        help='minimum coverage of alignment data; a seed read with less than MIN_COV_ALN average depth' + \
                        ' of coverage will be completely ignored')
    parser.add_argument('--max_cov_aln', type=int, default=0, # 0 to emulate previous behavior
                        help='maximum coverage of alignment data; a seed read with more than MAX_COV_ALN average depth' + \
                        ' of coverage of the longest alignments will be capped, excess shorter alignments will be ignored')
    parser.add_argument('--min_len_aln', type=int, default=0, # 0 to emulate previous behavior
                        help='minimum length of a sequence in an alignment to be used in consensus; any shorter sequence will be completely ignored')
    parser.add_argument('--min_n_read', type=int, default=10,
                        help='minimum number of reads used in generating the consensus; a seed read with fewer alignments will '+ \
                        'be completely ignored (obsoleted, formerly called `--min_cov_aln\')')
    parser.add_argument('--max_n_read', type=int, default=500,
                        help='maximum number of reads used in generating the consensus')
    parser.add_argument('--trim', action="store_true", default=False,
                        help='trim the input sequence with k-mer spare dynamic programming to find the mapped range')
    parser.add_argument('--output_full', action="store_true", default=False,
                        help='output uncorrected regions too')
    parser.add_argument('--output_multi', action="store_true", default=False,
                        help='output multi correct regions; implies --output_dformat, unless --output-simple-fasta-header')
    parser.add_argument('--output_dformat', action="store_true", default=True,
                        help='output daligner compatible header, only work with --output_multi; DEPRECATED and ignored, as this is the default now')
    parser.add_argument('--output_simple_fasta_header', action='store_true', default=False,
                        help='Turn off --output_dformat. This was for older (pre spring 2015) DALIGNER. Never needed now.')
    parser.add_argument('--min_idt', type=float, default=0.70,
                        help='minimum identity of the alignments used for correction')
    parser.add_argument('--edge_tolerance', type=int, default=1000,
                        help='for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read')
    parser.add_argument('--trim_size', type=int, default=50,
                        help='the size for triming both ends from initial sparse aligned region')
    good_region = re.compile("[ACGT]+")
    args = parser.parse_args(argv[1:])
    def Start():
        print>>sys.stderr, 'Started a worker in %d from parent %d' %(os.getpid(), os.getppid())
    exe_pool = Pool(args.n_core, initializer=Start)
    if args.trim:
        get_consensus = get_consensus_with_trim
    else:
        get_consensus = get_consensus_without_trim

    K = 8
    config = args.min_cov, K, args.local_match_count_window, args.local_match_count_threshold,\
             args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln
    # TODO: pass config object, not tuple, so we can add fields
    for res in exe_pool.imap(get_consensus, get_seq_data(config, args.min_n_read, args.min_len_aln)):
        cns, seed_id = res
        if len(cns) < 500:
            continue


        if args.output_full:
            print ">"+seed_id+"_f"
            print cns
        else:
            cns = good_region.findall(cns)
            if len(cns) == 0:
                continue
            if args.output_multi:
                seq_i = 0
                for cns_seq in cns:
                    if len(cns_seq) < 500:
                        continue
                    if not args.output_simple_fasta_header:
                        if seq_i >= 10:
                            break
                        print ">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0, len(cns_seq))
                        print format_seq(cns_seq, 80)
                    else:
                        print ">"+seed_id+"_%d" % seq_i
                        print cns_seq
                    seq_i += 1
            else:
                cns.sort(key = lambda x: len(x))
                print ">"+seed_id
                print cns[-1]

Exemple #9

0

Afficher le fichier

Fichier : consensus.py Projet : YuJinhui/FALCON

def main(argv=sys.argv):
    parser = argparse.ArgumentParser(description='a simple multi-processor consensus sequence generator',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--n_core', type=int, default=24,
                        help='number of processes used for generating consensus; '
                        '0 for main process only')
    parser.add_argument('--min_cov', type=int, default=6,
                        help='minimum coverage to break the consensus')
    parser.add_argument('--min_cov_aln', type=int, default=10,
                        help='minimum coverage of alignment data; a seed read with less than MIN_COV_ALN average depth' + \
                        ' of coverage will be completely ignored')
    parser.add_argument('--max_cov_aln', type=int, default=0, # 0 to emulate previous behavior
                        help='maximum coverage of alignment data; a seed read with more than MAX_COV_ALN average depth' + \
                        ' of coverage of the longest alignments will be capped, excess shorter alignments will be ignored')
    parser.add_argument('--min_len_aln', type=int, default=0, # 0 to emulate previous behavior
                        help='minimum length of a sequence in an alignment to be used in consensus; any shorter sequence will be completely ignored')
    parser.add_argument('--min_n_read', type=int, default=10,
                        help='1 + minimum number of reads used in generating the consensus; a seed read with fewer alignments will '+ \
                        'be completely ignored')
    parser.add_argument('--max_n_read', type=int, default=500,
                        help='1 + maximum number of reads used in generating the consensus')
    parser.add_argument('--trim', action="store_true", default=False,
                        help='trim the input sequence with k-mer spare dynamic programming to find the mapped range')
    parser.add_argument('--output_full', action="store_true", default=False,
                        help='output uncorrected regions too')
    parser.add_argument('--output_multi', action="store_true", default=False,
                        help='output multi correct regions')
    parser.add_argument('--min_idt', type=float, default=0.70,
                        help='minimum identity of the alignments used for correction')
    parser.add_argument('--edge_tolerance', type=int, default=1000,
                        help='for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read')
    parser.add_argument('--trim_size', type=int, default=50,
                        help='the size for triming both ends from initial sparse aligned region')
    good_region = re.compile("[ACGT]+")
    args = parser.parse_args(argv[1:])
    def Start():
        print>>sys.stderr, 'Started a worker in %d from parent %d' %(os.getpid(), os.getppid())
    exe_pool = Pool(args.n_core, initializer=Start)
    if args.trim:
        get_consensus = get_consensus_with_trim
    else:
        get_consensus = get_consensus_without_trim

    K = 8
    config = args.min_cov, K, \
             args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln
    # TODO: pass config object, not tuple, so we can add fields
    for res in exe_pool.imap(get_consensus, get_seq_data(config, args.min_n_read, args.min_len_aln)):
        cns, seed_id = res
        if len(cns) < 500:
            continue


        if args.output_full:
            print ">"+seed_id+"_f"
            print cns
        else:
            cns = good_region.findall(cns)
            if len(cns) == 0:
                continue
            if args.output_multi:
                seq_i = 0
                for cns_seq in cns:
                    if len(cns_seq) < 500:
                        continue
                    if seq_i >= 10:
                        break
                    print ">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0, len(cns_seq))
                    print format_seq(cns_seq, 80)
                    seq_i += 1
            else:
                cns.sort(key = lambda x: len(x))
                print ">"+seed_id
                print cns[-1]

Exemple #10

0

Afficher le fichier

def main(argv=sys.argv):
    parser = argparse.ArgumentParser(
        description='a simple multi-processor consensus sequence generator')
    parser.add_argument(
        '--n_core',
        type=int,
        default=24,
        help='number of processes used for generating consensus; '
        '0 for main process only (default=%(default)s)')
    parser.add_argument('--local_match_count_window',
                        type=int,
                        default=12,
                        help='local match window size (obsoleted, no effect)')
    parser.add_argument(
        '--local_match_count_threshold',
        type=int,
        default=6,
        help='local match count threshold (obsoleted, no effect)')
    parser.add_argument('--min_cov',
                        type=int,
                        default=6,
                        help='minimum coverage to break the consensus')
    parser.add_argument('--min_cov_aln', type=int, default=10,
                        help='minimum coverage of alignment data; a seed read with less than MIN_COV_ALN average depth' + \
                        ' of coverage will be completely ignored')
    parser.add_argument('--max_cov_aln', type=int, default=0, # 0 to emulate previous behavior
                        help='maximum coverage of alignment data; a seed read with more than MAX_COV_ALN average depth' + \
                        ' of coverage of the longest alignments will be capped, excess shorter alignments will be ignored')
    parser.add_argument(
        '--min_len_aln',
        type=int,
        default=0,  # 0 to emulate previous behavior
        help=
        'minimum length of a sequence in an alignment to be used in consensus; any shorter sequence will be completely ignored'
    )
    parser.add_argument('--min_n_read', type=int, default=10,
                        help='minimum number of reads used in generating the consensus; a seed read with fewer alignments will '+ \
                        'be completely ignored (obsoleted, formerly called `--min_cov_aln\')')
    parser.add_argument(
        '--max_n_read',
        type=int,
        default=500,
        help='maximum number of reads used in generating the consensus')
    parser.add_argument(
        '--trim',
        action="store_true",
        default=False,
        help=
        'trim the input sequence with k-mer spare dynamic programming to find the mapped range'
    )
    parser.add_argument('--output_full',
                        action="store_true",
                        default=False,
                        help='output uncorrected regions too')
    parser.add_argument(
        '--output_multi',
        action="store_true",
        default=False,
        help=
        'output multi correct regions; implies --output_dformat, unless --output-simple-fasta-header'
    )
    parser.add_argument(
        '--output_dformat',
        action="store_true",
        default=True,
        help=
        'output daligner compatible header, only work with --output_multi; DEPRECATED and ignored, as this is the default now'
    )
    parser.add_argument(
        '--output_simple_fasta_header',
        action='store_true',
        default=False,
        help=
        'Turn off --output_dformat. This was for older (pre spring 2015) DALIGNER. Never needed now.'
    )
    parser.add_argument(
        '--min_idt',
        type=float,
        default=0.70,
        help='minimum identity of the alignments used for correction')
    parser.add_argument(
        '--edge_tolerance',
        type=int,
        default=1000,
        help=
        'for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read'
    )
    parser.add_argument(
        '--trim_size',
        type=int,
        default=50,
        help='the size for triming both ends from initial sparse aligned region'
    )
    good_region = re.compile("[ACGT]+")
    args = parser.parse_args(argv[1:])

    def Start():
        print >> sys.stderr, 'Started a worker in %d from parent %d' % (
            os.getpid(), os.getppid())

    exe_pool = Pool(args.n_core, initializer=Start)
    if args.trim:
        get_consensus = get_consensus_with_trim
    else:
        get_consensus = get_consensus_without_trim

    K = 8
    config = args.min_cov, K, args.local_match_count_window, args.local_match_count_threshold,\
             args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln
    # TODO: pass config object, not tuple, so we can add fields
    for res in exe_pool.imap(
            get_consensus,
            get_seq_data(config, args.min_n_read, args.min_len_aln)):
        cns, seed_id = res
        if len(cns) < 500:
            continue

        if args.output_full:
            print ">" + seed_id + "_f"
            print cns
        else:
            cns = good_region.findall(cns)
            if len(cns) == 0:
                continue
            if args.output_multi:
                seq_i = 0
                for cns_seq in cns:
                    if len(cns_seq) < 500:
                        continue
                    if not args.output_simple_fasta_header:
                        if seq_i >= 10:
                            break
                        print ">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0,
                                                        len(cns_seq))
                        print format_seq(cns_seq, 80)
                    else:
                        print ">" + seed_id + "_%d" % seq_i
                        print cns_seq
                    seq_i += 1
            else:
                cns.sort(key=lambda x: len(x))
                print ">" + seed_id
                print cns[-1]

Exemple #11

0

Afficher le fichier

Fichier : consensus.py Projet : wangzhennan14/FALCON

def main(argv=sys.argv):
    parser = argparse.ArgumentParser(
        description='a simple multi-processor consensus sequence generator',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--n_core',
        type=int,
        default=24,
        help='number of processes used for generating consensus; '
        '0 for main process only')
    parser.add_argument('--min_cov',
                        type=int,
                        default=6,
                        help='minimum coverage to break the consensus')
    parser.add_argument(
        '--min_cov_aln',
        type=int,
        default=10,
        help=
        'minimum coverage of alignment data; a seed read with less than MIN_COV_ALN average depth'
        + ' of coverage will be completely ignored')
    parser.add_argument('--max_cov_aln', type=int, default=0,  # 0 to emulate previous behavior
                        help='maximum coverage of alignment data; a seed read with more than MAX_COV_ALN average depth' + \
                        ' of coverage of the longest alignments will be capped, excess shorter alignments will be ignored')
    parser.add_argument(
        '--min_len_aln',
        type=int,
        default=0,  # 0 to emulate previous behavior
        help=
        'minimum length of a sequence in an alignment to be used in consensus; any shorter sequence will be completely ignored'
    )
    parser.add_argument(
        '--min_n_read',
        type=int,
        default=10,
        help=
        '1 + minimum number of reads used in generating the consensus; a seed read with fewer alignments will '
        + 'be completely ignored')
    parser.add_argument(
        '--max_n_read',
        type=int,
        default=500,
        help='1 + maximum number of reads used in generating the consensus')
    parser.add_argument(
        '--trim',
        action="store_true",
        default=False,
        help=
        'trim the input sequence with k-mer spare dynamic programming to find the mapped range'
    )
    parser.add_argument('--output_full',
                        action="store_true",
                        default=False,
                        help='output uncorrected regions too')
    parser.add_argument('--output_multi',
                        action="store_true",
                        default=False,
                        help='output multi correct regions')
    parser.add_argument(
        '--min_idt',
        type=float,
        default=0.70,
        help='minimum identity of the alignments used for correction')
    parser.add_argument(
        '--edge_tolerance',
        type=int,
        default=1000,
        help=
        'for trimming, the there is unaligned edge leng > edge_tolerance, ignore the read'
    )
    parser.add_argument(
        '--trim_size',
        type=int,
        default=50,
        help='the size for triming both ends from initial sparse aligned region'
    )
    good_region = re.compile("[ACGT]+")
    args = parser.parse_args(argv[1:])

    def Start():
        print >> sys.stderr, 'Started a worker in %d from parent %d' % (
            os.getpid(), os.getppid())

    exe_pool = Pool(args.n_core, initializer=Start)
    if args.trim:
        get_consensus = get_consensus_with_trim
    else:
        get_consensus = get_consensus_without_trim

    K = 8
    config = args.min_cov, K, \
        args.max_n_read, args.min_idt, args.edge_tolerance, args.trim_size, args.min_cov_aln, args.max_cov_aln
    # TODO: pass config object, not tuple, so we can add fields
    for res in exe_pool.imap(
            get_consensus,
            get_seq_data(config, args.min_n_read, args.min_len_aln)):
        cns, seed_id = res
        if len(cns) < 500:
            continue

        if args.output_full:
            print ">" + seed_id + "_f"
            print cns
        else:
            cns = good_region.findall(cns)
            if len(cns) == 0:
                continue
            if args.output_multi:
                seq_i = 0
                for cns_seq in cns:
                    if len(cns_seq) < 500:
                        continue
                    if seq_i >= 10:
                        break
                    print ">prolog/%s%01d/%d_%d" % (seed_id, seq_i, 0,
                                                    len(cns_seq))
                    print format_seq(cns_seq, 80)
                    seq_i += 1
            else:
                cns.sort(key=lambda x: len(x))
                print ">" + seed_id
                print cns[-1]