Exemple #1
0
def add_batch(batch_index, pCS, orphans, fasta_d, cpus, dun_use_partial):
    """
    1. align batch<i>.fasta against seed<i>.S.fasta, process -> write remains to batch<i>.remains.fasta
    2. align batch<i>.remains.fasta against seed<i>.orphans.fasta -> write remains to batch<i>.remains2.fasta
    3. self align batch<i>.remains2.fasta -> combine remains+orphans to new orphans
    4. write out seed<i+1>.S.fasta and seed<i+1>.orphans.fasta

    """
    cur_file = "batch{0}.fasta".format(batch_index)
    seqids = set([r.id for r in SeqIO.parse(open(cur_file), 'fasta')])
    o = ar.run_minimap(cur_file,
                       "seed{0}.S.fasta".format(batch_index),
                       cpus=cpus)
    print("processing", o, file=sys.stderr)
    pCS, remains = sp.process_align_to_pCS(o,
                                           seqids,
                                           pCS,
                                           MiniReader,
                                           dun_use_partial=dun_use_partial)
    print("pCS: {0}, tucked: {1}, orphans: {2}, remains: {3}".format( \
        len(pCS.S), sum(v == 'T' for v in pCS.seq_stat.values()), len(orphans), len(remains)), file=sys.stderr)
    # write batch<i>.remains.fasta
    cur_file = "batch{0}.remains.fasta".format(batch_index)
    FileIO.write_seqids_to_fasta(remains, cur_file, fasta_d)
    o = ar.run_minimap(cur_file,
                       "seed{0}.orphans.fasta".format(batch_index),
                       cpus=cpus)
    print("processing", o, file=sys.stderr)
    pCS, orphans, remains = sp.process_align_to_orphan(
        o, remains, orphans, pCS, MiniReader, dun_use_partial=dun_use_partial)
    print("pCS: {0}, tucked: {1}, orphans: {2}, remains: {3}".format( \
        len(pCS.S), sum(v == 'T' for v in pCS.seq_stat.values()), len(orphans), len(remains)), file=sys.stderr)
    # write batch<i>.remains2.fasta and self align
    cur_file = "batch{0}.remains2.fasta".format(batch_index)
    FileIO.write_seqids_to_fasta(remains, cur_file, fasta_d)
    o = ar.run_minimap(cur_file, cur_file, cpus=cpus)
    print("processing", o, file=sys.stderr)
    pCS, remains = sp.process_self_align_into_seed(
        o, remains, MiniReader, pCS, dun_use_partial=dun_use_partial)
    print("pCS: {0}, tucked: {1}, orphans: {2}, remains: {3}".format( \
        len(pCS.S), sum(v == 'T' for v in pCS.seq_stat.values()), len(orphans), len(remains)), file=sys.stderr)
    # combine remains+orphans to new orphans
    orphans = orphans.union(remains)
    FileIO.write_preClusterSet_to_fasta(
        pCS, "seed{0}.S.fasta".format(batch_index + 1), fasta_d)
    FileIO.write_seqids_to_fasta(
        orphans, "seed{0}.orphans.fasta".format(batch_index + 1), fasta_d)

    return pCS, orphans
def add_batch(batch_index, pCS, orphans, fasta_d, cpus, dun_use_partial):
    """
    1. align batch<i>.fasta against seed<i>.S.fasta, process -> write remains to batch<i>.remains.fasta
    2. align batch<i>.remains.fasta against seed<i>.orphans.fasta -> write remains to batch<i>.remains2.fasta
    3. self align batch<i>.remains2.fasta -> combine remains+orphans to new orphans
    4. write out seed<i+1>.S.fasta and seed<i+1>.orphans.fasta

    """
    cur_file = "batch{0}.fasta".format(batch_index)
    seqids = set([r.id for r in SeqIO.parse(open(cur_file), 'fasta')])
    o = ar.run_minimap(cur_file, "seed{0}.S.fasta".format(batch_index), cpus=cpus)
    print >> sys.stderr, "processing", o
    pCS, remains = sp.process_align_to_pCS(o, seqids, pCS, MiniReader, dun_use_partial=dun_use_partial)
    print >> sys.stderr, "pCS: {0}, tucked: {1}, orphans: {2}, remains: {3}".format( \
        len(pCS.S), sum(v == 'T' for v in pCS.seq_stat.itervalues()), len(orphans), len(remains))
    # write batch<i>.remains.fasta
    cur_file = "batch{0}.remains.fasta".format(batch_index)
    FileIO.write_seqids_to_fasta(remains, cur_file, fasta_d)
    o = ar.run_minimap(cur_file, "seed{0}.orphans.fasta".format(batch_index), cpus=cpus)
    print >> sys.stderr, "processing", o
    pCS, orphans, remains = sp.process_align_to_orphan(o, remains, orphans, pCS, MiniReader, dun_use_partial=dun_use_partial)
    print >> sys.stderr, "pCS: {0}, tucked: {1}, orphans: {2}, remains: {3}".format( \
        len(pCS.S), sum(v == 'T' for v in pCS.seq_stat.itervalues()), len(orphans), len(remains))
    # write batch<i>.remains2.fasta and self align
    cur_file = "batch{0}.remains2.fasta".format(batch_index)
    FileIO.write_seqids_to_fasta(remains, cur_file, fasta_d)
    o = ar.run_minimap(cur_file, cur_file, cpus=cpus)
    print >> sys.stderr, "processing", o
    pCS, remains = sp.process_self_align_into_seed(o, remains, MiniReader, pCS, dun_use_partial=dun_use_partial)
    print >> sys.stderr, "pCS: {0}, tucked: {1}, orphans: {2}, remains: {3}".format( \
        len(pCS.S), sum(v == 'T' for v in pCS.seq_stat.itervalues()), len(orphans), len(remains))
    # combine remains+orphans to new orphans
    orphans = orphans.union(remains)
    FileIO.write_preClusterSet_to_fasta(pCS, "seed{0}.S.fasta".format(batch_index+1), fasta_d)
    FileIO.write_seqids_to_fasta(orphans, "seed{0}.orphans.fasta".format(batch_index+1), fasta_d)

    return pCS, orphans