Ejemplo n.º 1
0
def post_minimap2_processing(ref='cogent.fa',
                             sam='in.trimmed.fa.sam',
                             output_prefix='cogent2',
                             seqrecs=[]):
    good_for = defaultdict(lambda: [])
    reader = BioReaders.GMAPSAMReader(sam,
                                      True,
                                      query_len_dict=dict(((r.id, len(r.seq))
                                                           for r in seqrecs)))
    for r in reader:
        if r.sID == '*': continue  # not mapped
        assert r.sID.startswith('path')  # chr should be path0, path1, etc
        assert 0 < r.qCoverage <= 1
        assert 0 < r.identity <= 1
        if r.qCoverage >= 0.98 and r.identity >= 0.98:
            good_for[r.qID].append(int(r.sID[4:]))

    touse = []
    if len(good_for) == 0:
        log.warning(
            "[BUG] good_for in post_minimap2_processing is empty. Probably from cycles. CHECK!"
        )
    else:
        N = max(max(v) for v in good_for.itervalues()) + 1
        try:
            prob = make_into_lp_problem(good_for.items(), N, add_noise=False)
            prob.solve()
        except:
            prob = make_into_lp_problem(good_for.items(), N, add_noise=True)
            prob.solve()
        for v in prob.variables():
            log.debug("{0} = {1}".format(v.name, v.varValue))
            if v.varValue == 1: touse.append(int(v.name))

    with open(output_prefix + '.fa', 'w') as f:
        for r in SeqIO.parse(open(ref), 'fasta'):
            if int(r.id[4:]) in touse:
                f.write(">{0}\n{1}\n".format(r.id, r.seq))
        # if there are some sequences that didn't map (possibly from cycles)
        # then just use THEMSELVES
        fake_path_i = max(touse) + 1 if len(touse) >= 1 else 0
        for r in seqrecs:
            if r.id not in good_for:
                log.warning(
                    "[BUG] {0} is not fully mapped to cogent in minimap2. \
                Likely cycle issues. Use itself in output.".format(r.id))
                f.write(">path{0}\n{1}\n".format(fake_path_i, r.seq))
                fake_path_i += 1
Ejemplo n.º 2
0
def post_gmap_processing(db_name='cogent', gff_filename='in.trimmed.fa.gff', output_prefix='cogent2', seqrecs=[]):
    good_for = defaultdict(lambda: [])
    reader = GFF.gmapGFFReader(gff_filename)
    for r in reader:
        assert r.chr.startswith('path')  # chr should be path0, path1, etc
        if r.coverage >= 98.: good_for[r.seqid].append(int(r.chr[4:]))

    touse = []
    if len(good_for) == 0:
        log.warning("[BUG] good_for in post_gmap_processing is empty. Probably from cycles. CHECK!")
    else:
        N = max(max(v) for v in good_for.itervalues())+1
        prob = make_into_lp_problem(good_for.items(), N)
        prob.solve()
        for v in prob.variables():
            log.debug("{0} = {1}".format(v.name, v.varValue))
            if v.varValue == 1: touse.append(int(v.name))


    with open(output_prefix + '.fa', 'w') as f:
        for r in SeqIO.parse(open(db_name + '.fa'),'fasta'):
            if int(r.id[4:]) in touse:
                f.write(">{0}\n{1}\n".format(r.id, r.seq))
        # if there are some sequences that didn't map (possibly from cycles)
        # then just use THEMSELVES
        fake_path_i = max(touse)+1 if len(touse) >= 1 else 0
        for r in seqrecs:
            if r.id not in good_for:
                log.warning("[BUG] {0} is not fully mapped to cogent in GMAP. \
                Likely cycle issues. Use itself in output.".format(r.id))
                f.write(">path{0}\n{1}\n".format(fake_path_i, r.seq))
                fake_path_i += 1
Ejemplo n.º 3
0
def post_gmap_processing(db_name='cogent', gff_filename='in.trimmed.fa.gff', output_prefix='cogent2', seqrecs=[]):
    good_for = defaultdict(lambda: [])
    reader = GFF.gmapGFFReader(gff_filename)
    for r in reader:
        assert r.chr.startswith('path')  # chr should be path0, path1, etc
        if r.coverage >= 98.: good_for[r.seqid].append(int(r.chr[4:]))

    touse = []
    if len(good_for) == 0:
        log.warning("[BUG] good_for in post_gmap_processing is empty. Probably from cycles. CHECK!")
    else:
        N = max(max(v) for v in good_for.itervalues())+1
        prob = make_into_lp_problem(good_for.items(), N)
        prob.solve()
        for v in prob.variables():
            log.debug("{0} = {1}".format(v.name, v.varValue))
            if v.varValue == 1: touse.append(int(v.name))


    with open(output_prefix + '.fa', 'w') as f:
        for r in SeqIO.parse(open(db_name + '.fa'),'fasta'):
            if int(r.id[4:]) in touse:
                f.write(">{0}\n{1}\n".format(r.id, r.seq))
        # if there are some sequences that didn't map (possibly from cycles)
        # then just use THEMSELVES
        fake_path_i = max(touse)+1 if len(touse) >= 1 else 0
        for r in seqrecs:
            if r.id not in good_for:
                log.warning("[BUG] {0} is not fully mapped to cogent in GMAP. \
                Likely cycle issues. Use itself in output.".format(r.id))
                f.write(">path{0}\n{1}\n".format(fake_path_i, r.seq))
                fake_path_i += 1