def post_minimap2_processing(ref='cogent.fa', sam='in.trimmed.fa.sam', output_prefix='cogent2', seqrecs=[]): good_for = defaultdict(lambda: []) reader = BioReaders.GMAPSAMReader(sam, True, query_len_dict=dict(((r.id, len(r.seq)) for r in seqrecs))) for r in reader: if r.sID == '*': continue # not mapped assert r.sID.startswith('path') # chr should be path0, path1, etc assert 0 < r.qCoverage <= 1 assert 0 < r.identity <= 1 if r.qCoverage >= 0.98 and r.identity >= 0.98: good_for[r.qID].append(int(r.sID[4:])) touse = [] if len(good_for) == 0: log.warning( "[BUG] good_for in post_minimap2_processing is empty. Probably from cycles. CHECK!" ) else: N = max(max(v) for v in good_for.itervalues()) + 1 try: prob = make_into_lp_problem(good_for.items(), N, add_noise=False) prob.solve() except: prob = make_into_lp_problem(good_for.items(), N, add_noise=True) prob.solve() for v in prob.variables(): log.debug("{0} = {1}".format(v.name, v.varValue)) if v.varValue == 1: touse.append(int(v.name)) with open(output_prefix + '.fa', 'w') as f: for r in SeqIO.parse(open(ref), 'fasta'): if int(r.id[4:]) in touse: f.write(">{0}\n{1}\n".format(r.id, r.seq)) # if there are some sequences that didn't map (possibly from cycles) # then just use THEMSELVES fake_path_i = max(touse) + 1 if len(touse) >= 1 else 0 for r in seqrecs: if r.id not in good_for: log.warning( "[BUG] {0} is not fully mapped to cogent in minimap2. \ Likely cycle issues. Use itself in output.".format(r.id)) f.write(">path{0}\n{1}\n".format(fake_path_i, r.seq)) fake_path_i += 1
def post_gmap_processing(db_name='cogent', gff_filename='in.trimmed.fa.gff', output_prefix='cogent2', seqrecs=[]): good_for = defaultdict(lambda: []) reader = GFF.gmapGFFReader(gff_filename) for r in reader: assert r.chr.startswith('path') # chr should be path0, path1, etc if r.coverage >= 98.: good_for[r.seqid].append(int(r.chr[4:])) touse = [] if len(good_for) == 0: log.warning("[BUG] good_for in post_gmap_processing is empty. Probably from cycles. CHECK!") else: N = max(max(v) for v in good_for.itervalues())+1 prob = make_into_lp_problem(good_for.items(), N) prob.solve() for v in prob.variables(): log.debug("{0} = {1}".format(v.name, v.varValue)) if v.varValue == 1: touse.append(int(v.name)) with open(output_prefix + '.fa', 'w') as f: for r in SeqIO.parse(open(db_name + '.fa'),'fasta'): if int(r.id[4:]) in touse: f.write(">{0}\n{1}\n".format(r.id, r.seq)) # if there are some sequences that didn't map (possibly from cycles) # then just use THEMSELVES fake_path_i = max(touse)+1 if len(touse) >= 1 else 0 for r in seqrecs: if r.id not in good_for: log.warning("[BUG] {0} is not fully mapped to cogent in GMAP. \ Likely cycle issues. Use itself in output.".format(r.id)) f.write(">path{0}\n{1}\n".format(fake_path_i, r.seq)) fake_path_i += 1