def post_gmap_processing(db_name='cogent', gff_filename='in.trimmed.fa.gff', output_prefix='cogent2', seqrecs=[]): good_for = defaultdict(lambda: []) reader = GFF.gmapGFFReader(gff_filename) for r in reader: assert r.chr.startswith('path') # chr should be path0, path1, etc if r.coverage >= 98.: good_for[r.seqid].append(int(r.chr[4:])) touse = [] if len(good_for) == 0: log.warning("[BUG] good_for in post_gmap_processing is empty. Probably from cycles. CHECK!") else: N = max(max(v) for v in good_for.itervalues())+1 prob = make_into_lp_problem(good_for.items(), N) prob.solve() for v in prob.variables(): log.debug("{0} = {1}".format(v.name, v.varValue)) if v.varValue == 1: touse.append(int(v.name)) with open(output_prefix + '.fa', 'w') as f: for r in SeqIO.parse(open(db_name + '.fa'),'fasta'): if int(r.id[4:]) in touse: f.write(">{0}\n{1}\n".format(r.id, r.seq)) # if there are some sequences that didn't map (possibly from cycles) # then just use THEMSELVES fake_path_i = max(touse)+1 if len(touse) >= 1 else 0 for r in seqrecs: if r.id not in good_for: log.warning("[BUG] {0} is not fully mapped to cogent in GMAP. \ Likely cycle issues. Use itself in output.".format(r.id)) f.write(">path{0}\n{1}\n".format(fake_path_i, r.seq)) fake_path_i += 1
#!/usr/bin/env python import os, sys from Cogent import GFF input = sys.argv[1] output = input[:input.rfind('.')] + '.collapsed.gff' f = open(output, 'w') reader = GFF.gmapGFFReader(input) for r in reader: GFF.write_collapseGFF_format(f, r) f.close()