def read_cogent2_aligned_to_genome_sam(input, filename): """ Read cogent2 mapped to a genome. Return: dict of {cogent path} --> list of SAM Record; set of mapped genome contigs NOTE: (minimap2 was run with --secondary=no so if multiple must be chimeric) """ d = defaultdict(lambda: []) contigs_seen = set() if not os.path.exists(filename): return {}, set() try: for r in BioReaders.GMAPSAMReader( filename, True, query_len_dict=dict( (r.id, len(r.seq)) for r in SeqIO.parse(open(input), 'fasta'))): if r.sID == '*': continue # unmapped d[r.qID].append(r) contigs_seen.add(r.sID) except IndexError: pass return dict(d), contigs_seen
def post_minimap2_processing(ref='cogent.fa', sam='in.trimmed.fa.sam', output_prefix='cogent2', seqrecs=[]): good_for = defaultdict(lambda: []) reader = BioReaders.GMAPSAMReader(sam, True, query_len_dict=dict(((r.id, len(r.seq)) for r in seqrecs))) for r in reader: if r.sID == '*': continue # not mapped assert r.sID.startswith('path') # chr should be path0, path1, etc assert 0 < r.qCoverage <= 1 assert 0 < r.identity <= 1 if r.qCoverage >= 0.98 and r.identity >= 0.98: good_for[r.qID].append(int(r.sID[4:])) touse = [] if len(good_for) == 0: log.warning( "[BUG] good_for in post_minimap2_processing is empty. Probably from cycles. CHECK!" ) else: N = max(max(v) for v in good_for.itervalues()) + 1 try: prob = make_into_lp_problem(good_for.items(), N, add_noise=False) prob.solve() except: prob = make_into_lp_problem(good_for.items(), N, add_noise=True) prob.solve() for v in prob.variables(): log.debug("{0} = {1}".format(v.name, v.varValue)) if v.varValue == 1: touse.append(int(v.name)) with open(output_prefix + '.fa', 'w') as f: for r in SeqIO.parse(open(ref), 'fasta'): if int(r.id[4:]) in touse: f.write(">{0}\n{1}\n".format(r.id, r.seq)) # if there are some sequences that didn't map (possibly from cycles) # then just use THEMSELVES fake_path_i = max(touse) + 1 if len(touse) >= 1 else 0 for r in seqrecs: if r.id not in good_for: log.warning( "[BUG] {0} is not fully mapped to cogent in minimap2. \ Likely cycle issues. Use itself in output.".format(r.id)) f.write(">path{0}\n{1}\n".format(fake_path_i, r.seq)) fake_path_i += 1
def tally_for_a_Cogent_dir(dirname, writer1, writer2, genome1, genome2=None, blastn_filename=None): """ 1. read input mapped to cogent2 (in.trimmed.fa.cogent2.gff) 2. read cogent2 mapped to genome1 3. read cogent2 mapped to genome2 (if genome2 does not exist, just repeat genome1) """ if not os.path.exists(os.path.join(dirname, 'COGENT.DONE')): return seq_info = defaultdict(lambda: []) contigs_seen = set() # input mapped to Cogent contigs filename = os.path.join(dirname, 'in.trimmed.fa.cogent2.sam') reader = BioReaders.GMAPSAMReader(filename, True, \ query_len_dict=dict((r.id, len(r.seq)) for r in SeqIO.parse(open(os.path.join(dirname, 'in.trimmed.fa')), 'fasta'))) for r in reader: seq_info[r.qID].append(r) contigs_seen.add(r.sID) # sanity check that all sequences in in.fa are mapped to cogent2.fa for r in SeqIO.parse(open(os.path.join(dirname, 'in.fa')), 'fasta'): assert r.id in seq_info d_genome1, contig_genome1 = read_cogent2_aligned_to_genome_sam( os.path.join(dirname, 'cogent2.fa'), os.path.join(dirname, 'cogent2.fa.' + genome1 + '.sam')) if genome2 is not None: d_genome2, contig_genome2 = read_cogent2_aligned_to_genome_sam( os.path.join(dirname, 'cogent2.fa'), os.path.join(dirname, 'cogent2.fa.' + genome2 + '.sam')) if blastn_filename is not None: qlen_dict = dict((r.id, len(r.seq)) for r in SeqIO.parse( open(os.path.join(dirname, 'in.trimmed.fa')), 'fasta')) best_of = read_blastn(os.path.join(dirname, blastn_filename), qlen_dict) # write: # dirname, # of input, # of cogent contig, # of pacbio_contig, total pacbio cov, pacbio iden cov1, acc1, has_chimeric1 = calculate_cov_acc(d_genome1) rec1 = { 'gene_family': dirname, 'input_size': len(seq_info), 'num_Cogent_contigs': len(contigs_seen), 'num_genome_contig': len(contig_genome1), 'genome_cov': "{0:.2f}".format(cov1), 'genome_acc': "{0:.2f}".format(acc1), 'genome_chimeric': has_chimeric1, 'genome_contigs': ",".join(contig_genome1) } # (for genome2), # of contig, total worst cov, iden, is_chimeric, comma-separated list of contigs if genome2 is not None: cov2, acc2, has_chimeric2 = calculate_cov_acc(d_genome2) rec1['num_genome2_contig'] = len(contig_genome2) rec1['genome2_cov'] = "{0:.2f}".format(cov2) rec1['genome2_acc'] = "{0:.2f}".format(acc2) rec1['genome2_chimeric'] = has_chimeric2 rec1['genome2_contigs'] = ",".join(contig_genome2) # (for blastn, optional) best name with best e-value if blastn_filename is not None: if len(best_of) == 0: rec1['num_blastn'] = 0 rec1['blastn_best'] = 'NA' else: stuff = list(best_of.values()) # list of (e-value, name) stuff.sort() rec1['num_blastn'] = sum(_n != 'NA' for _e, _n in list(best_of.values())) rec1['blastn_best'] = '"' + stuff[0][1] + '"' writer1.writerow(rec1) in_aligned_to_genome1 = os.path.join(dirname, 'in.trimmed.fa.' + genome1 + '.sam') if os.path.exists(in_aligned_to_genome1): d3, junk = read_cogent2_aligned_to_genome_sam( os.path.join(dirname, 'in.trimmed.fa'), in_aligned_to_genome1) else: d3 = {} for seqid, v in seq_info.items(): contigs = [x.sID for x in v] acc = sum(x.identity * x.qCoverage for x in v) / sum(x.qCoverage for x in v) rec2 = { 'seqid': seqid, 'gene_family': dirname, 'Cogent_contig': ",".join(contigs), 'Cogent_contig_acc': acc } if not seqid in d3: rec2['scaffold'] = 'NA' rec2['num_scaffold'] = 0 rec2['scaffold_coverage'] = 'NA' rec2['scaffold_acc'] = 'NA' if blastn_filename is not None: rec2['blastn_best'] = 'NA' else: scaffolds = [x.sID for x in d3[seqid]] # calculate cov and acc c = ClusterTree(0, 0) for x in d3[seqid]: qlen = x.qLen c.insert(x.qStart, x.qEnd, -1) cov = sum(_e - _s for _s, _e, _junk in c.getregions()) * 100. / qlen acc = sum(x.identity * x.qCoverage for x in d3[seqid]) * 1. / sum(x.qCoverage for x in d3[seqid]) rec2['scaffold'] = ",".join(scaffolds) rec2['num_scaffold'] = len(scaffolds) rec2['scaffold_coverage'] = cov rec2['scaffold_acc'] = acc if blastn_filename is not None: rec2['blastn_best'] = best_of[seqid][1] writer2.writerow(rec2)
def tally_for_a_Cogent_dir(dirname, f1, f2, genome1, genome2, blastn_filename=None): """ 1. read input mapped to cogent2 (in.trimmed.fa.cogent2.gff) 2. read cogent2 mapped to genome1 3. read cogent2 mapped to genome2 (if genome2 does not exist, just repeat genome1) """ if not os.path.exists(os.path.join(dirname, 'COGENT.DONE')): return seq_info = defaultdict(lambda: []) contigs_seen = set() # input mapped to Cogent contigs filename = os.path.join(dirname, 'in.trimmed.fa.cogent2.sam') reader = BioReaders.GMAPSAMReader(filename, True, \ query_len_dict=dict((r.id, len(r.seq)) for r in SeqIO.parse(open(os.path.join(dirname, 'in.trimmed.fa')), 'fasta'))) for r in reader: seq_info[r.qID].append(r) contigs_seen.add(r.sID) # sanity check that all sequences in in.fa are mapped to cogent2.fa for r in SeqIO.parse(open(os.path.join(dirname, 'in.fa')), 'fasta'): assert r.id in seq_info d_genome1, contig_genome1 = read_cogent2_aligned_to_genome_sam(os.path.join(dirname, 'cogent2.fa'), os.path.join(dirname,'cogent2.fa.'+genome1+'.sam')) d_genome2, contig_genome2 = read_cogent2_aligned_to_genome_sam(os.path.join(dirname, 'cogent2.fa'), os.path.join(dirname,'cogent2.fa.'+genome2+'.sam')) if blastn_filename is not None: qlen_dict = dict((r.id, len(r.seq)) for r in SeqIO.parse(open(os.path.join(dirname, 'in.trimmed.fa')),'fasta')) best_of = read_blastn(os.path.join(dirname, blastn_filename), qlen_dict) # write: # dirname, # of input, # of cogent contig, # of pacbio_contig, total pacbio cov, pacbio iden f1.write("{0}\t{1}\t{2}\t".format(dirname, len(seq_info), len(contigs_seen))) cov1, acc1, has_chimeric1 = calculate_cov_acc(d_genome1) f1.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\t".format(len(contig_genome1), cov1, acc1, has_chimeric1, ",".join(contig_genome1))) # (for genome2), # of contig, total worst cov, iden, is_chimeric, comma-separated list of contigs cov2, acc2, has_chimeric2 = calculate_cov_acc(d_genome2) f1.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}".format(len(contig_genome2), cov2, acc2, has_chimeric2, ",".join(contig_genome2))) # (for blastn, optional) best name with best e-value if blastn_filename is not None: if len(best_of) == 0: f1.write("\t0\tNA\n") else: stuff = best_of.values() # list of (e-value, name) stuff.sort() f1.write("\t{0}\t\"{1}\"\n".format(sum(_n!='NA' for _e,_n in best_of.values()), stuff[0][1])) else: f1.write("\n") in_aligned_to_genome1 = os.path.join(dirname, 'in.trimmed.fa.'+genome1+'.sam') if os.path.exists(in_aligned_to_genome1): d3, junk = read_cogent2_aligned_to_genome_sam(os.path.join(dirname, 'in.trimmed.fa'), in_aligned_to_genome1) else: d3 = {} for seqid, v in seq_info.iteritems(): contigs = [x.sID for x in v] acc = sum(x.identity*x.qCoverage for x in v)/sum(x.qCoverage for x in v) f2.write("{0}\t{1}\t{2}\t{3}\t".format(seqid, dirname, ",".join(contigs), acc)) if not seqid in d3: f2.write("NA\t0\tNA\tNA") if blastn_filename is not None: f2.write("\tNA\n") else: f2.write("\n") else: scaffolds = [x.sID for x in d3[seqid]] # calculate cov and acc c = ClusterTree(0,0) for x in d3[seqid]: qlen = x.qLen c.insert(x.qStart, x.qEnd, -1) cov = sum(_e-_s for _s,_e,_junk in c.getregions())*100./qlen acc = sum(x.identity*x.qCoverage for x in d3[seqid])*1./sum(x.qCoverage for x in d3[seqid]) f2.write("{0}\t{1}\t{2}\t{3}".format(",".join(scaffolds), len(scaffolds), cov, acc)) if blastn_filename is not None: f2.write("\t{0}\n".format(best_of[seqid][1])) else: f2.write("\n")