def main(cons_filename, in_filename): run_gmap(cons_filename) run_gmap(in_filename) cons = GFF.gmapGFFReader(cons_filename+'.gff').next() good, bad = 0, [] for r in GFF.gmapGFFReader(in_filename+'.gff'): if compare_gff(cons, r): good += 1 else: bad.append(r.seqid) print "{0}/{1} agree with consensus".format(good, good+len(bad)) print "Disagreement:" for x in bad: print x return good*1./(good+len(bad))
def main(gmap_filename): from collections import defaultdict output = [] CT = defaultdict(lambda: ClusterTree(0,0)) for r in GFF.gmapGFFReader(gmap_filename): if r.coverage >= 95. and all(x>=80 for x in r.scores): output.append(r) i = len(output)-1 CT[r.chr].insert(r.start, r.end, i) f = open(gmap_filename+'.cov95score80_consolidated_UCSC.gff', 'w') for ct in CT.itervalues(): for a,b,record_indices in ct.getregions(): records = [output[ind] for ind in record_indices] # continuously merge potentially compatiable records i = 0 while i < len(records)-1: j = i + 1 while j < len(records) and records[j].start < records[i].end: if consolidate(records[i], records[j]) is None: j += 1 else: records.pop(j) # j is merged into i, delete i += 1 for r in records: GFF.write_GFF_UCSCformat(f, r) f.close()
def test_random_subset_gcon(records, iterations=10, size=10): for iter in xrange(iterations): picked = random.sample(records, size) f = open("tmp.fa", "w") for r in picked: f.write(">{0}\n{1}\n".format(r.id, r.seq)) f.close() if os.path.exists("tmp_cons.fa"): os.remove("tmp_cons.fa") if os.path.exists("tmp_cons.fa.gff"): os.remove("tmp_cons.fa.gff") os.system("ice_pbdagcon.py tmp.fa tmp_cons tmp_cons --nproc 12") run_gmap("tmp_cons.fa") r2 = GFF.gmapGFFReader("tmp_cons.fa.gff").next() yield r2
def process_contig(dirname, f): d = defaultdict(lambda: []) # path_number --> (start, end, scaffold) reader = GFF.gmapGFFReader(os.path.join(dirname,'aloha2.fa.cuttlefish.gff')) for r in reader: if r.strand == '+': s,e = r.seq_exons[0].start, r.seq_exons[-1].end else: s,e = r.seq_exons[-1].start, r.seq_exons[0].end d[r.seqid].append((s, e, r.chr, r.start, r.end)) if len(d) == 0: return for path_i, x in d.iteritems(): x.sort(key=lambda x: x[0]) xx = [_chr for s,e,_chr,_chr_s,_chr_e in x] f.write("{0}\t{1}\t{2}\n".format(dirname, path_i, ",".join(xx))) for s,e,_chr,_chr_s,_chr_e in x: f.write("#{0}:{1}-{2}\t{3}:{4}-{5}\n".format(path_i,s,e,_chr,_chr_s,_chr_e))
def process_contig(dirname, f): d = defaultdict(lambda: []) # path_number --> (start, end, scaffold) reader = GFF.gmapGFFReader( os.path.join(dirname, 'aloha2.fa.cuttlefish.gff')) for r in reader: if r.strand == '+': s, e = r.seq_exons[0].start, r.seq_exons[-1].end else: s, e = r.seq_exons[-1].start, r.seq_exons[0].end d[r.seqid].append((s, e, r.chr, r.start, r.end)) if len(d) == 0: return for path_i, x in d.iteritems(): x.sort(key=lambda x: x[0]) xx = [_chr for s, e, _chr, _chr_s, _chr_e in x] f.write("{0}\t{1}\t{2}\n".format(dirname, path_i, ",".join(xx))) for s, e, _chr, _chr_s, _chr_e in x: f.write("#{0}:{1}-{2}\t{3}:{4}-{5}\n".format( path_i, s, e, _chr, _chr_s, _chr_e))
def read_cogent2_aligned_to_genome_gff(filename): """ Read cogent2 mapped to a genome. Return: dict of {cogent path} --> list of gmapRecord; set of mapped genome contigs NOTE: (gmap was run with -n 0 so if multiple must be chimeric) """ d = defaultdict(lambda: []) contigs_seen = set() if not os.path.exists(filename): return {}, set() try: for r in GFF.gmapGFFReader(filename): d[r.seqid].append(r) contigs_seen.add(r.chr) except IndexError: pass return dict(d), contigs_seen
def read_cogent2_aligned_to_genome_gff(filename): """ Read cogent2 mapped to a genome. Return: dict of {cogent path} --> list of gmapRecord; set of mapped genome contigs NOTE: (gmap was run with -n 0 so if multiple must be chimeric) """ d = defaultdict(lambda: []) contigs_seen = set() if not os.path.exists(filename): return {}, set() try: for r in GFF.gmapGFFReader(filename): d[r.seqid].append(r) contigs_seen.add(r.chr) except IndexError: pass return dict(d), contigs_seen
def main(gmap_filename, fasta_filename): """ Given a GMAP output (.gff) compare the aligned start/end to Gencode annotations (transcript & polyA) Need the original fasta to get sequence length """ seqlen_dict = dict([(r.id,len(r.seq)) for r in SeqIO.parse(open(fasta_filename),'fasta')]) gtf_f = '/home/UNIXHOME/etseng/share/gencode/gencode.v15.annotation.gtf' gtfA_f = '/home/UNIXHOME/etseng/share/gencode/gencode.v15.polyAs.gtf' gtf = GFF.GTF(gtf_f) gtfA = GFF.polyAGFF(gtfA_f) f = open(gmap_filename+'.summary', 'w') f.write("ID\thit5_exon\thit5_dist\thit5_id\thit3_exon\thit3_dist\thit3_id\thitA_dist\n") reader = GFF.gmapGFFReader(gmap_filename) while True: try: r = reader.next() except AssertionError: #ignore bad gmap output continue except StopIteration: break except: continue if r.coverage < min_coverage: continue # IMPORTANT! if r.start/r.end is not complete, extend it! r_start_corrected = r.start - r.seq_exons[0].start r_end_corrected = r.end + (seqlen_dict[r.seqid] - r.seq_exons[-1].end) hit5, hit3 = validate_53seen(gtf, r.chr, r.start, r.end, r.strand) hitA = validate_polyA(gtfA, r.chr, r.start, r.end, r.strand) f.write("{id}\t{e5}\t{d5}\t{i5}\t{e3}\t{d3}\t{i3}\t{dA}\n".format(\ id=r.seqid, e5=hit5[0], d5=hit5[1], i5=hit5[2], e3=hit3[0], d3=hit3[1], i3=hit3[2], dA=hitA)) f.close()
def tally_for_a_Cogent_dir(dirname, f1, f2, genome1, genome2): """ 1. read input mapped to cogent2 (in.trimmed.fa.cogent2.gff) 2. read cogent2 mapped to genome1 3. read cogent2 mapped to genome2 (if genome2 does not exist, just repeat genome1) """ if not os.path.exists(os.path.join(dirname, 'COGENT.DONE')): return seq_info = defaultdict(lambda: []) contigs_seen = set() # input mapped to Cogent contigs filename = os.path.join(dirname, 'in.trimmed.fa.cogent2.gff') reader = GFF.gmapGFFReader(filename) for r in reader: seq_info[r.seqid].append(r) contigs_seen.add(r.chr) # sanity check that all sequences in in.fa are mapped to cogent2.fa for r in SeqIO.parse(open(os.path.join(dirname, 'in.fa')), 'fasta'): assert r.id in seq_info d_genome1, contig_genome1 = read_cogent2_aligned_to_genome_gff( os.path.join(dirname, 'cogent2.fa.' + genome1 + '.gff')) d_genome2, contig_genome2 = read_cogent2_aligned_to_genome_gff( os.path.join(dirname, 'cogent2.fa.' + genome2 + '.gff')) # write: # dirname, # of input, # of cogent contig, # of pacbio_contig, total pacbio cov, pacbio iden f1.write("{0}\t{1}\t{2}\t".format(dirname, len(seq_info), len(contigs_seen))) cov1, acc1, has_chimeric1 = calculate_cov_acc(d_genome1) f1.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\t".format( len(contig_genome1), cov1, acc1, has_chimeric1, ",".join(contig_genome1))) # (for genome2), # of contig, total worst cov, iden, is_chimeric, comma-separated list of contigs cov2, acc2, has_chimeric2 = calculate_cov_acc(d_genome2) f1.write("{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\n".format( len(contig_genome2), cov2, acc2, has_chimeric2, ",".join(contig_genome2))) in_aligned_to_genome1 = os.path.join(dirname, 'in.trimmed.fa.' + genome1 + '.gff') if os.path.exists(in_aligned_to_genome1): d3, junk = read_cogent2_aligned_to_genome_gff(in_aligned_to_genome1) else: d3 = {} for seqid, v in seq_info.iteritems(): contigs = [x.chr for x in v] acc = sum(x.identity * x.coverage for x in v) / sum(x.coverage for x in v) f2.write("{0}\t{1}\t{2}\t{3}\t".format(seqid, dirname, ",".join(contigs), acc)) if not seqid in d3: f2.write("NA\t0\tNA\tNA\n") else: scaffolds = [x.chr for x in d3[seqid]] cov = sum(x.coverage for x in d3[seqid]) acc = sum(x.identity * x.coverage for x in d3[seqid]) / cov f2.write("{0}\t{1}\t{2}\t{3}\n".format(",".join(scaffolds), len(scaffolds), cov, acc))
def tally_for_a_Cogent_dir(dirname, f1, f2, genome1, genome2): """ 1. read input mapped to cogent2 (in.trimmed.fa.cogent2.gff) 2. read cogent2 mapped to genome1 3. read cogent2 mapped to genome2 (if genome2 does not exist, just repeat genome1) """ if not os.path.exists(os.path.join(dirname, "COGENT.DONE")): return seq_info = defaultdict(lambda: []) contigs_seen = set() # input mapped to Cogent contigs filename = os.path.join(dirname, "in.trimmed.fa.cogent2.gff") reader = GFF.gmapGFFReader(filename) for r in reader: seq_info[r.seqid].append(r) contigs_seen.add(r.chr) # sanity check that all sequences in in.fa are mapped to cogent2.fa for r in SeqIO.parse(open(os.path.join(dirname, "in.fa")), "fasta"): assert r.id in seq_info d_genome1, contig_genome1 = read_cogent2_aligned_to_genome_gff( os.path.join(dirname, "cogent2.fa." + genome1 + ".gff") ) d_genome2, contig_genome2 = read_cogent2_aligned_to_genome_gff( os.path.join(dirname, "cogent2.fa." + genome2 + ".gff") ) # write: # dirname, # of input, # of cogent contig, # of pacbio_contig, total pacbio cov, pacbio iden f1.write("{0}\t{1}\t{2}\t".format(dirname, len(seq_info), len(contigs_seen))) cov1, acc1, has_chimeric1 = calculate_cov_acc(d_genome1) f1.write( "{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\t".format( len(contig_genome1), cov1, acc1, has_chimeric1, ",".join(contig_genome1) ) ) # (for genome2), # of contig, total worst cov, iden, is_chimeric, comma-separated list of contigs cov2, acc2, has_chimeric2 = calculate_cov_acc(d_genome2) f1.write( "{0}\t{1:.2f}\t{2:.2f}\t{3}\t{4}\n".format( len(contig_genome2), cov2, acc2, has_chimeric2, ",".join(contig_genome2) ) ) in_aligned_to_genome1 = os.path.join(dirname, "in.trimmed.fa." + genome1 + ".gff") if os.path.exists(in_aligned_to_genome1): d3, junk = read_cogent2_aligned_to_genome_gff(in_aligned_to_genome1) else: d3 = {} for seqid, v in seq_info.iteritems(): contigs = [x.chr for x in v] acc = sum(x.identity * x.coverage for x in v) / sum(x.coverage for x in v) f2.write("{0}\t{1}\t{2}\t{3}\t".format(seqid, dirname, ",".join(contigs), acc)) if not seqid in d3: f2.write("NA\t0\tNA\tNA\n") else: scaffolds = [x.chr for x in d3[seqid]] cov = sum(x.coverage for x in d3[seqid]) acc = sum(x.identity * x.coverage for x in d3[seqid]) / cov f2.write("{0}\t{1}\t{2}\t{3}\n".format(",".join(scaffolds), len(scaffolds), cov, acc))
#!/usr/bin/env python import os, sys import GFF input = sys.argv[1] output = input[:input.rfind('.')] + '.collapsed.gff' f = open(output, 'w') reader = GFF.gmapGFFReader(input) for r in reader: GFF.write_collapseGFF_format(f, r) f.close()