def sanitize_gff_file(gff_fname, in_memory=True, in_place=False): """ Sanitize a GFF file. """ db = None if is_gff_db(gff_fname): # It's a database filename, so load it db = gffutils.FeatureDB(gff_fname) else: # Need to create a database for file if in_memory: db = gffutils.create_db(gff_fname, ":memory:", verbose=False) else: db = get_gff_db(gff_fname) if in_place: gff_out = gffwriter.GFFWriter(gff_fname, in_place=in_place) else: gff_out = gffwriter.GFFWriter(sys.stdout) sanitized_db = sanitize_gff_db(db) for gene_rec in sanitized_db.all_features(featuretype="gene"): gff_out.write_gene_recs(sanitized_db, gene_rec.id) gff_out.close()
def add_EVM(final_update, wd, consensus_mapped_gff3): """ """ db_evm = gffutils.create_db(final_update, ':memory:', merge_strategy='create_unique', keep_order=True) ids_evm = [gene.attributes["ID"][0] for gene in db_evm.features_of_type("mRNA")] db_gmap = gffutils.create_db(consensus_mapped_gff3, ':memory:', merge_strategy='create_unique', keep_order=True) ids_gmap_full = [gene.attributes["ID"][0] for gene in db_gmap.features_of_type("gene")] ids_gmap = [gene.attributes["ID"][0].split("_")[0] for gene in db_gmap.features_of_type("gene")] uniq_evm = [evm for evm in ids_evm if not evm in ids_gmap] mRNA = [] for evm in uniq_evm: for line in db_evm.parents(evm, order_by='start'): mRNA.append(line.attributes["ID"][0]) mRNA_uniq = list(set(mRNA)) outfile = tempfile.NamedTemporaryFile(delete=False, prefix="additional.1.", suffix=".gff3", dir=wd) gff_out_s = gffwriter.GFFWriter(outfile.name) for name in mRNA_uniq: for i in db_evm.children(name, order_by='start'): gff_out_s.write_rec(i) gff_out_s.write_rec(db_evm[name]) for name in ids_gmap_full: for i in db_gmap.children(name, order_by='start'): gff_out_s.write_rec(i) gff_out_s.write_rec(db_gmap[name]) gff_out_s.close() return outfile.name
def add_removed_evm(pasa, exon, wd): """ here the clusters of sequence from the same locus are prepared """ db_evm = gffutils.create_db(pasa, ':memory:', merge_strategy='create_unique', keep_order=True) ids_evm = [gene.attributes["ID"][0] for gene in db_evm.features_of_type("mRNA")] db_gmap = gffutils.create_db(exon, ':memory:', merge_strategy='create_unique', keep_order=True) ids_gmap_full = [gene.attributes["ID"][0] for gene in db_gmap.features_of_type("gene")] ids_gmap = [gene.attributes["ID"][0].split("_")[0] for gene in db_gmap.features_of_type("mRNA")] uniq_evm = [evm for evm in ids_evm if evm not in ids_gmap] uniq_gene = [gene.attributes["ID"][0] for mrna in uniq_evm for gene in db_evm.parents(mrna)] uniq = list(set(uniq_gene)) outfile = tempfile.NamedTemporaryFile(delete=False, prefix="additional.", suffix=".gff3", dir=wd) gff_out_s = gffwriter.GFFWriter(outfile.name) for name in uniq: for i in db_evm.children(name, order_by='start'): gff_out_s.write_rec(i) gff_out_s.write_rec(db_evm[name]) for name in ids_gmap_full: for i in db_gmap.children(name, order_by='start'): gff_out_s.write_rec(i) gff_out_s.write_rec(db_gmap[name]) gff_out_s.close() return outfile.name
def longest_cds(gff_file, gff_filerc, verbose, wd, filename): db = gffutils.create_db(gff_file, ':memory:', merge_strategy='create_unique', keep_order=True, transform=transform) dbrc = gffutils.create_db(gff_filerc, ':memory:', merge_strategy='create_unique', keep_order=True, transform=transform) list_mrna = [ mRNA.attributes["ID"][0] for mRNA in db.features_of_type('mRNA') ] list_mrna_rc = [ mRNA.attributes["ID"][0] for mRNA in dbrc.features_of_type('mRNA') ] list_all = list(set(list_mrna + list_mrna_rc)) list_db = [] list_db_rc = [] for mrna_id in list_all: cds_len = [ int(i.end) - int(i.start) for i in db.children(mrna_id, featuretype='CDS', order_by='start') ] cds_len_rc = [ int(i.end) - int(i.start) for i in dbrc.children( mrna_id, featuretype='CDS', order_by='start') ] if cds_len == cds_len_rc: list_db.append(mrna_id) elif cds_len > cds_len_rc: list_db.append(mrna_id) else: list_db_rc.append(mrna_id) gff_out = gffwriter.GFFWriter(filename) for evm in list_db: if evm in list_mrna: for i in db.children(evm, featuretype='CDS', order_by='start'): gff_out.write_rec(i) i = db[evm] gff_out.write_rec(i) for i in db.parents(evm, featuretype='gene', order_by='start'): gff_out.write_rec(i) for i in db.children(evm, featuretype='exon', order_by='start'): gff_out.write_rec(i) for evm in list_db_rc: if evm in list_mrna_rc: for i in dbrc.children(evm, featuretype='CDS', order_by='start'): gff_out.write_rec(i) i = dbrc[evm] gff_out.write_rec(i) for i in dbrc.parents(evm, featuretype='gene', order_by='start'): gff_out.write_rec(i) for i in dbrc.children(evm, featuretype='exon', order_by='start'): gff_out.write_rec(i) gff_out.close() if verbose: print(filename) return filename
def genename_last(gff_filename, prefix, verbose, wd, dict_ref_name, step): global prefix_name prefix_name = prefix out = tempfile.NamedTemporaryFile(delete=False, mode="w", dir=wd) err = tempfile.NamedTemporaryFile(delete=False, mode="w") gt_com = GT_GFF3 % gff_filename if verbose: sys.stderr.write('Executing: %s\n\n' % gt_com) gt_call = subprocess.Popen(gt_com, stdout=out, stderr=err, shell=True) gt_call.communicate() db1 = gffutils.create_db(out.name, ':memory:', merge_strategy='create_unique', keep_order=True, transform=transform_name) gene_count = 0 list_mrna = [mRNA.attributes["ID"][0] for mRNA in db1.features_of_type('mRNA')] out_gff = tempfile.NamedTemporaryFile(delete=False, prefix="gffread", suffix=".gff3", dir=wd) gff_out = gffwriter.GFFWriter(out_gff.name) gene_name = [] for evm in list_mrna: for i in db1.children(evm, featuretype='CDS', order_by='start'): if i.chrom in dict_ref_name: i.chrom = dict_ref_name[i.chrom] gff_out.write_rec(i) i = db1[evm] if i.chrom in dict_ref_name: i.chrom = dict_ref_name[i.chrom] gff_out.write_rec(i) for i in db1.parents(evm, featuretype='gene', order_by='start'): if i.chrom in dict_ref_name: i.chrom = dict_ref_name[i.chrom] id_gene = i.attributes['ID'][0] if not id_gene in gene_name: gff_out.write_rec(i) gene_name.append(id_gene) for i in db1.children(evm, featuretype='exon', order_by='start'): if i.chrom in dict_ref_name: i.chrom = dict_ref_name[i.chrom] gff_out.write_rec(i) gff_out.close() if "pasa" in step: out_name = os.path.join(wd, "Final.evm.update.gff3") with open(out_name, "w") as fh: gt_com = GT_RETAINID % out_gff.name if verbose: sys.stderr.write('Executing: %s\n\n' % gt_com) gt_call = subprocess.Popen(gt_com, stdout=fh, stderr=err, shell=True) gt_call.communicate() if "lorean" in step: out_name = os.path.join(wd, "Final.LoReAn.update.gff3") with open(out_name, "w") as fh: gt_com = GT_RETAINID % out_gff.name if verbose: sys.stderr.write('Executing: %s\n\n' % gt_com) gt_call = subprocess.Popen(gt_com, stdout=fh, stderr=err, shell=True) gt_call.communicate() if verbose: print(out_name) return out_name
def test_gffwriter(): """ Test GFFWriter. """ print("Testing GFF writer..") fn = gffutils.example_filename("unsanitized.gff") # Make a copy of it as temporary named file temp_f = tempfile.NamedTemporaryFile(delete=False) temp_fname_source = temp_f.name shutil.copy(fn, temp_fname_source) # Now write file in place source_first_line = open(temp_fname_source, "r").readline().strip() assert (not source_first_line.startswith("#GFF3")), \ "unsanitized.gff should not have a gffutils-style header." db_in = gffutils.create_db(fn, ":memory:", keep_order=True) # Fetch first record rec = six.next(db_in.all_features()) ## ## Write GFF file in-place test ## print("Testing in-place writing") gff_out = gffwriter.GFFWriter(temp_fname_source, in_place=True, with_header=True) gff_out.write_rec(rec) gff_out.close() # Ensure that the file was written with header rewritten = open(temp_fname_source, "r") new_header = rewritten.readline().strip() assert new_header.startswith("#GFF3"), \ "GFFWriter serialized files should have a #GFF3 header." print(" - Wrote GFF file in-place successfully.") ## ## Write GFF file to new file test ## print("Testing writing to new file") new_file = tempfile.NamedTemporaryFile(delete=False) gff_out = gffwriter.GFFWriter(new_file.name) gff_out.write_rec(rec) gff_out.close() new_line = open(new_file.name, "r").readline().strip() assert new_line.startswith("#GFF3"), \ "GFFWriter could not write to a new GFF file." print(" - Wrote to new file successfully.")
def output_combined_gff_events(sg_gff_fname, sg_events, new_gff_fname, output_gff_fname, genome, sg_label="sg2008", source_attr="source"): """ Output the given events from sg_gff_fname and all of the entries from new_gff_fname into a single file. Mark SG events with sg_label. """ gff_out_file = open(output_gff_fname, "w") gff_out = gffwriter.GFFWriter(gff_out_file) # SG records to output sg_records = [] # New records to output new_records = [] # Load up gffutils databases for SG and new events new_db = gffutils.create_db(new_gff_fname, ":memory:", verbose=False) sg_db = gffutils.create_db(sg_gff_fname, ":memory:", verbose=False) #sg_gff_genes = sg_db.features_of_type("gene") new_gff_genes = new_db.features_of_type("gene") # Output new events first new_ids = {} for gene_rec in new_gff_genes: gene_id = gene_rec.id gff_out.write_gene_recs(new_db, gene_id) new_ids[gene_id] = True # Output SG events for sg_gene_id in sg_events: if sg_gene_id in new_ids: print "Skipping %s" % (sg_gene_id) # If this has an identical ID to one of the new annotation # events, skip it continue # Get all SG event records sg_recs = get_event_gff_recs(sg_gene_id, sg_db) # Add source attribute to each record for rec in sg_recs: rec.attributes[source_attr] = [sg_label] gff_out.write_rec(rec) gff_out.close() # Sanitize the file sanitize_cmd = \ "gffutils-cli sanitize %s --in-memory --in-place" %(output_gff_fname) print "Sanitizing merged GFF..." ret_val = os.system(sanitize_cmd) if ret_val != 0: raise Exception, "Sanitization failed on %s" % (output_gff_fname) # Annotate the file print "Annotating merged GFF..." gffutils_helpers.annotate_gff(output_gff_fname, genome)
def pep_seq(myFasta, final_evm): fasta = {} db = gffutils.create_db(final_evm, ':memory:', merge_strategy="create_unique", keep_order=True) gff_file = final_evm + ".gff3" gff_out = gffwriter.GFFWriter(gff_file) for t in db.features_of_type('mRNA', order_by='start'): position = [] seq_combined = '' j = 0 for i in db.children(t, featuretype='CDS', order_by='start'): j += 1 if j == 1: pphase = i[7] seq = i.sequence(myFasta, use_strand=False) seq_combined += seq position = position + [i.start,i.stop] seq_combined = SeqRecord(Seq(seq_combined, generic_dna)) #print(t.attributes["ID"][0]) #print(seq_combined.seq) if t.strand == '-': pphase = i[7] seq_combined = seq_combined.reverse_complement() if pphase == "0" or pphase == ".": seq_transl = seq_combined.translate() elif pphase == "1": seq_transl = seq_combined[1:].translate() elif pphase == "2": seq_transl = seq_combined[2:].translate() seq_transl.description = position seq_transl.id = t.id position = sorted(position) t.start = position[0] t.stop = position[-1] fasta[str(seq_transl.seq)] = [seq_transl, t] count = 0 for key in fasta: for i in db.parents(fasta[key][1], featuretype='gene', order_by='start'): gff_out.write_rec(i) i.start = fasta[key][1].start i.stop = fasta[key][1].stop gff_out.write_rec(fasta[key][1]) for i in db.children(fasta[key][1], featuretype='CDS', order_by='start'): gff_out.write_rec(i) for i in db.children(fasta[key][1], featuretype='CDS', order_by='start'): count += 1 i.featuretype="exon" if "ID" in i.attributes: i.attributes["ID"][0] = i.attributes["ID"][0] + "-" + str(count) else: i.attributes["ID"] = ["exon" + "-" + str(count)] gff_out.write_rec(i) return (gff_file)
def genename_lorean(gff_filename, verbose, wd): outfile_gff = tempfile.NamedTemporaryFile(delete=False, prefix="additional.", suffix=".gff3", dir=wd) log = tempfile.NamedTemporaryFile(delete=False, prefix="uniq.ID.pasa.", suffix=".log", dir=wd) err = tempfile.NamedTemporaryFile(delete=False, prefix="uniq.ID.pasa.", suffix=".err", dir=wd) cmd = GFFREAD_M % (outfile_gff.name, gff_filename) if verbose: sys.stderr.write('Executing: %s\n\n' % cmd) gffread = subprocess.Popen(cmd, cwd=wd, shell=True, stdout=log, stderr=err) gffread.communicate() out_final = tempfile.NamedTemporaryFile(delete=False, mode="w", prefix = "gt_gff3.", suffix=".gff3", dir=wd) log = tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".log", dir=wd) err = tempfile.NamedTemporaryFile(delete=False, mode="w", dir=wd, suffix=".last.gt_gff3.err") gt_com = 'gt gff3 -retainids -sort -force -tidy -o %s %s' % (out_final.name, outfile_gff.name) if verbose: sys.stderr.write('Executing: %s\n\n' % gt_com) gt_call = subprocess.Popen(gt_com, stdout=log, stderr=err, shell=True) gt_call.communicate() db_gffread = gffutils.create_db(out_final.name, ':memory:', merge_strategy='create_unique', keep_order=True, transform=transform_cds) outfile_out = tempfile.NamedTemporaryFile(delete=False, prefix="uniq.ID.pasa.final.", suffix=".gff3", dir=wd) gff_out_s = gffwriter.GFFWriter(outfile_out.name) for gene in db_gffread.features_of_type("gene"): gff_out_s.write_rec(db_gffread[gene]) for i in db_gffread.children(gene, order_by='start'): gff_out_s.write_rec(i) if verbose: print(outfile_out.name) out_1 = tempfile.NamedTemporaryFile(delete=False, mode="w", prefix="gt_gff3.", suffix=".gff3", dir=wd) log = tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".log", dir=wd) err = tempfile.NamedTemporaryFile(delete=False, mode="w", dir=wd, suffix=".last.gt_gff3.err") gt_com = 'gt gff3 -retainids -sort -force -tidy -o %s %s' % (out_1.name, outfile_out.name) if verbose: sys.stderr.write('Executing: %s\n\n' % gt_com) gt_call = subprocess.Popen(gt_com, stdout=log, stderr=err, shell=True) gt_call.communicate() return out_1.name
def exonerate(ref, gff_file, proc, wd, verbose): ##THIS removes the warning. the check of the longest protein was giving a warining. if Biopython change, this could be a problem warnings.filterwarnings("ignore") exon_file_out = gff_file + ".exons.fasta" prot_file_out = gff_file + ".prot.fasta" errorFile = gff_file + ".gffread_err.log" logFile = exon_file_out + "gffread_log.log" com = GFFREAD_W % (ref, exon_file_out, prot_file_out, gff_file) fasta_file_outfile = open(logFile, "w") errorFilefile = open(errorFile, "w") if verbose: sys.stderr.write('Executing: %s\n\n' % com) call = subprocess.Popen(com, stdout=fasta_file_outfile, cwd=wd, stderr=errorFilefile, shell=True) call.communicate() fasta_file_outfile.close() errorFilefile.close() listComplete = [] dictIncomplete = {} dictFastaProt = {} longestProt = [] listTotal = [] listSingleExons = [] testDict = {} for record in SeqIO.parse(prot_file_out, "fasta"): listTotal.append(record.id) if record.seq.startswith("M"): # and record.seq.endswith("."): listComplete.append(record.id) else: dictIncomplete[record.id] = record.id for record in SeqIO.parse(exon_file_out, "fasta"): listFields = record.description.split(' ') for elem in listFields: if elem.startswith('exons'): exonNumber = elem.split(",") ## changed here for all genes if (len(exonNumber)) > 0: listSingleExons.append(record.id) if record.id in dictIncomplete: newrecord = record.reverse_complement() input_seq = str(record.seq) startP = re.compile('ATG') nuc = input_seq.replace('\n', '') longest = (0, ) for m in startP.finditer(nuc): if len( Seq.Seq(nuc)[m.start():].translate( to_stop=True)) > longest[0]: pro = Seq.Seq(nuc)[m.start():].translate( to_stop=True) longest = [ len(pro), m.start(), str(pro), nuc[m.start():m.start() + len(pro) * 3 + 3] ] if len(longest) == 4: record.seq = Seq.Seq(longest[2]) dictFastaProt[record.id] = record else: dictFastaProt[record.id] = record input_seq = str(newrecord.seq) startP = re.compile('ATG') nuc = input_seq.replace('\n', '') longest = (0, ) for m in startP.finditer(nuc): if len( Seq.Seq(nuc)[m.start():].translate( to_stop=True)) > longest[0]: pro = Seq.Seq(nuc)[m.start():].translate( to_stop=True) longest = [ len(pro), m.start(), str(pro), nuc[m.start():m.start() + len(pro) * 3 + 3] ] if len(longest) == 4: if record.id in dictFastaProt: if (len((dictFastaProt[record.id]).seq) ) < (len(longest[2])): record.seq = Seq.Seq(longest[2]) dictFastaProt[record.id] = record elif len(longest) == 4: record.seq = Seq.Seq(longest[2]) dictFastaProt[record.id] = record else: dictFastaProt[record.id] = record for mod in dictFastaProt: longestProt.append(dictFastaProt[mod]) prot_file_out_mod = prot_file_out + ".mod.fasta" SeqIO.write(longestProt, prot_file_out_mod, "fasta") manage = Manager() queue = manage.Queue() pool = Pool(processes=int(proc), maxtasksperchild=10000) commandList = [] listShort = [] record_dict = SeqIO.to_dict(SeqIO.parse(exon_file_out, "fasta")) for key in dictFastaProt: if key in record_dict: listShort.append(key) outputFilenameProt = wd + key + '.prot.fasta' SeqIO.write(dictFastaProt[key], outputFilenameProt, "fasta") listFields = record_dict[key].description.split(' ') for elem in listFields: outputFilename = wd + key + '.genome.fasta' bedFile = wd + key + '.genome.bed' if (elem.startswith('loc') and elem.endswith('+')) or (elem.startswith('loc') and elem.endswith('-')): coordsList = elem.split('|', -2) chrN = coordsList[0].split(':') coord = coordsList[1].split('-') locus = '\t'.join([chrN[1], coord[0], coord[1]]) locus = locus + '\n' bedhandler = open(bedFile, 'w') bedhandler.write(locus) bedhandler.close() com = BEDTOOLS_GET_FASTA % (ref, bedFile, outputFilename) if verbose: sys.stderr.write('Executing: %s\n\n' % com) call = subprocess.Popen( com, cwd=wd, shell=True ) # , stdout= fasta_file_outfile , stderr=errorFilefile) call.communicate() combList = [ outputFilenameProt, outputFilename, verbose, queue ] commandList.append(combList) results = pool.map_async(runExonerate, commandList) with progressbar.ProgressBar(max_value=len(commandList)) as bar: while not results.ready(): size = queue.qsize() bar.update(size) time.sleep(1) outputFilenameGff = wd + 'mRNA_complete_gene_Annotation.gff3' exonerate_files = results._value + [outputFilenameGff] listInGff = listComplete + listShort listAsbent = sorted(set(list(set(listTotal) ^ set(listInGff)))) listCompleteAll = listAsbent + listComplete gff_out = gffwriter.GFFWriter(outputFilenameGff) db1 = gffutils.create_db(gff_file, ':memory:', merge_strategy='create_unique', keep_order=True) for evm in listCompleteAll: for i in db1.children(evm, featuretype='CDS', order_by='start'): gff_out.write_rec(i) gff_out.write_rec(db1[evm]) for i in db1.parents(evm, featuretype='gene', order_by='start'): gff_out.write_rec(i) for i in db1.children(evm, featuretype='exon', order_by='start'): gff_out.write_rec(i) gff_out.close() orintedFIleN = wd + '/oriented.oldname.gff3' with open(orintedFIleN, 'wb') as wfd: for f in exonerate_files: with open(f, 'rb') as fd: shutil.copyfileobj(fd, wfd, 1024 * 1024 * 10) return orintedFIleN
def longest(gff_file, fasta, proc, wd, verbose): outputFilenameLeft = tempfile.NamedTemporaryFile(delete=False, dir=wd, prefix="longest.") gff_out = gffwriter.GFFWriter(outputFilenameLeft.name) gt_com = GT_GFF3_INTRON % gff_file gff_file_outfile = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".out") #open(gff_file_out, "w") errorFilefile = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".err") #open(errorFile, "w") if verbose: sys.stderr.write('Executing: %s\n\n' % gt_com) gt_call = subprocess.Popen(gt_com, stdout=gff_file_outfile, stderr=errorFilefile, shell=True) gt_call.communicate() gt_com = GT_GFF3TOGTF % gff_file_outfile.name gtf_file_outfile = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".out") #open(gtf_file_out, "w") errorFilefile = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".err") #open(errorFile, "w") if verbose: sys.stderr.write('Executing: %s\n\n' % gt_com) gt_call = subprocess.Popen(gt_com, stdout=gtf_file_outfile, stderr=errorFilefile, shell=True) gt_call.communicate() db1 = gffutils.create_db(gff_file_outfile.name, ':memory:', merge_strategy='create_unique', keep_order=True) fasta_file_outfile = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".out") #open(fasta_file_out, "w") errorFilefile = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".err") #open(errorFile, "w") com = 'cufflinks_gtf_genome_to_cdna_fasta.pl %s %s' % (gtf_file_outfile.name, fasta) if verbose: sys.stderr.write('Executing: %s\n\n' % com) call = subprocess.Popen(com, stdout=fasta_file_outfile, stderr=errorFilefile, shell=True) call.communicate() gff_file_outfile = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".out") #open(gff_file_out_u, "w") errorFilefile = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".err") #open(errorFile, "w") com = 'cufflinks_gtf_to_alignment_gff3.pl %s' % gtf_file_outfile.name if verbose: sys.stderr.write('Executing: %s\n\n' % com) call = subprocess.Popen(com, stdout=gff_file_outfile, stderr=errorFilefile, shell=True) call.communicate() gff_file_outfile_1 = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".out") #open(gff_file_out, "w") errorFilefile_1 = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".err") #open(errorFile, "w") com = 'TransDecoder.LongOrfs -m 10 -t %s' % fasta_file_outfile.name if verbose: sys.stderr.write('Executing: %s\n\n' % com) call = subprocess.Popen(com, stdout=gff_file_outfile_1, stderr=errorFilefile_1, cwd=wd, shell=True) call.communicate() gff_file_outfile_2 = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".out") #open(gff_file_out, "w") errorFilefile_2 = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.", suffix=".err") #open(errorFile, "w") wd_fasta = fasta_file_outfile.name com = 'TransDecoder.Predict --single_best_orf --cpu %s --retain_long_orfs 10 -t %s' % (proc, wd_fasta) if verbose: sys.stderr.write('Executing: %s\n\n' % com) call = subprocess.Popen(com, stdout=gff_file_outfile_2, stderr=errorFilefile_2, cwd=wd, shell=True) call.communicate() gff_file_outfile_3 = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.") #open(outputFilename, "w") errorFilefile_3 = tempfile.NamedTemporaryFile(delete=False, mode='w', dir=wd, prefix="longest.") #open(errorFile, "w") transdecoder = tempfile.NamedTemporaryFile(delete=False) com = 'cdna_alignment_orf_to_genome_orf.pl %s %s %s' % (wd_fasta + '.transdecoder.gff3', gff_file_outfile.name, wd_fasta) if verbose: sys.stderr.write('Executing: %s\n\n' % com) call = subprocess.Popen(com, stdout=gff_file_outfile_3, stderr=errorFilefile_3, cwd=wd, shell=True) call.communicate() listErr = [] err_file = open(errorFilefile.name, "r") for line in err_file: if line.startswith("Warning"): listErr.append(("mRNA" + line.split("::")[1]).split(".")[0]) listErrUniq = list(set(listErr)) for evm in listErrUniq: for i in db1.children(evm, featuretype='CDS', order_by='start'): gff_out.write_rec(i) gff_out.write_rec(db1[evm]) for i in db1.parents(evm, featuretype='gene', order_by='start'): gff_out.write_rec(i) for i in db1.children(evm, featuretype='exon', order_by='start'): gff_out.write_rec(i) gff_files = [outputFilenameLeft.name, gff_file_outfile.name] outputFilenameFinal = wd + 'finalAnnotation.Transdecoder.gff3' with open(outputFilenameFinal, 'wb') as wfd: for f in gff_files: with open(f, 'rb') as fd: shutil.copyfileobj(fd, wfd, 1024 * 1024 * 10) return outputFilenameFinal
def genename_evm(gff_filename, verbose, wd): gene_evm = "evm.TU." mRNA_evm = "evm.model." out = tempfile.NamedTemporaryFile(delete=False, mode="w", dir=wd) err = tempfile.NamedTemporaryFile(delete=False, mode="w") gt_com = GT_GFF3 % gff_filename if verbose: sys.stderr.write('Executing: %s\n\n' % gt_com) gt_call = subprocess.Popen(gt_com, stdout=out, stderr=err, shell=True) gt_call.communicate() db1 = gffutils.create_db(out.name, ':memory:', merge_strategy='create_unique', keep_order=True) list_gene = [gene.attributes["ID"][0] for gene in db1.features_of_type('gene')] list_chr = [chr.chrom for chr in db1.features_of_type('gene')] chr_count_gene = {} chr_count_mRNA = {} chrs = list(set(list_chr)) for elm in chrs: chr_count_gene[elm] = 0 chr_count_mRNA[elm] = 0 out_gff = tempfile.NamedTemporaryFile(delete=False, prefix="gffread_parse", suffix=".gff3", dir=wd) gff_out = gffwriter.GFFWriter(out_gff.name) for evm in list_gene: exon_count = 0 cds_count = 0 gene_chr = db1[evm].chrom gene = db1[evm] count_gene = chr_count_gene[gene_chr] + 1 chr_count_gene[gene_chr] = count_gene id_new_gene = gene_evm + gene_chr + "." + str(count_gene) gene.attributes["ID"][0] = id_new_gene gff_out.write_rec(gene) for i in db1.children(evm, featuretype='mRNA', order_by='start'): mRNA = i mRNA_old = i.attributes["ID"][0] count_mRNA = chr_count_mRNA[gene_chr] + 1 chr_count_mRNA[gene_chr] = count_mRNA id_new_mRNA = mRNA_evm + gene_chr + "." + str(count_mRNA) mRNA.attributes["Parent"][0] = id_new_gene mRNA.attributes["ID"][0] = id_new_mRNA gff_out.write_rec(mRNA) for e in db1.children(mRNA_old, featuretype='exon', order_by='start'): exon_count += 1 exon = e exon.attributes["Parent"][0] = mRNA.attributes["ID"][0] exon.attributes["ID"] = id_new_mRNA + ".exon" + str(exon_count) gff_out.write_rec(exon) for c in db1.children(mRNA_old, featuretype='CDS', order_by='start'): cds_count += 1 cds = c cds.attributes["Parent"][0] = mRNA.attributes["ID"][0] cds.attributes["ID"] = "cds." + str(cds_count) + "." + id_new_mRNA gff_out.write_rec(cds) gff_out.close() out = tempfile.NamedTemporaryFile(delete=False, mode="w", prefix="new_name_update.", suffix= ".gff3", dir=wd) err = tempfile.NamedTemporaryFile(delete=False, mode="w", dir=wd) gt_com = GT_GFF3_R % out_gff.name if verbose: sys.stderr.write('Executing: %s\n\n' % gt_com) gt_call = subprocess.Popen(gt_com, stdout=out, stderr=err, shell=True) gt_call.communicate() if verbose: print(out.name) return out.name
def exonerate(ref, gff_file, proc, wd, verbose): ##THIS removes the warning. the check of the longest protein was giving a warining. if Biopython change, this could be a problem warnings.filterwarnings("ignore") exon_file_out = gff_file + ".exons.fasta" prot_file_out = gff_file + ".prot.fasta" errorFile = gff_file + ".gffread_err.log" logFile = exon_file_out + "gffread_log.log" com = GFFREAD_W % (ref, exon_file_out, prot_file_out, gff_file) fasta_file_outfile = open(logFile, "w") errorFilefile = open(errorFile, "w") if verbose: sys.stderr.write('Executing: %s\n\n' % com) call = subprocess.Popen(com, stdout=fasta_file_outfile, cwd=wd, stderr=errorFilefile, shell=True) call.communicate() fasta_file_outfile.close() errorFilefile.close() list_complete = [] dict_incomplete = {} dict_fasta_prot = {} longest_prot = [] list_total = [] list_single_exons = [] list_incomplete = [] for record in SeqIO.parse(prot_file_out, "fasta"): list_total.append(record.id) if record.seq.startswith("M"): list_complete.append(record.id) else: list_incomplete.append(record.id) dict_incomplete[record.id] = record.id for record in SeqIO.parse(exon_file_out, "fasta"): list_fields = record.description.split(' ') for elem in list_fields: if elem.startswith('exons'): exon_number = elem.split(",") if (len(exon_number)) > 0: list_single_exons.append(record.id) if record.id in dict_incomplete: newrecord = record.reverse_complement() input_seq = str(record.seq) startP = re.compile('ATG') nuc = input_seq.replace('\n', '') longest = (0,) for m in startP.finditer(nuc): if len(Seq.Seq(nuc)[m.start():].translate(to_stop=True)) > longest[0]: pro = Seq.Seq(nuc)[m.start():].translate(to_stop=True) longest = [len(pro), nuc[m.start():m.start() + len(pro) * 3 + 3]] if len(longest) == 2: record.seq = Seq.Seq(longest[1]) dict_fasta_prot[record.id] = record else: dict_fasta_prot[record.id] = record input_seq = str(newrecord.seq) startP = re.compile('ATG') nuc = input_seq.replace('\n', '') longest = (0,) for m in startP.finditer(nuc): if len(Seq.Seq(nuc)[m.start():].translate(to_stop=True)) > longest[0]: pro = Seq.Seq(nuc)[m.start():].translate(to_stop=True) longest = [len(pro), nuc[m.start():m.start() + len(pro) * 3 + 3]] if len(longest) == 2: if record.id in dict_fasta_prot: if (len((dict_fasta_prot[record.id]).seq)) < (len(longest[1])): record.seq = Seq.Seq(longest[1]) dict_fasta_prot[record.id] = record elif len(longest) == 2: record.seq = Seq.Seq(longest[1]) dict_fasta_prot[record.id] = record else: dict_fasta_prot[record.id] = record for mod in dict_fasta_prot: longest_prot.append(dict_fasta_prot[mod]) prot_file_out_mod = prot_file_out + ".mod.fasta" SeqIO.write(longest_prot, prot_file_out_mod, "fasta") pool = Pool(processes=int(proc)) list_get_seq= [] list_short = [] record_dict = SeqIO.to_dict(SeqIO.parse(exon_file_out, "fasta")) for key in dict_fasta_prot: if key in record_dict: list_short.append(key) output_filename_prot = os.path.join(wd, key + '.prot.fasta') SeqIO.write(dict_fasta_prot[key], output_filename_prot, "fasta") list_fields = record_dict[key].description.split(' ') for elem in list_fields: output_filename = wd + key + '.genome.fasta' bedFile = wd + key + '.genome.bed' if (elem.startswith('loc') and elem.endswith('+')) or (elem.startswith('loc') and elem.endswith('-')): coordsList = elem.split('|', -2) chrN = coordsList[0].split(':') coord = coordsList[1].split('-') locus = '\t'.join([chrN[1], coord[0], coord[1]]) locus = locus + '\n' bedhandler = open(bedFile, 'w') bedhandler.write(locus) bedhandler.close() data = [ref, bedFile, output_filename, output_filename_prot, verbose, wd] list_get_seq.append(data) results_get = pool.map(get_fasta, list_get_seq, chunksize=1) results = pool.map(runExonerate, results_get, chunksize=1) output_filename_gff = wd + 'mRNA_complete_gene_Annotation.gff3' gff_out = gffwriter.GFFWriter(output_filename_gff) db1 = gffutils.create_db(gff_file, ':memory:', merge_strategy='create_unique', keep_order=True) list_gene_complete = [] list_gene_incomplete = [] for mRNA in list_complete: for mRNA_ok in db1.parents(mRNA, featuretype='gene', order_by='start'): list_gene_complete.append(mRNA_ok.attributes["ID"][0]) for mRNA in list_incomplete: for mRNA_ok in db1.parents(mRNA, featuretype='gene', order_by='start'): list_gene_incomplete.append(mRNA_ok.attributes["ID"][0]) list_gene_complete = sorted(list(set(list_gene_complete))) list_gene_incomplete = sorted(list(set(list_gene_incomplete))) list_gene_ok_uniq = sorted(list(set(list_gene_complete) - set(list_gene_incomplete))) for evm in list_gene_ok_uniq: gff_out.write_rec(db1[evm]) for i in db1.children(evm): gff_out.write_rec(i) exonerate_files = results + [output_filename_gff] gff_out.close() orintedFIleN = wd + '/oriented.oldname.gff3' with open(orintedFIleN, 'wb') as wfd: for f in exonerate_files: with open(f, 'rb') as fd: shutil.copyfileobj(fd, wfd, 1024 * 1024 * 10) outfile_gff = tempfile.NamedTemporaryFile(delete=False, prefix="additional.2.", suffix=".gff3", dir=wd) log = tempfile.NamedTemporaryFile(delete=False, prefix="additional.", suffix=".log", dir=wd) err = tempfile.NamedTemporaryFile(delete=False, prefix="additional.", suffix=".err", dir=wd) cmd = GFFREAD_M_S % (outfile_gff.name, orintedFIleN) gffread = subprocess.Popen(cmd, cwd=wd, shell=True, stdout=log, stderr=err) gffread.communicate() db_gffread = gffutils.create_db(outfile_gff.name, ':memory:', merge_strategy='create_unique', keep_order=True, transform=transform_func) outfile_out = tempfile.NamedTemporaryFile(delete=False, prefix="additional.final.", suffix=".gff3", dir=wd) gff_out_s = gffwriter.GFFWriter(outfile_out.name) for gene in db_gffread.features_of_type("gene"): for i in db_gffread.children(gene, order_by='start'): gff_out_s.write_rec(i) gff_out_s.write_rec(db_gffread[gene]) return outfile_out.name
def gff_filter(final_evm, myFasta): file_out = final_evm + ".mod.gff3" with open(final_evm, "r") as fh, open(file_out, "w") as fhd: for line in fh: if not line.startswith("#"): elm = line.split("\t") elm[8] = elm[8].replace("locus", "Parent") elm[2] = elm[2].replace("locus", "gene") fhd.write("\t".join(elm)) db = gffutils.create_db(file_out, ':memory:', merge_strategy="create_unique", keep_order=True) b = [] mrna_retain = [] for t in db.features_of_type('gene', order_by='start'): c = 0 for i in db.children(t, featuretype='mRNA', order_by='start'): c += 1 if c > 1: b.append(t) else: mrna_retain.append(i.attributes["ID"][0]) mrna_select = [] for t in b: seq_multiple = [] for a in db.children(t, featuretype='mRNA', order_by='start'): seq_combined = '' j = 0 for i in db.children(a, featuretype='CDS', order_by='start'): j += 1 if j == 1: pphase = i[7] seq = i.sequence(myFasta, use_strand=False) seq_combined += seq seq_combined = SeqRecord(Seq(seq_combined, generic_dna)) if t.strand == '-': pphase = i[7] seq_combined = seq_combined.reverse_complement() if pphase == "0" or pphase == ".": seq_transl = seq_combined.translate() elif pphase == "1": seq_transl = seq_combined[1:].translate() elif pphase == "2": seq_transl = seq_combined[2:].translate() seq_transl.id = a.attributes["ID"][0] if seq_multiple: seq_multiple_len = len(seq_multiple) c = 0 while seq_multiple_len > c: a1 = str(seq_multiple[c].seq) a2 = str(seq_transl.seq) if a1.rstrip("*").startswith(a2.rstrip("*")) and len(a1) < len(a2): seq_multiple[c] = seq_transl elif a2.rstrip("*").startswith(a1.rstrip("*")) and len(a1) < len(a2): seq_multiple[c] = seq_transl else: seq_multiple = seq_multiple + [seq_transl] c+=1 else: seq_multiple = [seq_transl] mrna_select = mrna_select + seq_multiple mrna_select_name = [] for seq in mrna_select: mrna_select_name.append(seq.id) mrna_total = sorted(mrna_retain + mrna_select_name) gff_file = final_evm + ".final.gff3" gff_out = gffwriter.GFFWriter(gff_file) for key in mrna_total: for i in db.parents(key, featuretype='gene', order_by='start'): gff_out.write_rec(i) gff_out.write_rec(db[key]) for i in db.children(key, featuretype='CDS', order_by='start'): gff_out.write_rec(i) for i in db.children(key, featuretype='exon', order_by='start'): gff_out.write_rec(i) return(gff_file)
def strand(gff_file1, gff_file2, fasta, proc, gmap_wd, verbose): outputFilename = tempfile.NamedTemporaryFile(delete=False, prefix="grs", dir=gmap_wd) gff_out = gffwriter.GFFWriter(outputFilename.name) outputFilenameGmap = tempfile.NamedTemporaryFile(delete=False, prefix="grs", dir=gmap_wd) gff_out_s = gffwriter.GFFWriter(outputFilenameGmap.name) gt_com = GT_RETAINID % gff_file1 file1 = tempfile.NamedTemporaryFile(delete=False, mode="w", prefix="grs", dir=gmap_wd) err1 = tempfile.NamedTemporaryFile(delete=False, mode="w", prefix="grs", dir=gmap_wd) if verbose: sys.stderr.write('Executing: %s\n\n' % gt_com) sys.stderr.write('Log file is: %s %s\n\n' % (file1.name, err1.name)) gt_call = subprocess.Popen(gt_com, stdout=file1, stderr=err1, shell=True) gt_call.communicate() file2 = tempfile.NamedTemporaryFile(delete=False, mode="w") err1 = tempfile.NamedTemporaryFile(delete=False, mode="w") gt_com = GT_RETAINID % gff_file2 if verbose: sys.stderr.write('Executing: %s\n\n' % gt_com) gt_call = subprocess.Popen(gt_com, stdout=file2, stderr=err1, shell=True) gt_call.communicate() db1 = gffutils.create_db(file1.name, ':memory:', merge_strategy='create_unique', keep_order=True) db2 = gffutils.create_db(file2.name, ':memory:', merge_strategy='create_unique', keep_order=True) listgeneintrons = [] listgenetotal = [] for i in db1.features_of_type("intron"): g = ' '.join(i.attributes['Parent']) listgeneintrons.append(g) for i in db1.features_of_type("CDS"): g = ' '.join(i.attributes['Parent']) listgenetotal.append(g) listgene1 = sorted(set(list(set(listgenetotal) ^ set(listgeneintrons)))) listgeneintrons = [] listgenetotal = [] for i in db2.features_of_type("intron"): g = ' '.join(i.attributes['Parent']) listgeneintrons.append(g) for i in db2.features_of_type("CDS"): g = ' '.join(i.attributes['Parent']) listgenetotal.append(g) listgene2 = sorted(set(list(set(listgenetotal) ^ set(listgeneintrons)))) newlist = [] gene_dict = {} for a in listgene2: b = a.split('_', 1)[1] bb = b.split('.') del bb[-1] evm = '.'.join(bb) newlist.append(evm) if evm in gene_dict: z = gene_dict[evm] gene_dict[evm] = z + [a] else: gene_dict[evm] = [a] commonlist = list(set(listgene1).intersection(newlist)) uniqGmap = sorted(set(list(set(newlist) ^ set(commonlist)))) evm_list = [] gmap_list = [] for a in commonlist: if gene_dict[a] and len(gene_dict[a]) < 2: evm_list.append(a) elif gene_dict[a] and len(gene_dict[a]) > 1: gmap_list = gmap_list + gene_dict[a] for a in uniqGmap: if gene_dict[a]: gmap_list = gmap_list + gene_dict[a] listgeneintrons_u = (set(listgeneintrons)) for evm in evm_list: for i in db1.children(evm, featuretype='CDS', order_by='start'): gff_out.write_rec(i) gff_out.write_rec(db1[evm]) for i in db1.parents(evm, featuretype='gene', order_by='start'): gff_out.write_rec(i) for i in db1.children(evm, featuretype='exon', order_by='start'): gff_out.write_rec(i) for evm in gmap_list: for i in db2.children(evm, featuretype='CDS', order_by='start'): gff_out_s.write_rec(i) gff_out_s.write_rec(db2[evm]) for i in db2.parents(evm, featuretype='gene', order_by='start'): gff_out_s.write_rec(i) for i in db2.children(evm, featuretype='exon', order_by='start'): gff_out_s.write_rec(i) for evm in listgeneintrons_u: for i in db2.children(evm, featuretype='CDS', order_by='start'): gff_out.write_rec(i) gff_out.write_rec(db2[evm]) for i in db2.parents(evm, featuretype='gene', order_by='start'): gff_out.write_rec(i) for i in db2.children(evm, featuretype='exon', order_by='start'): gff_out.write_rec(i) gff_out.close() gff_out_s.close() gffOrf = longest(outputFilenameGmap.name, fasta, proc, gmap_wd, verbose) output_filename_final = gmap_wd + 'finalAnnotation.Final.Comb.gff3' gff_files = [gffOrf, outputFilename.name] with open(output_filename_final, 'wb') as wfd: for f in gff_files: with open(f, 'rb') as fd: shutil.copyfileobj(fd, wfd, 1024 * 1024 * 10) return output_filename_final
def removeDiscrepancy(gff, evmFile, verbose): badName = [] comm = PASA_VAL % gff if verbose: sys.stderr.write('Executing: %s\n\n' % comm) gffVal_call = subprocess.Popen(comm, stdout=subprocess.PIPE, shell=True) for ln in gffVal_call.stdout.readlines(): name = re.split(' |\.CDS', (ln.decode("utf-8"))) if len(name) > 3 and "ERROR" in name[0]: badName.append(name[2]) badNameUniq = list(set(badName)) i = open(gff, 'r') listAllName = [] for line in i: fields = line.strip().split('\t') if len(fields) > 3: if "mRNA" in fields[2]: attribute = fields[8].split(';') for el in attribute: if "ID" in el: listAllName.append(el.split("=")[1]) listgene = sorted(set(list(set(listAllName) ^ set(badNameUniq)))) outputFilename = gff + '.noProblem.gff3' gff_out = gffwriter.GFFWriter(outputFilename) db1 = gffutils.create_db(gff, ':memory:', merge_strategy='create_unique', keep_order=True) for evm in listgene: for i in db1.children(evm, featuretype='CDS', order_by='start'): gff_out.write_rec(i) gff_out.write_rec(db1[evm]) for i in db1.parents(evm, featuretype='gene', order_by='start'): gff_out.write_rec(i) for i in db1.children(evm, featuretype='exon', order_by='start'): gff_out.write_rec(i) cmd = BEDTOOLS_INTERSECT % (evmFile, gff) if verbose: sys.stderr.write('Executing: %s\n\n' % cmd) bedtools_call = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) evm_mRNA = [] for ln in bedtools_call.stdout.readlines(): lne = ln.decode("utf-8") ln = lne if "mRNA" in ln.split('\t')[2]: attribute = ln.split('\t')[8].split(';') for el in attribute: if "ID" in el: mRNA = el.split('=')[1] evm_mRNA.append(mRNA) db1 = gffutils.create_db(evmFile, ':memory:', merge_strategy='create_unique', keep_order=True) for evm in evm_mRNA: for i in db1.children(evm, featuretype='CDS', order_by='start'): gff_out.write_rec(i) gff_out.write_rec(db1[evm]) for i in db1.parents(evm, featuretype='gene', order_by='start'): gff_out.write_rec(i) for i in db1.children(evm, featuretype='exon', order_by='start'): gff_out.write_rec(i) return outputFilename
def removeOverlap(gff, verbose): i = open(gff, 'r') #outFile = gff + '.RNA.gff' o = tempfile.NamedTemporaryFile(delete=False, mode='w') #open(outFile, 'w') for line in i: listLine = line.split('\t') if len(listLine) == 9: if "CDS" in listLine[2]: o.write(line) i.close() bedouffile = tempfile.NamedTemporaryFile() #errorFile = outFile + ".bedtools_err.log" errorFilefile = tempfile.NamedTemporaryFile() #open(errorFile, "w") bedsort = BEDTOOLS_SORT % o.name bedmerge = BEDTOOLS_MERGE o.close() if verbose: sys.stderr.write('Executing: %s\n\n' % bedsort) sys.stderr.write('Executing: %s\n\n' % bedmerge) bedsort_call = subprocess.Popen(bedsort, stdout=subprocess.PIPE, stderr=errorFilefile, shell=True) bedmerge_call = subprocess.Popen(bedmerge, stdin=bedsort_call.stdout, stdout=bedouffile, stderr=errorFilefile, shell=True) bedmerge_call.communicate() errorFilefile.close() listMultiple = [] listUniq = [] count = 0 dictRNA = {} i = open(bedouffile.name, 'r') for a in i: listLine = a.split('\t') nameRNA = re.split(',|;', listLine[5]) count += 1 locus = "locus" + str(count) for elm in nameRNA: if "Parent" in elm and int(listLine[4]) > 1: mRNAname = elm.split('=')[1] listMultiple.append(mRNAname) if locus in dictRNA: dictRNA[locus].append(mRNAname) else: dictRNA[locus] = [ mRNAname, ] elif "Parent" in elm: mRNAname = elm.split('=')[1] listUniq.append(mRNAname) bedouffile.close() listMultipleUniq = [] listMultipleUniq = list(set(listMultiple)) dictLength = {} mRNA = open(gff, 'r') for line in mRNA: listLine = line.split('\t') if len(listLine) == 9: if "CDS" in listLine[2]: for key in dictRNA: for el in dictRNA[key]: nameID = "Parent=" + el + ';' if nameID in line: length = (int(line.split('\t')[4]) - int(line.split('\t')[3])) if key in dictLength: oldLenght = dictLength[key] if int(oldLenght[1]) < int(length): dictLength[key] = [el, str(length)] else: dictLength[key] = [el, str(length)] for key in dictLength: listUniq.append(dictLength[key][0]) listUniqNew = [] for mRNA in listUniq: mRNAnew = mRNA.strip('\n') if mRNAnew not in listUniqNew: listUniqNew.append(mRNAnew) outputFilename = gff + '.uniq.gff3' gff_out = gffwriter.GFFWriter(outputFilename) db1 = gffutils.create_db(gff, ':memory:', merge_strategy='create_unique', keep_order=True) for evm in listUniqNew: for i in db1.children(evm, featuretype='CDS', order_by='start'): gff_out.write_rec(i) gff_out.write_rec(db1[evm]) for i in db1.parents(evm, featuretype='gene', order_by='start'): gff_out.write_rec(i) for i in db1.children(evm, featuretype='exon', order_by='start'): gff_out.write_rec(i) return outputFilename
def annotate_gff_with_genes(args): """ Annotate GFF with genes table. """ gff_fname = utils.pathify(args.gff_filename) if not os.path.isfile(gff_fname): raise Exception, "Cannot find %s" % (gff_fname) table_fname = utils.pathify(args.table_filename) if not os.path.isfile(table_fname): raise Exception, "Cannot find %s" % (table_fname) table_bed = get_table_as_bedtool(table_fname) # Get BedTool for events, containing only the gene entries all_events_bed = pybedtools.BedTool(gff_fname) event_genes = \ all_events_bed.filter(lambda entry: entry.fields[2] == "gene") print "Determining overlap between events and genes..." # Intersect event genes with gene txStart/txEnd intersected_bed = \ event_genes.intersect(table_bed, wb=True, s=True, f=1) # Map event genes to their IDs # # event_gene1 -> refseq -> value # -> ensgene -> value # event_gene2 -> refseq -> # ... event_genes_to_info = \ defaultdict(lambda: defaultdict(list)) for entry in intersected_bed: event_gene_attrs = utils.parse_attributes(entry.fields[8]) event_gene_str = event_gene_attrs["ID"] gene_info_field = entry.fields[-1] # Strip semicolon of ID attributes if gene_info_field.endswith(";"): gene_info_field = gene_info_field[0:-1] # Convert attributes into dictionary gene_info = utils.parse_attributes(gene_info_field) ensgene_id = gene_info["ensg_id"] refseq_id = gene_info["refseq_id"] gene_symbol = gene_info["gsymbol"] # Skip null entries if not is_null_id(ensgene_id): event_genes_to_info[event_gene_str]["ensg_id"].append(ensgene_id) if not is_null_id(refseq_id): event_genes_to_info[event_gene_str]["refseq_id"].append(refseq_id) if not is_null_id(gene_symbol): event_genes_to_info[event_gene_str]["gsymbol"].append(gene_symbol) # Incorporate the gene information into the GFF and output it # it using gffutils print "Loading events into GFF database..." events_db = gffutils.create_db(gff_fname, ":memory:", verbose=False) output_fname = gff_fname events_out = gffwriter.GFFWriter(output_fname, in_place=True) print " - Outputting annotated GFF to: %s" % (output_fname) def new_recs(): for gene_recs in list(events_db.iter_by_parent_childs()): gene_rec = gene_recs[0] event_id = gene_rec.id # Use existing IDs if present if "ensgene_id" in gene_rec.attributes: ensgene_id = gene_rec.attributes["ensg_id"][0] else: ensgene_id = "NA" if "refseq_id" in gene_rec.attributes: refseq_id = gene_rec.attributes["refseq_id"][0] else: refseq_id = "NA" if "gene_symbol" in gene_rec.attributes: gene_symbol = gene_rec.attributes["gsymbol"][0] else: gene_symbol = "NA" if event_id in event_genes_to_info: event_info = event_genes_to_info[event_id] ensgene_ids = \ utils.unique_list(event_info["ensg_id"]) if len(ensgene_ids) > 0 and ensgene_ids[0] != "NA": ensgene_id = ",".join(ensgene_ids) refseq_ids = \ utils.unique_list(event_info["refseq_id"]) if len(refseq_ids) > 0 and refseq_ids[0] != "NA": refseq_id = ",".join(refseq_ids) gene_symbols = \ utils.unique_list(event_info["gsymbol"]) if len(gene_symbols) > 0 and gene_symbols[0] != "NA": gene_symbol = ",".join(gene_symbols) gene_rec.attributes["ensg_id"] = [ensgene_id] gene_rec.attributes["refseq_id"] = [refseq_id] gene_rec.attributes["gsymbol"] = [gene_symbol] # Yield all the gene's records for g in gene_recs: yield g t1 = time.time() print "Creating annotated GFF database..." annotated_db = gffutils.create_db(new_recs(), ":memory:", verbose=False) t2 = time.time() print "Creation took %.2f secs" % (t2 - t1) # Write to file print "Writing annotated GFF to file..." for gene_rec in annotated_db.all_features(featuretype="gene"): events_out.write_gene_recs(annotated_db, gene_rec.id) events_out.close()