def sanitize_gff(gff_fname, output_dir, include_introns=True): """ Sanitize a GFF file. Return the revised GFF file. """ gff_out_fname = os.path.join(output_dir, os.path.basename(gff_fname)) genes = gene_utils.load_genes_from_gff(gff_fname, include_introns=include_introns) t1 = time.time() with open(gff_out_fname, "w") as gff_out_file: gff_out = miso_gff_utils.Writer(gff_out_file) for gene in genes: gene_obj = genes[gene]["gene_object"] gene_record = genes[gene]["hierarchy"][gene]["gene"] gene_hierarchy = genes[gene]["hierarchy"][gene] # Write gene record write_rec_to_gff(gff_out, sanitize_record(gene_record)) for mRNA in gene_obj.isoforms: mRNA_id = mRNA.label mRNA_obj = gene_hierarchy["mRNAs"][mRNA_id] mRNA_record = mRNA_obj["record"] # Write out the mRNA record write_rec_to_gff(gff_out, sanitize_record(mRNA_record)) # Get parts of each mRNA parts_records = \ [mRNA_obj["exons"][p.label]["record"] for p in mRNA.parts] if gene_obj.strand == "-": parts_records = fix_up_down_ids(parts_records) for part in parts_records: write_rec_to_gff(gff_out, part) t2 = time.time() print "Sanitizing took %.2f seconds" %(t2 - t1) return gff_out_fname
def instert_introns_to_gff3(gff_filename, output_gff3_filename): output_filename = os.path.join("%s_introns.gff3" % (output_gff3_filename)) print "Adding introns to GFF..." print " - Input: %s" % (gff_filename) print " - Output: %s" % (output_filename) gff_out = gff_utils.Writer(open(output_filename, "w")) gff_db = gff_utils.GFFDatabase(from_filename=gff_filename, reverse_recs=True) t1 = time.time() genes = gene_utils.load_genes_from_gff(gff_filename) for gene_id in genes: gene_info = genes[gene_id] gene_tree = gene_info["hierarchy"] gene_obj = gene_info["gene_object"] gene_rec = gene_tree[gene_id]["gene"] # Write the GFF record gff_out.write(gene_rec) # Write out the mRNAs, their exons, and then # input the introns for mRNA_id in gene_tree[gene_id]["mRNAs"]: curr_mRNA = gene_tree[gene_id]["mRNAs"][mRNA_id] gff_out.write(curr_mRNA["record"]) # Write out the exons curr_exons = gene_tree[gene_id]["mRNAs"][mRNA_id]["exons"] for exon in curr_exons: gff_out.write(curr_exons[exon]["record"]) # Now output the introns for isoform in gene_obj.isoforms: intron_coords = [] for first_exon, second_exon in zip(isoform.parts, isoform.parts[1::1]): # Intron start coordinate is the coordinate right after # the end of the first exon, intron end coordinate is the # coordinate just before the beginning of the second exon intron_start = first_exon.end + 1 intron_end = second_exon.start - 1 if intron_start >= intron_end: continue intron_coords.append((intron_start, intron_end)) # Create record for this intron intron_id = "%s:%s:%d-%d:%s" % (isoform.label, gene_obj.chrom, intron_start, intron_end, gene_obj.strand) intron_rec = \ gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron", intron_start, intron_end, ".", gene_obj.strand, ".", attributes={"ID": [intron_id], "Parent": [isoform.label]}) gff_out.write(intron_rec) t2 = time.time() print "Addition took %.2f minutes." % ((t2 - t1) / 60.)
def shorten_gff(input_gff, output_gff, max_id_len=75): # # List of search and replace with IDs # replace_ids = [] # # pattern to identify ID= elements # pat = 'ID=(.+);' # input_file = open(input_gff, 'r') # for line in input_file: # # find ID= elements # match = re.search(pat, line) # if match != None: # assert(len(match.groups()) > 0) # id_to_replace = match.groups()[0] # if len(id_to_replace) >= max_id_len: # new_id = shorten_id(id_to_replace) # #replace_ids.append((id_to_replace, new_id)) # old_to_new_ids[id_to_replace] = new_id """ Replace the ith ID in old_ids with the ith ID in new_ids. Output result to output gff. """ new_recs = [] # Load input GFF t1 = time.time() gff_in = GFF.GFFDatabase(from_filename=input_gff, reverse_recs=True) # Mapping from old to new IDs old_to_new_ids = {} for rec in gff_in: new_record = shorten_rec(rec, old_to_new_ids, max_id_len) new_recs.append(new_record) t2 = time.time() print "Loading of input GFF took %.2f seconds" %(t2 - t1) print "Writing revised gff to: %s" %(output_gff) output_file = open(output_gff, 'w') gff_writer = GFF.Writer(output_file) # Write new GFF file gff_writer.write_recs(new_recs) output_file.close()
def output_exons_to_file(recs, output_filename, output_format='gff'): """ Output exons to file. Arguments: - records in gff format - filename to output results to """ print "Outputting exons to file: %s" % (output_filename) if output_format == "gff": # Write file in GFF format output_file = open(output_filename, 'w') gff_writer = gff_utils.Writer(output_file) recs.reverse() gff_writer.write_recs(recs) output_file.close() elif output_format == "bed": # Write file in BED format raise Exception, "BED format unsupported."
def add_introns_to_gff(gff_filename, output_dir): """ Add 'intron' entries to GFF. """ output_basename = \ utils.trim_gff_ext(os.path.basename(gff_filename)) ext_to_use = os.path.basename(gff_filename).rsplit(".", 1)[1] output_filename = \ os.path.join(output_dir, "%s.with_introns.%s" %(output_basename, ext_to_use)) print "Adding introns to GFF..." print " - Input: %s" %(gff_filename) print " - Output: %s" %(output_filename) if os.path.isfile(output_filename): print "Found file %s, skipping.." %(output_filename) return output_filename gff_out = miso_gff_utils.Writer(open(output_filename, "w")) gff_db = miso_gff_utils.GFFDatabase(from_filename=gff_filename, reverse_recs=True) t1 = time.time() genes = gene_utils.load_genes_from_gff(gff_filename) for gene_id in genes: gene_info = genes[gene_id] gene_tree = gene_info["hierarchy"] gene_obj = gene_info["gene_object"] gene_rec = gene_tree[gene_id]["gene"] # Write the GFF record gff_out.write(gene_rec) # Write out the mRNAs, their exons, and then # input the introns for mRNA in gene_obj.isoforms: mRNA_id = mRNA.label curr_mRNA = gene_tree[gene_id]["mRNAs"][mRNA_id] gff_out.write(curr_mRNA["record"]) # Write out the exons curr_exons = gene_tree[gene_id]["mRNAs"][mRNA_id]["exons"] for exon in curr_exons: gff_out.write(curr_exons[exon]["record"]) # Now output the introns for isoform in gene_obj.isoforms: intron_coords = [] for first_exon, second_exon in zip(isoform.parts, isoform.parts[1::1]): # Intron start coordinate is the coordinate right after # the end of the first exon, intron end coordinate is the # coordinate just before the beginning of the second exon intron_start = first_exon.end + 1 intron_end = second_exon.start - 1 if intron_start >= intron_end: continue intron_coords.append((intron_start, intron_end)) # Create record for this intron intron_id = "%s:%d-%d:%s.intron" \ %(gene_obj.chrom, intron_start, intron_end, gene_obj.strand) intron_rec = \ miso_gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron", intron_start, intron_end, strand=gene_obj.strand, attributes={"ID": [intron_id], "Parent": [isoform.label]}) gff_out.write(intron_rec) t2 = time.time() print "Addition took %.2f minutes." %((t2 - t1)/60.)
def fetch_seq_from_gff(gff_fname, fasta_fname, output_dir, with_flanking_introns=False, flanking_introns_coords=None, overwrite=True, entries_to_include=["gene", "mRNA", "exon"]): """ Fetch sequence from GFF file. Outputs: (1) GFF file containing an annotation of the sequences. (2) FASTA file with the actual sequences. If asked, fetch the flanking intronic sequences. Flanking regions are marked below: U: region of upstream intron D: region of downstream intron U D [ U P ]-----[ S E ]-----[ D N ] a,b c,d a,b,c,d correspond to optional flanking intron coordinates that determine the regions of the upstream/downstream introns that should be fetched: a, b: negative ints, position relative to 5' splice site of SE a < b c, d: positive ints, position relative to 3' splice site of SE c < d """ # Load GFF genes gff_db = miso_gff_utils.GFFDatabase(from_filename=gff_fname, reverse_recs=True) file_basename = re.sub("\.gff3?", "", os.path.basename(gff_fname)) output_basename = "%s.event_seqs" %(file_basename) if flanking_introns_coords is not None: output_basename = "%s.flank_intronic_%s_%s_%s_%s" \ %(output_basename, flanking_introns_coords[0], flanking_introns_coords[1], flanking_introns_coords[2], flanking_introns_coords[3]) gff_outdir = os.path.join(output_dir, "gff_coords") utils.make_dir(gff_outdir) gff_output_fname = os.path.join(gff_outdir, "%s.gff" %(output_basename)) fasta_output_fname = os.path.join(output_dir, "%s.fa" %(output_basename)) if not overwrite: if os.path.isfile(fasta_output_fname): print "Output file %s exists. Skipping..." %(fasta_output_fname) return fasta_output_fname print "Outputting GFF coordinates to: %s" %(gff_output_fname) if os.path.isfile(gff_output_fname): print " - Overwriting existing file" print "Outputting sequences to: %s" %(fasta_output_fname) if os.path.isfile(fasta_output_fname): print " - Overwriting existing file" genes = gene_utils.load_genes_from_gff(gff_fname) gff_out_file = open(gff_output_fname, "w") gff_out = miso_gff_utils.Writer(gff_out_file) for gene_id in genes: gene_info = genes[gene_id] gene_tree = gene_info["hierarchy"] gene_obj = gene_info["gene_object"] # GFF records to write for the current gene recs_to_write = [] # For mRNA entries, extract the flanking introns of the # alternative exon if asked event_recs = get_event_recs_from_gene(gene_obj, gene_tree) long_mRNA_id = event_recs["long_mRNA"].get_id() if event_recs is None: continue # Write out up, se, and dn exons recs_to_write.extend([event_recs["up_exon"]["record"], event_recs["se_exon"]["record"], event_recs["dn_exon"]["record"]]) if with_flanking_introns: introns_coords = \ get_flanking_introns_coords(gene_obj) if introns_coords == None: raise Exception, "Cannot find flanking introns coordinates." sys.exit(1) # Fetch upstream intron sequence up_intron_start, up_intron_end = \ introns_coords["up_intron"] up_intron_len = up_intron_end - up_intron_start + 1 # Fetch downstream intron sequence dn_intron_start, dn_intron_end = \ introns_coords["dn_intron"] dn_intron_len = dn_intron_end - dn_intron_start + 1 # If given custom coordinates, use them instead of entire up/down # flanking intronic coordinates. se_exon_rec = event_recs["se_exon"]["record"] if flanking_introns_coords is not None: # (start,end) of upstream intron sequence a, b = \ int(flanking_introns_coords[0]), int(flanking_introns_coords[1]) c, d = \ int(flanking_introns_coords[2]), int(flanking_introns_coords[3]) a, b, c, d = error_check_intronic_coords(a, b, c, d, up_intron_len, dn_intron_len) # Coordinates relative to 5' splice site of sequence to be fetched # The start of upstream intron sequence is negative from the 5' ss up_intron_start = se_exon_rec.start + a up_intron_end = se_exon_rec.start + b dn_intron_start = se_exon_rec.end + c dn_intron_end = se_exon_rec.end + d # Make GFF records for up/dn intronic sequences chrom = se_exon_rec.seqid source = se_exon_rec.source rec_type = "intron" strand = se_exon_rec.strand up_intron_str = "%s.up_intron" %(long_mRNA_id) up_intron_rec = \ miso_gff_utils.GFF(chrom, source, "intron", up_intron_start, up_intron_end, strand=strand, attributes={"ID": [up_intron_str], "Parent": [gene_obj.label]}) dn_intron_str = "%s.dn_intron" %(long_mRNA_id) dn_intron_rec = \ miso_gff_utils.GFF(chrom, source, "intron", dn_intron_start, dn_intron_end, strand=strand, attributes={"ID": [dn_intron_str], "Parent": [gene_obj.label]}) recs_to_write.append(up_intron_rec) recs_to_write.append(dn_intron_rec) # Write out records to GFF for rec in recs_to_write: gff_out.write(rec) gff_out_file.close() # Output FASTA sequences output_fasta_seqs_from_gff(gff_output_fname, fasta_fname, fasta_output_fname) return fasta_output_fname