def output_parts_as_gff(gff_out, parts, chrom, strand, source=".", rec_type="exon", gene_id="NA", na_val="NA"): """ Output a set of parts to GFF. """ for part in parts: attributes = { 'ID': ["%s.%s" % (rec_type, part.label)], 'Parent': [part.parent], 'gene_id': [gene_id], } rec_start, rec_end = part.start, part.end gff_rec = gff_utils.GFF(chrom, source, rec_type, rec_start, rec_end, attributes=attributes, strand=strand) gff_out.write(gff_rec)
def shorten_rec(gff_rec, old_to_new_ids, max_id_len): """ Shorten GFF record. Return new record. """ rec_id = ",".join(gff_rec.attributes["ID"]) if "Parent" in gff_rec.attributes: rec_parent = ",".join(gff_rec.attributes["Parent"]) else: rec_parent = "" new_attributes = gff_rec.attributes.copy() if rec_id not in old_to_new_ids: # Set new attributes for ID field old_to_new_ids[rec_id] = shorten_id(rec_id, max_id_len) new_attributes['ID'] = [old_to_new_ids[rec_id]] new_attributes['Name'] = [old_to_new_ids[rec_id]] if rec_parent not in old_to_new_ids: # Set new attributes for Parent field old_to_new_ids[rec_parent] = shorten_id(rec_parent, max_id_len) if old_to_new_ids[rec_parent] != "": # Only set parent record if there is a parent new_attributes['Parent'] = [old_to_new_ids[rec_parent]] new_gff_rec = GFF.GFF(seqid=gff_rec.seqid, source=gff_rec.source, type=gff_rec.type, start=gff_rec.start, end=gff_rec.end, score=gff_rec.score, strand=gff_rec.strand, phase=gff_rec.phase, attributes=new_attributes) return new_gff_rec
def instert_introns_to_gff3(gff_filename, output_gff3_filename): output_filename = os.path.join("%s_introns.gff3" % (output_gff3_filename)) print "Adding introns to GFF..." print " - Input: %s" % (gff_filename) print " - Output: %s" % (output_filename) gff_out = gff_utils.Writer(open(output_filename, "w")) gff_db = gff_utils.GFFDatabase(from_filename=gff_filename, reverse_recs=True) t1 = time.time() genes = gene_utils.load_genes_from_gff(gff_filename) for gene_id in genes: gene_info = genes[gene_id] gene_tree = gene_info["hierarchy"] gene_obj = gene_info["gene_object"] gene_rec = gene_tree[gene_id]["gene"] # Write the GFF record gff_out.write(gene_rec) # Write out the mRNAs, their exons, and then # input the introns for mRNA_id in gene_tree[gene_id]["mRNAs"]: curr_mRNA = gene_tree[gene_id]["mRNAs"][mRNA_id] gff_out.write(curr_mRNA["record"]) # Write out the exons curr_exons = gene_tree[gene_id]["mRNAs"][mRNA_id]["exons"] for exon in curr_exons: gff_out.write(curr_exons[exon]["record"]) # Now output the introns for isoform in gene_obj.isoforms: intron_coords = [] for first_exon, second_exon in zip(isoform.parts, isoform.parts[1::1]): # Intron start coordinate is the coordinate right after # the end of the first exon, intron end coordinate is the # coordinate just before the beginning of the second exon intron_start = first_exon.end + 1 intron_end = second_exon.start - 1 if intron_start >= intron_end: continue intron_coords.append((intron_start, intron_end)) # Create record for this intron intron_id = "%s:%s:%d-%d:%s" % (isoform.label, gene_obj.chrom, intron_start, intron_end, gene_obj.strand) intron_rec = \ gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron", intron_start, intron_end, ".", gene_obj.strand, ".", attributes={"ID": [intron_id], "Parent": [isoform.label]}) gff_out.write(intron_rec) t2 = time.time() print "Addition took %.2f minutes." % ((t2 - t1) / 60.)
def add_introns_to_gff(gff_filename, output_dir): """ Add 'intron' entries to GFF. """ output_basename = \ utils.trim_gff_ext(os.path.basename(gff_filename)) ext_to_use = os.path.basename(gff_filename).rsplit(".", 1)[1] output_filename = \ os.path.join(output_dir, "%s.with_introns.%s" %(output_basename, ext_to_use)) print "Adding introns to GFF..." print " - Input: %s" %(gff_filename) print " - Output: %s" %(output_filename) if os.path.isfile(output_filename): print "Found file %s, skipping.." %(output_filename) return output_filename gff_out = miso_gff_utils.Writer(open(output_filename, "w")) gff_db = miso_gff_utils.GFFDatabase(from_filename=gff_filename, reverse_recs=True) t1 = time.time() genes = gene_utils.load_genes_from_gff(gff_filename) for gene_id in genes: gene_info = genes[gene_id] gene_tree = gene_info["hierarchy"] gene_obj = gene_info["gene_object"] gene_rec = gene_tree[gene_id]["gene"] # Write the GFF record gff_out.write(gene_rec) # Write out the mRNAs, their exons, and then # input the introns for mRNA in gene_obj.isoforms: mRNA_id = mRNA.label curr_mRNA = gene_tree[gene_id]["mRNAs"][mRNA_id] gff_out.write(curr_mRNA["record"]) # Write out the exons curr_exons = gene_tree[gene_id]["mRNAs"][mRNA_id]["exons"] for exon in curr_exons: gff_out.write(curr_exons[exon]["record"]) # Now output the introns for isoform in gene_obj.isoforms: intron_coords = [] for first_exon, second_exon in zip(isoform.parts, isoform.parts[1::1]): # Intron start coordinate is the coordinate right after # the end of the first exon, intron end coordinate is the # coordinate just before the beginning of the second exon intron_start = first_exon.end + 1 intron_end = second_exon.start - 1 if intron_start >= intron_end: continue intron_coords.append((intron_start, intron_end)) # Create record for this intron intron_id = "%s:%d-%d:%s.intron" \ %(gene_obj.chrom, intron_start, intron_end, gene_obj.strand) intron_rec = \ miso_gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron", intron_start, intron_end, strand=gene_obj.strand, attributes={"ID": [intron_id], "Parent": [isoform.label]}) gff_out.write(intron_rec) t2 = time.time() print "Addition took %.2f minutes." %((t2 - t1)/60.)
def fetch_seq_from_gff(gff_fname, fasta_fname, output_dir, with_flanking_introns=False, flanking_introns_coords=None, overwrite=True, entries_to_include=["gene", "mRNA", "exon"]): """ Fetch sequence from GFF file. Outputs: (1) GFF file containing an annotation of the sequences. (2) FASTA file with the actual sequences. If asked, fetch the flanking intronic sequences. Flanking regions are marked below: U: region of upstream intron D: region of downstream intron U D [ U P ]-----[ S E ]-----[ D N ] a,b c,d a,b,c,d correspond to optional flanking intron coordinates that determine the regions of the upstream/downstream introns that should be fetched: a, b: negative ints, position relative to 5' splice site of SE a < b c, d: positive ints, position relative to 3' splice site of SE c < d """ # Load GFF genes gff_db = miso_gff_utils.GFFDatabase(from_filename=gff_fname, reverse_recs=True) file_basename = re.sub("\.gff3?", "", os.path.basename(gff_fname)) output_basename = "%s.event_seqs" %(file_basename) if flanking_introns_coords is not None: output_basename = "%s.flank_intronic_%s_%s_%s_%s" \ %(output_basename, flanking_introns_coords[0], flanking_introns_coords[1], flanking_introns_coords[2], flanking_introns_coords[3]) gff_outdir = os.path.join(output_dir, "gff_coords") utils.make_dir(gff_outdir) gff_output_fname = os.path.join(gff_outdir, "%s.gff" %(output_basename)) fasta_output_fname = os.path.join(output_dir, "%s.fa" %(output_basename)) if not overwrite: if os.path.isfile(fasta_output_fname): print "Output file %s exists. Skipping..." %(fasta_output_fname) return fasta_output_fname print "Outputting GFF coordinates to: %s" %(gff_output_fname) if os.path.isfile(gff_output_fname): print " - Overwriting existing file" print "Outputting sequences to: %s" %(fasta_output_fname) if os.path.isfile(fasta_output_fname): print " - Overwriting existing file" genes = gene_utils.load_genes_from_gff(gff_fname) gff_out_file = open(gff_output_fname, "w") gff_out = miso_gff_utils.Writer(gff_out_file) for gene_id in genes: gene_info = genes[gene_id] gene_tree = gene_info["hierarchy"] gene_obj = gene_info["gene_object"] # GFF records to write for the current gene recs_to_write = [] # For mRNA entries, extract the flanking introns of the # alternative exon if asked event_recs = get_event_recs_from_gene(gene_obj, gene_tree) long_mRNA_id = event_recs["long_mRNA"].get_id() if event_recs is None: continue # Write out up, se, and dn exons recs_to_write.extend([event_recs["up_exon"]["record"], event_recs["se_exon"]["record"], event_recs["dn_exon"]["record"]]) if with_flanking_introns: introns_coords = \ get_flanking_introns_coords(gene_obj) if introns_coords == None: raise Exception, "Cannot find flanking introns coordinates." sys.exit(1) # Fetch upstream intron sequence up_intron_start, up_intron_end = \ introns_coords["up_intron"] up_intron_len = up_intron_end - up_intron_start + 1 # Fetch downstream intron sequence dn_intron_start, dn_intron_end = \ introns_coords["dn_intron"] dn_intron_len = dn_intron_end - dn_intron_start + 1 # If given custom coordinates, use them instead of entire up/down # flanking intronic coordinates. se_exon_rec = event_recs["se_exon"]["record"] if flanking_introns_coords is not None: # (start,end) of upstream intron sequence a, b = \ int(flanking_introns_coords[0]), int(flanking_introns_coords[1]) c, d = \ int(flanking_introns_coords[2]), int(flanking_introns_coords[3]) a, b, c, d = error_check_intronic_coords(a, b, c, d, up_intron_len, dn_intron_len) # Coordinates relative to 5' splice site of sequence to be fetched # The start of upstream intron sequence is negative from the 5' ss up_intron_start = se_exon_rec.start + a up_intron_end = se_exon_rec.start + b dn_intron_start = se_exon_rec.end + c dn_intron_end = se_exon_rec.end + d # Make GFF records for up/dn intronic sequences chrom = se_exon_rec.seqid source = se_exon_rec.source rec_type = "intron" strand = se_exon_rec.strand up_intron_str = "%s.up_intron" %(long_mRNA_id) up_intron_rec = \ miso_gff_utils.GFF(chrom, source, "intron", up_intron_start, up_intron_end, strand=strand, attributes={"ID": [up_intron_str], "Parent": [gene_obj.label]}) dn_intron_str = "%s.dn_intron" %(long_mRNA_id) dn_intron_rec = \ miso_gff_utils.GFF(chrom, source, "intron", dn_intron_start, dn_intron_end, strand=strand, attributes={"ID": [dn_intron_str], "Parent": [gene_obj.label]}) recs_to_write.append(up_intron_rec) recs_to_write.append(dn_intron_rec) # Write out records to GFF for rec in recs_to_write: gff_out.write(rec) gff_out_file.close() # Output FASTA sequences output_fasta_seqs_from_gff(gff_output_fname, fasta_fname, fasta_output_fname) return fasta_output_fname