Ejemplo n.º 1
0
def index_gff(gff_filename, output_dir,
              compress_id=False):
    """
    Index the given GFF and placed the indexed representation
    in the output directory.
    """
    print "Indexing GFF..."
    if compress_id:
        print "  - Using compressed IDs to create indexed filenames."
    # First check that the GFF is not already indexed
    indexed_files = glob.glob(os.path.join(output_dir, "chr*"))
    if len(indexed_files) >= 1:
        print "%s appears to already be indexed. Aborting." \
            %(gff_filename)
        return
    
    #print "  - GFF: %s" %(gff_filename)
    #print "  - Outputting to: %s" %(output_dir)
    overall_t1 = time.time()
    t1 = time.time()
    gff_genes = gene_utils.load_genes_from_gff(gff_filename)
    t2 = time.time()
    #print "  - Loading of genes from GFF took %.2f seconds" %(t2 - t1)

    t1 = time.time()
    serialize_genes(gff_genes,
                    gff_filename,
                    output_dir,
                    compress_id=compress_id)
    t2 = time.time()
    print " Serialization of genes from GFF took %.2f seconds" %(t2 - t1)
    overall_t2 = time.time()
    print "Indexing of GFF took %.2f seconds." %(overall_t2 - overall_t1)
Ejemplo n.º 2
0
def sanitize_gff(gff_fname, output_dir, include_introns=True):
    """
    Sanitize a GFF file. Return the revised
    GFF file.
    """
    gff_out_fname = os.path.join(output_dir, os.path.basename(gff_fname))
    genes = gene_utils.load_genes_from_gff(gff_fname,
                                           include_introns=include_introns)
    t1 = time.time()
    with open(gff_out_fname, "w") as gff_out_file:
        gff_out = miso_gff_utils.Writer(gff_out_file)
        for gene in genes:
            gene_obj = genes[gene]["gene_object"]
            gene_record = genes[gene]["hierarchy"][gene]["gene"]
            gene_hierarchy = genes[gene]["hierarchy"][gene]
            # Write gene record
            write_rec_to_gff(gff_out, sanitize_record(gene_record))
            for mRNA in gene_obj.isoforms:
                mRNA_id = mRNA.label
                mRNA_obj = gene_hierarchy["mRNAs"][mRNA_id]
                mRNA_record = mRNA_obj["record"]
                # Write out the mRNA record
                write_rec_to_gff(gff_out, sanitize_record(mRNA_record))
                # Get parts of each mRNA
                parts_records = \
                    [mRNA_obj["exons"][p.label]["record"] for p in mRNA.parts]
                if gene_obj.strand == "-":
                    parts_records = fix_up_down_ids(parts_records)
                for part in parts_records:
                    write_rec_to_gff(gff_out, part)
    t2 = time.time()
    print "Sanitizing took %.2f seconds" %(t2 - t1)
    return gff_out_fname
Ejemplo n.º 3
0
    def loaded_events_to_genes(self,
                               single_event_name=None,
                               read_len=None,
                               overhang_len=None):
        """
	Parse the loaded set of events into gene structures.  Map events to genes.
	"""
        if len(self.events) == 0:
            raise Exception, "Must load events first before they can be converted to genes."
        events_to_genes = {}

        t1 = time.time()
        if single_event_name:
            # If given an event name, only parse that event
            event_names = [single_event_name]
        else:
            event_names = self.events.keys()
        for event_name in event_names:
            event = self.events[event_name]

            if self.event_type == 'SE' or self.event_type == 'RI':
                gene = Gene.se_event_to_gene(event.up_part_len,
                                             event.len,
                                             event.dn_part_len,
                                             event.chrom,
                                             label=event.label)
            elif self.event_type == 'TandemUTR':
                gene = Gene.tandem_utr_event_to_gene(event.core_len,
                                                     event.ext_len,
                                                     event.chrom,
                                                     label=event.label)
            elif (self.event_type == 'AFE' or self.event_type == 'ALE'):
                gene = Gene.afe_ale_event_to_gene(event.proximal_exons,
                                                  event.distal_exons,
                                                  self.event_type,
                                                  event.chrom,
                                                  label=event.label,
                                                  read_len=read_len,
                                                  overhang_len=overhang_len)
            else:
                raise Exception, "Unsupported event type: %s" % (
                    self.event_type)
            events_to_genes[event_name] = gene
        t2 = time.time()
        print "Parsing of events to genes took %.2f seconds." % (t2 - t1)
        return events_to_genes
Ejemplo n.º 4
0
def index_exons(gff_fname):
    gff_genes = gene_utils.load_genes_from_gff(gff_fname)
    exons = defaultdict(bool)
    for gene_id in gff_genes:
        gene_obj = gff_genes[gene_id]["gene_object"]
        se = gene_obj.isoforms[0].parts[1]
        # Index the exon by chromosome
        exons[(gene_obj.chrom, se.start, se.end, gene_obj.strand)] = True
    return exons
def instert_introns_to_gff3(gff_filename, output_gff3_filename):
    output_filename = os.path.join("%s_introns.gff3" % (output_gff3_filename))
    print "Adding introns to GFF..."
    print "  - Input: %s" % (gff_filename)
    print "  - Output: %s" % (output_filename)
    gff_out = gff_utils.Writer(open(output_filename, "w"))
    gff_db = gff_utils.GFFDatabase(from_filename=gff_filename,
                                   reverse_recs=True)
    t1 = time.time()
    genes = gene_utils.load_genes_from_gff(gff_filename)
    for gene_id in genes:
        gene_info = genes[gene_id]
        gene_tree = gene_info["hierarchy"]
        gene_obj = gene_info["gene_object"]
        gene_rec = gene_tree[gene_id]["gene"]
        # Write the GFF record
        gff_out.write(gene_rec)
        # Write out the mRNAs, their exons, and then
        # input the introns
        for mRNA_id in gene_tree[gene_id]["mRNAs"]:
            curr_mRNA = gene_tree[gene_id]["mRNAs"][mRNA_id]
            gff_out.write(curr_mRNA["record"])
            # Write out the exons
            curr_exons = gene_tree[gene_id]["mRNAs"][mRNA_id]["exons"]
            for exon in curr_exons:
                gff_out.write(curr_exons[exon]["record"])
        # Now output the introns
        for isoform in gene_obj.isoforms:
            intron_coords = []
            for first_exon, second_exon in zip(isoform.parts,
                                               isoform.parts[1::1]):
                # Intron start coordinate is the coordinate right after
                # the end of the first exon, intron end coordinate is the
                # coordinate just before the beginning of the second exon
                intron_start = first_exon.end + 1
                intron_end = second_exon.start - 1
                if intron_start >= intron_end:
                    continue
                intron_coords.append((intron_start, intron_end))
                # Create record for this intron
                intron_id = "%s:%s:%d-%d:%s" % (isoform.label, gene_obj.chrom,
                                                intron_start, intron_end,
                                                gene_obj.strand)
                intron_rec = \
                    gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron",
                                  intron_start, intron_end, ".", gene_obj.strand, ".",
                                  attributes={"ID": [intron_id], "Parent": [isoform.label]})
                gff_out.write(intron_rec)
    t2 = time.time()
    print "Addition took %.2f minutes." % ((t2 - t1) / 60.)
Ejemplo n.º 6
0
    def loaded_events_to_genes(self, single_event_name=None,
                               read_len=None, overhang_len=None):
	"""
	Parse the loaded set of events into gene structures.  Map events to genes.
	"""
	if len(self.events) == 0:
	    raise Exception, "Must load events first before they can be converted to genes."
	events_to_genes = {}

	t1 = time.time()
	if single_event_name:
	    # If given an event name, only parse that event
	    event_names = [single_event_name]
	else:
	    event_names = self.events.keys()
	for event_name in event_names:
	    event = self.events[event_name]

	    if self.event_type == 'SE' or self.event_type == 'RI':
		gene = Gene.se_event_to_gene(event.up_part_len, event.len, event.dn_part_len,
                                             event.chrom,
                                             label=event.label)
	    elif self.event_type == 'TandemUTR':
		gene = Gene.tandem_utr_event_to_gene(event.core_len, event.ext_len,
                                                     event.chrom,
                                                     label=event.label)
            elif (self.event_type == 'AFE' or self.event_type == 'ALE'):
                gene = Gene.afe_ale_event_to_gene(event.proximal_exons, event.distal_exons,
                                                  self.event_type, event.chrom,
                                                  label=event.label, read_len=read_len,
                                                  overhang_len=overhang_len)
            else:
                raise Exception, "Unsupported event type: %s" %(self.event_type)
	    events_to_genes[event_name] = gene
	t2 = time.time()
	print "Parsing of events to genes took %.2f seconds." %(t2 - t1)
	return events_to_genes
Ejemplo n.º 7
0
def extract_lens_from_gff(gff_fname, output_dir):
    entries = []
    output_basename = "%s.lens" %(os.path.basename(gff_fname))
    output_fname = os.path.join(output_dir, output_basename)
    print "Extracting lengths from GFF file..."
    print "  - Input GFF: %s" %(gff_fname)
    print "  - Output file: %s" %(output_fname)
    if os.path.isfile(output_fname):
        print "Overwriting %s" %(output_fname)
    gff_genes = gene_utils.load_genes_from_gff(gff_fname)
    for gene_id in gff_genes:
        gene = gff_genes[gene_id]["gene_object"]
        # Get the length of each isoform
        iso_lens = []
        iso_labels = [] 
        iso_exon_lens = []
        genomic_coords = []
        for iso in gene.isoforms:
            iso_lens.append(str(iso.len))
            iso_labels.append(iso.label)
            exon_lens = [str(exon.len) for exon in iso.parts]
            iso_exon_lens.append(exon_lens)
            genomic_coords.append([iso.genomic_start,
                                   iso.genomic_end])
        genomic_coords = np.array(genomic_coords)
        genomic_lens = \
            map(str, list(genomic_coords[:, 1] - genomic_coords[:, 0] + 1))
        entry = \
            {"event_name":
             gene.label,
             "mRNA_lens":
             ",".join(iso_lens),
             "mRNA_labels":
             ",".join(iso_labels),
             "exon_lens":
             ";".join([",".join(exons) for exons in iso_exon_lens]),
             "genomic_lens":
             ",".join(genomic_lens)}
        entries.append(entry)
    entries_df = pandas.DataFrame(entries)
    entries_df.to_csv(output_fname,
                      cols=["event_name", "mRNA_labels",
                            "mRNA_lens", "exon_lens",
                            "genomic_lens"],
                      sep="\t",
                      index=False)
Ejemplo n.º 8
0
def add_introns_to_gff(gff_filename, output_dir):
    """
    Add 'intron' entries to GFF.
    """
    output_basename = \
        utils.trim_gff_ext(os.path.basename(gff_filename))
    ext_to_use = os.path.basename(gff_filename).rsplit(".", 1)[1]
    output_filename = \
        os.path.join(output_dir,
                     "%s.with_introns.%s" %(output_basename,
                                            ext_to_use))
    print "Adding introns to GFF..."
    print "  - Input: %s" %(gff_filename)
    print "  - Output: %s" %(output_filename)
    if os.path.isfile(output_filename):
        print "Found file %s, skipping.." %(output_filename)
        return output_filename
    gff_out = miso_gff_utils.Writer(open(output_filename, "w"))
    gff_db = miso_gff_utils.GFFDatabase(from_filename=gff_filename,
                                        reverse_recs=True)
    t1 = time.time()
    genes = gene_utils.load_genes_from_gff(gff_filename)
    for gene_id in genes:
        gene_info = genes[gene_id]
        gene_tree = gene_info["hierarchy"]
        gene_obj = gene_info["gene_object"]
        gene_rec = gene_tree[gene_id]["gene"]
        # Write the GFF record
        gff_out.write(gene_rec)
        # Write out the mRNAs, their exons, and then
        # input the introns
        for mRNA in gene_obj.isoforms:
            mRNA_id = mRNA.label
            curr_mRNA = gene_tree[gene_id]["mRNAs"][mRNA_id]
            gff_out.write(curr_mRNA["record"])
            # Write out the exons
            curr_exons = gene_tree[gene_id]["mRNAs"][mRNA_id]["exons"]
            for exon in curr_exons:
                gff_out.write(curr_exons[exon]["record"])
        # Now output the introns
        for isoform in gene_obj.isoforms:
            intron_coords = []
            for first_exon, second_exon in zip(isoform.parts,
                                               isoform.parts[1::1]):
                # Intron start coordinate is the coordinate right after
                # the end of the first exon, intron end coordinate is the
                # coordinate just before the beginning of the second exon
                intron_start = first_exon.end + 1
                intron_end = second_exon.start - 1
                if intron_start >= intron_end:
                    continue
                intron_coords.append((intron_start, intron_end))
                # Create record for this intron
                intron_id = "%s:%d-%d:%s.intron" \
                    %(gene_obj.chrom,
                      intron_start,
                      intron_end,
                      gene_obj.strand)
                intron_rec = \
                    miso_gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron",
                                       intron_start, intron_end,
                                       strand=gene_obj.strand,
                                       attributes={"ID": [intron_id],
                                                   "Parent": [isoform.label]})
                gff_out.write(intron_rec)
    t2 = time.time()
    print "Addition took %.2f minutes." %((t2 - t1)/60.)
Ejemplo n.º 9
0
def fetch_seq_from_gff(gff_fname, fasta_fname, output_dir,
                       with_flanking_introns=False,
                       flanking_introns_coords=None,
                       overwrite=True,
                       entries_to_include=["gene",
                                           "mRNA",
                                           "exon"]):
    """
    Fetch sequence from GFF file.

    Outputs:

    (1) GFF file containing an annotation of the sequences.

    (2) FASTA file with the actual sequences.

    If asked, fetch the flanking intronic sequences.

    Flanking regions are marked below:

      U: region of upstream intron
      D: region of downstream intron

             U           D

    [ U P ]-----[ S E ]-----[ D N ]

            a,b         c,d

    a,b,c,d correspond to optional flanking intron coordinates
    that determine the regions of the upstream/downstream
    introns that should be fetched:

       a, b: negative ints, position relative to 5' splice site of SE
             a < b

       c, d: positive ints, position relative to 3' splice site of SE
             c < d
    """
    # Load GFF genes
    gff_db = miso_gff_utils.GFFDatabase(from_filename=gff_fname,
                                        reverse_recs=True)
    file_basename = re.sub("\.gff3?", "",
                           os.path.basename(gff_fname))
    output_basename = "%s.event_seqs" %(file_basename)
    if flanking_introns_coords is not None:
        output_basename = "%s.flank_intronic_%s_%s_%s_%s" \
            %(output_basename,
              flanking_introns_coords[0],
              flanking_introns_coords[1],
              flanking_introns_coords[2],
              flanking_introns_coords[3])
    gff_outdir = os.path.join(output_dir, "gff_coords")
    utils.make_dir(gff_outdir)
    gff_output_fname = os.path.join(gff_outdir, "%s.gff" %(output_basename))
    fasta_output_fname = os.path.join(output_dir, "%s.fa" %(output_basename))
    if not overwrite:
        if os.path.isfile(fasta_output_fname):
            print "Output file %s exists. Skipping..." %(fasta_output_fname)
            return fasta_output_fname
    print "Outputting GFF coordinates to: %s" %(gff_output_fname)
    if os.path.isfile(gff_output_fname):
        print "  - Overwriting existing file"
    print "Outputting sequences to: %s" %(fasta_output_fname)
    if os.path.isfile(fasta_output_fname):
        print "  - Overwriting existing file"
    genes = gene_utils.load_genes_from_gff(gff_fname)
    gff_out_file = open(gff_output_fname, "w")
    gff_out = miso_gff_utils.Writer(gff_out_file)
    for gene_id in genes:
        gene_info = genes[gene_id]
        gene_tree = gene_info["hierarchy"]
        gene_obj = gene_info["gene_object"]
        # GFF records to write for the current gene
        recs_to_write = []
        # For mRNA entries, extract the flanking introns of the
        # alternative exon if asked
        event_recs = get_event_recs_from_gene(gene_obj, gene_tree)
        long_mRNA_id = event_recs["long_mRNA"].get_id()
        if event_recs is None:
            continue
        # Write out up, se, and dn exons
        recs_to_write.extend([event_recs["up_exon"]["record"],
                              event_recs["se_exon"]["record"],
                              event_recs["dn_exon"]["record"]])
        if with_flanking_introns:
            introns_coords = \
                get_flanking_introns_coords(gene_obj)
            if introns_coords == None:
                raise Exception, "Cannot find flanking introns coordinates."
                sys.exit(1)
            # Fetch upstream intron sequence
            up_intron_start, up_intron_end = \
                introns_coords["up_intron"]
            up_intron_len = up_intron_end - up_intron_start + 1
            # Fetch downstream intron sequence
            dn_intron_start, dn_intron_end = \
                introns_coords["dn_intron"]
            dn_intron_len = dn_intron_end - dn_intron_start + 1
            # If given custom coordinates, use them instead of entire up/down
            # flanking intronic coordinates.
            se_exon_rec = event_recs["se_exon"]["record"]
            if flanking_introns_coords is not None:
                # (start,end) of upstream intron sequence
                a, b = \
                    int(flanking_introns_coords[0]), int(flanking_introns_coords[1])
                c, d = \
                    int(flanking_introns_coords[2]), int(flanking_introns_coords[3])
                a, b, c, d = error_check_intronic_coords(a, b, c, d,
                                                         up_intron_len, dn_intron_len)
                # Coordinates relative to 5' splice site of sequence to be fetched
                # The start of upstream intron sequence is negative from the 5' ss
                up_intron_start = se_exon_rec.start + a
                up_intron_end = se_exon_rec.start + b
                dn_intron_start = se_exon_rec.end + c
                dn_intron_end = se_exon_rec.end + d
            # Make GFF records for up/dn intronic sequences
            chrom = se_exon_rec.seqid
            source = se_exon_rec.source
            rec_type = "intron"
            strand = se_exon_rec.strand
            up_intron_str = "%s.up_intron" %(long_mRNA_id)
            up_intron_rec = \
                miso_gff_utils.GFF(chrom, source, "intron",
                              up_intron_start, up_intron_end,
                              strand=strand,
                              attributes={"ID": [up_intron_str],
                                          "Parent": [gene_obj.label]})
            dn_intron_str = "%s.dn_intron" %(long_mRNA_id)
            dn_intron_rec = \
                miso_gff_utils.GFF(chrom, source, "intron",
                                   dn_intron_start, dn_intron_end,
                                   strand=strand,
                                   attributes={"ID": [dn_intron_str],
                                               "Parent": [gene_obj.label]})
            recs_to_write.append(up_intron_rec)
            recs_to_write.append(dn_intron_rec)
        # Write out records to GFF
        for rec in recs_to_write:
            gff_out.write(rec)
    gff_out_file.close()
    # Output FASTA sequences
    output_fasta_seqs_from_gff(gff_output_fname,
                               fasta_fname,
                               fasta_output_fname)
    return fasta_output_fname
def instert_introns_to_gff3(gff_filename, output_gff3_filename):
    output_filename = os.path.join("%s_introns.gff3" %(output_gff3_filename))
    print "Adding introns to GFF..."
    print "  - Input: %s" %(gff_filename)
    print "  - Output: %s" %(output_filename)
    gff_out = gff_utils.Writer(open(output_filename, "w"))
    gff_db = gff_utils.GFFDatabase(from_filename=gff_filename, reverse_recs=True)
    t1 = time.time()
    genes = gene_utils.load_genes_from_gff(gff_filename)
    for gene_id in genes:
        gene_info = genes[gene_id]
        gene_tree = gene_info["hierarchy"]
        gene_obj = gene_info["gene_object"]
        gene_rec = gene_tree[gene_id]["gene"]
        gene_start = int(str(gene_tree[gene_id]['gene']).split(",")[3].strip(" "))
        gene_end = int(str(gene_tree[gene_id]['gene']).split(",")[4].strip(" "))
        # Write the GFF record
        gff_out.write(gene_rec)
        # Write out the mRNAs, their exons, and then
        # input the introns
        for mRNA_id in gene_tree[gene_id]["mRNAs"]:
            curr_mRNA = gene_tree[gene_id]["mRNAs"][mRNA_id]
            gff_out.write(curr_mRNA["record"])
            # Write out the exons
            curr_exons = gene_tree[gene_id]["mRNAs"][mRNA_id]["exons"]
	    #curr_cds= gene_tree[gene_id]["mRNAs"][mRNA_id]["CDSs"]	
            for exon in curr_exons:
                gff_out.write(curr_exons[exon]["record"])
		#gff_out.write(curr_cds[cds]["record"])
        # Now output the introns
        for isoform in gene_obj.isoforms:
	    #print gene_obj.isoforms
            intron_coords = []
            for first_exon, second_exon in zip(isoform.parts,
                                               isoform.parts):
                final_exon_end=0
                first_exon_start=0
                if(len(intron_coords)==len(isoform.parts)-1):
                    final_exon_end=second_exon.end
                if(len(isoform.parts)==1):
                    first_exon_start=first_exon.start
                #print first_exon
                
                if(first_exon_start>1 and gene_start==1):
                    intron_start=1
                    intron_end = first_exon_start - 1
                    
                elif(gene_end>final_exon_end and final_exon_end!=0):
                    #print str(isoform.parts[len(isoform.parts)-1]).split(",")
                    print first_exon_start
                    intron_start=final_exon_end+1
                    intron_end=gene_end
                else:   
                    intron_start = first_exon.end + 1
                    intron_end = second_exon.start - 1
                     
                if intron_start >= intron_end:
                    continue
                intron_coords.append((intron_start, intron_end))
                # Create record for this intron
                intron_id = "%s:%d-%d:%s" %(gene_obj.chrom,
                                            intron_start,
                                            intron_end,gene_obj.strand)
                intron_rec = \
                    gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron",
                                  intron_start, intron_end,".",gene_obj.strand,
                                  attributes={"ID": [gene_obj.label],
                                              "Parent": [isoform.label]})
                gff_out.write(intron_rec)


            
            for first_exon, second_exon in zip(isoform.parts,
                                               isoform.parts[1::1]): 
                # Intron start coordinate is the coordinate right after
                # the end of the first exon, intron end coordinate is the
                # coordinate just before the beginning of the second exon
                #print "test"
		        #print "s"
                #else:    
                #    intron_start = first_exon.end + 1
                #    intron_end = second_exon.start - 1

                intron_start = first_exon.end + 1
                intron_end = second_exon.start - 1
                if intron_start >= intron_end:
                    continue
                intron_coords.append((intron_start, intron_end))
                # Create record for this intron
                intron_id = "%s:%d-%d:%s" %(gene_obj.chrom,
                                            intron_start,
                                            intron_end,gene_obj.strand)
                intron_rec = \
                    gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron",
                                  intron_start, intron_end,".",gene_obj.strand,
                                  attributes={"ID": [gene_obj.label],
                                              "Parent": [isoform.label]})
                gff_out.write(intron_rec)
    t2 = time.time()
    print "Addition took %.2f minutes." %((t2 - t1)/60.)