def instert_introns_to_gff3(gff_filename, output_gff3_filename):
    output_filename = os.path.join("%s_introns.gff3" % (output_gff3_filename))
    print "Adding introns to GFF..."
    print "  - Input: %s" % (gff_filename)
    print "  - Output: %s" % (output_filename)
    gff_out = gff_utils.Writer(open(output_filename, "w"))
    gff_db = gff_utils.GFFDatabase(from_filename=gff_filename,
                                   reverse_recs=True)
    t1 = time.time()
    genes = gene_utils.load_genes_from_gff(gff_filename)
    for gene_id in genes:
        gene_info = genes[gene_id]
        gene_tree = gene_info["hierarchy"]
        gene_obj = gene_info["gene_object"]
        gene_rec = gene_tree[gene_id]["gene"]
        # Write the GFF record
        gff_out.write(gene_rec)
        # Write out the mRNAs, their exons, and then
        # input the introns
        for mRNA_id in gene_tree[gene_id]["mRNAs"]:
            curr_mRNA = gene_tree[gene_id]["mRNAs"][mRNA_id]
            gff_out.write(curr_mRNA["record"])
            # Write out the exons
            curr_exons = gene_tree[gene_id]["mRNAs"][mRNA_id]["exons"]
            for exon in curr_exons:
                gff_out.write(curr_exons[exon]["record"])
        # Now output the introns
        for isoform in gene_obj.isoforms:
            intron_coords = []
            for first_exon, second_exon in zip(isoform.parts,
                                               isoform.parts[1::1]):
                # Intron start coordinate is the coordinate right after
                # the end of the first exon, intron end coordinate is the
                # coordinate just before the beginning of the second exon
                intron_start = first_exon.end + 1
                intron_end = second_exon.start - 1
                if intron_start >= intron_end:
                    continue
                intron_coords.append((intron_start, intron_end))
                # Create record for this intron
                intron_id = "%s:%s:%d-%d:%s" % (isoform.label, gene_obj.chrom,
                                                intron_start, intron_end,
                                                gene_obj.strand)
                intron_rec = \
                    gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron",
                                  intron_start, intron_end, ".", gene_obj.strand, ".",
                                  attributes={"ID": [intron_id], "Parent": [isoform.label]})
                gff_out.write(intron_rec)
    t2 = time.time()
    print "Addition took %.2f minutes." % ((t2 - t1) / 60.)
Example #2
0
def shorten_gff(input_gff, output_gff, max_id_len=75):
#     # List of search and replace with IDs
#     replace_ids = []

#     # pattern to identify ID= elements
#     pat = 'ID=(.+);'


#     input_file = open(input_gff, 'r')
#     for line in input_file:
#         # find ID= elements
#         match = re.search(pat, line)
#         if match != None:
#             assert(len(match.groups()) > 0)
#             id_to_replace = match.groups()[0]
#             if len(id_to_replace) >= max_id_len:
#                 new_id = shorten_id(id_to_replace)
#                 #replace_ids.append((id_to_replace, new_id))
#                 old_to_new_ids[id_to_replace] = new_id
    """
    Replace the ith ID in old_ids with the ith ID in new_ids.
    Output result to output gff.
    """
    new_recs = []
    # Load input GFF
    t1 = time.time()
    gff_in = GFF.GFFDatabase(from_filename=input_gff,
                             reverse_recs=True)

    # Mapping from old to new IDs
    old_to_new_ids = {}
    
    for rec in gff_in:
        new_record = shorten_rec(rec, old_to_new_ids, max_id_len)
        new_recs.append(new_record)
    t2 = time.time()

    print "Loading of input GFF took %.2f seconds" %(t2 - t1)

    print "Writing revised gff to: %s" %(output_gff)

    output_file = open(output_gff, 'w')
    gff_writer = GFF.Writer(output_file)

    # Write new GFF file
    gff_writer.write_recs(new_recs)

    output_file.close()
def get_events_in_region(gff_filename, region, record_types=["gene"]):
    """
    Output Return all 'gene' entries in a given GFF file that
    intersect the given region.

    record_types is a list of GFF records to collect (e.g. gene, mRNA, ...)
    """
    gff_db = gff_utils.GFFDatabase(from_filename=gff_filename,
                                   reverse_recs=True)
    # Parse the query region
    parsed_region = parse_query_region(region)
    query_chrom, query_start, query_end, \
        query_strand = parsed_region
    matched_records = []
    num_recs = 0
    for record in gff_db:
        chrom = record.seqid
        # Name
        name = record.type
        start, end = int(record.start), int(record.end)
        strand = record.strand
        # Skip GFF records that don't match our record types
        if name not in record_types:
            continue
        num_recs += 1
        # Check that there is intersection
        if (query_chrom != chrom) or \
            (not utils.intersect_coords(query_start, query_end,
                                        start, end)):
            # Skip if chromosomes don't match or if there's no intersection
            continue
        # If strand is supplied in query region, check that
        # the strand matches
        if (query_strand is not None) and \
            (strand != query_strand):
            continue
        # Must match
        record_id = record.get_id()
        print "%s" % (record_id)
        print "  - ", record
        matched_records.append(record)
    print "Looked through %d records." % (num_recs)
    return matched_records
def add_introns_to_gff(gff_filename, output_dir):
    """
    Add 'intron' entries to GFF.
    """
    output_basename = \
        utils.trim_gff_ext(os.path.basename(gff_filename))
    ext_to_use = os.path.basename(gff_filename).rsplit(".", 1)[1]
    output_filename = \
        os.path.join(output_dir,
                     "%s.with_introns.%s" %(output_basename,
                                            ext_to_use))
    print "Adding introns to GFF..."
    print "  - Input: %s" %(gff_filename)
    print "  - Output: %s" %(output_filename)
    if os.path.isfile(output_filename):
        print "Found file %s, skipping.." %(output_filename)
        return output_filename
    gff_out = miso_gff_utils.Writer(open(output_filename, "w"))
    gff_db = miso_gff_utils.GFFDatabase(from_filename=gff_filename,
                                        reverse_recs=True)
    t1 = time.time()
    genes = gene_utils.load_genes_from_gff(gff_filename)
    for gene_id in genes:
        gene_info = genes[gene_id]
        gene_tree = gene_info["hierarchy"]
        gene_obj = gene_info["gene_object"]
        gene_rec = gene_tree[gene_id]["gene"]
        # Write the GFF record
        gff_out.write(gene_rec)
        # Write out the mRNAs, their exons, and then
        # input the introns
        for mRNA in gene_obj.isoforms:
            mRNA_id = mRNA.label
            curr_mRNA = gene_tree[gene_id]["mRNAs"][mRNA_id]
            gff_out.write(curr_mRNA["record"])
            # Write out the exons
            curr_exons = gene_tree[gene_id]["mRNAs"][mRNA_id]["exons"]
            for exon in curr_exons:
                gff_out.write(curr_exons[exon]["record"])
        # Now output the introns
        for isoform in gene_obj.isoforms:
            intron_coords = []
            for first_exon, second_exon in zip(isoform.parts,
                                               isoform.parts[1::1]):
                # Intron start coordinate is the coordinate right after
                # the end of the first exon, intron end coordinate is the
                # coordinate just before the beginning of the second exon
                intron_start = first_exon.end + 1
                intron_end = second_exon.start - 1
                if intron_start >= intron_end:
                    continue
                intron_coords.append((intron_start, intron_end))
                # Create record for this intron
                intron_id = "%s:%d-%d:%s.intron" \
                    %(gene_obj.chrom,
                      intron_start,
                      intron_end,
                      gene_obj.strand)
                intron_rec = \
                    miso_gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron",
                                       intron_start, intron_end,
                                       strand=gene_obj.strand,
                                       attributes={"ID": [intron_id],
                                                   "Parent": [isoform.label]})
                gff_out.write(intron_rec)
    t2 = time.time()
    print "Addition took %.2f minutes." %((t2 - t1)/60.)
def fetch_seq_from_gff(gff_fname, fasta_fname, output_dir,
                       with_flanking_introns=False,
                       flanking_introns_coords=None,
                       overwrite=True,
                       entries_to_include=["gene",
                                           "mRNA",
                                           "exon"]):
    """
    Fetch sequence from GFF file.

    Outputs:

    (1) GFF file containing an annotation of the sequences.

    (2) FASTA file with the actual sequences.

    If asked, fetch the flanking intronic sequences.

    Flanking regions are marked below:

      U: region of upstream intron
      D: region of downstream intron

             U           D

    [ U P ]-----[ S E ]-----[ D N ]

            a,b         c,d

    a,b,c,d correspond to optional flanking intron coordinates
    that determine the regions of the upstream/downstream
    introns that should be fetched:

       a, b: negative ints, position relative to 5' splice site of SE
             a < b

       c, d: positive ints, position relative to 3' splice site of SE
             c < d
    """
    # Load GFF genes
    gff_db = miso_gff_utils.GFFDatabase(from_filename=gff_fname,
                                        reverse_recs=True)
    file_basename = re.sub("\.gff3?", "",
                           os.path.basename(gff_fname))
    output_basename = "%s.event_seqs" %(file_basename)
    if flanking_introns_coords is not None:
        output_basename = "%s.flank_intronic_%s_%s_%s_%s" \
            %(output_basename,
              flanking_introns_coords[0],
              flanking_introns_coords[1],
              flanking_introns_coords[2],
              flanking_introns_coords[3])
    gff_outdir = os.path.join(output_dir, "gff_coords")
    utils.make_dir(gff_outdir)
    gff_output_fname = os.path.join(gff_outdir, "%s.gff" %(output_basename))
    fasta_output_fname = os.path.join(output_dir, "%s.fa" %(output_basename))
    if not overwrite:
        if os.path.isfile(fasta_output_fname):
            print "Output file %s exists. Skipping..." %(fasta_output_fname)
            return fasta_output_fname
    print "Outputting GFF coordinates to: %s" %(gff_output_fname)
    if os.path.isfile(gff_output_fname):
        print "  - Overwriting existing file"
    print "Outputting sequences to: %s" %(fasta_output_fname)
    if os.path.isfile(fasta_output_fname):
        print "  - Overwriting existing file"
    genes = gene_utils.load_genes_from_gff(gff_fname)
    gff_out_file = open(gff_output_fname, "w")
    gff_out = miso_gff_utils.Writer(gff_out_file)
    for gene_id in genes:
        gene_info = genes[gene_id]
        gene_tree = gene_info["hierarchy"]
        gene_obj = gene_info["gene_object"]
        # GFF records to write for the current gene
        recs_to_write = []
        # For mRNA entries, extract the flanking introns of the
        # alternative exon if asked
        event_recs = get_event_recs_from_gene(gene_obj, gene_tree)
        long_mRNA_id = event_recs["long_mRNA"].get_id()
        if event_recs is None:
            continue
        # Write out up, se, and dn exons
        recs_to_write.extend([event_recs["up_exon"]["record"],
                              event_recs["se_exon"]["record"],
                              event_recs["dn_exon"]["record"]])
        if with_flanking_introns:
            introns_coords = \
                get_flanking_introns_coords(gene_obj)
            if introns_coords == None:
                raise Exception, "Cannot find flanking introns coordinates."
                sys.exit(1)
            # Fetch upstream intron sequence
            up_intron_start, up_intron_end = \
                introns_coords["up_intron"]
            up_intron_len = up_intron_end - up_intron_start + 1
            # Fetch downstream intron sequence
            dn_intron_start, dn_intron_end = \
                introns_coords["dn_intron"]
            dn_intron_len = dn_intron_end - dn_intron_start + 1
            # If given custom coordinates, use them instead of entire up/down
            # flanking intronic coordinates.
            se_exon_rec = event_recs["se_exon"]["record"]
            if flanking_introns_coords is not None:
                # (start,end) of upstream intron sequence
                a, b = \
                    int(flanking_introns_coords[0]), int(flanking_introns_coords[1])
                c, d = \
                    int(flanking_introns_coords[2]), int(flanking_introns_coords[3])
                a, b, c, d = error_check_intronic_coords(a, b, c, d,
                                                         up_intron_len, dn_intron_len)
                # Coordinates relative to 5' splice site of sequence to be fetched
                # The start of upstream intron sequence is negative from the 5' ss
                up_intron_start = se_exon_rec.start + a
                up_intron_end = se_exon_rec.start + b
                dn_intron_start = se_exon_rec.end + c
                dn_intron_end = se_exon_rec.end + d
            # Make GFF records for up/dn intronic sequences
            chrom = se_exon_rec.seqid
            source = se_exon_rec.source
            rec_type = "intron"
            strand = se_exon_rec.strand
            up_intron_str = "%s.up_intron" %(long_mRNA_id)
            up_intron_rec = \
                miso_gff_utils.GFF(chrom, source, "intron",
                              up_intron_start, up_intron_end,
                              strand=strand,
                              attributes={"ID": [up_intron_str],
                                          "Parent": [gene_obj.label]})
            dn_intron_str = "%s.dn_intron" %(long_mRNA_id)
            dn_intron_rec = \
                miso_gff_utils.GFF(chrom, source, "intron",
                                   dn_intron_start, dn_intron_end,
                                   strand=strand,
                                   attributes={"ID": [dn_intron_str],
                                               "Parent": [gene_obj.label]})
            recs_to_write.append(up_intron_rec)
            recs_to_write.append(dn_intron_rec)
        # Write out records to GFF
        for rec in recs_to_write:
            gff_out.write(rec)
    gff_out_file.close()
    # Output FASTA sequences
    output_fasta_seqs_from_gff(gff_output_fname,
                               fasta_fname,
                               fasta_output_fname)
    return fasta_output_fname
def get_const_exons_by_gene(gff_filename,
                            output_dir,
                            output_filename=None,
                            all_constitutive=False,
                            min_size=0,
                            output_format='gff'):
    """
    Get consitutive exons from GFF file.

    Arguments:
    - gff_filename: GFF input filename
    - output_dir: output directory

    Optional arguments:

    - min_size: minimum exon size
    - output_format: gff or BED
    - all_constitutive: treat all exons as constitutive
    """
    print "Getting constitutive exons..."
    print "  - Input GFF: %s" % (gff_filename)
    print "  - Output dir: %s" % (output_dir)
    print "  - Output format: %s" % (output_format)

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    if min_size > 0:
        print "  - Including only exons greater than or " \
              "equal to %d-bp" \
              %(min_size)

    t1 = time.time()
    gff_in = gff_utils.GFFDatabase(from_filename=gff_filename)

    const_exons_by_gene = []

    num_exons = 0

    for gene, mRNAs in gff_in.mRNAs_by_gene.iteritems():
        # For each gene, look at all mRNAs and return constitutive exon
        curr_const_exons = \
            get_const_exons_from_mRNA(gff_in, mRNAs,
                                      all_constitutive=all_constitutive,
                                      min_size=min_size)
        const_exons_by_gene.extend(curr_const_exons)
        num_exons += len(curr_const_exons)

    t2 = time.time()

    basename = re.sub("[.]gff3?", "", os.path.basename(gff_filename))
    if output_filename is None:
        # Create default output filename if not
        # given one as argument
        output_filename = os.path.join(output_dir,
                                       "%s.min_%d.const_exons.gff" \
                                       %(basename,
                                         min_size))
    if not all_constitutive:
        print "Constitutive exon retrieval took %.2f seconds (%d exons)." \
              %((t2 - t1), num_exons)
        output_exons_to_file(const_exons_by_gene,
                             output_filename,
                             output_format=output_format)
    else:
        print "Constitutive exons GFF was given, so not outputting " \
              "another one."
    return const_exons_by_gene, output_filename