Beispiel #1
0
def output_parts_as_gff(gff_out,
                        parts,
                        chrom,
                        strand,
                        source=".",
                        rec_type="exon",
                        gene_id="NA",
                        na_val="NA"):
    """
    Output a set of parts to GFF.
    """
    for part in parts:
        attributes = {
            'ID': ["%s.%s" % (rec_type, part.label)],
            'Parent': [part.parent],
            'gene_id': [gene_id],
        }
        rec_start, rec_end = part.start, part.end
        gff_rec = gff_utils.GFF(chrom,
                                source,
                                rec_type,
                                rec_start,
                                rec_end,
                                attributes=attributes,
                                strand=strand)
        gff_out.write(gff_rec)
Beispiel #2
0
def shorten_rec(gff_rec, old_to_new_ids, max_id_len):
    """
    Shorten GFF record. Return new record.
    """
    rec_id = ",".join(gff_rec.attributes["ID"])
    if "Parent" in gff_rec.attributes:
        rec_parent = ",".join(gff_rec.attributes["Parent"])
    else:
        rec_parent = ""
    
    new_attributes = gff_rec.attributes.copy()

    if rec_id not in old_to_new_ids:
        # Set new attributes for ID field
        old_to_new_ids[rec_id] = shorten_id(rec_id, max_id_len)
    new_attributes['ID'] = [old_to_new_ids[rec_id]]
    new_attributes['Name'] = [old_to_new_ids[rec_id]]
    if rec_parent not in old_to_new_ids:
        # Set new attributes for Parent field
        old_to_new_ids[rec_parent] = shorten_id(rec_parent, max_id_len)
    if old_to_new_ids[rec_parent] != "":
        # Only set parent record if there is a parent 
        new_attributes['Parent'] = [old_to_new_ids[rec_parent]]
    new_gff_rec = GFF.GFF(seqid=gff_rec.seqid,
                          source=gff_rec.source,
                          type=gff_rec.type,
                          start=gff_rec.start,
                          end=gff_rec.end,
                          score=gff_rec.score,
                          strand=gff_rec.strand,
                          phase=gff_rec.phase,
                          attributes=new_attributes)

    return new_gff_rec
def instert_introns_to_gff3(gff_filename, output_gff3_filename):
    output_filename = os.path.join("%s_introns.gff3" % (output_gff3_filename))
    print "Adding introns to GFF..."
    print "  - Input: %s" % (gff_filename)
    print "  - Output: %s" % (output_filename)
    gff_out = gff_utils.Writer(open(output_filename, "w"))
    gff_db = gff_utils.GFFDatabase(from_filename=gff_filename,
                                   reverse_recs=True)
    t1 = time.time()
    genes = gene_utils.load_genes_from_gff(gff_filename)
    for gene_id in genes:
        gene_info = genes[gene_id]
        gene_tree = gene_info["hierarchy"]
        gene_obj = gene_info["gene_object"]
        gene_rec = gene_tree[gene_id]["gene"]
        # Write the GFF record
        gff_out.write(gene_rec)
        # Write out the mRNAs, their exons, and then
        # input the introns
        for mRNA_id in gene_tree[gene_id]["mRNAs"]:
            curr_mRNA = gene_tree[gene_id]["mRNAs"][mRNA_id]
            gff_out.write(curr_mRNA["record"])
            # Write out the exons
            curr_exons = gene_tree[gene_id]["mRNAs"][mRNA_id]["exons"]
            for exon in curr_exons:
                gff_out.write(curr_exons[exon]["record"])
        # Now output the introns
        for isoform in gene_obj.isoforms:
            intron_coords = []
            for first_exon, second_exon in zip(isoform.parts,
                                               isoform.parts[1::1]):
                # Intron start coordinate is the coordinate right after
                # the end of the first exon, intron end coordinate is the
                # coordinate just before the beginning of the second exon
                intron_start = first_exon.end + 1
                intron_end = second_exon.start - 1
                if intron_start >= intron_end:
                    continue
                intron_coords.append((intron_start, intron_end))
                # Create record for this intron
                intron_id = "%s:%s:%d-%d:%s" % (isoform.label, gene_obj.chrom,
                                                intron_start, intron_end,
                                                gene_obj.strand)
                intron_rec = \
                    gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron",
                                  intron_start, intron_end, ".", gene_obj.strand, ".",
                                  attributes={"ID": [intron_id], "Parent": [isoform.label]})
                gff_out.write(intron_rec)
    t2 = time.time()
    print "Addition took %.2f minutes." % ((t2 - t1) / 60.)
def add_introns_to_gff(gff_filename, output_dir):
    """
    Add 'intron' entries to GFF.
    """
    output_basename = \
        utils.trim_gff_ext(os.path.basename(gff_filename))
    ext_to_use = os.path.basename(gff_filename).rsplit(".", 1)[1]
    output_filename = \
        os.path.join(output_dir,
                     "%s.with_introns.%s" %(output_basename,
                                            ext_to_use))
    print "Adding introns to GFF..."
    print "  - Input: %s" %(gff_filename)
    print "  - Output: %s" %(output_filename)
    if os.path.isfile(output_filename):
        print "Found file %s, skipping.." %(output_filename)
        return output_filename
    gff_out = miso_gff_utils.Writer(open(output_filename, "w"))
    gff_db = miso_gff_utils.GFFDatabase(from_filename=gff_filename,
                                        reverse_recs=True)
    t1 = time.time()
    genes = gene_utils.load_genes_from_gff(gff_filename)
    for gene_id in genes:
        gene_info = genes[gene_id]
        gene_tree = gene_info["hierarchy"]
        gene_obj = gene_info["gene_object"]
        gene_rec = gene_tree[gene_id]["gene"]
        # Write the GFF record
        gff_out.write(gene_rec)
        # Write out the mRNAs, their exons, and then
        # input the introns
        for mRNA in gene_obj.isoforms:
            mRNA_id = mRNA.label
            curr_mRNA = gene_tree[gene_id]["mRNAs"][mRNA_id]
            gff_out.write(curr_mRNA["record"])
            # Write out the exons
            curr_exons = gene_tree[gene_id]["mRNAs"][mRNA_id]["exons"]
            for exon in curr_exons:
                gff_out.write(curr_exons[exon]["record"])
        # Now output the introns
        for isoform in gene_obj.isoforms:
            intron_coords = []
            for first_exon, second_exon in zip(isoform.parts,
                                               isoform.parts[1::1]):
                # Intron start coordinate is the coordinate right after
                # the end of the first exon, intron end coordinate is the
                # coordinate just before the beginning of the second exon
                intron_start = first_exon.end + 1
                intron_end = second_exon.start - 1
                if intron_start >= intron_end:
                    continue
                intron_coords.append((intron_start, intron_end))
                # Create record for this intron
                intron_id = "%s:%d-%d:%s.intron" \
                    %(gene_obj.chrom,
                      intron_start,
                      intron_end,
                      gene_obj.strand)
                intron_rec = \
                    miso_gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron",
                                       intron_start, intron_end,
                                       strand=gene_obj.strand,
                                       attributes={"ID": [intron_id],
                                                   "Parent": [isoform.label]})
                gff_out.write(intron_rec)
    t2 = time.time()
    print "Addition took %.2f minutes." %((t2 - t1)/60.)
def fetch_seq_from_gff(gff_fname, fasta_fname, output_dir,
                       with_flanking_introns=False,
                       flanking_introns_coords=None,
                       overwrite=True,
                       entries_to_include=["gene",
                                           "mRNA",
                                           "exon"]):
    """
    Fetch sequence from GFF file.

    Outputs:

    (1) GFF file containing an annotation of the sequences.

    (2) FASTA file with the actual sequences.

    If asked, fetch the flanking intronic sequences.

    Flanking regions are marked below:

      U: region of upstream intron
      D: region of downstream intron

             U           D

    [ U P ]-----[ S E ]-----[ D N ]

            a,b         c,d

    a,b,c,d correspond to optional flanking intron coordinates
    that determine the regions of the upstream/downstream
    introns that should be fetched:

       a, b: negative ints, position relative to 5' splice site of SE
             a < b

       c, d: positive ints, position relative to 3' splice site of SE
             c < d
    """
    # Load GFF genes
    gff_db = miso_gff_utils.GFFDatabase(from_filename=gff_fname,
                                        reverse_recs=True)
    file_basename = re.sub("\.gff3?", "",
                           os.path.basename(gff_fname))
    output_basename = "%s.event_seqs" %(file_basename)
    if flanking_introns_coords is not None:
        output_basename = "%s.flank_intronic_%s_%s_%s_%s" \
            %(output_basename,
              flanking_introns_coords[0],
              flanking_introns_coords[1],
              flanking_introns_coords[2],
              flanking_introns_coords[3])
    gff_outdir = os.path.join(output_dir, "gff_coords")
    utils.make_dir(gff_outdir)
    gff_output_fname = os.path.join(gff_outdir, "%s.gff" %(output_basename))
    fasta_output_fname = os.path.join(output_dir, "%s.fa" %(output_basename))
    if not overwrite:
        if os.path.isfile(fasta_output_fname):
            print "Output file %s exists. Skipping..." %(fasta_output_fname)
            return fasta_output_fname
    print "Outputting GFF coordinates to: %s" %(gff_output_fname)
    if os.path.isfile(gff_output_fname):
        print "  - Overwriting existing file"
    print "Outputting sequences to: %s" %(fasta_output_fname)
    if os.path.isfile(fasta_output_fname):
        print "  - Overwriting existing file"
    genes = gene_utils.load_genes_from_gff(gff_fname)
    gff_out_file = open(gff_output_fname, "w")
    gff_out = miso_gff_utils.Writer(gff_out_file)
    for gene_id in genes:
        gene_info = genes[gene_id]
        gene_tree = gene_info["hierarchy"]
        gene_obj = gene_info["gene_object"]
        # GFF records to write for the current gene
        recs_to_write = []
        # For mRNA entries, extract the flanking introns of the
        # alternative exon if asked
        event_recs = get_event_recs_from_gene(gene_obj, gene_tree)
        long_mRNA_id = event_recs["long_mRNA"].get_id()
        if event_recs is None:
            continue
        # Write out up, se, and dn exons
        recs_to_write.extend([event_recs["up_exon"]["record"],
                              event_recs["se_exon"]["record"],
                              event_recs["dn_exon"]["record"]])
        if with_flanking_introns:
            introns_coords = \
                get_flanking_introns_coords(gene_obj)
            if introns_coords == None:
                raise Exception, "Cannot find flanking introns coordinates."
                sys.exit(1)
            # Fetch upstream intron sequence
            up_intron_start, up_intron_end = \
                introns_coords["up_intron"]
            up_intron_len = up_intron_end - up_intron_start + 1
            # Fetch downstream intron sequence
            dn_intron_start, dn_intron_end = \
                introns_coords["dn_intron"]
            dn_intron_len = dn_intron_end - dn_intron_start + 1
            # If given custom coordinates, use them instead of entire up/down
            # flanking intronic coordinates.
            se_exon_rec = event_recs["se_exon"]["record"]
            if flanking_introns_coords is not None:
                # (start,end) of upstream intron sequence
                a, b = \
                    int(flanking_introns_coords[0]), int(flanking_introns_coords[1])
                c, d = \
                    int(flanking_introns_coords[2]), int(flanking_introns_coords[3])
                a, b, c, d = error_check_intronic_coords(a, b, c, d,
                                                         up_intron_len, dn_intron_len)
                # Coordinates relative to 5' splice site of sequence to be fetched
                # The start of upstream intron sequence is negative from the 5' ss
                up_intron_start = se_exon_rec.start + a
                up_intron_end = se_exon_rec.start + b
                dn_intron_start = se_exon_rec.end + c
                dn_intron_end = se_exon_rec.end + d
            # Make GFF records for up/dn intronic sequences
            chrom = se_exon_rec.seqid
            source = se_exon_rec.source
            rec_type = "intron"
            strand = se_exon_rec.strand
            up_intron_str = "%s.up_intron" %(long_mRNA_id)
            up_intron_rec = \
                miso_gff_utils.GFF(chrom, source, "intron",
                              up_intron_start, up_intron_end,
                              strand=strand,
                              attributes={"ID": [up_intron_str],
                                          "Parent": [gene_obj.label]})
            dn_intron_str = "%s.dn_intron" %(long_mRNA_id)
            dn_intron_rec = \
                miso_gff_utils.GFF(chrom, source, "intron",
                                   dn_intron_start, dn_intron_end,
                                   strand=strand,
                                   attributes={"ID": [dn_intron_str],
                                               "Parent": [gene_obj.label]})
            recs_to_write.append(up_intron_rec)
            recs_to_write.append(dn_intron_rec)
        # Write out records to GFF
        for rec in recs_to_write:
            gff_out.write(rec)
    gff_out_file.close()
    # Output FASTA sequences
    output_fasta_seqs_from_gff(gff_output_fname,
                               fasta_fname,
                               fasta_output_fname)
    return fasta_output_fname