Exemple #1
0
def intersect_clusters_with_gff(logger,
                                clusters_fname,
                                gff_filenames,
                                output_dir):
    """
    Intersect CLIP clusters with a set of GFF filenames
    (e.g. GFFs containing events.)
    """
    event_clusters_fnames = []
    logger.info("Intersecting clusters %s with GFFs.." \
                %(clusters_fname))
    for gff_fname in gff_filenames:
        if not os.path.isfile(gff_fname):
            logger.critical("Cannot find events GFF file %s" \
                            %(gff_fname))
            continue
        gff_label = os.path.basename(utils.trim_gff_ext(gff_fname))
        event_clusters_fname = \
            os.path.join(output_dir,
                         "%s.clusters.bed" %(gff_label))
        logger.info("  - Processing GFF %s" %(gff_label))
        # Intersect the clusters for this sample with each
        # GFF file
        intersectBed_cmd = \
            "%s -a %s -b %s -loj -f 1 > %s" \
            %(bedtools_utils.intersectBed_path,
              clusters_fname,
              gff_fname,
              event_clusters_fname)
        if os.path.isfile(event_clusters_fname):
            logger.info("Found %s, skipping.." %(event_clusters_fname))
            event_clusters_fnames.append(event_clusters_fname)
            continue
        logger.info("  - Executing: %s" %(intersectBed_cmd))
        ret_val = os.system(intersectBed_cmd)
        if ret_val != 0:
            logger.critical("Cannot intersect clusters with %s" \
                            %(gff_fname))
        else:
            event_clusters_fnames.append(event_clusters_fname)
    return event_clusters_fnames
Exemple #2
0
def intersect_clusters_with_gff(logger, clusters_fname, gff_filenames,
                                output_dir):
    """
    Intersect CLIP clusters with a set of GFF filenames
    (e.g. GFFs containing events.)
    """
    event_clusters_fnames = []
    logger.info("Intersecting clusters %s with GFFs.." \
                %(clusters_fname))
    for gff_fname in gff_filenames:
        if not os.path.isfile(gff_fname):
            logger.critical("Cannot find events GFF file %s" \
                            %(gff_fname))
            continue
        gff_label = os.path.basename(utils.trim_gff_ext(gff_fname))
        event_clusters_fname = \
            os.path.join(output_dir,
                         "%s.clusters.bed" %(gff_label))
        logger.info("  - Processing GFF %s" % (gff_label))
        # Intersect the clusters for this sample with each
        # GFF file
        intersectBed_cmd = \
            "%s -a %s -b %s -loj -f 1 > %s" \
            %(bedtools_utils.intersectBed_path,
              clusters_fname,
              gff_fname,
              event_clusters_fname)
        if os.path.isfile(event_clusters_fname):
            logger.info("Found %s, skipping.." % (event_clusters_fname))
            event_clusters_fnames.append(event_clusters_fname)
            continue
        logger.info("  - Executing: %s" % (intersectBed_cmd))
        ret_val = os.system(intersectBed_cmd)
        if ret_val != 0:
            logger.critical("Cannot intersect clusters with %s" \
                            %(gff_fname))
        else:
            event_clusters_fnames.append(event_clusters_fname)
    return event_clusters_fnames
def add_introns_to_gff(gff_filename, output_dir):
    """
    Add 'intron' entries to GFF.
    """
    output_basename = \
        utils.trim_gff_ext(os.path.basename(gff_filename))
    ext_to_use = os.path.basename(gff_filename).rsplit(".", 1)[1]
    output_filename = \
        os.path.join(output_dir,
                     "%s.with_introns.%s" %(output_basename,
                                            ext_to_use))
    print "Adding introns to GFF..."
    print "  - Input: %s" %(gff_filename)
    print "  - Output: %s" %(output_filename)
    if os.path.isfile(output_filename):
        print "Found file %s, skipping.." %(output_filename)
        return output_filename
    gff_out = miso_gff_utils.Writer(open(output_filename, "w"))
    gff_db = miso_gff_utils.GFFDatabase(from_filename=gff_filename,
                                        reverse_recs=True)
    t1 = time.time()
    genes = gene_utils.load_genes_from_gff(gff_filename)
    for gene_id in genes:
        gene_info = genes[gene_id]
        gene_tree = gene_info["hierarchy"]
        gene_obj = gene_info["gene_object"]
        gene_rec = gene_tree[gene_id]["gene"]
        # Write the GFF record
        gff_out.write(gene_rec)
        # Write out the mRNAs, their exons, and then
        # input the introns
        for mRNA in gene_obj.isoforms:
            mRNA_id = mRNA.label
            curr_mRNA = gene_tree[gene_id]["mRNAs"][mRNA_id]
            gff_out.write(curr_mRNA["record"])
            # Write out the exons
            curr_exons = gene_tree[gene_id]["mRNAs"][mRNA_id]["exons"]
            for exon in curr_exons:
                gff_out.write(curr_exons[exon]["record"])
        # Now output the introns
        for isoform in gene_obj.isoforms:
            intron_coords = []
            for first_exon, second_exon in zip(isoform.parts,
                                               isoform.parts[1::1]):
                # Intron start coordinate is the coordinate right after
                # the end of the first exon, intron end coordinate is the
                # coordinate just before the beginning of the second exon
                intron_start = first_exon.end + 1
                intron_end = second_exon.start - 1
                if intron_start >= intron_end:
                    continue
                intron_coords.append((intron_start, intron_end))
                # Create record for this intron
                intron_id = "%s:%d-%d:%s.intron" \
                    %(gene_obj.chrom,
                      intron_start,
                      intron_end,
                      gene_obj.strand)
                intron_rec = \
                    miso_gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron",
                                       intron_start, intron_end,
                                       strand=gene_obj.strand,
                                       attributes={"ID": [intron_id],
                                                   "Parent": [isoform.label]})
                gff_out.write(intron_rec)
    t2 = time.time()
    print "Addition took %.2f minutes." %((t2 - t1)/60.)