def intersect_clusters_with_gff(logger, clusters_fname, gff_filenames, output_dir): """ Intersect CLIP clusters with a set of GFF filenames (e.g. GFFs containing events.) """ event_clusters_fnames = [] logger.info("Intersecting clusters %s with GFFs.." \ %(clusters_fname)) for gff_fname in gff_filenames: if not os.path.isfile(gff_fname): logger.critical("Cannot find events GFF file %s" \ %(gff_fname)) continue gff_label = os.path.basename(utils.trim_gff_ext(gff_fname)) event_clusters_fname = \ os.path.join(output_dir, "%s.clusters.bed" %(gff_label)) logger.info(" - Processing GFF %s" %(gff_label)) # Intersect the clusters for this sample with each # GFF file intersectBed_cmd = \ "%s -a %s -b %s -loj -f 1 > %s" \ %(bedtools_utils.intersectBed_path, clusters_fname, gff_fname, event_clusters_fname) if os.path.isfile(event_clusters_fname): logger.info("Found %s, skipping.." %(event_clusters_fname)) event_clusters_fnames.append(event_clusters_fname) continue logger.info(" - Executing: %s" %(intersectBed_cmd)) ret_val = os.system(intersectBed_cmd) if ret_val != 0: logger.critical("Cannot intersect clusters with %s" \ %(gff_fname)) else: event_clusters_fnames.append(event_clusters_fname) return event_clusters_fnames
def intersect_clusters_with_gff(logger, clusters_fname, gff_filenames, output_dir): """ Intersect CLIP clusters with a set of GFF filenames (e.g. GFFs containing events.) """ event_clusters_fnames = [] logger.info("Intersecting clusters %s with GFFs.." \ %(clusters_fname)) for gff_fname in gff_filenames: if not os.path.isfile(gff_fname): logger.critical("Cannot find events GFF file %s" \ %(gff_fname)) continue gff_label = os.path.basename(utils.trim_gff_ext(gff_fname)) event_clusters_fname = \ os.path.join(output_dir, "%s.clusters.bed" %(gff_label)) logger.info(" - Processing GFF %s" % (gff_label)) # Intersect the clusters for this sample with each # GFF file intersectBed_cmd = \ "%s -a %s -b %s -loj -f 1 > %s" \ %(bedtools_utils.intersectBed_path, clusters_fname, gff_fname, event_clusters_fname) if os.path.isfile(event_clusters_fname): logger.info("Found %s, skipping.." % (event_clusters_fname)) event_clusters_fnames.append(event_clusters_fname) continue logger.info(" - Executing: %s" % (intersectBed_cmd)) ret_val = os.system(intersectBed_cmd) if ret_val != 0: logger.critical("Cannot intersect clusters with %s" \ %(gff_fname)) else: event_clusters_fnames.append(event_clusters_fname) return event_clusters_fnames
def add_introns_to_gff(gff_filename, output_dir): """ Add 'intron' entries to GFF. """ output_basename = \ utils.trim_gff_ext(os.path.basename(gff_filename)) ext_to_use = os.path.basename(gff_filename).rsplit(".", 1)[1] output_filename = \ os.path.join(output_dir, "%s.with_introns.%s" %(output_basename, ext_to_use)) print "Adding introns to GFF..." print " - Input: %s" %(gff_filename) print " - Output: %s" %(output_filename) if os.path.isfile(output_filename): print "Found file %s, skipping.." %(output_filename) return output_filename gff_out = miso_gff_utils.Writer(open(output_filename, "w")) gff_db = miso_gff_utils.GFFDatabase(from_filename=gff_filename, reverse_recs=True) t1 = time.time() genes = gene_utils.load_genes_from_gff(gff_filename) for gene_id in genes: gene_info = genes[gene_id] gene_tree = gene_info["hierarchy"] gene_obj = gene_info["gene_object"] gene_rec = gene_tree[gene_id]["gene"] # Write the GFF record gff_out.write(gene_rec) # Write out the mRNAs, their exons, and then # input the introns for mRNA in gene_obj.isoforms: mRNA_id = mRNA.label curr_mRNA = gene_tree[gene_id]["mRNAs"][mRNA_id] gff_out.write(curr_mRNA["record"]) # Write out the exons curr_exons = gene_tree[gene_id]["mRNAs"][mRNA_id]["exons"] for exon in curr_exons: gff_out.write(curr_exons[exon]["record"]) # Now output the introns for isoform in gene_obj.isoforms: intron_coords = [] for first_exon, second_exon in zip(isoform.parts, isoform.parts[1::1]): # Intron start coordinate is the coordinate right after # the end of the first exon, intron end coordinate is the # coordinate just before the beginning of the second exon intron_start = first_exon.end + 1 intron_end = second_exon.start - 1 if intron_start >= intron_end: continue intron_coords.append((intron_start, intron_end)) # Create record for this intron intron_id = "%s:%d-%d:%s.intron" \ %(gene_obj.chrom, intron_start, intron_end, gene_obj.strand) intron_rec = \ miso_gff_utils.GFF(gene_obj.chrom, gene_rec.source, "intron", intron_start, intron_end, strand=gene_obj.strand, attributes={"ID": [intron_id], "Parent": [isoform.label]}) gff_out.write(intron_rec) t2 = time.time() print "Addition took %.2f minutes." %((t2 - t1)/60.)