def get_acceptable_species(): """ Finds all species in data directory :return: acceptable_species (set): string of available genome """ acceptable_species = set([]) for fn in os.listdir(clipper.data_dir()): fn = fn.split(".")[0] if fn == "__init__": continue acceptable_species.add(fn) return acceptable_species
def make_features(self): Region = collections.namedtuple("Region", ["region", "gene_id"]) bedtracks = {} for region in self.assigned_regions: bedtracks[region] = pybedtools.BedTool( os.path.join(clipper.data_dir(), "regions", "%s_%s.bed" % (self.species, region)) ) features = HTSeq.GenomicArrayOfSets("auto", stranded=True) for region, bedtrack in bedtracks.items(): for iv, interval in izip(CLIP_analysis.bed_to_genomic_interval(bedtrack), bedtrack): features[iv] = set([Region(region, interval.name)]) return features
def count_genomic_region_sizes(regions, species="hg19"): """ Counts the genomic region sizes for the specified regions dir (should be changed to GTF) """ genomic_region_sizes = {} #TODO update this to work of GFF file, because something isn't matching up... for region in regions: region_tool = pybedtools.BedTool(os.path.join(clipper.data_dir(), "regions", species + "_" + region + ".bed")) genomic_region_sizes[region] = region_tool.total_coverage() return genomic_region_sizes
def make_features(self): Region = collections.namedtuple("Region", ["region", "gene_id"]) bedtracks = {} for region in self.assigned_regions: bedtracks[region] = pybedtools.BedTool( os.path.join(clipper.data_dir(), "regions", "%s_%s.bed" % (self.species, region))) features = HTSeq.GenomicArrayOfSets("auto", stranded=True) for region, bedtrack in bedtracks.items(): for iv, interval in izip( CLIP_analysis.bed_to_genomic_interval(bedtrack), bedtrack): features[iv] = set([Region(region, interval.name)]) return features
def get_acceptable_species(): """ Finds all species in data directory """ acceptable_species = set([]) for fn in os.listdir(clipper.data_dir()): fn = fn.split(".")[0] if fn == "__init__": continue acceptable_species.add(fn) return acceptable_species
def __init__(self, species, db=None, regions_dir=None): """ creates genomic features function, chooses how to direct creation of features based on _species regions_dir : str location to create region species: str species (hg19, mm9, ce10 db: gffutils FeatureDb object """ if regions_dir == None: regions_dir = os.path.join(data_dir(), "regions") self._regions_dir = regions_dir self._db = db self._species = species if species in ["hg19", "mm9"]: self._feature_names = { "five_prime_utr": "five_prime_utr", "three_prime_utr": "three_prime_utr", "exon": "exon", "CDS": "CDS", "gene_id": "gene_id", "transcript": "transcript", } self._fix_chrom = self._fix_chrom_null elif species in ["ce10"]: self._feature_names = { "five_prime_utr": "five_prime_UTR", "three_prime_utr": "three_prime_UTR", "exon": "exon", "CDS": "CDS", "gene_id": "ID", "transcript": "mRNA", } self._fix_chrom = self._fix_chrom_ce10 if self._db is not None: self.featuretypes = list(self._db.featuretypes()) else: self.featuretypes = None
def __init__(self, species, db=None, regions_dir=None): """ creates genomic features function, chooses how to direct creation of features based on _species regions_dir : str location to create region species: str species (hg19, mm9, ce10 db: gffutils FeatureDb object """ if regions_dir == None: regions_dir = os.path.join(data_dir(), "regions") self._regions_dir = regions_dir self._db = db self._species = species if species in ["hg19", "mm9"]: self._feature_names = { "five_prime_utr" : "five_prime_utr", "three_prime_utr" : "three_prime_utr", "exon" : "exon", "CDS" : "CDS", "gene_id" : "gene_id", "transcript" : "transcript", } self._fix_chrom = self._fix_chrom_null elif species in ["ce10"]: self._feature_names = { "five_prime_utr" : "five_prime_UTR", "three_prime_utr" : "three_prime_UTR", "exon" : "exon", "CDS" : "CDS", "gene_id" : "ID", "transcript" : "mRNA", } self._fix_chrom = self._fix_chrom_ce10 if self._db is not None: self.featuretypes = list(self._db.featuretypes()) else: self.featuretypes = None
def get_uORF_start_stop_gff(self): """ Returns hg19 uORFs """ db = gffutils.FeatureDB( "/nas3/yeolab/Genome/ensembl/gtf/gencode.v17.annotation.gtf.db.old" ) transcript_gene_dict = self._create_transcript_map(db) #get all 5' UTRs (UTR3, UTR5, exons, genes, introns, CDS) = CLIP_analysis.get_genomic_regions( os.path.join(clipper.data_dir(), "regions"), "hg19", db).values() five_prime_utr_dict = self._get_five_prime_utr_sequences( UTR5, "/nas3/yeolab/Genome/ucsc/hg19/chromosomes/all.fa") return self._get_uorf_start_stop(five_prime_utr_dict)
def get_uORF_start_stop_gff(self): """ Returns hg19 uORFs """ db = gffutils.FeatureDB("/nas3/yeolab/Genome/ensembl/gtf/gencode.v17.annotation.gtf.db.old") transcript_gene_dict = self._create_transcript_map(db) #get all 5' UTRs (UTR3, UTR5, exons, genes, introns, CDS) = CLIP_analysis.get_genomic_regions(os.path.join(clipper.data_dir(), "regions"), "hg19", db).values() five_prime_utr_dict = self._get_five_prime_utr_sequences(UTR5, "/nas3/yeolab/Genome/ucsc/hg19/chromosomes/all.fa") return self._get_uorf_start_stop(five_prime_utr_dict)
def get_exon_bed(species): short_species = species.split("_")[0] return os.path.join(clipper.data_dir(), "regions", "%s_%s.bed" % (short_species, "exons"))
def __init__(self, species, db=None, regions_dir=None): """ creates genomic features function, chooses how to direct creation of features based on _species regions_dir : str location to create region species: str species (hg19, mm9, ce10 db: gffutils FeatureDb object """ if regions_dir == None: regions_dir = os.path.join(data_dir(), "regions") self._regions_dir = regions_dir self._db = db self._species = species # I'm going to be lazy and leave this here, its needed to make a new genomic features for human genomes # engineering so it doesn't take too much time on load will be slightly annoying so just uncomment when you need it # result = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) # for x, feature in enumerate(db.all_features()): # gene_ids = feature.attributes['gene_id'] # transcript_ids = feature.attributes['transcript_id'] # feature_type = feature.featuretype # # # if feature_type == "gene": # if len(gene_ids) != 1: # print gene_ids[0] # break # # result[gene_ids[0]]['gene'] = feature # else: # for gene_id in gene_ids: # for transcript_id in transcript_ids: # result[gene_id][transcript_id][feature_type].append(feature) # # self._feature_hash = result if species in ["hg19", "mm9", "hg19_v19", "GRCh38_v24"]: self._feature_names = { "five_prime_utr": "five_prime_utr", "three_prime_utr": "three_prime_utr", "exon": "exon", "CDS": "CDS", "gene_id": "gene_id", "transcript": "transcript", } self._fix_chrom = self._fix_chrom_null elif species in ["ce10"]: self._feature_names = { "five_prime_utr": "five_prime_UTR", "three_prime_utr": "three_prime_UTR", "exon": "exon", "CDS": "CDS", "gene_id": "ID", "transcript": "mRNA", } self._fix_chrom = self._fix_chrom_ce10 if self._db is not None: self.featuretypes = list(self._db.featuretypes()) else: self.featuretypes = None
def assign_to_regions(tool, clusters=None, assigned_dir=".", species="hg19", nrand=3): """ Assigns each cluster to a genic region finally saves all generated bed and fasta files for future analysis... tool - a bed tool (each line represnting a cluster) clusters - name of cluster file (optional) assigned_dir - location to save files in species - str species to segment nrand - int number offsets times to shuffle for null hypothesis """ if clusters is None: clusters, ext = os.path.splitext(os.path.basename(tool.fn)) bedtracks = {} regions, assigned_regions = regions_generator() short_species = species.split("_")[0] if short_species == "GRCh38": short_species = "hg38" for region in regions: bedtracks[region] = pybedtools.BedTool( os.path.join(clipper.data_dir(), "regions", "%s_%s.bed" % (species, region))) #creates the basics of bed dict bed_dict = {'all': {'rand': {}}} genes = pybedtools.BedTool( os.path.join(clipper.data_dir(), "regions", "%s_genes.bed" % (species))) offsets = get_offsets_bed12(tool) if tool.field_count() <= 5: tool.sort().merge().saveas() elif 6 <= tool.field_count() < 8: #Hack to get around not having gene name assigned by peak caller, due to overlapping genes this won't be perfect #move_name_real = functools.partial(move_name, original_length=len(tool[0].fields)) #tool = tool.intersect(genes, wo=True, s=True).each(move_name_real).saveas() #fix_strand_ok = functools.partial(fix_strand, warn=False) tool = tool.sort().merge( s=True, c="4,5,6", o="collapse,collapse,collapse").each(fix_strand_v26).saveas() #elif not tool[0][7].isdigit(): # tool = tool.sort().merge(s=True, c="4,5,6", o="collapse,collapse,collapse").each(fix_strand).each(fix_name).saveas() else: #Clipper, this is ideal we like this technique tool = tool.sort().merge(s=True, c="4,5,6,7,8", o="collapse,collapse,collapse,min,min").each( fix_strand_v26).saveas() remaining_clusters = adjust_offsets(tool, offsets) # print "There are a total %d clusters I'll examine" % (len(tool)) for region in regions: remaining_clusters, overlapping = intersection(remaining_clusters, b=bedtracks[region]) #if for some reason there isn't a peak in the region skip it if len(overlapping) == 0: # print "ignoring %s " % region continue #sets up bed dict for this region bed_dict[region] = { 'real': overlapping.sort(stream=True).saveas(), 'rand': {} } no_overlapping_count = len(remaining_clusters) overlapping_count = len(bed_dict[region]['real']) # print "For region: %s found %d that overlap and %d that don't" % (region, # overlapping_count, # no_overlapping_count) if 'real' not in bed_dict['all']: bed_dict['all']['real'] = bed_dict[region]['real'] else: bed_dict['all']['real'] = bed_dict['all']['real'].cat( bed_dict[region]['real'], stream=True, postmerge=False).saveas() #saves offsets so after shuffling the offsets can be readjusted offset_dict = get_offsets_bed12(bed_dict[region]['real']) for i in range(nrand): random_intervals = bed_dict[region]['real'].shuffle( genome=short_species, incl=bedtracks[region].fn).sort() random_intervals = fix_shuffled_strand(random_intervals, bedtracks[region].fn) random_intervals = adjust_offsets(random_intervals, offset_dict) bed_dict[region]['rand'][i] = random_intervals.saveas() if i not in bed_dict['all']['rand']: bed_dict['all']['rand'][i] = bed_dict[region]['rand'][i] else: bed_dict['all']['rand'][i] = bed_dict['all']['rand'][i].cat( bed_dict[region]['rand'][i], stream=True, postmerge=False) #if there are no more clusters to assign stop trying if no_overlapping_count == 0: break # print "After assigning %d un-categorized regions" % len(remaining_clusters) if len(remaining_clusters) > 0: bed_dict['uncatagorized'] = { 'real': remaining_clusters.sort(stream=True).saveas() } bed_dict = save_bedtools(bed_dict, clusters, assigned_dir) return bed_dict
def __init__(self, species, db=None, regions_dir=None, gencode=False): """ creates genomic features function, chooses how to direct creation of features based on _species regions_dir : str location to create region species: str species (hg19, mm9, ce10 db: gffutils FeatureDb object """ if regions_dir == None: regions_dir = os.path.join(data_dir(), "regions") self._regions_dir = regions_dir self._db = db self._species = species #I'm going to be lazy and leave this here, its needed to make a new genomic features for human genomes #engineering so it doesn't take too much time on load will be slightly annoying so just uncomment when you need it result = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) for x, feature in enumerate(db.all_features()): gene_ids = feature.attributes['gene_id'] transcript_ids = feature.attributes['transcript_id'] feature_type = feature.featuretype if feature_type == "gene": if len(gene_ids) != 1: print gene_ids[0] break result[gene_ids[0]]['gene'] = feature else: for gene_id in gene_ids: for transcript_id in transcript_ids: result[gene_id][transcript_id][feature_type].append( feature) self._feature_hash = result if species in ["hg19", "mm9", "hg19_v19", "GRCh38_v24", "hb27" ] or gencode: self._feature_names = { "five_prime_utr": "five_prime_utr", "three_prime_utr": "three_prime_utr", "exon": "exon", "CDS": "CDS", "gene_id": "gene_id", "transcript": "transcript", } self._fix_chrom = self._fix_chrom_null elif species in ["ce10"]: self._feature_names = { "five_prime_utr": "five_prime_UTR", "three_prime_utr": "three_prime_UTR", "exon": "exon", "CDS": "CDS", "gene_id": "ID", "transcript": "mRNA", } self._fix_chrom = self._fix_chrom_ce10 if self._db is not None: self.featuretypes = list(self._db.featuretypes()) else: self.featuretypes = None
def assign_to_regions(tool, clusters=None, assigned_dir=".", species="hg19", nrand=3): """ Assigns each cluster to a genic region finally saves all generated bed and fasta files for future analysis... tool - a bed tool (each line represnting a cluster) clusters - name of cluster file (optional) assigned_dir - location to save files in species - str species to segment nrand - int number offsets times to shuffle for null hypothesis """ if clusters is None: clusters, ext = os.path.splitext(os.path.basename(tool.fn)) bedtracks = {} regions, assigned_regions = regions_generator() short_species = species.split("_")[0] for region in regions: bedtracks[region] = pybedtools.BedTool(os.path.join(clipper.data_dir(), "regions", "%s_%s.bed" % (species, region))) #creates the basics of bed dict bed_dict = {'all': {'rand': {}}} genes = pybedtools.BedTool(os.path.join(clipper.data_dir(), "regions", "%s_genes.bed" % (species))) offsets = get_offsets_bed12(tool) if tool.field_count() <= 5: tool.sort().merge().saveas() elif 6 <= tool.field_count() < 8: #Hack to get around not having gene name assigned by peak caller, due to overlapping genes this won't be perfect #move_name_real = functools.partial(move_name, original_length=len(tool[0].fields)) #tool = tool.intersect(genes, wo=True, s=True).each(move_name_real).saveas() #fix_strand_ok = functools.partial(fix_strand, warn=False) tool = tool.sort().merge(s=True, c="4,5,6", o="collapse,collapse,collapse").each(fix_strand).saveas() elif not tool[0][7].isdigit(): tool = tool.sort().merge(s=True, c="4,5,6", o="collapse,collapse,collapse").each(fix_strand).saveas() else: #Clipper, this is ideal we like this technique tool = tool.sort().merge(s=True, c="4,5,6,7,8", o="collapse,collapse,collapse,min,min").each(fix_strand).saveas() remaining_clusters = adjust_offsets(tool, offsets) # print "There are a total %d clusters I'll examine" % (len(tool)) for region in regions: remaining_clusters, overlapping = intersection(remaining_clusters, b=bedtracks[region]) #if for some reason there isn't a peak in the region skip it if len(overlapping) == 0: # print "ignoring %s " % region continue #sets up bed dict for this region bed_dict[region] = {'real': overlapping.sort(stream=True).saveas(), 'rand': {}} no_overlapping_count = len(remaining_clusters) overlapping_count = len(bed_dict[region]['real']) # print "For region: %s found %d that overlap and %d that don't" % (region, # overlapping_count, # no_overlapping_count) if 'real' not in bed_dict['all']: bed_dict['all']['real'] = bed_dict[region]['real'] else: bed_dict['all']['real'] = bed_dict['all']['real'].cat(bed_dict[region]['real'], stream=True, postmerge=False).saveas() #saves offsets so after shuffling the offsets can be readjusted offset_dict = get_offsets_bed12(bed_dict[region]['real']) for i in range(nrand): random_intervals = bed_dict[region]['real'].shuffle(genome=short_species, incl=bedtracks[region].fn).sort() random_intervals = fix_shuffled_strand(random_intervals, bedtracks[region].fn) random_intervals = adjust_offsets(random_intervals, offset_dict) bed_dict[region]['rand'][i] = random_intervals.saveas() if i not in bed_dict['all']['rand']: bed_dict['all']['rand'][i] = bed_dict[region]['rand'][i] else: bed_dict['all']['rand'][i] = bed_dict['all']['rand'][i].cat(bed_dict[region]['rand'][i], stream=True, postmerge=False) #if there are no more clusters to assign stop trying if no_overlapping_count == 0: break # print "After assigning %d un-categorized regions" % len(remaining_clusters) if len(remaining_clusters) > 0: bed_dict['uncatagorized'] = {'real': remaining_clusters.sort(stream=True).saveas()} bed_dict = save_bedtools(bed_dict, clusters, assigned_dir) return bed_dict