def test_auto_distancing_scrambled(self): distance_matrix = genome_tools.RegionSet(self.regions1A, self.scrambled_genome)\ .map_intersects(genome_tools.RegionSet(self.regions1B, self.scrambled_genome), lambda x,y : x.get_genomic_distance(y), slop_distance=75) self.assertTrue( np.all(np.array(distance_matrix.todense()).astype(int) == self.auto_distancing_truth) )
def test_genome_bin_mapping(self): m2m_map = genome_tools.RegionSet(self.regions1A, self.genome)\ .map_genomic_windows(min_window_overlap_proportion=0.0, regions_to_bins=False) self.assertTrue( np.all(m2m_map == self.m2m_map_truth) )
def load_genes(self): self.log.append('Loading gene info ...') self.genes = gene_selection.GeneSet.from_refseq(self._config.get('genome','genes')\ .format(package_path = PACKAGE_PATH, species = self.species), self.genome) self.gene_loc_set = genome_tools.RegionSet( [gene.get_tss_region() for gene in self.genes], self.genome) self.rp_map_locs = np.array( [r.annotation.get_location() for r in self.gene_loc_set.regions])
def build_binned_rp_map(self, style, rp_decay): region_set = genome_tools.RegionSet(list(self.genome.list_windows()), self.genome) if style == 'basic': return self._make_basic_rp_map(self.gene_loc_set, region_set, rp_decay) elif style == 'enhanced': return self._make_enhanced_rp_map(self.gene_loc_set, region_set, rp_decay) else: NotImplementedError()
def _make_enhanced_rp_map(self, gene_loc_set, region_set, decay): #make regions x exons map and exons x genes map try: indptr, indices, exons = [0], [], [] for locus in gene_loc_set.regions: new_exons = locus.annotation.get_exon_regions() exons.extend(new_exons) indices.extend(range(indptr[-1], indptr[-1] + len(new_exons))) indptr.append(indptr[-1] + len(new_exons)) exon_gene_map = sparse.csc_matrix( (np.ones(len(exons)), indices, indptr), shape=(len(exons), len(gene_loc_set.regions))) exons = genome_tools.RegionSet(exons, self.genome) region_exon_map = region_set.map_intersects( exons, distance_function=lambda x, y: x.overlaps( y, min_overlap_proportion=0.4), slop_distance=0) #REGIONS X EXONS region_exon_map = region_exon_map.dot(exon_gene_map).astype( np.bool) not_exon_promoter = 1 - region_exon_map.sum(axis=1).astype(np.bool) basic_rp_map = self._make_basic_rp_map(gene_loc_set, region_set, decay) enhanced_rp_map = basic_rp_map.transpose().multiply( not_exon_promoter) + region_exon_map return enhanced_rp_map.transpose() except Exception as err: print(repr(err)) return region_exon_map, exon_gene_map
def __init__(self, species, regions, rp_map='enhanced', rp_decay=10000, isd_method='chipseq', verbose=4, log=None): ''' *class* **lisa.FromRegions** (species, regions, rp_map = 'enhanced', rp_decay = 10000, isd_method = 'chipseq', verbose = 4, log = None)** Initialize the LISA test using user-defined regions. Parameters: species: {'hg38', 'mm10'} regions (list of lists/tuples with format [('chr', start, end), ... ]): User-defined regions. rp_map ({"basic", "enhanced"}, scipy.sparse_matrix): RP map type, currently supports "basic" and "enhanced". User may also pass their own RP map as scipy.sparse_matrix in the shape (genes x regions) rp_decay (float, int): Decay rate of region influence on gene based on distance from TSS. Increase to prioritize distal regions, decrease to prioritize promoters. Default of 10000 bp is balanced. isd_method {"chipseq", "motifs"}: Use ChIP-seq data or motifs to mark TF binding locations. verbose (int): Number of levels of log messages to print to stderr Returns: lisa object ''' super().__init__(species, _config, 100, isd_method=isd_method, verbose=verbose, log=log) if isinstance(rp_map, str): rp_map_styles = self._config.get('lisa_params', 'rp_map_styles').split(',') assert ( rp_map in rp_map_styles ), 'RP map must be numpy/scipy.sparse array, or be one of provided maps: {}'.format( ','.join(rp_map_styles)) else: assert ( isinstance(rp_map, np.ndarry) or isinstance(rp_map, scipy.sparse) ), 'RP map must be either numpy ndarry or scipy.sparse matrix' self.rp_map = rp_map #self.genome = genome_tools.Genome.from_file(self._config.get('paths','genomes').format(package_path = PACKAGE_PATH, species = self.species), window_size=100) assert ( isinstance(regions, (list, tuple)) ), '"regions" parameter must be list of region tuples in format [ (chr,start,end [,score]), (chr,start,end [,score]) ... ] or name of bed file.' self.log.append('Validation user-provided regions ...') self.num_regions_supplied = len(regions) regions = self._check_region_specification(regions) self.region_set = genome_tools.RegionSet(regions, self.data_interface.genome) self.region_score_map = np.array( [r.annotation for r in self.region_set.regions]) assert (isinstance(rp_decay, (int, float)) and rp_decay > 0), 'RP decay parameter must be positive int/float' self.rp_decay = rp_decay assert ( len(regions) >= 1000 and len(regions) < 1000000 ), 'User must provide atleast 1000 reigons, and less than 1 million.'