def lisa_multi(args): log = Log(target=sys.stderr, verbose=args.verbose) lisa = FromGenes(args.species, **extract_kwargs(args, INSTANTIATION_KWARGS), log=log) query_dict = { os.path.basename(query.name): query.readlines() for query in args.query_lists } results_summary = [] all_passed = True for query_name, query_list in query_dict.items(): with log.section('Modeling {}:'.format(str(query_name))): try: results, metadata = lisa.predict( query_list, **extract_kwargs(args, PREDICTION_KWARGS)) top_TFs_unique = save_and_get_top_TFs(args, query_name, results, metadata) results_summary.append((query_name, top_TFs_unique)) except AssertionError as err: all_passed = False log.append('ERROR: ' + str(err)) print_results_multi(results_summary) if not all_passed: raise MultiError('One or more genelists raised an error')
def lisa_deseq(args): log = Log(target=sys.stderr, verbose=args.verbose) lisa = FromGenes(args.species, **extract_kwargs(args, INSTANTIATION_KWARGS), log=log) up_genes, down_genes = parse_deseq_file(args.deseq_file, lfc_cutoff=args.lfc_cutoff, pval_cutoff=args.pval_cutoff, sep=args.sep) results_summary = [] all_passed = True for prefix, query_list in zip(['up-regulated', 'down-regulated'], [up_genes, down_genes]): with log.section('Modeling {}:'.format(str(prefix))): try: results, metadata = lisa.predict( query_list, **extract_kwargs(args, PREDICTION_KWARGS)) top_TFs_unique = save_and_get_top_TFs(args, prefix, results, metadata) results_summary.append((prefix, top_TFs_unique)) except AssertionError as err: all_passed = False log.append('ERROR: ' + str(err)) print_results_multi(results_summary) if not all_passed: raise MultiError('One or more genelists raised an error')
class DataInterface: _config = h5_config data_path = os.path.join(PACKAGE_PATH, 'data') @classmethod def get_window_bedfile_str(cls, species, window_size): genome = cls.load_genome(species, window_size) window_strs = [] for i, window in enumerate(genome.list_windows()): window_strs.append(str(window) + '\t' + str(i)) return '\n'.join(window_strs) @classmethod def get_metadata_headers(cls, technology): return cls._config.get('metadata', technology + '_headers').split(',') @classmethod def get_dataset_url(cls, species, window_size): return h5_config.get('lisa_params','h5_path')\ .format(path = cls._config.get('cistrome','data_url'), species = species, version = REQURED_DATASET_VERSION, window = str(window_size)) @classmethod def get_dataset_path(cls, species, window_size): return cls._config.get('lisa_params', 'h5_path').format( path=cls.data_path, species=species, version=REQURED_DATASET_VERSION, window=str(window_size)) #___ DATASET DOWNLOADING ____ @classmethod def fetch_from_cistrome(cls, species, window_size): dataset_url = cls.get_dataset_url(species, window_size) if not os.path.isdir(cls.data_path): os.mkdir(cls.data_path) filename, _ = request.urlretrieve(dataset_url) os.rename(filename, cls.get_dataset_path(species, window_size)) @classmethod def load_genome(cls, species, window_size): return genome_tools.Genome.from_file(cls._config.get('genome','genome')\ .format(package_path = PACKAGE_PATH, species = species), window_size= window_size) def __init__(self, species, window_size=1000, download_if_not_exists=True, make_new=False, log=None, path=None, load_genes=True): self.species = species self.window_size = int(window_size) if log is None: self.log = Log() else: self.log = log if path is None: self.path = self.get_dataset_path(self.species, self.window_size) else: self.path = path if make_new: h5.File(self.path, 'w').close() elif not os.path.isfile(self.path): if download_if_not_exists and path is None: self.download_data() else: h5.File(self.path, 'w').close() #___ LOAD GENE DATA FROM PACKAGE _____ self.genome = self.load_genome(self.species, self.window_size) if load_genes: self.load_genes() def load_genes(self): self.log.append('Loading gene info ...') self.genes = gene_selection.GeneSet.from_refseq(self._config.get('genome','genes')\ .format(package_path = PACKAGE_PATH, species = self.species), self.genome) self.gene_loc_set = genome_tools.RegionSet( [gene.get_tss_region() for gene in self.genes], self.genome) self.rp_map_locs = np.array( [r.annotation.get_location() for r in self.gene_loc_set.regions]) def get_install_path(self): return self.data_path def get_windows(self): return '\n'.join(str(r) for r in self.genome.list_windows()) # ____ RP MAP DATA _____ @staticmethod def _make_basic_rp_map(gene_loc_set, region_set, decay): distance_matrix = gene_loc_set.map_intersects( region_set, lambda x, y: x.get_genomic_distance(y), slop_distance=5 * decay) distance_matrix.data = np.power(2, -distance_matrix.data / decay) return distance_matrix.tocsr() def _make_enhanced_rp_map(self, gene_loc_set, region_set, decay): #make regions x exons map and exons x genes map try: indptr, indices, exons = [0], [], [] for locus in gene_loc_set.regions: new_exons = locus.annotation.get_exon_regions() exons.extend(new_exons) indices.extend(range(indptr[-1], indptr[-1] + len(new_exons))) indptr.append(indptr[-1] + len(new_exons)) exon_gene_map = sparse.csc_matrix( (np.ones(len(exons)), indices, indptr), shape=(len(exons), len(gene_loc_set.regions))) exons = genome_tools.RegionSet(exons, self.genome) region_exon_map = region_set.map_intersects( exons, distance_function=lambda x, y: x.overlaps( y, min_overlap_proportion=0.4), slop_distance=0) #REGIONS X EXONS region_exon_map = region_exon_map.dot(exon_gene_map).astype( np.bool) not_exon_promoter = 1 - region_exon_map.sum(axis=1).astype(np.bool) basic_rp_map = self._make_basic_rp_map(gene_loc_set, region_set, decay) enhanced_rp_map = basic_rp_map.transpose().multiply( not_exon_promoter) + region_exon_map return enhanced_rp_map.transpose() except Exception as err: print(repr(err)) return region_exon_map, exon_gene_map def build_binned_rp_map(self, style, rp_decay): region_set = genome_tools.RegionSet(list(self.genome.list_windows()), self.genome) if style == 'basic': return self._make_basic_rp_map(self.gene_loc_set, region_set, rp_decay) elif style == 'enhanced': return self._make_enhanced_rp_map(self.gene_loc_set, region_set, rp_decay) else: NotImplementedError() @staticmethod def set_attributes(dataset, attr_dict): for key, value in attr_dict.items(): dataset.attrs[key] = value def get_rp_map_shape(self): return (len(self.genes), len(self.genome)) def add_rp_map(self, style, rp_map): assert(rp_map.shape == self.get_rp_map_shape()), \ 'RP map must be of shape (num genes, num bins): ' + str(self.get_rp_map_shape()) rp_map_path = self._config.get('rp_map', 'rp_map').format(style=style) rp_map = rp_map.tocsr() with h5.File(self.path, 'a') as data: if rp_map_path in data: del data[rp_map_path] group = data.create_group(rp_map_path) group.create_dataset('indptr', data=rp_map.indptr, dtype=np.int32, compression=COMPRESSION) group.create_dataset('indices', data=rp_map.indices, dtype=np.int32, compression=COMPRESSION) group.create_dataset('data', data=rp_map.data, dtype=np.float32, compression=COMPRESSION) self.set_attributes(group, dict(shape=rp_map.shape)) def get_rp_maps(self): try: with h5.File(self.path, 'a') as data: return list(data['rp_maps'].keys()) except KeyError: return [] def get_rp_map(self, style): rp_map_path = self._config.get('rp_map', 'rp_map').format(style=style) with h5.File(self.path, 'r') as data: try: group = data[rp_map_path] rp_map = sparse.csr_matrix( (group['data'][...], group['indices'][...], group['indptr'][...]), shape=group.attrs['shape']) except KeyError: raise DatasetNotFoundError(rp_map_path) return rp_map #___ BIN PROJECTION _____ def check_bin_map_unique(self, bin_map): return len(np.unique(bin_map)) == len(bin_map) def project_indices(self, indices, bin_map): input_hits = sparse.csc_matrix( (np.ones_like(indices), indices, [0, len(indices)]), ) input_hits = self.project_sparse_matrix(input_hits, bin_map, None) return input_hits.tocoo().row @staticmethod def project_array(arr, bin_map, num_bins): #assert(check_bin_map_unique(bin_map[:,0]) and check_bin_map_unique(bin_map[:,1])), 'To project array, bin_map must have all one-to-one mappings' new_arr = np.zeros(num_bins) new_arr[bin_map[:, 1]] = arr[bin_map[:, 0]] return new_arr @staticmethod def project_sparse_matrix(input_hits, bin_map, num_bins, binarize=False): index_converted = input_hits.tocsc()[bin_map[:, 0], :].tocoo() input_hits = sparse.coo_matrix( (index_converted.data, (bin_map[index_converted.row, 1], index_converted.col)), shape=(num_bins, input_hits.shape[1]) if not num_bins is None else None).tocsr() if binarize: input_hits.data = np.ones_like(input_hits.data) return input_hits #___ BINDING FACTOR DATA _____ def get_factor_hit_path(self, technology, dataset_id): return self._config.get('factor_binding', 'hits').format(technology=technology, dataset_id=dataset_id) def get_factor_score_path(self, technology, dataset_id): return self._config.get('factor_binding', 'scores').format(technology=technology, dataset_id=dataset_id) def get_metadata(self, attributes, technology, dataset_id): return { dataset_id: { key: attributes[key] for key in self.get_metadata_headers(technology) } } def transpose_metadata(self, metadata, technology): headers = self.get_metadata_headers(technology) sample_ids = list(metadata.keys()) return { 'sample_id': sample_ids, **{ key: [metadata[sample][key] for sample in sample_ids] for key in headers } } def add_binding_data(self, technology, dataset_id, hit_bins, hit_scores=None, **metadata): hits_path = self.get_factor_hit_path(technology, dataset_id) scores_path = self.get_factor_score_path(technology, dataset_id) with h5.File(self.path, 'a') as data: if hits_path in data: del data[hits_path] hits = data.create_dataset(hits_path, data=np.array(hit_bins), dtype=np.int32, compression=COMPRESSION) if not hit_scores is None: assert (len(hit_bins) == len(hit_scores)) scores = data.create_dataset(scores_path, data=np.array(hit_scores), dtype=np.float64, compression=COMPRESSION) self.set_attributes(hits, metadata) def get_binding_dataset(self, technology, dataset_id): metadata_headers = self.get_metadata_headers(technology) with h5.File(self.path, 'r') as data: factor_dataset_path = self.get_factor_hit_path( technology, dataset_id) scores_path = self.get_factor_score_path(technology, dataset_id) try: hit_bins = np.array(data[factor_dataset_path][...]) attributes = data[factor_dataset_path].attrs if scores_path in data: scores = np.array(data[scores_path][...]) else: scores = np.ones_like(hit_bins) except KeyError: raise DatasetNotFoundError(factor_dataset_path) metadata = self.get_metadata(attributes, technology, dataset_id) return hit_bins, scores, metadata def get_binding_data(self, technology): with h5.File(self.path, 'r') as data: dataset_ids = list(data[self._config.get( 'factor_binding', 'root').format(technology=technology)].keys()) indices = [] scores = [] metadata = dict() for dataset_id in dataset_ids: hit_bins, hit_scores, sample_meta = self.get_binding_dataset( technology, dataset_id) metadata.update(sample_meta) indices.append(hit_bins) scores.append(hit_scores) hits_matrix = indices_list_to_sparse_array(indices, len(self.genome), scores) return hits_matrix.transpose(), np.array( dataset_ids), self.transpose_metadata(metadata, technology) def remove_binding_dataset(self, technology, dataset_id): factor_dataset_path = self.get_factor_hit_path(technology, dataset_id) with h5.File(self.path, 'a') as data: del data[factor_dataset_path] def list_binding_datasets(self, technology): try: with h5.File(self.path, 'r') as data: dataset_ids = list(data[self._config.get( 'factor_binding', 'root').format(technology=technology)].keys()) return dataset_ids except KeyError: return [] #____ PROFILE DATA _____ def add_profile_data(self, technology, dataset_id, profile, rp_maps, rp_map_styles, norm_depth=1e5, **metadata): assert (len(rp_maps) == len(rp_map_styles)) profile_path = self._config.get('profiles', 'profile').format( technology=technology, dataset_id=dataset_id) profile = np.array(profile) if len(profile.shape) == 1: profile = profile[:, np.newaxis] assert (len(profile.shape) == 2) assert (profile.shape[0] == self.genome.num_windows_in_genome()) if not norm_depth is None: profile = profile / profile.sum() * norm_depth with h5.File(self.path, 'a') as data: if profile_path in data: del data[profile_path] hits = data.create_dataset(profile_path, data=profile, dtype=np.float16, compression=COMPRESSION) self.set_attributes(hits, metadata) for rp_map, style in zip(rp_maps, rp_map_styles): rp_matrix_path = self._config.get( 'profiles', 'rp_matrix_col').format(technology=technology, style=style, dataset_id=dataset_id) if rp_matrix_path in data: del data[rp_matrix_path] rp_matrix_col = data.create_dataset(rp_matrix_path, data=rp_map.dot(profile), dtype=np.float32, compression=COMPRESSION) self.set_attributes(rp_matrix_col, metadata) def remove_profile(self, technology, dataset_id): profile_path = self._config.get('profiles', 'profile').format( technology=technology, dataset_id=dataset_id) with h5.File(self.path, 'a') as data: del data[profile_path] for style in self.get_rp_maps(): rp_matrix_col_path = self._config.get( 'profiles', 'rp_matrix_col').format(technology=technology, style=style, dataset_id=dataset_id) del data[rp_matrix_col_path] def get_profile(self, technology, dataset_id): profile_path = self._config.get('profiles', 'profile').format( technology=technology, dataset_id=dataset_id) with h5.File(self.path, 'r') as data: try: profile = np.array(data[profile_path][...]) attributes = data[profile_path].attrs except KeyError: raise DatasetNotFoundError(profile_path) metadata = self.get_metadata(attributes, technology, dataset_id) return profile, metadata def list_profiles(self, technology): profiles_dir = self._config.get('profiles', 'root').format(technology=technology) try: with h5.File(self.path, 'r') as data: dataset_ids = list(data[profiles_dir].keys()) return dataset_ids except KeyError: return [] def get_rp_matrix(self, technology, style): with h5.File(self.path, 'r') as data: rp_matrix_dir = self._config.get('profiles', 'rp_matrix').format( technology=technology, style=style) dataset_ids = list(data[rp_matrix_dir].keys()) slices = [] for _id in dataset_ids: slices.append(np.array(data[rp_matrix_dir][_id][...])) return np.concatenate(slices, axis=1), np.array(dataset_ids) def download_data(self): with self.log.section('Grabbing {} data (~15 minutes):'.format( self.species)): self.log.append('Downloading from database ...') try: self.fetch_from_cistrome(self.species, self.window_size) except error.URLError as err: raise AssertionError( 'ERROR: Cannot connect to cistrome.org for data (usually due to security settings on some servers)!\nView github pages for manual dataset install instructions.' ) self.log.append('Done')