def lisa_multi(args): log = Log(target=sys.stderr, verbose=args.verbose) lisa = FromGenes(args.species, **extract_kwargs(args, INSTANTIATION_KWARGS), log=log) query_dict = { os.path.basename(query.name): query.readlines() for query in args.query_lists } results_summary = [] all_passed = True for query_name, query_list in query_dict.items(): with log.section('Modeling {}:'.format(str(query_name))): try: results, metadata = lisa.predict( query_list, **extract_kwargs(args, PREDICTION_KWARGS)) top_TFs_unique = save_and_get_top_TFs(args, query_name, results, metadata) results_summary.append((query_name, top_TFs_unique)) except AssertionError as err: all_passed = False log.append('ERROR: ' + str(err)) print_results_multi(results_summary) if not all_passed: raise MultiError('One or more genelists raised an error')
def lisa_deseq(args): log = Log(target=sys.stderr, verbose=args.verbose) lisa = FromGenes(args.species, **extract_kwargs(args, INSTANTIATION_KWARGS), log=log) up_genes, down_genes = parse_deseq_file(args.deseq_file, lfc_cutoff=args.lfc_cutoff, pval_cutoff=args.pval_cutoff, sep=args.sep) results_summary = [] all_passed = True for prefix, query_list in zip(['up-regulated', 'down-regulated'], [up_genes, down_genes]): with log.section('Modeling {}:'.format(str(prefix))): try: results, metadata = lisa.predict( query_list, **extract_kwargs(args, PREDICTION_KWARGS)) top_TFs_unique = save_and_get_top_TFs(args, prefix, results, metadata) results_summary.append((prefix, top_TFs_unique)) except AssertionError as err: all_passed = False log.append('ERROR: ' + str(err)) print_results_multi(results_summary) if not all_passed: raise MultiError('One or more genelists raised an error')
def convert_bigwig(cls, bigwig, species, bigwig_cmd_path): log = Log() genome = DataInterface.load_genome(species, cls.window_size) coverage_array = np.zeros(len(genome)) log.append('Converting BigWig file to coverage array ...') if not os.path.exists(cls._get_genome_bin_path(species)): log.append('Writing bins ...') cls._write_genome_bins(species) try: temp = tempfile.NamedTemporaryFile('w', delete=False) temp.close() process = subprocess.run([bigwig_cmd_path, bigwig, cls._get_genome_bin_path(species), temp.name], capture_output=True) if process.returncode == 0: with open(temp.name, 'r') as cmd_output: for line in cmd_output: fields = line.strip().split('\t') coverage_array[int(fields[0])] = fields[4] return coverage_array else: raise AssertionError(process.stderr.decode('utf-8')) finally: os.remove(temp.name)
def __init__(self, species, window_size=1000, download_if_not_exists=True, make_new=False, log=None, path=None, load_genes=True): self.species = species self.window_size = int(window_size) if log is None: self.log = Log() else: self.log = log if path is None: self.path = self.get_dataset_path(self.species, self.window_size) else: self.path = path if make_new: h5.File(self.path, 'w').close() elif not os.path.isfile(self.path): if download_if_not_exists and path is None: self.download_data() else: h5.File(self.path, 'w').close() #___ LOAD GENE DATA FROM PACKAGE _____ self.genome = self.load_genome(self.species, self.window_size) if load_genes: self.load_genes()
def convert_bigwig(cls, bigwig, species, log=None): if log is None: log = Log() genome = DataInterface.load_genome(species, cls.window_size) coverage_array = np.zeros(len(genome)) log.append('Converting BigWig file to coverage array ...') bar = LoadingBar('Progress', len(genome) // 1000 + 1, cold_start=True) try: coverage_bw = bw.open(bigwig) log.append(bar, update_line=True) for i, window in enumerate(genome.list_windows()): if window.chromosome in coverage_bw.chroms(): mean_coverage = coverage_bw.stats(*window.to_tuple())[0] coverage_array[i] = mean_coverage if i % 1000 == 0: log.append(bar, update_line=True) return np.nan_to_num(coverage_array) finally: coverage_bw.close()
def main(species, motif_bed, window_size, gamma_threshold=0.95): genome = DataInterface.load_genome(species, window_size) log = Log(target=stderr) factor_name = None window_nums, scores = [], [] with gzip.open(motif_bed, 'rb') as f: bed = f.readlines() bar = LoadingBar('Binning {} motif hits'.format(str(len(bed))), len(bed), cold_start=True) for i, line in enumerate(bed): chrom, start, end, factor, relscore, log_pval, strand = line.decode( 'utf-8').strip().split('\t') if i == 0: factor_name = factor try: hit_windows = genome.get_region_windows( Region(chrom, start, end)) window_nums.extend(hit_windows) scores.extend([float(log_pval) / 100] * len(hit_windows)) except BadRegionError: pass log.append(bar, update_line=True) log.append('') log.append('Done') hits = sparse.csc_matrix((scores, window_nums, [0, len(window_nums)]), shape=(len(genome), 1)).tocoo().tocsc() sample_hit_scores = np.random.choice(np.array(hits.todense()).reshape(-1), size=10000) min_bin_score = gamma(*gamma.fit(sample_hit_scores)).ppf(gamma_threshold) hit_indices = hits.indices[(hits.data >= min_bin_score) & (hits.data > 0)] return hit_indices, factor_name
def using_bigwig(cls, species, query_genes, bigwig_path, rp_map='enhanced_10K', isd_method='chipseq', background_list=[], background_strategy='all', num_background_genes=3000, seed=2556, verbose=4, log=None): ''' *classmethod* **lisa.FromCoverage.using_bigwig** (species, query_genes, bigwig_path, rp_map = 'basic', rp_decay = 10000, isd_method = 'chipseq', background_list = [], background_strategy = 'all', num_background_genes = 3000, seed = 2556, header = False, verbose = 4, log = None) Run LISA FromCoverage test using a bigwig coverage file. Parameters: species: {'hg38', 'mm10'} query_genes (list): Genes-of-interest, in either Symbol of RefSeqID format. Must provide between 20 to 500 genes. bigwig_path (str): Path to bigwig file Returns: results (lisa.core.utils.LISA_Results): With each key representing a table column, sorted by "summary_p_value" field. The dictionary can be passed directly to a the pandas constructor: ``results_df = pd.DataFrame(results.to_dict())``. metadata (dict): Test metadata. Includes query genes provided and background genes that were selected. ''' if log is None: log = Log() coverage_array = cls.convert_bigwig(bigwig_path, species, log=log) return cls(species, coverage_array, rp_map = rp_map, isd_method=isd_method, verbose=verbose, log=log)\ .predict(query_genes, background_list=background_list, background_strategy=background_strategy, num_background_genes=num_background_genes, seed=seed)