def load_modisco_results(modisco_dir): """Load modisco_result - return Args: modisco_dir: directory path `output_dir` in `basepair.cli.modisco.modisco_run` contains: modisco.h5, strand_distances.h5, kwargs.json Returns: TfModiscoResults object containing original track_set """ import modisco from modisco.tfmodisco_workflow import workflow modisco_kwargs = read_json(f"{modisco_dir}/kwargs.json") grad_type = modisco_kwargs['grad_type'] # load used strand distance filter included_samples = HDF5Reader.load( f"{modisco_dir}/strand_distances.h5")['included_samples'] # load importance scores d = HDF5Reader.load(modisco_kwargs['imp_scores']) if 'hyp_imp' not in d: # backcompatibility d['hyp_imp'] = d['grads'] tasks = list(d['targets']['profile']) if isinstance(d['inputs'], dict): one_hot = d['inputs']['seq'] else: one_hot = d['inputs'] thr_hypothetical_contribs = { f"{task}/{gt}": mean(d['hyp_imp'][task][gt])[included_samples] for task in tasks for gt in grad_type.split(",") } thr_one_hot = one_hot[included_samples] thr_contrib_scores = { f"{task}/{gt}": thr_hypothetical_contribs[f"{task}/{gt}"] * thr_one_hot for task in tasks for gt in grad_type.split(",") } track_set = modisco.tfmodisco_workflow.workflow.prep_track_set( task_names=tasks, contrib_scores=thr_contrib_scores, hypothetical_contribs=thr_hypothetical_contribs, one_hot=thr_one_hot) with h5py.File(os.path.join(modisco_dir, "modisco.h5"), "r") as grp: mr = workflow.TfModiscoResults.from_hdf5(grp, track_set=track_set) return mr, tasks, grad_type
def get_eval_predictions(tf, model, filter_dnase=False): """Get the predictions""" with HDF5Reader(os.path.join(eval_dir, tf, model + ".h5")) as r: y_pred = r.f['/preds'][:] labels_bed_file = os.path.join(root_dir, get_dl_kwargs(tf)['intervals_file']) df_unfiltered = pd.read_csv(labels_bed_file, sep="\t", header=None) df_unfiltered.columns = ['chr', 'start', 'end', 'y_true'] if filter_dnase: # Filter the DNase peaks based on the overlaps dnase_peaks = '{ddir}/raw/tfbinding/eval/tf-DREAM/DNASE.{ctype}.relaxed.narrowPeak.gz'.format( ddir=ddir, ctype=TF2CT[tf]) filtered_bed = BedTool(labels_bed_file).intersect(BedTool(dnase_peaks), u=True, wa=True, f=.5) df_filtered = pd.read_csv(filtered_bed.fn, sep="\t", header=None) df_filtered.columns = ['chr', 'start', 'end', 'y_true'] df_filtered['filtered'] = True keep = df_unfiltered.merge(df_filtered, how='left', on=list( df_unfiltered.columns)).filtered == True return df_unfiltered.y_true.values[keep], y_pred[keep] else: return df_unfiltered.y_true.values, y_pred[:]
def modisco_score(modisco_dir, imp_scores, output_tsv, output_seqlets_pkl=None, seqlet_len=25, n_cores=1, method="rank", trim_pattern=False): """Find seqlet instances using modisco """ add_file_logging(os.path.dirname(output_tsv), logger, 'modisco-score') mr, tasks, grad_type = load_modisco_results(modisco_dir) # load importance scores we want to score d = HDF5Reader.load(imp_scores) if 'hyp_imp' not in d: # backcompatibility d['hyp_imp'] = d['grads'] if isinstance(d['inputs'], dict): one_hot = d['inputs']['seq'] else: one_hot = d['inputs'] hypothetical_contribs = { f"{task}/{gt}": mean(d['hyp_imp'][task][gt]) for task in tasks for gt in grad_type.split(",") } contrib_scores = { f"{task}/{gt}": hypothetical_contribs[f"{task}/{gt}"] * one_hot for task in tasks for gt in grad_type.split(",") } seqlets = find_instances(mr, tasks, contrib_scores, hypothetical_contribs, one_hot, seqlet_len=seqlet_len, n_cores=n_cores, method=method, trim_pattern=trim_pattern) if len(seqlets) == 0: print("ERROR: no seqlets found!!") return [], None if output_seqlets_pkl: write_pkl(seqlets, output_seqlets_pkl) df = labelled_seqlets2df(seqlets) dfm = pd.DataFrame(d['metadata']['range']) dfm.columns = ["example_" + v for v in dfm.columns] df = df.merge(dfm, left_on="example_idx", how='left', right_on="example_id") df.to_csv(output_tsv, sep='\t') return seqlets, df
def __init__(self, fpath): self.fpath = fpath self.f = HDF5Reader(self.fpath) self.f.open() # example ranges. loaded when needed self.ranges = None
def load_included_samples(modisco_dir): modisco_dir = Path(modisco_dir) kwargs = read_json(modisco_dir / "kwargs.json") d = ImpScoreFile(kwargs["imp_scores"]) interval_from_task = d.get_ranges().interval_from_task n = len(d) d.close() included_samples = np.ones((n, ), dtype=bool) if not kwargs.get("skip_dist_filter", False) and ( modisco_dir / "strand_distances.h5").exists(): included_samples = HDF5Reader.load( modisco_dir / "strand_distances.h5")['included_samples'] & included_samples if kwargs.get("filter_npy", None) is not None: included_samples = np.load(kwargs["filter_npy"]) & included_samples if kwargs.get("subset_tasks", None) is not None and kwargs.get( "filter_subset_tasks", False): included_samples = interval_from_task.isin( kwargs['subset_tasks']).values & included_samples return included_samples
def test_HDF5BatchWriter_array(dl_batch, pred_batch_array, tmpdir): tmpfile = str(tmpdir.mkdir("example").join("out.h5")) batch = prepare_batch(dl_batch, pred_batch_array) writer = HDF5BatchWriter(tmpfile, chunk_size=4) writer.batch_write(batch) writer.batch_write(batch) writer.close() with HDF5Reader(tmpfile) as f: assert np.all( list(f.batch_iter(2))[0]['metadata']['gene_id'] == dl_batch['metadata']['gene_id'][:2]) out = f.load_all() assert np.all(out['metadata']['gene_id'] == np.concatenate([ dl_batch['metadata']['gene_id'], dl_batch['metadata']['gene_id'] ])) assert np.all(out['metadata']['ranges']["chr"] == np.concatenate([ dl_batch['metadata']['ranges']['chr'], dl_batch['metadata'] ['ranges']['chr'] ])) assert np.all(out['metadata']['ranges']["start"] == np.concatenate([ dl_batch['metadata']['ranges']['start'], dl_batch['metadata'] ['ranges']['start'] ])) assert np.all(out['preds'][:3] == pred_batch_array)
def test_test_example(example, tmpdir): """kipoi test ..., add also output file writing """ if example in {"rbp", "non_bedinput_model", "iris_model_template"} \ and sys.version_info[0] == 2: pytest.skip("example not supported on python 2 ") example_dir = cp_tmpdir("example/models/{0}".format(example), tmpdir) args = [ "python", "./kipoi/__main__.py", "test", "--batch_size=4", example_dir ] if INSTALL_FLAG: args.append(INSTALL_FLAG) returncode = subprocess.call(args=args) assert returncode == 0 if example == 'pyt': # python interface, write also the output file output_file = os.path.join(example_dir, 'preds.h5') kipoi.cli.main.cli_test("test", args[3:] + ["-o", output_file]) assert os.path.exists(output_file) preds = HDF5Reader.load(output_file) assert 'inputs' in preds assert 'metadata' in preds assert 'preds' in preds
def test_predict_example(example, tmpdir): """kipoi predict ... """ # TODO - test -out # Traceback (most recent call last): # File "/home/avsec/projects-work/kipoi/kipoi/__main__.py", line 60, in <module> # main() # File "/home/avsec/projects-work/kipoi/kipoi/__main__.py", line 56, in main # command_fn(args.command, sys.argv[2:]) # File "/home/avsec/bin/anaconda3/lib/python3.6/site-packages/kipoi/pipeline.py", line 273, in cli_predict # pred_batch = model.predict_on_batch(batch['inputs']) # File "/home/avsec/bin/anaconda3/lib/python3.6/site-packages/kipoi/model.py", line 22, in predict_on_batch # raise NotImplementedError # NotImplementedError # _________________________ if example in {"rbp", "non_bedinput_model", "iris_model_template" } and sys.version_info[0] == 2: pytest.skip("rbp example not supported on python 2 ") example_dir = "examples/{0}".format(example) if example == "rbp": file_format = "tsv" else: file_format = "hdf5" print(example) print("tmpdir: {0}".format(tmpdir)) tmpfile = str(tmpdir.mkdir("example").join("out.{0}".format(file_format))) # run the args = [ "python", os.path.abspath("./kipoi/__main__.py"), "predict", "../", # directory "--source=dir", "--batch_size=4", "--dataloader_args=test.json", "--output", tmpfile ] if INSTALL_FLAG: args.append(INSTALL_FLAG) returncode = subprocess.call(args=args, cwd=os.path.realpath(example_dir + "/example_files")) assert returncode == 0 assert os.path.exists(tmpfile) if file_format == "hdf5": data = HDF5Reader.load(tmpfile) assert {'metadata', 'preds'} <= set(data.keys()) else: data = pd.read_csv(tmpfile, sep="\t") assert list(data.columns) == [ 'metadata/ranges/chr', 'metadata/ranges/end', 'metadata/ranges/id', 'metadata/ranges/start', 'metadata/ranges/strand', 'preds/0' ]
def load(cls, file_path): """Load the dataset from an hdf5 dataset """ with HDF5Reader(file_path) as obj: data = obj.load_all() attrs = OrderedDict(obj.f.attrs) return cls(data, attrs)
def load(cls, modisco_dir): """Instantiate ModiscoData from tf-modisco run folder """ kwargs = read_json(os.path.join(modisco_dir, "kwargs.json")) d = HDF5Reader.load(kwargs['imp_scores']) # deeplift hdffile included_samples = np.load(kwargs["filter_npy"]) # load modisco mr = ModiscoResult(os.path.join(modisco_dir, "results.hdf5")) mr.open() tasks = list(d['grads'].keys()) return cls(mr, d, included_samples, tasks)
def test_preproc_example(example, tmpdir): """kipoi preproc ... """ if example in {"rbp", "non_bedinput_model", "iris_model_template" } and sys.version_info[0] == 2: pytest.skip("example not supported on python 2 ") if example in {"extended_coda", "kipoi_dataloader_decorator"}: # extended_coda will anyway be tested in models pytest.skip( "randomly failing on circleci without any reason. Skipping this test." ) example_dir = cp_tmpdir("example/models/{0}".format(example), tmpdir) # example_dir = "example/models/{0}".format(example) tmpfile = str(tmpdir.mkdir("output", ).join("out.h5")) # run the args = [ "python", os.path.abspath("./kipoi/__main__.py"), "preproc", "../", # directory "--source=dir", "--batch_size=4", "--num_workers=2", "--dataloader_args=test.json", "--output", tmpfile ] if INSTALL_FLAG: args.append(INSTALL_FLAG) returncode = subprocess.call(args=args, cwd=os.path.realpath(example_dir + "/example_files")) assert returncode == 0 assert os.path.exists(tmpfile) data = HDF5Reader.load(tmpfile) with open(example_dir + "/dataloader.yaml", "r") as f: ex_descr = yaml.load(f) if example not in {"pyt", "sklearn_iris"}: assert data["inputs"].keys( ) == ex_descr["output_schema"]["inputs"].keys() if example == 'pyt': args[-1] = tmpfile + "2.h5" with kipoi.utils.cd(os.path.join(example_dir, "example_files")): kipoi.cli.main.cli_preproc("preproc", args[3:])
def __init__(self, file_path, include_samples=None, default_imp_score='weighted'): self.file_path = file_path self.f = HDF5Reader(self.file_path) self.f.open() # use the hdf5 file handle self.data = self.f.f self.include_samples = include_samples self._hyp_contrib_cache = dict() self.default_imp_score = default_imp_score
def test_predict_activation_example(example, tmpdir): """Kipoi predict --layer=x with a specific output layer specified """ if example in {"rbp", "non_bedinput_model", "iris_model_template" } and sys.version_info[0] == 2: pytest.skip("rbp example not supported on python 2 ") if example in {'kipoi_dataloader_decorator'}: pytest.skip( "Automatically-dowloaded input files skipped for prediction") example_dir = cp_tmpdir("example/models/{0}".format(example), tmpdir) # example_dir = "example/models/{0}".format(example) print(example) print("tmpdir: {0}".format(tmpdir)) tmpfile = str(tmpdir.mkdir("output").join("out.h5")) # run the args = [ "python", os.path.abspath("./kipoi/__main__.py"), "predict", "../", # directory "--source=dir", "--layer", predict_activation_layers[example], "--batch_size=4", "--num_workers=2", "--dataloader_args=test.json", "--output", tmpfile ] if INSTALL_FLAG: args.append(INSTALL_FLAG) returncode = subprocess.call(args=args, cwd=os.path.realpath(example_dir + "/example_files")) assert returncode == 0 assert os.path.exists(tmpfile) data = HDF5Reader.load(tmpfile) assert {'metadata', 'preds'} <= set(data.keys()) if example == 'pyt': args[-1] = tmpfile + "2.h5" with kipoi.utils.cd(os.path.join(example_dir, "example_files")): kipoi.cli.main.cli_predict("predict", args[3:])
def test_preproc_example(example, new_dataloader_kwargs_format, tmpdir): """kipoi preproc ... """ if example in {"rbp", "non_bedinput_model", "iris_model_template" } and sys.version_info[0] == 2: pytest.skip("example not supported on python 2 ") if example in {"extended_coda", "kipoi_dataloader_decorator"}: # extended_coda will anyway be tested in models pytest.skip( "randomly failing on circleci without any reason. Skipping this test." ) example_dir = cp_tmpdir("example/models/{0}".format(example), tmpdir) # example_dir = "example/models/{0}".format(example) tmpfile = str(tmpdir.mkdir("output", ).join("out.h5")) if example in {"rbp"} and new_dataloader_kwargs_format: if example == "rbp": dataloader_args = [ "intervals_file=intervals.tsv", "fasta_file=hg38_chr22.fa", "preproc_transformer=../dataloader_files/encodeSplines.pkl", "gtf_file=gencode_v25_chr22.gtf.pkl.gz", "tarOget_file=targets.tsv" ] elif example == "extended_coda": dataloader_args = [ "intervals_file=intervals.tsv", "input_data_sources={'H3K27AC_subsampled':'H3K27AC_subsampled.bw'}", "batch_size=4" ] # run the args = [ "python", os.path.abspath("./kipoi/__main__.py"), "preproc", "../", # directory "--source=dir", "--batch_size=4", "--num_workers=2", "--dataloader_args" ] + dataloader_args + ["--output", tmpfile] else: # run the args = [ "python", os.path.abspath("./kipoi/__main__.py"), "preproc", "../", # directory "--source=dir", "--batch_size=4", "--num_workers=2", "--dataloader_args=test.json", "--output", tmpfile ] if INSTALL_FLAG: args.append(INSTALL_FLAG) returncode = subprocess.call(args=args, cwd=os.path.realpath(example_dir + "/example_files")) assert returncode == 0 assert os.path.exists(tmpfile) data = HDF5Reader.load(tmpfile) with open(example_dir + "/dataloader.yaml", "r") as f: ex_descr = yaml.load(f) if example not in {"pyt", "sklearn_iris"}: assert data["inputs"].keys( ) == ex_descr["output_schema"]["inputs"].keys() if example == 'pyt': args[-1] = tmpfile + "2.h5" with kipoi_utils.utils.cd(os.path.join(example_dir, "example_files")): kipoi.cli.main.cli_preproc("preproc", args[3:])
def test_predict_variants_example(example, restricted_bed, file_format, new_dataloader_kwargs_format, tmpdir): """kipoi predict ... """ if (example not in {"rbp", "non_bedinput_model"}) or (sys.version_info[0] == 2): pytest.skip( "Only rbp example testable at the moment, which only runs on py3") example_dir = "tests/models/{0}/".format(example) tmpdir_here = tmpdir.mkdir("example") # non_bedinput_model is not compatible with restricted bed files as # alterations in region generation have no influence on that model if restricted_bed and (example != "rbp"): pytest.skip("Resticted_bed only available for rbp_eclip") tmpfile = str(tmpdir_here.join("out.{0}".format(file_format))) vcf_tmpfile = str(tmpdir_here.join("out.{0}".format("vcf"))) dataloader_kwargs = { "fasta_file": "example_files/hg38_chr22.fa", "preproc_transformer": "dataloader_files/encodeSplines.pkl", "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz", "intervals_file": "example_files/variant_intervals.tsv" } dataloader_kwargs = { k: example_dir + v for k, v in dataloader_kwargs.items() } if not new_dataloader_kwargs_format: import json dataloader_kwargs_str = json.dumps(dataloader_kwargs) args = [ "python", os.path.abspath("./kipoi_veff/cli.py"), "score_variants", # "./", # directory example_dir, "--source=dir", "--batch_size=4", "--dataloader_args='%s'" % dataloader_kwargs_str, "--input_vcf", temp(example_dir + "/example_files/variants.vcf", tmpdir), # this one was now gone in the master?! "--output_vcf", vcf_tmpfile, "--extra_output", tmpfile ] else: dataloader_kwargs_list = [ "{0}={1}".format(key, val) for key, val in dataloader_kwargs.items() ] args = [ "python", os.path.abspath("./kipoi_veff/cli.py"), "score_variants", # "./", # directory example_dir, "--source=dir", "--batch_size=4", "--dataloader_args" ] + dataloader_kwargs_list + [ "--input_vcf", temp(example_dir + "/example_files/variants.vcf", tmpdir), # this one was now gone in the master?! "--output_vcf", vcf_tmpfile, "--extra_output", tmpfile ] # run the if INSTALL_FLAG: args.append(INSTALL_FLAG) if restricted_bed: args += [ "--restriction_bed", example_dir + "/example_files/restricted_regions.bed" ] returncode = subprocess.call(args=args, cwd=".") assert returncode == 0 assert os.path.exists(tmpfile) assert os.path.exists(vcf_tmpfile) if restricted_bed: # assert filecmp.cmp(example_dir + "/example_files/variants_ref_out2.vcf", vcf_tmpfile) compare_vcfs(example_dir + "/example_files/variants_ref_out2.vcf", vcf_tmpfile) else: # assert filecmp.cmp(example_dir + "/example_files/variants_ref_out.vcf", vcf_tmpfile) compare_vcfs(example_dir + "/example_files/variants_ref_out.vcf", vcf_tmpfile) if file_format == "hdf5": data = HDF5Reader.load(tmpfile) else: table_labels = [] table_starts = [] table_ends = [] tables = {} head_line_id = "KPVEP_" with open(tmpfile, "r") as ifh: for i, l in enumerate(ifh): if head_line_id in l: if (len(table_starts) > 0): table_ends.append(i - 1) table_labels.append(l.rstrip()[len(head_line_id):]) table_starts.append(i + 1) table_ends.append(i) for label, start, end in zip(table_labels, table_starts, table_ends): tables[label] = pd.read_csv(tmpfile, sep="\t", skiprows=start, nrows=end - start, index_col=0)
def modisco_score2_single_binary(modisco_dir, output_file, imp_scores=None, trim_frac=0.08, n_jobs=20): """ Equivalent of modisco_score2 """ import modisco from modisco.tfmodisco_workflow import workflow cm_path = os.path.join(modisco_dir, 'centroid_seqlet_matches.csv') dfm_norm = pd.read_csv(cm_path) mr = ModiscoResult(os.path.join(modisco_dir, "results.hdf5")) mr.open() tasks = mr.tasks() kwargs = read_json(os.path.join(modisco_dir, "kwargs.json")) d = HDF5Reader.load(kwargs['imp_scores']) # deeplift hdffile if isinstance(d['inputs'], dict): one_hot = d['inputs']['seq'] else: one_hot = d['inputs'] tasks = list(d['grads'].keys()) grad_type = list(d['grads'][tasks[0]].keys())[0] if kwargs.get("filter_npy", None) is not None: included_samples = np.load(kwargs["filter_npy"]) hyp_contrib = { f"{task}": d['grads'][task]['deeplift']['hyp_contrib_scores'][included_samples] for task in tasks for gt in grad_type.split(",") } contrib = { f"{task}": d['grads'][task][gt]['contrib_scores'][included_samples] for task in tasks for gt in grad_type.split(",") } seq = one_hot[included_samples] ranges = pd.DataFrame({ "chrom": d['metadata']['range']['chr'][:][included_samples], "start": d['metadata']['range']['start'][:][included_samples], "end": d['metadata']['range']['end'][:][included_samples], "strand": d['metadata']['range']['strand'][:][included_samples], "idx": np.arange(len(included_samples)), "interval_from_task": d['metadata']['interval_from_task'][:][included_samples], }) print("Scanning for patterns") dfl = [] mr_patterns = mr.patterns() # [:2] for pattern_name in tqdm(mr_patterns): pattern = mr.get_pattern(pattern_name).trim_seq_ic(trim_frac) match, importance = pattern.scan_importance(contrib, hyp_contrib, tasks, n_jobs=n_jobs, verbose=False) seq_match = pattern.scan_seq(seq, n_jobs=n_jobs, verbose=False) dfm = pattern.get_instances( tasks, match, importance, seq_match, norm_df=dfm_norm[dfm_norm.pattern == pattern_name], verbose=False, plot=False) dfl.append(dfm) print("Merging") # merge and write the results dfp = pd.concat(dfl) print("Append ranges") ranges.columns = ["example_" + v for v in ranges.columns] dfp = dfp.merge(ranges, on="example_idx", how='left') dfp.info() dfp.to_parquet(output_file) return None
def test_predict_variants_example_single_model(file_format, tmpdir): """kipoi predict ... """ if sys.version_info[0] == 2: pytest.skip( "Only rbp example testable at the moment, which only runs on py3") examples = "rbp", "non_bedinput_model" example_dirs = ["tests/models/{0}/".format(ex) for ex in examples] main_example_dir = example_dirs[1] tmpdir_here = tmpdir.mkdir("example") # non_bedinput_model is not compatible with restricted bed files as # alterations in region generation have no influence on that model tmpfile = str(tmpdir_here.join("out.{0}".format(file_format))) vcf_tmpfile = str(tmpdir_here.join("out.{0}".format("vcf"))) dataloader_kwargs = { "fasta_file": "example_files/hg38_chr22.fa", "preproc_transformer": "dataloader_files/encodeSplines.pkl", "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz", "intervals_file": "example_files/variant_intervals.tsv" } dataloader_kwargs = { k: main_example_dir + v for k, v in dataloader_kwargs.items() } import json dataloader_kwargs_str = json.dumps(dataloader_kwargs) args = [ "python", os.path.abspath("./kipoi_veff/cli.py"), "score_variants", # "./", # directory example_dirs[1], "--source=dir", "--batch_size=4", "--dataloader_args='%s'" % dataloader_kwargs_str, "--input_vcf", main_example_dir + "/example_files/variants.vcf", # this one was now gone in the master?! "--output_vcf", vcf_tmpfile, "--extra_output", tmpfile ] # run the if INSTALL_FLAG: args.append(INSTALL_FLAG) # run the command kipoi_veff.cli.cli_score_variants('score_variants', args[3:]) for example_dir in example_dirs[1:2]: # assert filecmp.cmp(example_dir + "/example_files/variants_ref_out.vcf", vcf_tmpfile) model_name_safe = example_dir.replace("/", "_") vcf_tmpfile_model = vcf_tmpfile assert os.path.exists(vcf_tmpfile_model) compare_vcfs(example_dir + "/example_files/variants_ref_out.vcf", vcf_tmpfile_model) ending = tmpfile.split('.')[-1] extra_output = tmpfile assert os.path.exists(extra_output) if file_format == "hdf5": data = HDF5Reader.load(extra_output) else: data = pd.read_table(extra_output)
def modisco_run( imp_scores, output_dir, null_imp_scores=None, hparams=None, override_hparams="", grad_type="weighted", subset_tasks=None, filter_subset_tasks=False, filter_npy=None, exclude_chr="", seqmodel=False, # interpretation glob # hparams=None, num_workers=10, max_strand_distance=0.1, overwrite=False, skip_dist_filter=False, use_all_seqlets=False, merge_tasks=False, gpu=None, ): """ Run modisco Args: imp_scores: path to the hdf5 file of importance scores null_imp_scores: Path to the null importance scores grad_type: for which output to compute the importance scores hparams: None, modisco hyper - parameeters: either a path to modisco.yaml or a ModiscoHParams object override_hparams: hyper - parameters overriding the settings in the hparams file output_dir: output file directory filter_npy: path to a npy file containing a boolean vector used for subsetting exclude_chr: comma-separated list of chromosomes to exclude seqmodel: If enabled, then the importance scores came from `imp-score-seqmodel` subset_tasks: comma-separated list of task names to use as a subset filter_subset_tasks: if True, run modisco only in the regions for that TF hparams: hyper - parameter file summary: which summary statistic to use for the profile gradients skip_dist_filter: if True, distances are not used to filter use_all_seqlets: if True, don't restrict the number of seqlets split: On which data split to compute the results merge_task: if True, importance scores for the tasks will be merged gpu: which gpu to use. If None, don't use any GPU's Note: when using subset_tasks, modisco will run on all the importance scores. If you wish to run it only for the importance scores for a particular task you should subset it to the peak regions of interest using `filter_npy` """ plt.switch_backend('agg') add_file_logging(output_dir, logger, 'modisco-run') import os if gpu is not None: create_tf_session(gpu) else: # Don't use any GPU's os.environ['CUDA_VISIBLE_DEVICES'] = '' os.environ['MKL_THREADING_LAYER'] = 'GNU' # import theano import modisco import modisco.tfmodisco_workflow.workflow if seqmodel: assert '/' in grad_type if subset_tasks == '': logger.warn("subset_tasks == ''. Not using subset_tasks") subset_tasks = None if subset_tasks == 'all': # Use all subset tasks e.g. don't subset subset_tasks = None if subset_tasks is not None: subset_tasks = subset_tasks.split(",") if len(subset_tasks) == 0: raise ValueError("Provide one or more subset_tasks. Found None") if filter_subset_tasks and subset_tasks is None: print("Using filter_subset_tasks=False since `subset_tasks` is None") filter_subset_tasks = False if exclude_chr: exclude_chr = exclude_chr.split(",") else: exclude_chr = [] output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) output_path = output_dir / "modisco.h5" remove_exists(output_path, overwrite) output_distances = output_dir / "strand_distances.h5" remove_exists(output_distances, overwrite) if filter_npy is not None: filter_npy = os.path.abspath(filter_npy) # save the hyper-parameters write_json( dict( imp_scores=os.path.abspath(imp_scores), grad_type=grad_type, output_dir=str(output_dir), subset_tasks=subset_tasks, filter_subset_tasks=filter_subset_tasks, hparams=hparams, null_imp_scores=null_imp_scores, # TODO - pack into hyper-parameters as well? filter_npy=filter_npy, exclude_chr=",".join(exclude_chr), skip_dist_filter=skip_dist_filter, use_all_seqlets=use_all_seqlets, max_strand_distance=max_strand_distance, gpu=gpu), os.path.join(output_dir, "kwargs.json")) print("-" * 40) # parse the hyper-parameters if hparams is None: print(f"Using default hyper-parameters") hp = ModiscoHParams() else: if isinstance(hparams, str): print(f"Loading hyper-parameters from file: {hparams}") hp = ModiscoHParams.load(hparams) else: assert isinstance(hparams, ModiscoHParams) hp = hparams if override_hparams: print(f"Overriding the following hyper-parameters: {override_hparams}") hp = tf.contrib.training.HParams( **hp.get_modisco_kwargs()).parse(override_hparams) if use_all_seqlets: hp.max_seqlets_per_metacluster = None # save the hyper-parameters print("Using the following hyper-parameters for modisco:") print("-" * 40) related_dump_yaml(ModiscoHParams(**hp.values()), os.path.join(output_dir, "hparams.yaml"), verbose=True) print("-" * 40) # TODO - replace with imp_scores d = HDF5Reader.load(imp_scores) if 'hyp_imp' not in d: # backcompatibility d['hyp_imp'] = d['grads'] if seqmodel: tasks = list(d['targets']) else: tasks = list(d['targets']['profile']) if subset_tasks is not None: # validate that all the `subset_tasks` # are present in `tasks` for st in subset_tasks: if st not in tasks: raise ValueError( f"subset task {st} not found in tasks: {tasks}") logger.info( f"Using the following tasks: {subset_tasks} instead of the original tasks: {tasks}" ) tasks = subset_tasks if isinstance(d['inputs'], dict): one_hot = d['inputs']['seq'] else: one_hot = d['inputs'] n = len(one_hot) # -------------------- # apply filters if not skip_dist_filter: print("Using profile prediction for the strand filtering") grad_type_filtered = 'weighted' distances = np.array([ np.array([ correlation( np.ravel(d['hyp_imp'][task][grad_type_filtered][0][i]), np.ravel(d['hyp_imp'][task][grad_type_filtered][1][i])) for i in range(n) ]) for task in tasks if len(d['hyp_imp'][task][grad_type_filtered]) == 2 ]).T.mean(axis=-1) # average the distances across tasks dist_filter = distances < max_strand_distance print(f"Fraction of sequences kept: {dist_filter.mean()}") HDF5BatchWriter.dump(output_distances, { "distances": distances, "included_samples": dist_filter }) else: dist_filter = np.ones((n, ), dtype=bool) # add also the filter numpy if filter_npy is not None: print(f"Loading a filter file from {filter_npy}") filter_vec = np.load(filter_npy) dist_filter = dist_filter & filter_vec if filter_subset_tasks: assert subset_tasks is not None interval_from_task = pd.Series(d['metadata']['interval_from_task']) print( f"Subsetting the intervals accoring to subset_tasks: {subset_tasks}" ) print(f"Number of original regions: {dist_filter.sum()}") dist_filter = dist_filter & interval_from_task.isin( subset_tasks).values print( f"Number of filtered regions after filter_subset_tasks: {dist_filter.sum()}" ) # filter by chromosome if exclude_chr: logger.info(f"Excluding chromosomes: {exclude_chr}") chromosomes = d['metadata']['range']['chr'] dist_filter = dist_filter & ( ~pd.Series(chromosomes).isin(exclude_chr)).values # ------------------------------------------------------------- # setup importance scores if seqmodel: thr_one_hot = one_hot[dist_filter] thr_hypothetical_contribs = { f"{task}/{gt}": d['hyp_imp'][task][gt.split("/")[0]][gt.split("/")[1]][dist_filter] for task in tasks for gt in grad_type.split(",") } thr_contrib_scores = { f"{task}/{gt}": thr_hypothetical_contribs[f"{task}/{gt}"] * thr_one_hot for task in tasks for gt in grad_type.split(",") } task_names = [ f"{task}/{gt}" for task in tasks for gt in grad_type.split(",") ] else: if merge_tasks: thr_one_hot = np.concatenate([ one_hot[dist_filter] for task in tasks for gt in grad_type.split(",") ]) thr_hypothetical_contribs = { "merged": np.concatenate([ mean(d['hyp_imp'][task][gt])[dist_filter] for task in tasks for gt in grad_type.split(",") ]) } thr_contrib_scores = { "merged": thr_hypothetical_contribs['merged'] * thr_one_hot } task_names = ['merged'] else: thr_one_hot = one_hot[dist_filter] thr_hypothetical_contribs = { f"{task}/{gt}": mean(d['hyp_imp'][task][gt])[dist_filter] for task in tasks for gt in grad_type.split(",") } thr_contrib_scores = { f"{task}/{gt}": thr_hypothetical_contribs[f"{task}/{gt}"] * thr_one_hot for task in tasks for gt in grad_type.split(",") } task_names = [ f"{task}/{gt}" for task in tasks for gt in grad_type.split(",") ] if null_imp_scores is not None: logger.info(f"Using null_imp_scores: {null_imp_scores}") null_isf = ImpScoreFile(null_imp_scores) null_per_pos_scores = { f"{task}/{gt}": v.sum(axis=-1) for gt in grad_type.split(",") for task, v in null_isf.get_contrib(imp_score=gt).items() if task in tasks } else: # default Null distribution. Requires modisco 5.0 logger.info(f"Using default null_imp_scores") null_per_pos_scores = modisco.coordproducers.LaplaceNullDist( num_to_samp=10000) # ------------------------------------------------------------- # run modisco tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow( # Modisco defaults sliding_window_size=hp.sliding_window_size, flank_size=hp.flank_size, target_seqlet_fdr=hp.target_seqlet_fdr, min_passing_windows_frac=hp.min_passing_windows_frac, max_passing_windows_frac=hp.max_passing_windows_frac, min_metacluster_size=hp.min_metacluster_size, max_seqlets_per_metacluster=hp.max_seqlets_per_metacluster, seqlets_to_patterns_factory=modisco.tfmodisco_workflow. seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory( trim_to_window_size=hp.trim_to_window_size, # default: 30 initial_flank_to_add=hp.initial_flank_to_add, # default: 10 kmer_len=hp.kmer_len, # default: 8 num_gaps=hp.num_gaps, # default: 3 num_mismatches=hp.num_mismatches, # default: 2 n_cores=num_workers, final_min_cluster_size=hp.final_min_cluster_size) # default: 30 )( task_names=task_names, contrib_scores=thr_contrib_scores, # -> task score hypothetical_contribs=thr_hypothetical_contribs, one_hot=thr_one_hot, null_per_pos_scores=null_per_pos_scores) # ------------------------------------------------------------- # save the results grp = h5py.File(output_path) tfmodisco_results.save_hdf5(grp)
def test_grad_predict_example(example): """kipoi postproc grad ... """ if example in {"rbp", "non_bedinput_model", "iris_model_template" } and sys.version_info[0] == 2: pytest.skip("rbp example not supported on python 2 ") example_dir = "examples/{0}".format(example) for file_format in ["tsv", "hdf5"]: print(example) tmpfile = os.path.realpath( str("./grad_outputs.{0}".format(file_format))) bedgraph_temp_file = os.path.realpath(str("./grad_x_input.bed")) # run the args = [ "python", os.path.abspath("./kipoi/__main__.py"), "postproc", "grad", "../", # directory "--source=dir", "--batch_size=4", "--dataloader_args=test.json", "--output", tmpfile ] layer_args = [ "--layer", predict_activation_layers[example], ] final_layer_arg = ["--final_layer"] if INSTALL_FLAG: args.append(INSTALL_FLAG) for la in [layer_args, final_layer_arg]: returncode = subprocess.call( args=args + la, cwd=os.path.realpath(example_dir + "/example_files")) assert returncode == 0 assert os.path.exists(tmpfile) if file_format == "hdf5": data = HDF5Reader.load(tmpfile) assert {'metadata', 'preds', 'inputs'} <= set(data.keys()) # Here we can attempt to write a bedgraph file: bg_args = [ "python", os.path.abspath("./kipoi/__main__.py"), "postproc", "gr_inp_to_file", "../", # directory "--source=dir", '--output', bedgraph_temp_file, "--input_file", tmpfile ] if grad_inputs[example] is not None: bg_args += ["--model_input", grad_inputs[example]] returncode = subprocess.call( args=bg_args, cwd=os.path.realpath(example_dir + "/example_files")) assert returncode == 0 assert os.path.exists(bedgraph_temp_file) os.unlink(bedgraph_temp_file) else: data = pd.read_csv(tmpfile, sep="\t") inputs_columns = data.columns.str.contains("inputs/") preds_columns = data.columns.str.contains("preds/") assert np.all( np.in1d( data.columns.values[preds_columns], data.columns.str.replace( "inputs/", "preds/").values[inputs_columns])) other_cols = data.columns.values[~(preds_columns | inputs_columns)] expected = [ 'metadata/ranges/chr', 'metadata/ranges/end', 'metadata/ranges/id', 'metadata/ranges/start', 'metadata/ranges/strand' ] assert np.all(np.in1d(expected, other_cols)) os.unlink(tmpfile)
def test_predict_variants_example_multimodel(file_format, tmpdir): """kipoi predict ... """ if sys.version_info[0] == 2: pytest.skip( "Only rbp example testable at the moment, which only runs on py3") examples = "rbp", "non_bedinput_model" example_dirs = ["examples/{0}/".format(ex) for ex in examples] main_example_dir = example_dirs[1] tmpdir_here = tmpdir.mkdir("example") # non_bedinput_model is not compatible with restricted bed files as # alterations in region generation have no influence on that model tmpfile = str(tmpdir_here.join("out.{0}".format(file_format))) vcf_tmpfile = str(tmpdir_here.join("out.{0}".format("vcf"))) dataloader_kwargs = { "fasta_file": "example_files/hg38_chr22.fa", "preproc_transformer": "dataloader_files/encodeSplines.pkl", "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz", "intervals_file": "example_files/variant_intervals.tsv" } dataloader_kwargs = { k: main_example_dir + v for k, v in dataloader_kwargs.items() } import json dataloader_kwargs_str = json.dumps(dataloader_kwargs) args = [ "python", os.path.abspath("./kipoi/__main__.py"), "postproc", "score_variants", # "./", # directory example_dirs[0], example_dirs[1], "--source=dir", "--batch_size=4", "--dataloader_args='%s'" % dataloader_kwargs_str, "--input_vcf", main_example_dir + "/example_files/variants.vcf", # this one was now gone in the master?! "--output_vcf", vcf_tmpfile, "--extra_output", tmpfile ] # run the if INSTALL_FLAG: args.append(INSTALL_FLAG) returncode = subprocess.call(args=args, cwd=os.path.realpath(main_example_dir) + "/../../") assert returncode == 0 assert os.path.exists(tmpfile) for example_dir in example_dirs: # assert filecmp.cmp(example_dir + "/example_files/variants_ref_out.vcf", vcf_tmpfile) vcf_tmpfile_model = vcf_tmpfile[:-4] + example_dir.replace( "/", "_") + ".vcf" assert os.path.exists(vcf_tmpfile_model) compare_vcfs(example_dir + "/example_files/variants_ref_out.vcf", vcf_tmpfile_model) if file_format == "hdf5": data = HDF5Reader.load(tmpfile) else: table_labels = [] table_starts = [] table_ends = [] tables = {} head_line_id = "KPVEP_" with open(tmpfile, "r") as ifh: for i, l in enumerate(ifh): if head_line_id in l: if (len(table_starts) > 0): table_ends.append(i - 1) table_labels.append(l.rstrip()[len(head_line_id):]) table_starts.append(i + 1) table_ends.append(i) for label, start, end in zip(table_labels, table_starts, table_ends): tables[label] = pd.read_csv(tmpfile, sep="\t", skiprows=start, nrows=end - start, index_col=0)
def modisco_instances_to_bed(modisco_h5, instances_parq, imp_score_h5, output_dir, trim_frac=0.08): from basepair.modisco.pattern_instances import load_instances add_file_logging(output_dir, logger, 'modisco-instances-to-bed') output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) mr = ModiscoResult(modisco_h5) mr.open() print("load task_id") d = HDF5Reader(imp_score_h5) d.open() if 'hyp_imp' not in d.f.keys(): # backcompatibility d['hyp_imp'] = d['grads'] id_hash = pd.DataFrame({ "peak_id": d.f['/metadata/interval_from_task'][:], "example_idx": np.arange(d.f['/metadata/interval_from_task'].shape[0]) }) # load the instances data frame print("load all instances") df = load_instances(instances_parq, motifs=None, dedup=True) # import pdb # pdb.set_trace() df = df.merge(id_hash, on="example_idx") # append peak_id patterns = df.pattern.unique().tolist() pattern_pssms = { pattern: mr.get_pssm(*pattern.split("/")) for pattern in patterns } append_pattern_loc(df, pattern_pssms, trim_frac=trim_frac) # write out the results example_cols = [ 'example_chr', 'example_start', 'example_end', 'example_id', 'peak_id' ] df_examples = df[example_cols].drop_duplicates().sort_values( ["example_chr", "example_start"]) df_examples.to_csv(output_dir / "scored_regions.bed", sep='\t', header=False, index=False) df["pattern_start_rel"] = df.pattern_start + df.example_start df["pattern_end_rel"] = df.pattern_end + df.example_start df["strand"] = df.revcomp.astype(bool).map({True: "-", False: "+"}) # TODO - update this - ? pattern_cols = [ 'example_chr', 'pattern_start_rel', 'pattern_end_rel', 'example_id', 'percnormed_score', 'strand', 'peak_id', 'seqlet_score' ] (output_dir / "README").write_text("score_regions.bed columns: " + ", ".join(example_cols) + "\n" + "metacluster_<>/pattern_<>.bed columns: " + ", ".join(pattern_cols)) df_pattern = df[pattern_cols] for pattern in df.pattern.unique(): out_path = output_dir / (pattern + ".bed.gz") out_path.parent.mkdir(parents=True, exist_ok=True) dfp = df_pattern[df.pattern == pattern].drop_duplicates().sort_values( ["example_chr", "pattern_start_rel"]) dfp.to_csv(out_path, compression='gzip', sep='\t', header=False, index=False)
def modisco_score_single_binary(modisco_dir, output_tsv, output_seqlets_pkl=None, seqlet_len=25, n_cores=1, method="rank", trim_pattern=False): """ Equivalent of modisco_score """ import modisco from modisco.tfmodisco_workflow import workflow kwargs = read_json(os.path.join(modisco_dir, "kwargs.json")) d = HDF5Reader.load(kwargs['imp_scores']) # deeplift hdffile if isinstance(d['inputs'], dict): one_hot = d['inputs']['seq'] else: one_hot = d['inputs'] tasks = list(d['grads'].keys()) grad_type = list(d['grads'][tasks[0]].keys())[0] if kwargs.get("filter_npy", None) is not None: included_samples = np.load(kwargs["filter_npy"]) hypothetical_contribs = { f"{task}": d['grads'][task]['deeplift']['hyp_contrib_scores'][included_samples] for task in tasks for gt in grad_type.split(",") } contrib_scores = { f"{task}": d['grads'][task][gt]['contrib_scores'][included_samples] for task in tasks for gt in grad_type.split(",") } print(tasks) track_set = workflow.prep_track_set( task_names=tasks, contrib_scores=contrib_scores, hypothetical_contribs=hypothetical_contribs, one_hot=one_hot[included_samples]) with h5py.File(os.path.join(modisco_dir, "results.hdf5"), "r") as grp: mr = workflow.TfModiscoResults.from_hdf5(grp, track_set=track_set) seqlets = find_instances(mr, tasks, contrib_scores, hypothetical_contribs, one_hot[included_samples], seqlet_len=seqlet_len, n_cores=n_cores, method=method, trim_pattern=trim_pattern) if output_seqlets_pkl: write_pkl(seqlets, output_seqlets_pkl) df = labelled_seqlets2df(seqlets) dfm = pd.DataFrame(d['metadata']['range']) dfm.columns = ["example_" + v for v in dfm.columns] dfm['example_id'] = d['metadata']['interval_from_task'] df = df.merge(dfm, left_on="example_idx", how='left', right_on="example_id") df.to_csv(output_tsv, sep='\t') return seqlets, df