def test_sequential_model_loading(): m2 = kipoi.get_model("example/models/extended_coda", source='dir') m1 = kipoi.get_model("example/models/kipoi_dataloader_decorator", source='dir') with cd(m2.source_dir): next(m2.default_dataloader.init_example().batch_iter()) with cd(m1.source_dir): next(m1.default_dataloader.init_example().batch_iter())
def test_predict_pipeline(): model = kipoi.get_model("Basset", source="kipoi") dl_kwargs = model.default_dataloader.example_kwargs with cd(model.source_dir): ret = model.pipeline.predict(dl_kwargs) assert isinstance(ret, np.ndarray) with cd(model.source_dir): ret = model.pipeline.predict(dl_kwargs, layer="11") assert isinstance(ret, list) # with a model that does not implement LayerActivationMixin it should fail: hal_model = kipoi.get_model("HAL", source="kipoi") hal_dl_kwargs = hal_model.default_dataloader.example_kwargs with pytest.raises(Exception): ret = model.pipeline.predict(hal_dl_kwargs, layer="11")
def get_example_data(example, layer, writer=None): example_dir = "examples/{0}".format(example) if INSTALL_REQ: install_model_requirements(example_dir, "dir", and_dataloaders=True) model = kipoi.get_model(example_dir, source="dir") # The preprocessor Dataloader = kipoi.get_dataloader_factory(example_dir, source="dir") # with open(example_dir + "/example_files/test.json", "r") as ifh: dataloader_arguments = json.load(ifh) for k in dataloader_arguments: dataloader_arguments[k] = "example_files/" + dataloader_arguments[k] outputs = [] with cd(model.source_dir): dl = Dataloader(**dataloader_arguments) it = dl.batch_iter(batch_size=32, num_workers=0) # Loop through the data, make predictions, save the output for i, batch in enumerate(tqdm(it)): # make the prediction pred_batch = model.input_grad(batch['inputs'], avg_func="sum", layer=layer, final_layer=False) # write out the predictions, metadata (, inputs, targets) # always keep the inputs so that input*grad can be generated! output_batch = prepare_batch(batch, pred_batch, keep_inputs=True) if writer is not None: writer.batch_write(output_batch) outputs.append(output_batch) if writer is not None: writer.close() return numpy_collate(outputs)
def test_loading(): model_path = "example/models/pyt/model_files/" model_path_class_model = "example/models/pyt_class/" # load model and weights explcitly with pytest.raises(Exception): m1 = PyTorchModel(weights=model_path + "only_weights.pth") m1 = PyTorchModel(module_file=model_path + "pyt.py", weights=model_path + "only_weights.pth") with cd(model_path): m1 = PyTorchModel(module_obj="pyt.simple_model", weights="only_weights.pth") m1 = PyTorchModel(module_file=model_path + "pyt.py", weights=model_path + "only_weights.pth", module_obj="simple_model") m1 = PyTorchModel(module_file=THISFILE, weights=PYT_NET_MODEL_WEIGHTS_FILE, module_class="PyTNet") m1 = PyTorchModel(module_file=THISFILE, weights=PYT_NET_MODEL_WEIGHTS_FILE, module_class="PyTNet", module_kwargs={}) m1 = PyTorchModel(module_file=THISFILE, weights=PYT_NET_MODEL_WEIGHTS_FILE, module_class="PyTNet", module_kwargs="{}") # test loading class from full yaml definition with module_kargs: mh = kipoi.get_model(model_path_class_model, "dir") # Load the test files from model source mh.pipeline.predict_example(batch_size=3)
def test_predict_to_file(tmpdir): h5_tmpfile = str(tmpdir.mkdir("example").join("out.h5")) model = kipoi.get_model("Basset", source="kipoi") dl_kwargs = model.default_dataloader.example_kwargs with cd(model.source_dir): model.pipeline.predict_to_file(h5_tmpfile, dl_kwargs) preds = kipoi.readers.HDF5Reader.load(h5_tmpfile) assert 'preds' in preds
def test_gradient_pipeline(): model = kipoi.get_model("Basset", source="kipoi") dl_kwargs = model.default_dataloader.example_kwargs with cd(model.source_dir): ret = model.pipeline.input_grad(dl_kwargs, final_layer=True, avg_func="sum") assert all(k in ret for k in ['targets', 'metadata', 'inputs', 'grads'])
def __init__(self, data, model, source="kipoi", grad_preds=None): """ Arguments: data: model input data batch model: model name as used for running `model.input_grad(...)` source: model source as used for running `model.input_grad(...)` grad_preds: return value of `model.input_grad(...)`. Can alternatively already be present in `data` argument under the key `preds`. In that case `grad_preds` may be None. """ self.data = data if grad_preds is not None: self.data['grads'] = grad_preds else: assert 'grads' in self.data # TODO: Instead of copying from kipoi.model should we rather have a get_model_descr # TODO-cont: funcion that is also called from get_model # Taken from get_model source_name = source source = kipoi.config.get_source(source) md = source.get_model_descr(model) if ":" in md.default_dataloader: dl_source, dl_path = md.default_dataloader.split(":") else: dl_source = source_name dl_path = md.default_dataloader # allow to use relative and absolute paths for referring to the dataloader default_dataloader_path = os.path.join("/" + model, dl_path)[1:] # This one loads the model!! # default_dataloader = kipoi.get_dataloader_factory(default_dataloader_path, # dl_source) # TODO: Is there a nicer way of getting ahold of the dataloader description? yaml_path = source.pull_dataloader(default_dataloader_path) dataloader_dir = os.path.dirname(yaml_path) from kipoi.components import DataLoaderDescription with cd(dataloader_dir): dl = DataLoaderDescription.load(os.path.basename(yaml_path)) default_dataloader = dl try: self.mie = ModelInfoExtractor(md, default_dataloader) except: logger.warn( "Model is not enabled for variant effect prediction hence it is unclear whether there is a DNA " "sequence input, so (automatic) seqlogo plots are not available for this model." ) self.mie = None self.md = md self.dataloader = default_dataloader # how can the correct model input be selected self.get_dataset, self.model_input_keylist = self._get_ds_extractor( md.schema.inputs)
def get_dataloader_descr(model_name, source): from kipoi.utils import cd src = kipoi.get_source(source) md = kipoi.get_model_descr(model_name, source=source) if isinstance(md.default_dataloader, str): dl_path = os.path.join(model_name, md.default_dataloader) return kipoi.get_dataloader_descr(dl_path, source=source) else: with cd(src.get_model_dir(model_name)): return md.default_dataloader.get()
def test_mutation_map(): if sys.version_info[0] == 2: pytest.skip("rbp example not supported on python 2 ") # Take the rbp model model_dir = "examples/rbp/" if INSTALL_REQ: install_model_requirements(model_dir, "dir", and_dataloaders=True) model = kipoi.get_model(model_dir, source="dir") # The preprocessor Dataloader = kipoi.get_dataloader_factory(model_dir, source="dir") # dataloader_arguments = { "fasta_file": "example_files/hg38_chr22.fa", "preproc_transformer": "dataloader_files/encodeSplines.pkl", "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz", } dataloader_arguments = { k: model_dir + v for k, v in dataloader_arguments.items() } # # Run the actual predictions vcf_path = model_dir + "example_files/first_variant.vcf" # model_info = kipoi.postprocessing.variant_effects.ModelInfoExtractor( model, Dataloader) vcf_to_region = kipoi.postprocessing.variant_effects.SnvCenteredRg( model_info) mdmm = mm._generate_mutation_map( model, Dataloader, vcf_path, dataloader_args=dataloader_arguments, evaluation_function=analyse_model_preds, batch_size=32, vcf_to_region=vcf_to_region, evaluation_function_kwargs={'diff_types': { 'diff': Diff("mean") }}) with cd(model.source_dir): mdmm.save_to_file("example_files/first_variant_mm_totest.hdf5") from kipoi.postprocessing.variant_effects.utils.generic import read_hdf5 reference = read_hdf5("example_files/first_variant_mm.hdf5") obs = read_hdf5("example_files/first_variant_mm.hdf5") compare_rec(reference[0], obs[0]) import matplotlib matplotlib.pyplot.switch_backend('agg') mdmm.plot_mutmap(0, "seq", "diff", "rbp_prb") os.unlink("example_files/first_variant_mm_totest.hdf5")
def test_var_eff_pred_varseq(): if sys.version_info[0] == 2: pytest.skip("rbp example not supported on python 2 ") model_dir = "examples/var_seqlen_model/" if INSTALL_REQ: install_model_requirements(model_dir, "dir", and_dataloaders=True) # model = kipoi.get_model(model_dir, source="dir") # The preprocessor Dataloader = kipoi.get_dataloader_factory(model_dir, source="dir") # dataloader_arguments = { "fasta_file": "example_files/hg38_chr22.fa", "preproc_transformer": "dataloader_files/encodeSplines.pkl", "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz", "intervals_file": "example_files/variant_centered_intervals.tsv" } vcf_path = "example_files/variants.vcf" out_vcf_fpath = "example_files/variants_generated.vcf" ref_out_vcf_fpath = "example_files/variants_ref_out.vcf" # with cd(model.source_dir): vcf_path = kipoi.postprocessing.variant_effects.ensure_tabixed_vcf( vcf_path) model_info = kipoi.postprocessing.variant_effects.ModelInfoExtractor( model, Dataloader) writer = kipoi.postprocessing.variant_effects.VcfWriter( model, vcf_path, out_vcf_fpath) vcf_to_region = None with pytest.raises(Exception): # This has to raise an exception as the sequence length is None. vcf_to_region = kipoi.postprocessing.variant_effects.SnvCenteredRg( model_info) res = sp.predict_snvs( model, Dataloader, vcf_path, dataloader_args=dataloader_arguments, evaluation_function=analyse_model_preds, batch_size=32, vcf_to_region=vcf_to_region, evaluation_function_kwargs={'diff_types': { 'diff': Diff("mean") }}, sync_pred_writer=writer) writer.close() # pass # assert filecmp.cmp(out_vcf_fpath, ref_out_vcf_fpath) compare_vcfs(out_vcf_fpath, ref_out_vcf_fpath) os.unlink(out_vcf_fpath)
def get_dataloader_factory(dataloader): # pull the dataloader & get the dataloader directory yaml_path = './model/dataloader.yaml' dataloader_dir = './model/' # -------------------------------------------- # Setup dataloader description with cd(dataloader_dir): # move to the dataloader directory temporarily dl = DataLoaderDescription.load(os.path.basename(yaml_path)) file_path, obj_name = tuple(dl.defined_as.split("::")) CustomDataLoader = getattr(load_module(file_path), obj_name) # check that dl.type is correct if dl.type not in AVAILABLE_DATALOADERS: raise ValueError( "dataloader type: {0} is not in supported dataloaders:{1}".format( dl.type, list(AVAILABLE_DATALOADERS.keys()))) # check that the extractor arguments match yaml arguments if not getargs(CustomDataLoader) == set(dl.args.keys()): raise ValueError("DataLoader arguments: \n{0}\n don't match ".format( set(getargs(CustomDataLoader))) + "the specification in the dataloader.yaml file:\n{0}". format(set(dl.args.keys()))) # check that CustomDataLoader indeed interits from the right DataLoader if dl.type in DATALOADERS_AS_FUNCTIONS: # transform the functions into objects assert isinstance(CustomDataLoader, types.FunctionType) CustomDataLoader = AVAILABLE_DATALOADERS[dl.type].from_fn( CustomDataLoader) else: if not issubclass(CustomDataLoader, AVAILABLE_DATALOADERS[dl.type]): raise ValueError( "DataLoader does't inherit from the specified dataloader: {0}". format(AVAILABLE_DATALOADERS[dl.type].__name__)) # Inherit the attributes from dl CustomDataLoader.type = dl.type CustomDataLoader.defined_as = dl.defined_as CustomDataLoader.args = dl.args CustomDataLoader.info = dl.info CustomDataLoader.output_schema = dl.output_schema CustomDataLoader.dependencies = dl.dependencies CustomDataLoader.postprocessing = dl.postprocessing CustomDataLoader._yaml_path = yaml_path CustomDataLoader.source_dir = dataloader_dir #CustomDataLoader.print_args = classmethod(print_dl_kwargs) return CustomDataLoader
def test_var_eff_pred2(): if sys.version_info[0] == 2: pytest.skip("rbp example not supported on python 2 ") # Take the rbp model model_dir = "examples/rbp/" if INSTALL_REQ: install_model_requirements(model_dir, "dir", and_dataloaders=True) # model = kipoi.get_model(model_dir, source="dir") # The preprocessor Dataloader = kipoi.get_dataloader_factory(model_dir, source="dir") # dataloader_arguments = { "fasta_file": "example_files/hg38_chr22.fa", "preproc_transformer": "dataloader_files/encodeSplines.pkl", "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz", } # # Run the actual predictions vcf_path = "example_files/variants.vcf" out_vcf_fpath = "example_files/variants_generated2.vcf" ref_out_vcf_fpath = "example_files/variants_ref_out2.vcf" restricted_regions_fpath = "example_files/restricted_regions.bed" # with cd(model.source_dir): pbd = pb.BedTool(restricted_regions_fpath) model_info = kipoi.postprocessing.variant_effects.ModelInfoExtractor( model, Dataloader) vcf_to_region = kipoi.postprocessing.variant_effects.SnvPosRestrictedRg( model_info, pbd) writer = kipoi.postprocessing.variant_effects.utils.io.VcfWriter( model, vcf_path, out_vcf_fpath) res = sp.predict_snvs( model, Dataloader, vcf_path, dataloader_args=dataloader_arguments, evaluation_function=analyse_model_preds, batch_size=32, vcf_to_region=vcf_to_region, evaluation_function_kwargs={'diff_types': { 'diff': Diff("mean") }}, sync_pred_writer=writer) writer.close() # pass #assert filecmp.cmp(out_vcf_fpath, ref_out_vcf_fpath) compare_vcfs(out_vcf_fpath, ref_out_vcf_fpath) os.unlink(out_vcf_fpath)
def test_gradient_function_model(example): """Test extractor """ if example == "rbp" and sys.version_info[0] == 2: pytest.skip("rbp example not supported on python 2 ") import keras backend = keras.backend._BACKEND if backend == 'theano' and example == "rbp": pytest.skip("extended_coda example not with theano ") # example_dir = "examples/{0}".format(example) # install the dependencies # - TODO maybe put it implicitly in load_dataloader? if INSTALL_REQ: install_model_requirements(example_dir, "dir", and_dataloaders=True) # Dl = kipoi.get_dataloader_factory(example_dir, source="dir") # test_kwargs = get_test_kwargs(example_dir) # # install the dependencies # - TODO maybe put it implicitly in load_extractor? if INSTALL_REQ: install_model_requirements(example_dir, source="dir") # # get model model = kipoi.get_model(example_dir, source="dir") # with cd(example_dir + "/example_files"): # initialize the dataloader dataloader = Dl(**test_kwargs) # # sample a batch of data it = dataloader.batch_iter() batch = next(it) # predict with a model model.predict_on_batch(batch["inputs"]) if backend != 'theano': model.input_grad(batch["inputs"], Slice_conv()[:, 0], pre_nonlinearity=True) model.input_grad(batch["inputs"], Slice_conv()[:, 0], pre_nonlinearity=False) model.input_grad(batch["inputs"], 0, pre_nonlinearity=False) # same as Slice_conv()[:, 0] model.input_grad(batch["inputs"], avg_func="sum")
def get_dataloader_descr(model_name, source='kipoi'): """Not yet nicely integrated with Kipoi Args: model_name: model name as a string Returns: (model output schema, list of required files) """ dl_skip_arguments = { "kipoiseq.dataloaders.SeqIntervalDl": ['alphabet_axis', 'dummy_axis', 'alphabet', 'dtype'] } md = kipoi.get_model_descr(model_name) src = kipoi.get_source(source) # get dataloader if isinstance(md.default_dataloader, str): dataloader = kipoi.get_dataloader_descr(os.path.join( model_name, md.default_dataloader), source=source) dataloader_name = md.default_dataloader dataloader_args = dataloader.args else: with cd(src.get_model_dir(model_name)): dataloader = md.default_dataloader.get() dataloader_name = md.default_dataloader.defined_as dataloader_args = OrderedDict([ (k, v) for k, v in dataloader.args.items() if k not in list(md.default_dataloader.default_args) + dl_skip_arguments.get(dataloader_name, []) ]) if md.default_dataloader.defined_as == 'kipoiseq.dataloaders.SeqIntervalDl': # HACK - cleanup some values for SeqIntervalDl if md.default_dataloader.default_args.get("ignore_targets", False): dataloader_args.pop('label_dtype', None) required_files = [] if 'fasta_file' in dataloader.args: required_files.append("fasta_file") if 'gtf_file' in dataloader.args: required_files.append("gtf_file") return get_output_schema(md.schema.targets), required_files
def test_activation_function_model(example): """Test extractor """ if example == "rbp" and sys.version_info[0] == 2: pytest.skip("rbp example not supported on python 2 ") # import keras backend = keras.backend._BACKEND if backend == 'theano' and example == "rbp": pytest.skip("extended_coda example not with theano ") # example_dir = "examples/{0}".format(example) # install the dependencies # - TODO maybe put it implicitly in load_dataloader? if INSTALL_REQ: install_model_requirements(example_dir, "dir", and_dataloaders=True) # Dl = kipoi.get_dataloader_factory(example_dir, source="dir") # test_kwargs = get_test_kwargs(example_dir) # # install the dependencies # - TODO maybe put it implicitly in load_extractor? if INSTALL_REQ: install_model_requirements(example_dir, source="dir") # # get model model = kipoi.get_model(example_dir, source="dir") # with cd(example_dir + "/example_files"): # initialize the dataloader dataloader = Dl(**test_kwargs) # # sample a batch of data it = dataloader.batch_iter() batch = next(it) # predict with a model model.predict_on_batch(batch["inputs"]) model.predict_activation_on_batch(batch["inputs"], layer=len(model.model.layers) - 2) if example == "rbp": model.predict_activation_on_batch(batch["inputs"], layer="flatten_6")
def test_loading_old(tmpdir): import torch # load model in different ways... with pytest.raises(Exception): OldPyTorchModel() OldPyTorchModel(build_fn=lambda: get_simple_model()) model_path = "example/models/pyt/model_files/" # load model and weights explcitly m1 = OldPyTorchModel(file=model_path + "pyt.py", weights=model_path + "only_weights.pth", build_fn="get_model") # load model and weights through model loader with cd("example/models/pyt"): m2 = OldPyTorchModel(file="model_files/pyt.py", build_fn="get_model_w_weights") # assert that's identical check_same_weights(m1.model.state_dict(), m2.model.state_dict()) # now test whether loading a full model works tmpfile = str(tmpdir.mkdir("pytorch").join("full_model.pth")) m = get_simple_model() torch.save(m, tmpfile) km = OldPyTorchModel(weights=tmpfile) check_same_weights(m.state_dict(), km.model.state_dict())
def modified_files(git_range, source_folder, relative=True): """ Returns files under the models dir that have been modified within the git range. Filenames are returned with the `source_folder` included. Args: git_range : list or tuple of length 1 or 2 For example, ['00232ffe', '10fab113'], or commonly ['master', 'HEAD'] or ['master']. If length 2, then the commits are provided to `git diff` using the triple-dot syntax, `commit1...commit2`. If length 1, the comparison is any changes in the working tree relative to the commit. source_folder : str Root of the model source/git repo relative=True: return the relative path """ assert isinstance(git_range, list) cmds = ['diff', '--name-only'] + git_range with cd(source_folder): code, lines = _call_command("git", cmds, use_stdout=True, return_logs_with_stdout=True) assert code == 0 modified = [os.path.join(source_folder, line) for line in lines] # exclude files that were deleted in the git-range existing = list(filter(os.path.exists, modified)) # if the only diff is that files were deleted, we can have ['model/'], so # filter on existing *files* existing = list(filter(os.path.isfile, existing)) if relative: return [os.path.relpath(f, source_folder) for f in existing] else: return existing
def merge_deps(models, dataloaders=None, source="kipoi", vep=False, gpu=False): """Setup the dependencies """ deps = Dependencies() for model in models: logger.info("Loading model: {0} description".format(model)) parsed_source, parsed_model = parse_source_name(source, model) sub_models = list_subcomponents(parsed_model, parsed_source, "model") if len(sub_models) == 0: raise ValueError("Model {0} not found in source {1}".format( parsed_model, parsed_source)) if len(sub_models) > 1: logger.info( "Found {0} models under the model name: {1}. Merging dependencies for all" .format(len(sub_models), parsed_model)) for sub_model in sub_models: model_descr = kipoi.get_model_descr(sub_model, parsed_source) model_dir = kipoi.get_source(parsed_source).get_model_dir( sub_model) deps = deps.merge(model_descr.dependencies) # handle the dataloader=None case if dataloaders is None or not dataloaders: if isinstance(model_descr.default_dataloader, DataLoaderImport): # dataloader specified by the import deps = deps.merge( model_descr.default_dataloader.dependencies) if model_descr.default_dataloader.parse_dependencies: # add dependencies specified in the yaml file # load from the dataloader description if you can try: with cd(model_dir): dataloader_descr = model_descr.default_dataloader.get( ) deps = deps.merge(dataloader_descr.dependencies) except ImportError as e: # package providing the dataloader is not installed yet if model_descr.default_dataloader.defined_as.startswith( "kipoiseq."): logger.info( "kipoiseq not installed. Using default kipoiseq dependencies for the dataloader: {}" .format(model_descr.default_dataloader. defined_as)) deps = deps.merge(KIPOISEQ_DEPS) else: logger.warn( "Unable to extract dataloader description. " "Make sure the package containing the dataloader `{}` is installed" .format(model_descr.default_dataloader. defined_as)) else: dataloader = os.path.normpath( os.path.join(sub_model, str(model_descr.default_dataloader))) logger.info("Inferred dataloader name: {0} from".format( dataloader) + " the model.") dataloader_descr = kipoi.get_dataloader_descr( dataloader, parsed_source) deps = deps.merge(dataloader_descr.dependencies) if dataloaders is not None or dataloaders: for dataloader in dataloaders: parsed_source, parsed_dataloader = parse_source_name( source, dataloader) sub_dataloaders = list_subcomponents(parsed_dataloader, parsed_source, "dataloader") if len(sub_dataloaders) == 0: raise ValueError( "Dataloader: {0} not found in source {1}".format( parsed_dataloader, parsed_source)) if len(sub_dataloaders) > 1: logger.info( "Found {0} dataloaders under the dataloader name: {1}. Merging dependencies for all" .format(len(sub_dataloaders), parsed_dataloader)) for sub_dataloader in sub_dataloaders: dataloader_descr = kipoi.get_dataloader_descr( sub_dataloader, parsed_source) deps = deps.merge(dataloader_descr.dependencies) # add Kipoi to the dependencies deps = KIPOI_DEPS.merge(deps) if vep: # add vep dependencies logger.info("Adding the vep dependencies") deps = VEP_DEPS.merge(deps) if gpu: logger.info("Using gpu-compatible dependencies") deps = deps.gpu() if platform == "darwin": logger.info("Using osx-type dependencies") deps = deps.osx() return deps
def model_list(model_name): """ Models list view """ from kipoi.utils import cd source = current_app.config['SOURCE'] df = get_model_list(source) model_name = model_name.rstrip('/') vtype_path = get_view(model_name, df) if vtype_path is None: # run 404 return # pass else: vtype, path = vtype_path # render the model detail view if vtype == "model": # Model info retrieved from kipoi model = kipoi.get_model_descr(model_name, source=source) src = kipoi.get_source(source) model_dir = kipoi.utils.relative_path(src.get_model_dir(model_name), src.local_path) model_url = github_dir_tree(src.remote_url, model_dir) # Model dataloaders info retrieved from kipoi if model.default_dataloader: if isinstance(model.default_dataloader, str): dl_rel_path = True dataloader = kipoi.get_dataloader_descr(os.path.join( model_name, model.default_dataloader), source=source) dataloader_name = model.default_dataloader dataloader_args = dataloader.args else: dl_rel_path = False with cd(src.get_model_dir(model_name)): dataloader = model.default_dataloader.get() dataloader_name = model.default_dataloader.defined_as dataloader_args = OrderedDict([ (k, v) for k, v in dataloader.args.items() if k not in list(model.default_dataloader.default_args) + dl_skip_arguments.get(dataloader_name, []) ]) if model.default_dataloader.defined_as == 'kipoiseq.dataloaders.SeqIntervalDl': # HACK - cleanup some values for SeqIntervalDl if model.default_dataloader.default_args.get( "ignore_targets", False): dataloader_args.pop('label_dtype', None) else: dataloader = None dataloader_name = '' dataloader_args = {} dl_rel_path = False title = model_name.split('/') # obtain snippets code_snippets = get_snippets(model_name, source) if model_name == "SeqVec/embedding2structure": code_snippets["docker"] = '' code_snippets["singularity"] = '' code_snippets["cli"] = '' code_snippets["python"] = '' code_snippets["R"] = '' # reading the README content readme_dir = kipoi.get_source( current_app.config['SOURCE']).get_model_dir(model_name) try: # python doesnt handle case sensetive path. so: filelists = os.listdir(readme_dir) readmeindx = [x.lower() for x in filelists].index("readme.md") filecontent = open(os.path.join(readme_dir, filelists[readmeindx]), "r").read() readmecontent = render_markdown(filecontent) # remove the title because already there is a title readmecontent = re.sub("<[hH][12]>.*</[hH][12]>", "", readmecontent, count=1) readmecontent = Markup(readmecontent) except IOError: readmecontent = "" except ValueError: readmecontent = "" return render_template( "models/model_details.html", model_name=model_name, model=model, contributors=update_contributors(model.info.contributors, model.info.authors), authors=update_authors(model.info.authors, model.info.cite_as), dataloader=dataloader, dataloader_args=dataloader_args, dataloader_name=dataloader_name, model_url=model_url, dl_rel_path=dl_rel_path, cite_as=update_cite_as(model.info.cite_as), title=title, code_snippets=code_snippets, readmecontent=readmecontent, model_postprocessing=available_postprocessing(model_name)) # run the normal model list view on a subsetted table elif vtype == "model_list": model_df = get_model_list(source) # TODO - augment the results # Filter the results model_df = model_df[model_df.model.str.contains("^" + path + "/")] filtered_models = model_df.to_dict(orient='records') filtered_models = [update_cite_as_dict(x) for x in filtered_models] # update contributors filtered_models = [ update_contributors_as_dict(x) for x in filtered_models ] # update authors filtered_models = [update_authors_as_dict(x) for x in filtered_models] # get readme file readme_dir = os.path.join( kipoi.get_source(current_app.config['SOURCE']).local_path, model_name) try: filelists = os.listdir(readme_dir) readmeindx = [x.lower() for x in filelists].index("readme.md") filecontent = open(os.path.join(readme_dir, filelists[readmeindx]), "r").read() readmecontent = render_markdown(filecontent) except IOError: readmecontent = "" except ValueError: readmecontent = "" return render_template("models/index.html", models=filtered_models, readmecontent=readmecontent) # redirect to the group list elif vtype == "group_list": return redirect(url_for('models.list_groups', group_name=path))
def merge_deps(models, dataloaders=None, source="kipoi", vep=False, interpret=False, gpu=False): """Setup the dependencies """ special_envs, only_models = split_models_special_envs(models) deps = Dependencies() # Treat the handcrafted environments differently for special_env in special_envs: from related import from_yaml logger.info("Loading environment definition: {0}".format(special_env)) # Load and merge the handcrafted deps. yaml_path = os.path.join( kipoi.get_source(source).local_path, special_env + ".yaml") if not os.path.exists(yaml_path): raise ValueError( "Environment definition file {0} not found in source {1}". format(yaml_path, source)) with open(yaml_path, "r", encoding="utf-8") as fh: special_env_deps = Dependencies.from_env_dict(from_yaml(fh)) deps = deps.merge(special_env_deps) for model in only_models: logger.info("Loading model: {0} description".format(model)) parsed_source, parsed_model = parse_source_name(source, model) sub_models = list_subcomponents(parsed_model, parsed_source, "model") if len(sub_models) == 0: raise ValueError("Model {0} not found in source {1}".format( parsed_model, parsed_source)) if len(sub_models) > 1: logger.info( "Found {0} models under the model name: {1}. Merging dependencies for all" .format(len(sub_models), parsed_model)) for sub_model in sub_models: model_descr = kipoi.get_model_descr(sub_model, parsed_source) model_dir = kipoi.get_source(parsed_source).get_model_dir( sub_model) deps = deps.merge(model_descr.dependencies) # handle the dataloader=None case if dataloaders is None or not dataloaders: if isinstance(model_descr.default_dataloader, DataLoaderImport): # dataloader specified by the import deps = deps.merge( model_descr.default_dataloader.dependencies) if model_descr.default_dataloader.parse_dependencies: # add dependencies specified in the yaml file # load from the dataloader description if you can try: with cd(model_dir): dataloader_descr = model_descr.default_dataloader.get( ) deps = deps.merge(dataloader_descr.dependencies) except ImportError as e: # package providing the dataloader is not installed yet if model_descr.default_dataloader.defined_as.startswith( "kipoiseq."): logger.info( "kipoiseq not installed. Using default kipoiseq dependencies for the dataloader: {}" .format(model_descr.default_dataloader. defined_as)) deps = deps.merge(KIPOISEQ_DEPS) else: logger.warn( "Unable to extract dataloader description. " "Make sure the package containing the dataloader `{}` is installed" .format(model_descr.default_dataloader. defined_as)) else: dataloader = os.path.normpath( os.path.join(sub_model, str(model_descr.default_dataloader))) logger.info("Inferred dataloader name: {0} from".format( dataloader) + " the model.") dataloader_descr = kipoi.get_dataloader_descr( dataloader, parsed_source) deps = deps.merge(dataloader_descr.dependencies) if dataloaders is not None or dataloaders: for dataloader in dataloaders: parsed_source, parsed_dataloader = parse_source_name( source, dataloader) sub_dataloaders = list_subcomponents(parsed_dataloader, parsed_source, "dataloader") if len(sub_dataloaders) == 0: raise ValueError( "Dataloader: {0} not found in source {1}".format( parsed_dataloader, parsed_source)) if len(sub_dataloaders) > 1: logger.info( "Found {0} dataloaders under the dataloader name: {1}. Merging dependencies for all" .format(len(sub_dataloaders), parsed_dataloader)) for sub_dataloader in sub_dataloaders: dataloader_descr = kipoi.get_dataloader_descr( sub_dataloader, parsed_source) deps = deps.merge(dataloader_descr.dependencies) # add Kipoi to the dependencies deps = KIPOI_DEPS.merge(deps) if vep: # add vep dependencies logger.info("Adding the vep dependencies") deps = VEP_DEPS.merge(deps) if interpret: # add vep dependencies logger.info("Adding the interpret dependencies") deps = INTERPRET_DEPS.merge(deps) if gpu: logger.info("Using gpu-compatible dependencies") deps = deps.gpu() if platform == "darwin": logger.info("Using osx-type dependencies") deps = deps.osx() return deps
def cli_create_mutation_map(command, raw_args): """CLI interface to calculate mutation map data """ assert command == "create_mutation_map" parser = argparse.ArgumentParser( 'kipoi postproc {}'.format(command), description='Predict effect of SNVs using ISM.') add_model(parser) add_dataloader(parser, with_args=True) parser.add_argument( '-r', '--regions_file', help='Region definition as VCF or bed file. Not a required input.') # TODO - rename path to fpath parser.add_argument('--batch_size', type=int, default=32, help='Batch size to use in prediction') parser.add_argument( "-n", "--num_workers", type=int, default=0, help="Number of parallel workers for loading the dataset") parser.add_argument("-i", "--install_req", action='store_true', help="Install required packages from requirements.txt") parser.add_argument( '-o', '--output', required=True, help="Output HDF5 file. To be used as input for plotting.") parser.add_argument( '-s', "--scores", default="diff", nargs="+", help= "Scoring method to be used. Only scoring methods selected in the model yaml file are" "available except for `diff` which is always available. Select scoring function by the" "`name` tag defined in the model yaml file.") parser.add_argument( '-k', "--score_kwargs", default="", nargs="+", help= "JSON definition of the kwargs for the scoring functions selected in --scores. The " "definiton can either be in JSON in the command line or the path of a .json file. The " "individual JSONs are expected to be supplied in the same order as the labels defined in " "--scores. If the defaults or no arguments should be used define '{}' for that respective " "scoring method.") parser.add_argument( '-l', "--seq_length", type=int, default=None, help= "Optional parameter: Model input sequence length - necessary if the model does not have a " "pre-defined input sequence length.") args = parser.parse_args(raw_args) # extract args for kipoi.variant_effects.predict_snvs dataloader_arguments = parse_json_file_str(args.dataloader_args) if args.output is None: raise Exception("Output file `--output` has to be set!") # -------------------------------------------- # install args if args.install_req: kipoi.pipeline.install_model_requirements(args.model, args.source, and_dataloaders=True) # load model & dataloader model = kipoi.get_model(args.model, args.source) regions_file = os.path.realpath(args.regions_file) output = os.path.realpath(args.output) with cd(model.source_dir): if not os.path.exists(regions_file): raise Exception("Regions inputs file does not exist: %s" % args.regions_file) # Check that all the folders exist file_exists(regions_file, logger) dir_exists(os.path.dirname(output), logger) if args.dataloader is not None: Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source) else: Dl = model.default_dataloader if not isinstance(args.scores, list): args.scores = [args.scores] dts = get_scoring_fns(model, args.scores, args.score_kwargs) # Load effect prediction related model info model_info = kipoi.postprocessing.variant_effects.ModelInfoExtractor( model, Dl) manual_seq_len = args.seq_length # Select the appropriate region generator and vcf or bed file input args.file_format = regions_file.split(".")[-1] bed_region_file = None vcf_region_file = None bed_to_region = None vcf_to_region = None if args.file_format == "vcf" or regions_file.endswith("vcf.gz"): vcf_region_file = regions_file if model_info.requires_region_definition: # Select the SNV-centered region generator vcf_to_region = kipoi.postprocessing.variant_effects.SnvCenteredRg( model_info, seq_length=manual_seq_len) logger.info('Using variant-centered sequence generation.') elif args.file_format == "bed": if model_info.requires_region_definition: # Select the SNV-centered region generator bed_to_region = kipoi.postprocessing.variant_effects.BedOverlappingRg( model_info, seq_length=manual_seq_len) logger.info('Using bed-file based sequence generation.') bed_region_file = regions_file else: raise Exception("") if model_info.use_seq_only_rc: logger.info( 'Model SUPPORTS simple reverse complementation of input DNA sequences.' ) else: logger.info( 'Model DOES NOT support simple reverse complementation of input DNA sequences.' ) from kipoi.postprocessing.variant_effects.mutation_map import _generate_mutation_map mdmm = _generate_mutation_map( model, Dl, vcf_fpath=vcf_region_file, bed_fpath=bed_region_file, batch_size=args.batch_size, num_workers=args.num_workers, dataloader_args=dataloader_arguments, vcf_to_region=vcf_to_region, bed_to_region=bed_to_region, evaluation_function_kwargs={'diff_types': dts}, ) mdmm.save_to_file(output) logger.info('Successfully generated mutation map data')