def test_var_eff_pred_varseq(tmpdir): model_name = "DeepSEA/variantEffects" if INSTALL_REQ: install_model_requirements(model_name, "kipoi", and_dataloaders=True) # model = kipoi.get_model(model_name, source="kipoi") # The preprocessor Dataloader = SeqIntervalDl # dataloader_arguments = {"intervals_file": "example_files/intervals.bed", "fasta_file": "example_files/hg38_chr22.fa", "required_seq_len": 1000, "alphabet_axis": 1, "dummy_axis": 2, "label_dtype": str} dataloader_arguments = {k: model.source_dir + "/" + v if isinstance(v, str) else v for k, v in dataloader_arguments.items()} vcf_path = "tests/data/variants.vcf" out_vcf_fpath = str(tmpdir.mkdir("variants_generated", ).join("out.vcf")) # vcf_path = kipoi_veff.ensure_tabixed_vcf(vcf_path) model_info = kipoi_veff.ModelInfoExtractor(model, Dataloader) writer = kipoi_veff.VcfWriter( model, vcf_path, out_vcf_fpath, standardise_var_id=True) vcf_to_region = kipoi_veff.SnvCenteredRg(model_info) res = sp.predict_snvs(model, Dataloader, vcf_path, dataloader_args=dataloader_arguments, batch_size=32, vcf_to_region=vcf_to_region, sync_pred_writer=writer) writer.close() assert os.path.exists(out_vcf_fpath)
def add_scores(self, snp_vcf_path='../data/snp_vcfs', out_dir=None): if out_dir is None: out_dir = '../data/model_scores/' + self.model_name if not os.path.exists(out_dir): os.makedirs(out_dir) file_names = os.listdir(self.snp_vcf_path) for file_name in file_names: chrom = file_name.split('.')[0] Dataloader = self.model.default_dataloader vcf_path = self.snp_vcf_path + '/' + file_name out_vcf_fpath = out_dir + '/' + chrom + '.vcf' print(vcf_path) print(out_vcf_fpath) writer = VcfWriter(self.model, vcf_path, out_vcf_fpath) model_info = kipoi_veff.ModelInfoExtractor(self.model, Dataloader) # vcf_to_region will generate a variant-centered regions when presented a VCF record. vcf_to_region = kipoi_veff.SnvCenteredRg(model_info) dataloader_arguments = { "fasta_file": '../data/fasta_files/chr' + chrom + '.fa' } sp.predict_snvs( self.model, Dataloader, vcf_path, batch_size=32, dataloader_args=dataloader_arguments, vcf_to_region=vcf_to_region, #evaluation_function_kwargs={'diff_types': {'diff': Diff("mean"), 'deepsea_effect': DeepSEA_effect("mean")}}, sync_pred_writer=writer)
def test_mutation_map(): if sys.version_info[0] == 2: pytest.skip("rbp example not supported on python 2 ") # Take the rbp model model_dir = "tests/models/rbp/" if INSTALL_REQ: install_model_requirements(model_dir, "dir", and_dataloaders=True) model = kipoi.get_model(model_dir, source="dir") # The preprocessor Dataloader = kipoi.get_dataloader_factory(model_dir, source="dir") # dataloader_arguments = { "fasta_file": "example_files/hg38_chr22.fa", "preproc_transformer": "dataloader_files/encodeSplines.pkl", "gtf_file": "example_files/gencode_v25_chr22.gtf.pkl.gz", } dataloader_arguments = { k: model_dir + v for k, v in dataloader_arguments.items() } # # Run the actual predictions vcf_path = model_dir + "example_files/first_variant.vcf" # model_info = kipoi_veff.ModelInfoExtractor(model, Dataloader) vcf_to_region = kipoi_veff.SnvCenteredRg(model_info) mdmm = mm._generate_mutation_map( model, Dataloader, vcf_path, dataloader_args=dataloader_arguments, evaluation_function=analyse_model_preds, batch_size=32, vcf_to_region=vcf_to_region, evaluation_function_kwargs={'diff_types': { 'diff': Diff("mean") }}) with cd(model.source_dir): mdmm.save_to_file("example_files/first_variant_mm_totest.hdf5") from kipoi_veff.utils.generic import read_hdf5 reference = read_hdf5("example_files/first_variant_mm.hdf5") obs = read_hdf5("example_files/first_variant_mm.hdf5") compare_rec(reference[0], obs[0]) import matplotlib matplotlib.pyplot.switch_backend('agg') mdmm.plot_mutmap(0, "seq", "diff", "rbp_prb") os.unlink("example_files/first_variant_mm_totest.hdf5")
import pandas as pd model_name = "DeepBind/Homo_sapiens/TF/D00299.003_SELEX_ATF7" # get the model model = kipoi.get_model(model_name) # get the dataloader factory Dataloader = model.default_dataloader vcf_path = "../data/test.vcf" # The output vcf path, based on the input file name out_vcf_fpath = vcf_path[:-4] + "%s.vcf" % model_name.replace("/", "_") # The writer object that will output the annotated VCF writer = VcfWriter(model, vcf_path, out_vcf_fpath) # Information extraction from dataloader and model model_info = kipoi_veff.ModelInfoExtractor(model, Dataloader) # vcf_to_region will generate a variant-centered regions when presented a VCF record. vcf_to_region = kipoi_veff.SnvCenteredRg(model_info) dataloader_arguments = {"fasta_file": "../data/fasta_files/chr1.fa"} sp.predict_snvs( model, Dataloader, vcf_path, batch_size=32, dataloader_args=dataloader_arguments, vcf_to_region=vcf_to_region, #evaluation_function_kwargs={'diff_types': {'diff': Diff("mean"), 'deepsea_effect': DeepSEA_effect("mean")}}, sync_pred_writer=writer) vcf_reader = KipoiVCFParser(out_vcf_fpath)
def cli_score_variants(command, raw_args): """CLI interface to score variants """ # Updated argument names: # - scoring -> scores # - --vcf_path -> --input_vcf, -i # - --out_vcf_fpath -> --output_vcf, -o # - --output -> -e, --extra_output # - remove - -install_req # - scoring_kwargs -> score_kwargs AVAILABLE_FORMATS = [k for k in writers.FILE_SUFFIX_MAP if k != 'bed'] assert command == "score_variants" parser = argparse.ArgumentParser( 'kipoi veff {}'.format(command), description='Predict effect of SNVs using ISM.') parser.add_argument('model', help='Model name.') parser.add_argument( '--source', default="kipoi", choices=list(kipoi.config.model_sources().keys()), help='Model source to use. Specified in ~/.kipoi/config.yaml' + " under model_sources. " + "'dir' is an additional source referring to the local folder.") add_dataloader(parser=parser, with_args=True) parser.add_argument('-i', '--input_vcf', required=True, help='Input VCF.') parser.add_argument('-o', '--output_vcf', help='Output annotated VCF file path.', default=None) parser.add_argument('--batch_size', type=int, default=32, help='Batch size to use in prediction') parser.add_argument( "-n", "--num_workers", type=int, default=0, help="Number of parallel workers for loading the dataset") parser.add_argument( '-r', '--restriction_bed', default=None, help="Regions for prediction can only be subsets of this bed file") parser.add_argument( '-e', '--extra_output', type=str, default=None, required=False, help= "Additional output files in other (non-vcf) formats. File format is inferred from the file path ending" + ". Available file formats are: {0}".format(", ".join( ["." + k for k in AVAILABLE_FORMATS]))) parser.add_argument( '-s', "--scores", default="diff", nargs="+", help= "Scoring method to be used. Only scoring methods selected in the model yaml file are" "available except for `diff` which is always available. Select scoring function by the" "`name` tag defined in the model yaml file.") parser.add_argument( '-k', "--score_kwargs", default="", nargs="+", help= "JSON definition of the kwargs for the scoring functions selected in --scoring. The " "definiton can either be in JSON in the command line or the path of a .json file. The " "individual JSONs are expected to be supplied in the same order as the labels defined in " "--scoring. If the defaults or no arguments should be used define '{}' for that respective " "scoring method.") parser.add_argument( '-l', "--seq_length", type=int, default=None, help= "Optional parameter: Model input sequence length - necessary if the model does not have a " "pre-defined input sequence length.") parser.add_argument( '--std_var_id', action="store_true", help="If set then variant IDs in the annotated" " VCF will be replaced with a standardised, unique ID.") parser.add_argument( "--model_outputs", type=str, default=None, nargs="+", help= "Optional parameter: Only return predictions for the selected model outputs. Naming" "according to the definition in model.yaml > schema > targets > column_labels" ) parser.add_argument( "--model_outputs_i", type=int, default=None, nargs="+", help= "Optional parameter: Only return predictions for the selected model outputs. Give integer" "indices of the selected model output(s).") parser.add_argument( "--singularity", action='store_true', help="Run `kipoi predict` in the appropriate singularity container. " "Containters will get downloaded to ~/.kipoi/envs/ or to " "$SINGULARITY_CACHEDIR if set") args = parser.parse_args(raw_args) # OBSOLETE # Make sure all the multi-model arguments like source, dataloader etc. fit together #_prepare_multi_model_args(args) # Check that all the folders exist file_exists(args.input_vcf, logger) if args.output_vcf is None and args.extra_output is None: logger.error( "One of the two needs to be specified: --output_vcf or --extra_output" ) sys.exit(1) if args.extra_output is not None: dir_exists(os.path.dirname(args.extra_output), logger) ending = args.extra_output.split('.')[-1] if ending not in AVAILABLE_FORMATS: logger.error("File ending: {0} for file {1} not from {2}".format( ending, args.extra_output, AVAILABLE_FORMATS)) sys.exit(1) # singularity_command if args.singularity: from kipoi.cli.singularity import singularity_command logger.info( "Running kipoi veff {} in the singularity container".format( command)) # Drop the singularity flag raw_args = [x for x in raw_args if x != '--singularity'] dataloader_kwargs = parse_json_file_str_or_arglist( args.dataloader_args) # create output files output_files = [] if args.output_vcf is not None: output_files.append(args.output_vcf) if args.extra_output is not None: output_files.append(args.extra_output) singularity_command(['kipoi', 'veff', command] + raw_args, model=args.model, dataloader_kwargs=dataloader_kwargs, output_files=output_files, source=args.source, dry_run=False) return None if not isinstance(args.scores, list): args.scores = [args.scores] score_kwargs = [] if len(args.score_kwargs) > 0: score_kwargs = args.score_kwargs if len(args.scores) >= 1: # Check if all scoring functions should be used: if args.scores == ["all"]: if len(score_kwargs) >= 1: raise ValueError( "`--score_kwargs` cannot be defined in combination will `--scoring all`!" ) else: score_kwargs = [parse_json_file_str(el) for el in score_kwargs] if not len(args.score_kwargs) == len(score_kwargs): raise ValueError( "When defining `--score_kwargs` a JSON representation of arguments (or the " "path of a file containing them) must be given for every " "`--scores` function.") # VCF writer output_vcf_model = None if args.output_vcf is not None: dir_exists(os.path.dirname(args.output_vcf), logger) output_vcf_model = args.output_vcf # Other writers if args.extra_output is not None: dir_exists(os.path.dirname(args.extra_output), logger) extra_output = args.extra_output writer = writers.get_writer(extra_output, metadata_schema=None) assert writer is not None extra_writers = [SyncBatchWriter(writer)] else: extra_writers = [] dataloader_arguments = parse_json_file_str_or_arglist(args.dataloader_args) # -------------------------------------------- # load model & dataloader model = kipoi.get_model(args.model, args.source) if args.dataloader is not None: Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source) else: Dl = model.default_dataloader # Load effect prediction related model info model_info = kipoi_veff.ModelInfoExtractor(model, Dl) if model_info.use_seq_only_rc: logger.info( 'Model SUPPORTS simple reverse complementation of input DNA sequences.' ) else: logger.info( 'Model DOES NOT support simple reverse complementation of input DNA sequences.' ) if output_vcf_model is not None: logger.info('Annotated VCF will be written to %s.' % str(output_vcf_model)) model_outputs = None if args.model_outputs is not None: model_outputs = args.model_outputs elif args.model_outputs_i is not None: model_outputs = args.model_outputs_i kipoi_veff.score_variants(model, dataloader_arguments, args.input_vcf, output_vcf=output_vcf_model, output_writers=extra_writers, scores=args.scores, score_kwargs=score_kwargs, num_workers=args.num_workers, batch_size=args.batch_size, seq_length=args.seq_length, std_var_id=args.std_var_id, restriction_bed=args.restriction_bed, return_predictions=False, model_outputs=model_outputs) logger.info('Successfully predicted samples')
def cli_create_mutation_map(command, raw_args): """CLI interface to calculate mutation map data """ assert command == "create_mutation_map" parser = argparse.ArgumentParser( 'kipoi veff {}'.format(command), description='Predict effect of SNVs using ISM.') add_model(parser) add_dataloader(parser, with_args=True) parser.add_argument( '-r', '--regions_file', help='Region definition as VCF or bed file. Not a required input.') parser.add_argument('--batch_size', type=int, default=32, help='Batch size to use in prediction') parser.add_argument( "-n", "--num_workers", type=int, default=0, help="Number of parallel workers for loading the dataset") parser.add_argument("-i", "--install_req", action='store_true', help="Install required packages from requirements.txt") parser.add_argument( '-o', '--output', required=True, help="Output HDF5 file. To be used as input for plotting.") parser.add_argument( '-s', "--scores", default="diff", nargs="+", help= "Scoring method to be used. Only scoring methods selected in the model yaml file are" "available except for `diff` which is always available. Select scoring function by the" "`name` tag defined in the model yaml file.") parser.add_argument( '-k', "--score_kwargs", default="", nargs="+", help= "JSON definition of the kwargs for the scoring functions selected in --scores. The " "definiton can either be in JSON in the command line or the path of a .json file. The " "individual JSONs are expected to be supplied in the same order as the labels defined in " "--scores. If the defaults or no arguments should be used define '{}' for that respective " "scoring method.") parser.add_argument( '-l', "--seq_length", type=int, default=None, help= "Optional parameter: Model input sequence length - necessary if the model does not have a " "pre-defined input sequence length.") parser.add_argument( "--singularity", action='store_true', help="Run `kipoi predict` in the appropriate singularity container. " "Containters will get downloaded to ~/.kipoi/envs/ or to " "$SINGULARITY_CACHEDIR if set") args = parser.parse_args(raw_args) # extract args for kipoi.variant_effects.predict_snvs print("DL ARGS", args.dataloader_args) dataloader_arguments = parse_json_file_str_or_arglist(args.dataloader_args) #dataloader_arguments = parse_json_file_str(args.dataloader_args) if args.output is None: raise Exception("Output file `--output` has to be set!") if args.singularity: from kipoi.cli.singularity import singularity_command logger.info( "Running kipoi veff in the singularity container".format(command)) # Drop the singularity flag raw_args = [x for x in raw_args if x != '--singularity'] singularity_command(['kipoi', 'veff', command] + raw_args, args.model, dataloader_arguments, output_files=args.output, source=args.source, dry_run=False) return None # -------------------------------------------- # install args if args.install_req: kipoi.pipeline.install_model_requirements(args.model, args.source, and_dataloaders=True) # load model & dataloader model = kipoi.get_model(args.model, args.source) regions_file = os.path.realpath(args.regions_file) output = os.path.realpath(args.output) with cd(model.source_dir): if not os.path.exists(regions_file): raise Exception("Regions inputs file does not exist: %s" % args.regions_file) # Check that all the folders exist file_exists(regions_file, logger) dir_exists(os.path.dirname(output), logger) if args.dataloader is not None: Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source) else: Dl = model.default_dataloader if not isinstance(args.scores, list): args.scores = [args.scores] # TODO - why is this function not a method of the model class? dts = get_scoring_fns(model, args.scores, args.score_kwargs) # Load effect prediction related model info model_info = kipoi_veff.ModelInfoExtractor(model, Dl) manual_seq_len = args.seq_length # Select the appropriate region generator and vcf or bed file input args.file_format = regions_file.split(".")[-1] bed_region_file = None vcf_region_file = None bed_to_region = None vcf_to_region = None if args.file_format == "vcf" or regions_file.endswith("vcf.gz"): vcf_region_file = regions_file if model_info.requires_region_definition: # Select the SNV-centered region generator vcf_to_region = kipoi_veff.SnvCenteredRg(model_info, seq_length=manual_seq_len) logger.info('Using variant-centered sequence generation.') elif args.file_format == "bed": if model_info.requires_region_definition: # Select the SNV-centered region generator bed_to_region = kipoi_veff.BedOverlappingRg( model_info, seq_length=manual_seq_len) logger.info('Using bed-file based sequence generation.') bed_region_file = regions_file else: raise Exception("") if model_info.use_seq_only_rc: logger.info( 'Model SUPPORTS simple reverse complementation of input DNA sequences.' ) else: logger.info( 'Model DOES NOT support simple reverse complementation of input DNA sequences.' ) from kipoi_veff.mutation_map import _generate_mutation_map mdmm = _generate_mutation_map( model, Dl, vcf_fpath=vcf_region_file, bed_fpath=bed_region_file, batch_size=args.batch_size, num_workers=args.num_workers, dataloader_args=dataloader_arguments, vcf_to_region=vcf_to_region, bed_to_region=bed_to_region, evaluation_function_kwargs={'diff_types': dts}, ) mdmm.save_to_file(output) logger.info('Successfully generated mutation map data')
def score_variants( model, dl_args, input_vcf, output_vcf=None, output_writers=None, scores=["logit_ref", "logit_alt", "ref", "alt", "logit", "diff"], score_kwargs=None, num_workers=0, batch_size=32, source='kipoi', seq_length=None, std_var_id=False, restriction_bed=None, return_predictions=False, model_outputs=None): """Score variants: annotate the vcf file using model predictions for the reference and alternative alleles The functional elements that generate a score from a set of predictions for reference and alternative allele are defined in the `scores` argument. This function is the python version of the command-line call `score_variants` and is a convenience version of the `predict_snvs` function: Prediction of effects of SNV based on a VCF. If desired the VCF can be stored with the predicted values as annotation. For a detailed description of the requirements in the yaml files please take a look at the core `kipoi` documentation on how to write a `dataloader.yaml` file or at the documentation of `kipoi-veff` in the section: `overview/#model-and-dataloader-requirements`. # Arguments model: model string or a model class instance dl_args: dataloader arguments as a dictionary input_vcf: input vcf file path output_vcf: output vcf file path output_writers: output writers a list of used output writers scores: list of score names to compute. See `kipoi_veff.scores` score_kwargs: optional, list of kwargs that corresponds to the entries in score. num_workers: number of paralell workers to use for dataloading batch_size: batch_size for dataloading source: model source name std_var_id: If true then variant IDs in the annotated VCF will be replaced with a standardised, unique ID. seq_length: If model accepts variable input sequence length then this value has to be set! restriction_bed: If dataloader can be run with regions generated from the VCF then only variants that overlap regions defined in `restriction_bed` will be tested. return_predictions: return generated predictions also as pandas dataframe. model_outputs: If set then either a boolean filter or a named filter for model outputs that are reported. # Returns dict: containing a pandas DataFrame containing the calculated values for each model output (target) column VCF SNV line. If `return_predictions == False`, returns None. """ import kipoi in_vcf_path_abs = os.path.realpath(input_vcf) if isinstance(model, str): model = kipoi.get_model(model, source=source, with_dataloader=True) Dataloader = model.default_dataloader vcf_path_tbx = ensure_tabixed_vcf( in_vcf_path_abs) # TODO - run this within the function if output_writers is None: output_writers = [] if output_vcf is not None: out_vcf_path_abs = os.path.realpath(output_vcf) output_writers.append( VcfWriter(model, in_vcf_path_abs, out_vcf_path_abs, standardise_var_id=std_var_id)) else: if not output_writers: raise ValueError( "Either output_vcf or output_writers need to be specified") dts = get_scoring_fns(model, scores, score_kwargs) # Load effect prediction related model info model_info = kipoi_veff.ModelInfoExtractor(model, Dataloader) vcf_to_region = _get_vcf_to_region(model_info, restriction_bed, seq_length) return predict_snvs(model, Dataloader, vcf_path_tbx, batch_size=batch_size, dataloader_args=dl_args, num_workers=num_workers, vcf_to_region=vcf_to_region, evaluation_function_kwargs={ 'diff_types': dts, 'output_filter': model_outputs }, sync_pred_writer=output_writers, return_predictions=return_predictions)