def predict_example(self, batch_size=32, output_file=None, **kwargs): """Run model prediction for the example file # Arguments batch_size: batch_size output_file: if not None, inputs and predictions are stored to `output_file` path **kwargs: Further arguments passed to batch_iter """ logger.info('Initialized data generator. Running batches...') from kipoi.writers import get_writer from kipoi.cli.main import prepare_batch if output_file is not None: output_file = os.path.abspath(output_file) if os.path.exists(output_file): raise ValueError( "Output file: {} already exists.".format(output_file)) with cd(self.dataloader_cls.source_dir): # init the dataloader dl = self.dataloader_cls.init_example() logger.info('Returned data schema correct') if output_file is not None: writer = get_writer(output_file, dl.get_output_schema().metadata, **kwargs) it = dl.batch_iter(batch_size=batch_size) # test that all predictions go through pred_list = [] for i, batch in enumerate(tqdm(it)): if i == 0 and not self.dataloader_cls.get_output_schema( ).compatible_with_batch(batch): logger.warning( "First batch of data is not compatible with the dataloader schema." ) pred_batch = self.model.predict_on_batch(batch['inputs']) if 'keep_metadata' in kwargs and kwargs.get( 'keep_metadata') and 'metadata' in batch: pred_list.append({ 'preds': pred_batch, 'metadata': batch['metadata'] }) else: pred_list.append(pred_batch) if output_file is not None: output_batch = prepare_batch( batch, pred_batch, keep_inputs=True, keep_metadata='keep_metadata' in kwargs and kwargs.get('keep_metadata')) writer.batch_write(output_batch) if output_file is not None: writer.close() logger.info('predict_example done!') return numpy_collate_concat(pred_list)
def predict_to_file(self, output_file, dataloader_kwargs, batch_size=32, keep_inputs=False, keep_metadata=False, **kwargs): """Make predictions and write them iteratively to a file # Arguments output_file: output file path. File format is inferred from the file path ending. Available file formats are: 'bed', 'h5', 'hdf5', 'tsv' dataloader_kwargs: Keyword arguments passed to the dataloader batch_size: Batch size used for the dataloader keep_inputs: if True, inputs and targets will also be written to the output file. keep_metadata: if True, metadata will also be written to the output file. **kwargs: Further arguments passed to batch_iter """ from kipoi.writers import get_writer from kipoi.cli.main import prepare_batch # setup dataloader validate_kwargs(self.dataloader_cls, dataloader_kwargs) dl = self.dataloader_cls(**dataloader_kwargs) it = dl.batch_iter(batch_size=batch_size, **kwargs) writer = get_writer(output_file, dl.get_output_schema().metadata, **kwargs) for i, batch in enumerate(tqdm(it)): if i == 0 and not self.dataloader_cls.get_output_schema( ).compatible_with_batch(batch): logger.warning( "First batch of data is not compatible with the dataloader schema." ) pred_batch = self.model.predict_on_batch(batch['inputs']) output_batch = prepare_batch(batch, pred_batch, keep_inputs=keep_inputs, keep_metadata=keep_metadata) writer.batch_write(output_batch) writer.close()
def cli_predict(command, raw_args): """CLI interface to predict """ assert command == "predict" parser = argparse.ArgumentParser('kipoi {}'.format(command), description='Run the model prediction.') add_model(parser) add_dataloader(parser, with_args=True) parser.add_argument('--batch_size', type=int, default=32, help='Batch size to use in prediction') parser.add_argument("-n", "--num_workers", type=int, default=0, help="Number of parallel workers for loading the dataset") parser.add_argument("-k", "--keep_inputs", action='store_true', help="Keep the inputs in the output file. ") parser.add_argument("-l", "--layer", help="Which output layer to use to make the predictions. If specified," + "`model.predict_activation_on_batch` will be invoked instead of `model.predict_on_batch`") parser.add_argument("--singularity", action='store_true', help="Run `kipoi predict` in the appropriate singularity container. " "Containters will get downloaded to ~/.kipoi/envs/ or to " "$SINGULARITY_CACHEDIR if set") parser.add_argument('-o', '--output', required=True, nargs="+", help="Output files. File format is inferred from the file path ending. Available file formats are: " + ", ".join(["." + k for k in writers.FILE_SUFFIX_MAP])) args = parser.parse_args(raw_args) dataloader_kwargs = parse_json_file_str_or_arglist(args.dataloader_args, parser) # setup the files if not isinstance(args.output, list): args.output = [args.output] for o in args.output: ending = o.split('.')[-1] if ending not in writers.FILE_SUFFIX_MAP: logger.error("File ending: {0} for file {1} not from {2}". format(ending, o, writers.FILE_SUFFIX_MAP)) sys.exit(1) dir_exists(os.path.dirname(o), logger) # singularity_command if args.singularity: from kipoi.cli.singularity import singularity_command logger.info("Running kipoi predict in the singularity container") # Drop the singularity flag raw_args = [x for x in raw_args if x != '--singularity'] singularity_command(['kipoi', command] + raw_args, args.model, dataloader_kwargs, output_files=args.output, source=args.source, dry_run=False) return None # -------------------------------------------- # load model & dataloader model = kipoi.get_model(args.model, args.source) if args.dataloader is not None: Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source) else: Dl = model.default_dataloader dataloader_kwargs = kipoi.pipeline.validate_kwargs(Dl, dataloader_kwargs) dl = Dl(**dataloader_kwargs) # setup batching it = dl.batch_iter(batch_size=args.batch_size, num_workers=args.num_workers) # Setup the writers use_writers = [] for output in args.output: writer = writers.get_writer(output, metadata_schema=dl.get_output_schema().metadata) if writer is None: logger.error("Unknown file format: {0}".format(ending)) sys.exit() else: use_writers.append(writer) output_writers = writers.MultipleBatchWriter(use_writers) # Loop through the data, make predictions, save the output for i, batch in enumerate(tqdm(it)): # validate the data schema in the first iteration if i == 0 and not Dl.get_output_schema().compatible_with_batch(batch): logger.warning("First batch of data is not compatible with the dataloader schema.") # make the prediction if args.layer is None: pred_batch = model.predict_on_batch(batch['inputs']) else: pred_batch = model.predict_activation_on_batch(batch['inputs'], layer=args.layer) # write out the predictions, metadata (, inputs, targets) output_batch = prepare_batch(batch, pred_batch, keep_inputs=args.keep_inputs) output_writers.batch_write(output_batch) output_writers.close() logger.info('Done! Predictions stored in {0}'.format(",".join(args.output)))
def cli_score_variants(command, raw_args): """CLI interface to score variants """ # Updated argument names: # - scoring -> scores # - --vcf_path -> --input_vcf, -i # - --out_vcf_fpath -> --output_vcf, -o # - --output -> -e, --extra_output # - remove - -install_req # - scoring_kwargs -> score_kwargs AVAILABLE_FORMATS = [k for k in writers.FILE_SUFFIX_MAP if k != 'bed'] assert command == "score_variants" parser = argparse.ArgumentParser( 'kipoi veff {}'.format(command), description='Predict effect of SNVs using ISM.') parser.add_argument('model', help='Model name.') parser.add_argument( '--source', default="kipoi", choices=list(kipoi.config.model_sources().keys()), help='Model source to use. Specified in ~/.kipoi/config.yaml' + " under model_sources. " + "'dir' is an additional source referring to the local folder.") add_dataloader(parser=parser, with_args=True) parser.add_argument('-i', '--input_vcf', required=True, help='Input VCF.') parser.add_argument('-o', '--output_vcf', help='Output annotated VCF file path.', default=None) parser.add_argument('--batch_size', type=int, default=32, help='Batch size to use in prediction') parser.add_argument( "-n", "--num_workers", type=int, default=0, help="Number of parallel workers for loading the dataset") parser.add_argument( '-r', '--restriction_bed', default=None, help="Regions for prediction can only be subsets of this bed file") parser.add_argument( '-e', '--extra_output', type=str, default=None, required=False, help= "Additional output files in other (non-vcf) formats. File format is inferred from the file path ending" + ". Available file formats are: {0}".format(", ".join( ["." + k for k in AVAILABLE_FORMATS]))) parser.add_argument( '-s', "--scores", default="diff", nargs="+", help= "Scoring method to be used. Only scoring methods selected in the model yaml file are" "available except for `diff` which is always available. Select scoring function by the" "`name` tag defined in the model yaml file.") parser.add_argument( '-k', "--score_kwargs", default="", nargs="+", help= "JSON definition of the kwargs for the scoring functions selected in --scoring. The " "definiton can either be in JSON in the command line or the path of a .json file. The " "individual JSONs are expected to be supplied in the same order as the labels defined in " "--scoring. If the defaults or no arguments should be used define '{}' for that respective " "scoring method.") parser.add_argument( '-l', "--seq_length", type=int, default=None, help= "Optional parameter: Model input sequence length - necessary if the model does not have a " "pre-defined input sequence length.") parser.add_argument( '--std_var_id', action="store_true", help="If set then variant IDs in the annotated" " VCF will be replaced with a standardised, unique ID.") parser.add_argument( "--model_outputs", type=str, default=None, nargs="+", help= "Optional parameter: Only return predictions for the selected model outputs. Naming" "according to the definition in model.yaml > schema > targets > column_labels" ) parser.add_argument( "--model_outputs_i", type=int, default=None, nargs="+", help= "Optional parameter: Only return predictions for the selected model outputs. Give integer" "indices of the selected model output(s).") parser.add_argument( "--singularity", action='store_true', help="Run `kipoi predict` in the appropriate singularity container. " "Containters will get downloaded to ~/.kipoi/envs/ or to " "$SINGULARITY_CACHEDIR if set") args = parser.parse_args(raw_args) # OBSOLETE # Make sure all the multi-model arguments like source, dataloader etc. fit together #_prepare_multi_model_args(args) # Check that all the folders exist file_exists(args.input_vcf, logger) if args.output_vcf is None and args.extra_output is None: logger.error( "One of the two needs to be specified: --output_vcf or --extra_output" ) sys.exit(1) if args.extra_output is not None: dir_exists(os.path.dirname(args.extra_output), logger) ending = args.extra_output.split('.')[-1] if ending not in AVAILABLE_FORMATS: logger.error("File ending: {0} for file {1} not from {2}".format( ending, args.extra_output, AVAILABLE_FORMATS)) sys.exit(1) # singularity_command if args.singularity: from kipoi.cli.singularity import singularity_command logger.info( "Running kipoi veff {} in the singularity container".format( command)) # Drop the singularity flag raw_args = [x for x in raw_args if x != '--singularity'] dataloader_kwargs = parse_json_file_str_or_arglist( args.dataloader_args) # create output files output_files = [] if args.output_vcf is not None: output_files.append(args.output_vcf) if args.extra_output is not None: output_files.append(args.extra_output) singularity_command(['kipoi', 'veff', command] + raw_args, model=args.model, dataloader_kwargs=dataloader_kwargs, output_files=output_files, source=args.source, dry_run=False) return None if not isinstance(args.scores, list): args.scores = [args.scores] score_kwargs = [] if len(args.score_kwargs) > 0: score_kwargs = args.score_kwargs if len(args.scores) >= 1: # Check if all scoring functions should be used: if args.scores == ["all"]: if len(score_kwargs) >= 1: raise ValueError( "`--score_kwargs` cannot be defined in combination will `--scoring all`!" ) else: score_kwargs = [parse_json_file_str(el) for el in score_kwargs] if not len(args.score_kwargs) == len(score_kwargs): raise ValueError( "When defining `--score_kwargs` a JSON representation of arguments (or the " "path of a file containing them) must be given for every " "`--scores` function.") # VCF writer output_vcf_model = None if args.output_vcf is not None: dir_exists(os.path.dirname(args.output_vcf), logger) output_vcf_model = args.output_vcf # Other writers if args.extra_output is not None: dir_exists(os.path.dirname(args.extra_output), logger) extra_output = args.extra_output writer = writers.get_writer(extra_output, metadata_schema=None) assert writer is not None extra_writers = [SyncBatchWriter(writer)] else: extra_writers = [] dataloader_arguments = parse_json_file_str_or_arglist(args.dataloader_args) # -------------------------------------------- # load model & dataloader model = kipoi.get_model(args.model, args.source) if args.dataloader is not None: Dl = kipoi.get_dataloader_factory(args.dataloader, args.dataloader_source) else: Dl = model.default_dataloader # Load effect prediction related model info model_info = kipoi_veff.ModelInfoExtractor(model, Dl) if model_info.use_seq_only_rc: logger.info( 'Model SUPPORTS simple reverse complementation of input DNA sequences.' ) else: logger.info( 'Model DOES NOT support simple reverse complementation of input DNA sequences.' ) if output_vcf_model is not None: logger.info('Annotated VCF will be written to %s.' % str(output_vcf_model)) model_outputs = None if args.model_outputs is not None: model_outputs = args.model_outputs elif args.model_outputs_i is not None: model_outputs = args.model_outputs_i kipoi_veff.score_variants(model, dataloader_arguments, args.input_vcf, output_vcf=output_vcf_model, output_writers=extra_writers, scores=args.scores, score_kwargs=score_kwargs, num_workers=args.num_workers, batch_size=args.batch_size, seq_length=args.seq_length, std_var_id=args.std_var_id, restriction_bed=args.restriction_bed, return_predictions=False, model_outputs=model_outputs) logger.info('Successfully predicted samples')