def test_HDF5BatchWriter_array(dl_batch, pred_batch_array, tmpdir): tmpfile = str(tmpdir.mkdir("example").join("out.h5")) batch = prepare_batch(dl_batch, pred_batch_array) writer = HDF5BatchWriter(tmpfile, chunk_size=4) writer.batch_write(batch) writer.batch_write(batch) writer.close() with HDF5Reader(tmpfile) as f: assert np.all( list(f.batch_iter(2))[0]['metadata']['gene_id'] == dl_batch['metadata']['gene_id'][:2]) out = f.load_all() assert np.all(out['metadata']['gene_id'] == np.concatenate([ dl_batch['metadata']['gene_id'], dl_batch['metadata']['gene_id'] ])) assert np.all(out['metadata']['ranges']["chr"] == np.concatenate([ dl_batch['metadata']['ranges']['chr'], dl_batch['metadata'] ['ranges']['chr'] ])) assert np.all(out['metadata']['ranges']["start"] == np.concatenate([ dl_batch['metadata']['ranges']['start'], dl_batch['metadata'] ['ranges']['start'] ])) assert np.all(out['preds'][:3] == pred_batch_array)
def predict_example(self, batch_size=32, output_file=None, **kwargs): """Run model prediction for the example file # Arguments batch_size: batch_size output_file: if not None, inputs and predictions are stored to `output_file` path **kwargs: Further arguments passed to batch_iter """ logger.info('Initialized data generator. Running batches...') from kipoi.writers import get_writer from kipoi.cli.main import prepare_batch if output_file is not None: output_file = os.path.abspath(output_file) if os.path.exists(output_file): raise ValueError( "Output file: {} already exists.".format(output_file)) with cd(self.dataloader_cls.source_dir): # init the dataloader dl = self.dataloader_cls.init_example() logger.info('Returned data schema correct') if output_file is not None: writer = get_writer(output_file, dl.get_output_schema().metadata, **kwargs) it = dl.batch_iter(batch_size=batch_size) # test that all predictions go through pred_list = [] for i, batch in enumerate(tqdm(it)): if i == 0 and not self.dataloader_cls.get_output_schema( ).compatible_with_batch(batch): logger.warning( "First batch of data is not compatible with the dataloader schema." ) pred_batch = self.model.predict_on_batch(batch['inputs']) if 'keep_metadata' in kwargs and kwargs.get( 'keep_metadata') and 'metadata' in batch: pred_list.append({ 'preds': pred_batch, 'metadata': batch['metadata'] }) else: pred_list.append(pred_batch) if output_file is not None: output_batch = prepare_batch( batch, pred_batch, keep_inputs=True, keep_metadata='keep_metadata' in kwargs and kwargs.get('keep_metadata')) writer.batch_write(output_batch) if output_file is not None: writer.close() logger.info('predict_example done!') return numpy_collate_concat(pred_list)
def get_example_data(example, layer, writer=None): example_dir = "examples/{0}".format(example) if INSTALL_REQ: install_model_requirements(example_dir, "dir", and_dataloaders=True) model = kipoi.get_model(example_dir, source="dir") # The preprocessor Dataloader = kipoi.get_dataloader_factory(example_dir, source="dir") # with open(example_dir + "/example_files/test.json", "r") as ifh: dataloader_arguments = json.load(ifh) for k in dataloader_arguments: dataloader_arguments[k] = "example_files/" + dataloader_arguments[k] outputs = [] with cd(model.source_dir): dl = Dataloader(**dataloader_arguments) it = dl.batch_iter(batch_size=32, num_workers=0) # Loop through the data, make predictions, save the output for i, batch in enumerate(tqdm(it)): # make the prediction pred_batch = model.input_grad(batch['inputs'], avg_func="sum", layer=layer, final_layer=False) # write out the predictions, metadata (, inputs, targets) # always keep the inputs so that input*grad can be generated! output_batch = prepare_batch(batch, pred_batch, keep_inputs=True) if writer is not None: writer.batch_write(output_batch) outputs.append(output_batch) if writer is not None: writer.close() return numpy_collate(outputs)
def test_BedBatchWriter(dl_batch, pred_batch_array, metadata_schema, tmpdir): tmpfile = str(tmpdir.mkdir("example").join("out.tsv")) writer = BedBatchWriter(tmpfile, metadata_schema=metadata_schema) batch = prepare_batch(dl_batch, pred_batch_array) writer.batch_write(batch) writer.batch_write(batch) writer.close() df = pd.read_csv(tmpfile, sep="\t") assert list(df.columns) == [ 'chr', 'start', 'end', 'name', 'score', 'strand', 'preds/0', 'preds/1', 'preds/2' ] assert list(df['name']) == [0, 1, 2, 0, 1, 2]
def test_TsvBatchWriter_array(dl_batch, pred_batch_array, tmpdir): tmpfile = str(tmpdir.mkdir("example").join("out.tsv")) writer = TsvBatchWriter(tmpfile) batch = prepare_batch(dl_batch, pred_batch_array) writer.batch_write(batch) writer.batch_write(batch) writer.close() df = pd.read_csv(tmpfile, sep="\t") assert set(list(df.columns)) == { 'metadata/ranges/id', 'metadata/ranges/strand', 'metadata/ranges/chr', 'metadata/ranges/start', 'metadata/ranges/end', 'metadata/gene_id', 'preds/0', 'preds/1', 'preds/2' } assert list(df['metadata/ranges/id']) == [0, 1, 2, 0, 1, 2]
def test_ParquetBatchWriter_array(dl_batch, pred_batch_array, tmpdir): tmpfile = str(tmpdir.mkdir("example").join("out.pq")) writer = ParquetBatchWriter(tmpfile) batch = prepare_batch(dl_batch, pred_batch_array) writer.batch_write(batch) writer.batch_write(batch) writer.close() df = pd.read_parquet(tmpfile, engine='fastparquet') assert set(list(df.columns)) == { 'metadata/ranges/id', 'metadata/ranges/strand', 'metadata/ranges/chr', 'metadata/ranges/start', 'metadata/ranges/end', 'metadata/gene_id', 'preds/0', 'preds/1', 'preds/2' } assert list(df['metadata/ranges/id']) == ['0', '1', '2', '0', '1', '2']
def test_MultipleBatchWriter(dl_batch, pred_batch_array, tmpdir): tmpdir = tmpdir.mkdir("example") h5_tmpfile = str(tmpdir.join("out.h5")) tsv_tmpfile = str(tmpdir.join("out.tsv")) batch = prepare_batch(dl_batch, pred_batch_array) writer = MultipleBatchWriter( [TsvBatchWriter(tsv_tmpfile), HDF5BatchWriter(h5_tmpfile)]) writer.batch_write(batch) writer.batch_write(batch) writer.close() assert os.path.exists(h5_tmpfile) assert os.path.exists(tsv_tmpfile) df = pd.read_csv(tsv_tmpfile, sep="\t") assert set(list(df.columns)) == { 'metadata/ranges/id', 'metadata/ranges/strand', 'metadata/ranges/chr', 'metadata/ranges/start', 'metadata/ranges/end', 'metadata/gene_id', 'preds/0', 'preds/1', 'preds/2' } assert list(df['metadata/ranges/id']) == [0, 1, 2, 0, 1, 2]
def predict_to_file(self, output_file, dataloader_kwargs, batch_size=32, keep_inputs=False, keep_metadata=False, **kwargs): """Make predictions and write them iteratively to a file # Arguments output_file: output file path. File format is inferred from the file path ending. Available file formats are: 'bed', 'h5', 'hdf5', 'tsv' dataloader_kwargs: Keyword arguments passed to the dataloader batch_size: Batch size used for the dataloader keep_inputs: if True, inputs and targets will also be written to the output file. keep_metadata: if True, metadata will also be written to the output file. **kwargs: Further arguments passed to batch_iter """ from kipoi.writers import get_writer from kipoi.cli.main import prepare_batch # setup dataloader validate_kwargs(self.dataloader_cls, dataloader_kwargs) dl = self.dataloader_cls(**dataloader_kwargs) it = dl.batch_iter(batch_size=batch_size, **kwargs) writer = get_writer(output_file, dl.get_output_schema().metadata, **kwargs) for i, batch in enumerate(tqdm(it)): if i == 0 and not self.dataloader_cls.get_output_schema( ).compatible_with_batch(batch): logger.warning( "First batch of data is not compatible with the dataloader schema." ) pred_batch = self.model.predict_on_batch(batch['inputs']) output_batch = prepare_batch(batch, pred_batch, keep_inputs=keep_inputs, keep_metadata=keep_metadata) writer.batch_write(output_batch) writer.close()