def flatten_batch(batch, nested_sep="/"): """Convert the nested batch of numpy arrays into a dictionary of 1-dimensional numpy arrays Args: batch: batch of data nested_sep: What separator to use for flattening the nested dictionary structure into a single key Returns: A dictionary of 1-dimensional numpy arrays. """ def array2array_dict(arr): """Convert a numpy array into a dictionary of numpy arrays >>> arr = np.arange(9).reshape((1, 3, 3)) >>> assert array2array_dict(arr)["0"]["1"][0] == arr[:, 0, 1][0] """ if isinstance(arr, np.ndarray): if arr.ndim <= 1: return arr else: return collections.OrderedDict([(str(i), array2array_dict(arr[:, i])) for i in range(arr.shape[1])]) elif isinstance(arr, pd.DataFrame): return {k: v.values for k, v in six.iteritems(arr.to_dict("records"))} else: raise ValueError("Unknown data type") return flatten(map_nested(batch, array2array_dict), separator=nested_sep)
def batch_write(self, batch): fbatch = flatten(batch, separator="/") batch_sizes = [fbatch[k].shape[0] for k in fbatch] # assert all shapes are the same assert len(pd.Series(batch_sizes).unique()) == 1 batch_size = batch_sizes[0] if self.first_pass: # have a dictionary holding for k in fbatch: if fbatch[k].dtype.type in [np.string_, np.str_, np.unicode_]: dtype = self.string_type else: dtype = fbatch[k].dtype self.f.create_dataset(k, shape=(0, ) + fbatch[k].shape[1:], dtype=dtype, maxshape=(None, ) + fbatch[k].shape[1:], compression=self.compression, chunks=(self.chunk_size, ) + fbatch[k].shape[1:]) self.first_pass = False # add data to the buffer if self.write_buffer is None: self.write_buffer = fbatch self.write_buffer_size = batch_size else: self.write_buffer = numpy_collate_concat( [self.write_buffer, fbatch]) self.write_buffer_size += batch_size if self.write_buffer is not None and self.write_buffer_size >= self.chunk_size: self._flush_buffer()
def evaluate(self, batch_size=256, shuffle=True, num_workers=8, save=True): """Evaluate the model on the validation set Args: metrics: a list or a dictionary of metrics batch_size: num_workers: """ print("Started loading validation dataset") X_valid, y_valid = self.valid_dataset.load_all(batch_size=batch_size, num_workers=num_workers) """ it = self.valid_dataset.batch_train_iter(batch_size=batch_size, shuffle=shuffle, num_workers=num_workers) X_valid, y_valid = next(it) """ print("Finished loading validation dataset") metric_res = self.model.score(X_valid, y_valid) if save: write_json(metric_res, self.evaluation_path, indent=2) if self.cometml_experiment: self.cometml_experiment.log_multiple_metrics(flatten(metric_res), prefix="best/") return metric_res
def __init__(self, file_path, metadata_schema, header=True): """ Args: file_path (str): File path of the output tsv file dataloader_schema: Schema of the dataloader. Used to find the ranges object nested_sep: What separator to use for flattening the nested dictionary structure into a single key """ self.file_path = file_path self.header = header self.first_pass = True f_dl_schema = flatten(metadata_schema) range_keys = [ "metadata/" + k for k in f_dl_schema if f_dl_schema[k].type == MetadataType.GENOMIC_RANGES ] if len(range_keys) > 1: raise ValueError( "Found multiple genomic ranges in metadata: {0}. For writing to the " + "bed file exactly one genomic range has to exist".format( range_keys)) elif len(range_keys) == 0: raise ValueError( "Found no genomic ranges in metadata. For writing to the " + "bed file exactly one genomic range has to exist") self.ranges_key = range_keys[0]
def test_unflatten_with_list_issue15(): """https://github.com/amirziai/flatten/issues/15""" dic = { "Required": { "a": "1", "b": ["1", "2", "3"], "c": { "d": { "e": [[{ "s1": 1 }, { "s2": 2 }], [{ "s3": 1 }, { "s4": 2 }]] } }, "f": ["1", "2"] }, "Optional": { "x": "1", "y": ["1", "2", "3"] } } dic_flatten = flatten(dic) actual = unflatten_list(dic_flatten) assert actual == dic
def test_blog_example(): dic = {"a": 1, "b": 2, "c": [{"d": ['2', 3, 4], "e": [{"f": 1, "g": 2}]}]} expected = { 'a': 1, 'b': 2, 'c_0_d_0': '2', 'c_0_d_1': 3, 'c_0_d_2': 4, 'c_0_e_0_f': 1, 'c_0_e_0_g': 2 } actual = flatten(dic) assert actual == expected
def test_list_and_dict(): dic = {'a': 1, 'b': 2, 'c': [{'d': [2, 3, 4], 'e': [{'f': 1, 'g': 2}]}]} expected = { 'a': 1, 'b': 2, 'c_0_d_0': 2, 'c_0_d_1': 3, 'c_0_d_2': 4, 'c_0_e_0_f': 1, 'c_0_e_0_g': 2 } actual = flatten(dic) assert actual == expected
def test_flatten_dict(nested_dict): fd = flatten(nested_dict) assert dict(fd) == { 'a': 1, 'b_c': 3, 'b_d_0': 1, 'b_d_1': 2, 'b_d_2': 3, 'b_e_0_f': 1, 'b_e_1_g': 4 } assert unflatten_list(fd) == dict(nested_dict)
def parse_schema(schema): """Parse model schema by removing unneeded fields and reordering them.""" if isinstance(schema, kipoi.components.ArraySchema): return {"list": [schema], "type": "Single numpy array"} elif isinstance(schema, list): return {"list": schema, "type": "List of numpy arrays"} elif isinstance(schema, dict): flattened_schema = flatten_json.flatten(dd=schema, separator='/') schema_list = [] for key, value in flattened_schema.items(): value = value.get_config() value["name"] = key schema_list.append(value) return {"list": schema_list, "type": "Dictionary of numpy arrays"}
def test_unflatten_with_list_deep(): dic = { 'a': [{ 'b': [{ 'c': [{ 'a': 5, 'b': { 'a': [1, 2, 3] }, 'c': { 'x': 3 } }] }] }] } dic_flatten = flatten(dic) actual = unflatten_list(dic_flatten) assert actual == dic
def __init__(self, file_path, metadata_schema, header=True): self.file_path = file_path self.header = header self.first_pass = True f_dl_schema = flatten(metadata_schema) range_keys = [ "metadata/" + k for k in f_dl_schema if f_dl_schema[k].type == MetadataType.GENOMIC_RANGES ] if len(range_keys) > 1: raise ValueError( "Found multiple genomic ranges in metadata: {0}. For writing to the " + "bed file exactly one genomic range has to exist".format( range_keys)) elif len(range_keys) == 0: raise ValueError( "Found no genomic ranges in metadata. For writing to the " + "bed file exactly one genomic range has to exist") self.ranges_key = range_keys[0]
def evaluate(self, metric, scaler_path=None, eval_type=np.float32, save=True): """Evaluate the model on the validation set Args: metrics: a list or a dictionary of metrics batch_size: num_workers: """ print("Started loading validation dataset") X_valid, y_valid = self.valid_dataset.load_all() if scaler_path: scaler = load_pickle(scaler_path) print("Started scaling X.") X_infl = X_valid.astype(np.float32) X_infl = scaler.transform(X_infl) if eval_type is not np.float32: X_valid = X_infl.astype(np.float16) if isinstance(X_valid, csr_matrix): X_valid.data = np.minimum(X_valid.data, 65500) else: X_valid = np.minimum(X_valid, 65500) del X_infl print("Finished scaling X.") print("Finished loading validation dataset. Shape: ", X_valid.shape, "True values:", y_valid.sum()/y_valid.shape[0]) y_pred = self.model.predict(X_valid) metric_res = metric(y_valid, y_pred) print("metric_res", metric_res, np.amax(X_valid)) if save: write_json(metric_res, self.evaluation_path, indent=2) if self.cometml_experiment: self.cometml_experiment.log_multiple_metrics(flatten(metric_res), prefix="best/") return metric_res
def batch_write(self, batch): """Write a batch of data to bed file # Arguments batch: batch of data. Either a single `np.array` or a list/dict thereof. """ fbatch = flatten(batch, separator="/") batch_sizes = [fbatch[k].shape[0] for k in fbatch] # assert all shapes are the same assert len(pd.Series(batch_sizes).unique()) == 1 batch_size = batch_sizes[0] if self.first_pass: # have a dictionary holding for k in fbatch: if fbatch[k].dtype.type in [np.string_, np.str_, np.unicode_]: dtype = self.string_type else: dtype = fbatch[k].dtype self.f.create_dataset(k, shape=(0, ) + fbatch[k].shape[1:], dtype=dtype, maxshape=(None, ) + fbatch[k].shape[1:], compression=self.compression, chunks=(self.chunk_size, ) + fbatch[k].shape[1:]) self.first_pass = False # add data to the buffer if self.write_buffer is None: self.write_buffer = [fbatch] self.write_buffer_size = batch_size else: self.write_buffer.append(fbatch) self.write_buffer_size += batch_size if self.write_buffer is not None and self.write_buffer_size >= self.chunk_size: self._flush_buffer()
def evaluate(self, batch_size=256, shuffle=True, num_workers=8, save=True): """Evaluate the model on the validation set Args: metrics: a list or a dictionary of metrics batch_size: num_workers: """ print("Started loading validation dataset") for d in self.valid_datasets: self.valid_datasets.append(d.batch_iter(batch_size)) batch = None metric_res_b = [] print("Loading and training") for batch_num in tqdm(enumerate(len(self.valid_datasets[0]))): for it in self.valid_datasets: # Connecting features from all kipoi datasets # we assume that the variants have been curated, # i.e. the same in the exact order. if batch is None: batch = next(it) else: batch = np.concatenate(batch, next(it), axis=1) X_batch, y_batch = batch[:, 1:], batch[:,0] metric_res_b.append(self.model.test_on_batch(X_batch, y_batch, sample_weight=sample_weight)) metric_res = np.average(metric_res_b) if save: write_json(metric_res, self.evaluation_path, indent=2) if self.cometml_experiment: self.cometml_experiment.log_multiple_metrics(flatten(metric_res), prefix="best/") return metric_res
def evaluate(self, metric, batch_size=256, num_workers=8, eval_train=False, eval_skip=[], save=True, **kwargs): """Evaluate the model on the validation set Args: metric: a function accepting (y_true, y_pred) and returning the evaluation metric(s) batch_size: num_workers: eval_train: if True, also compute the evaluation metrics on the training set save: save the json file to the output directory """ if len(kwargs) > 0: logger.warn( f"Extra kwargs were provided to trainer.evaluate(): {kwargs}") # Save the complete model -> HACK self.seq_model.save(os.path.join(self.output_dir, 'seq_model.pkl')) # contruct a list of dataset to evaluate if eval_train: eval_datasets = [('train', self.train_dataset) ] + self.valid_dataset else: eval_datasets = self.valid_dataset # skip some datasets for evaluation try: if len(eval_skip) > 0: logger.info(f"Using eval_skip: {eval_skip}") eval_datasets = [(k, v) for k, v in eval_datasets if k not in eval_skip] except: logger.warn( f"eval datasets don't contain tuples. Unable to skip them using {eval_skip}" ) metric_res = OrderedDict() for d in eval_datasets: if len(d) == 2: dataset_name, dataset = d eval_metric = None # Ignore the provided metric elif len(d) == 3: # specialized evaluation metric was passed dataset_name, dataset, eval_metric = d else: raise ValueError( "Valid dataset needs to be a list of tuples of 2 or 3 elements" "(name, dataset) or (name, dataset, metric)") logger.info(f"Evaluating dataset: {dataset_name}") metric_res[dataset_name] = self.seq_model.evaluate( dataset, eval_metric=eval_metric, num_workers=num_workers, batch_size=batch_size) if save: write_json(metric_res, self.evaluation_path, indent=2) logger.info("Saved metrics to {}".format(self.evaluation_path)) if self.cometml_experiment is not None: self.cometml_experiment.log_multiple_metrics(flatten( metric_res, separator='/'), prefix="eval/") if self.wandb_run is not None: self.wandb_run.summary.update( flatten(prefix_dict(metric_res, prefix="eval/"), separator='/')) return metric_res
def test_list(): dic = {'a': 1, 'b': [{'c': [2, 3]}]} expected = {'a': 1, 'b_0_c_0': 2, 'b_0_c_1': 3} actual = flatten(dic) assert actual == expected
def test_custom_separator(): dic = {'a': '1', 'b': '2', 'c': {'c1': '3', 'c2': '4'}} expected = {'a': '1', 'b': '2', 'c*c1': '3', 'c*c2': '4'} actual = flatten(dic, '*') assert actual == expected
def test_one_flatten_utf8_dif(): a = {u'eñe': 1} info = dict(info=a) expected = {u'info_{}'.format(u'eñe'): 1} actual = flatten(info) assert actual == expected
def test_one_flatten_utf8(): dic = {'a': '1', u'ñ': u'áéö', 'c': {u'c1': '3', 'c2': '4'}} expected = {'a': '1', u'ñ': u'áéö', 'c_c1': '3', 'c_c2': '4'} actual = flatten(dic) assert actual == expected
def test_one_flatten(): dic = {'a': '1', 'b': '2', 'c': {'c1': '3', 'c2': '4'}} expected = {'a': '1', 'b': '2', 'c_c1': '3', 'c_c2': '4'} actual = flatten(dic) assert actual == expected
def test_no_flatten(): dic = {'a': '1', 'b': '2', 'c': 3} expected = dic actual = flatten(dic) assert actual == expected
def test_unflatten_with_list_nested(): dic = {"a": [[{"b": 1}], [{"d": 1}]]} dic_flatten = flatten(dic) actual = unflatten_list(dic_flatten) assert actual == dic