Example #1
0
def flatten_batch(batch, nested_sep="/"):
    """Convert the nested batch of numpy arrays into a dictionary of 1-dimensional numpy arrays

    Args:
      batch: batch of data
      nested_sep: What separator to use for flattening the nested dictionary structure
          into a single key

    Returns:
      A dictionary of 1-dimensional numpy arrays.
    """
    def array2array_dict(arr):
        """Convert a numpy array into a dictionary of numpy arrays

        >>> arr = np.arange(9).reshape((1, 3, 3))
        >>> assert array2array_dict(arr)["0"]["1"][0] == arr[:, 0, 1][0]
        """
        if isinstance(arr, np.ndarray):
            if arr.ndim <= 1:
                return arr
            else:
                return collections.OrderedDict([(str(i), array2array_dict(arr[:, i]))
                                                for i in range(arr.shape[1])])
        elif isinstance(arr, pd.DataFrame):
            return {k: v.values for k, v in six.iteritems(arr.to_dict("records"))}
        else:
            raise ValueError("Unknown data type")

    return flatten(map_nested(batch, array2array_dict),
                   separator=nested_sep)
Example #2
0
    def batch_write(self, batch):
        fbatch = flatten(batch, separator="/")

        batch_sizes = [fbatch[k].shape[0] for k in fbatch]
        # assert all shapes are the same
        assert len(pd.Series(batch_sizes).unique()) == 1
        batch_size = batch_sizes[0]

        if self.first_pass:
            # have a dictionary holding
            for k in fbatch:
                if fbatch[k].dtype.type in [np.string_, np.str_, np.unicode_]:
                    dtype = self.string_type
                else:
                    dtype = fbatch[k].dtype

                self.f.create_dataset(k,
                                      shape=(0, ) + fbatch[k].shape[1:],
                                      dtype=dtype,
                                      maxshape=(None, ) + fbatch[k].shape[1:],
                                      compression=self.compression,
                                      chunks=(self.chunk_size, ) +
                                      fbatch[k].shape[1:])
            self.first_pass = False
        # add data to the buffer
        if self.write_buffer is None:
            self.write_buffer = fbatch
            self.write_buffer_size = batch_size
        else:
            self.write_buffer = numpy_collate_concat(
                [self.write_buffer, fbatch])
            self.write_buffer_size += batch_size

        if self.write_buffer is not None and self.write_buffer_size >= self.chunk_size:
            self._flush_buffer()
Example #3
0
    def evaluate(self, batch_size=256, shuffle=True, num_workers=8, save=True):
        """Evaluate the model on the validation set
        Args:
          metrics: a list or a dictionary of metrics
          batch_size:
          num_workers:
        """
        print("Started loading validation dataset")
        
        X_valid, y_valid = self.valid_dataset.load_all(batch_size=batch_size,
                                               num_workers=num_workers)
        """
        it = self.valid_dataset.batch_train_iter(batch_size=batch_size, shuffle=shuffle, num_workers=num_workers)
        X_valid, y_valid = next(it)
        """
        print("Finished loading validation dataset")
        metric_res = self.model.score(X_valid, y_valid)

        if save:
            write_json(metric_res, self.evaluation_path, indent=2)

        if self.cometml_experiment:
            self.cometml_experiment.log_multiple_metrics(flatten(metric_res), prefix="best/")

        return metric_res
Example #4
0
    def __init__(self, file_path, metadata_schema, header=True):
        """

        Args:
          file_path (str): File path of the output tsv file
          dataloader_schema: Schema of the dataloader. Used to find the ranges object
          nested_sep: What separator to use for flattening the nested dictionary structure
            into a single key
        """
        self.file_path = file_path
        self.header = header
        self.first_pass = True

        f_dl_schema = flatten(metadata_schema)
        range_keys = [
            "metadata/" + k for k in f_dl_schema
            if f_dl_schema[k].type == MetadataType.GENOMIC_RANGES
        ]
        if len(range_keys) > 1:
            raise ValueError(
                "Found multiple genomic ranges in metadata: {0}. For writing to the "
                + "bed file exactly one genomic range has to exist".format(
                    range_keys))
        elif len(range_keys) == 0:
            raise ValueError(
                "Found no genomic ranges in metadata. For writing to the " +
                "bed file exactly one genomic range has to exist")
        self.ranges_key = range_keys[0]
Example #5
0
def test_unflatten_with_list_issue15():
    """https://github.com/amirziai/flatten/issues/15"""
    dic = {
        "Required": {
            "a": "1",
            "b": ["1", "2", "3"],
            "c": {
                "d": {
                    "e": [[{
                        "s1": 1
                    }, {
                        "s2": 2
                    }], [{
                        "s3": 1
                    }, {
                        "s4": 2
                    }]]
                }
            },
            "f": ["1", "2"]
        },
        "Optional": {
            "x": "1",
            "y": ["1", "2", "3"]
        }
    }
    dic_flatten = flatten(dic)
    actual = unflatten_list(dic_flatten)
    assert actual == dic
Example #6
0
def test_blog_example():
    dic = {"a": 1, "b": 2, "c": [{"d": ['2', 3, 4], "e": [{"f": 1, "g": 2}]}]}
    expected = {
        'a': 1,
        'b': 2,
        'c_0_d_0': '2',
        'c_0_d_1': 3,
        'c_0_d_2': 4,
        'c_0_e_0_f': 1,
        'c_0_e_0_g': 2
    }
    actual = flatten(dic)
    assert actual == expected
Example #7
0
def test_list_and_dict():
    dic = {'a': 1, 'b': 2, 'c': [{'d': [2, 3, 4], 'e': [{'f': 1, 'g': 2}]}]}
    expected = {
        'a': 1,
        'b': 2,
        'c_0_d_0': 2,
        'c_0_d_1': 3,
        'c_0_d_2': 4,
        'c_0_e_0_f': 1,
        'c_0_e_0_g': 2
    }
    actual = flatten(dic)
    assert actual == expected
Example #8
0
def test_flatten_dict(nested_dict):
    fd = flatten(nested_dict)

    assert dict(fd) == {
        'a': 1,
        'b_c': 3,
        'b_d_0': 1,
        'b_d_1': 2,
        'b_d_2': 3,
        'b_e_0_f': 1,
        'b_e_1_g': 4
    }
    assert unflatten_list(fd) == dict(nested_dict)
Example #9
0
def parse_schema(schema):
    """Parse model schema by removing unneeded fields and reordering them."""
    if isinstance(schema, kipoi.components.ArraySchema):
        return {"list": [schema], "type": "Single numpy array"}
    elif isinstance(schema, list):
        return {"list": schema, "type": "List of numpy arrays"}
    elif isinstance(schema, dict):
        flattened_schema = flatten_json.flatten(dd=schema, separator='/')
        schema_list = []
        for key, value in flattened_schema.items():
            value = value.get_config()
            value["name"] = key
            schema_list.append(value)
        return {"list": schema_list, "type": "Dictionary of numpy arrays"}
Example #10
0
def test_unflatten_with_list_deep():
    dic = {
        'a': [{
            'b': [{
                'c': [{
                    'a': 5,
                    'b': {
                        'a': [1, 2, 3]
                    },
                    'c': {
                        'x': 3
                    }
                }]
            }]
        }]
    }
    dic_flatten = flatten(dic)
    actual = unflatten_list(dic_flatten)
    assert actual == dic
Example #11
0
    def __init__(self, file_path, metadata_schema, header=True):
        self.file_path = file_path
        self.header = header
        self.first_pass = True

        f_dl_schema = flatten(metadata_schema)
        range_keys = [
            "metadata/" + k for k in f_dl_schema
            if f_dl_schema[k].type == MetadataType.GENOMIC_RANGES
        ]
        if len(range_keys) > 1:
            raise ValueError(
                "Found multiple genomic ranges in metadata: {0}. For writing to the "
                + "bed file exactly one genomic range has to exist".format(
                    range_keys))
        elif len(range_keys) == 0:
            raise ValueError(
                "Found no genomic ranges in metadata. For writing to the " +
                "bed file exactly one genomic range has to exist")
        self.ranges_key = range_keys[0]
Example #12
0
    def evaluate(self, metric, scaler_path=None, eval_type=np.float32, save=True):
        """Evaluate the model on the validation set
        Args:
          metrics: a list or a dictionary of metrics
          batch_size:
          num_workers:
        """
        print("Started loading validation dataset")
        
        X_valid, y_valid = self.valid_dataset.load_all()

        if scaler_path:
            scaler = load_pickle(scaler_path)
            print("Started scaling X.")
            X_infl = X_valid.astype(np.float32)
            X_infl = scaler.transform(X_infl)

            if eval_type is not np.float32:
                X_valid = X_infl.astype(np.float16)
                if isinstance(X_valid, csr_matrix):
                    X_valid.data = np.minimum(X_valid.data, 65500)
                else:
                    X_valid = np.minimum(X_valid, 65500)
                del X_infl
            print("Finished scaling X.")

        print("Finished loading validation dataset. Shape: ", X_valid.shape, "True values:", y_valid.sum()/y_valid.shape[0])
        
        y_pred = self.model.predict(X_valid)
        metric_res = metric(y_valid, y_pred)
        print("metric_res", metric_res, np.amax(X_valid))

        if save:
            write_json(metric_res, self.evaluation_path, indent=2)

        if self.cometml_experiment:
            self.cometml_experiment.log_multiple_metrics(flatten(metric_res), prefix="best/")

        return metric_res
Example #13
0
    def batch_write(self, batch):
        """Write a batch of data to bed file

        # Arguments
            batch: batch of data. Either a single `np.array` or a list/dict thereof.
        """
        fbatch = flatten(batch, separator="/")

        batch_sizes = [fbatch[k].shape[0] for k in fbatch]
        # assert all shapes are the same
        assert len(pd.Series(batch_sizes).unique()) == 1
        batch_size = batch_sizes[0]

        if self.first_pass:
            # have a dictionary holding
            for k in fbatch:
                if fbatch[k].dtype.type in [np.string_, np.str_, np.unicode_]:
                    dtype = self.string_type
                else:
                    dtype = fbatch[k].dtype

                self.f.create_dataset(k,
                                      shape=(0, ) + fbatch[k].shape[1:],
                                      dtype=dtype,
                                      maxshape=(None, ) + fbatch[k].shape[1:],
                                      compression=self.compression,
                                      chunks=(self.chunk_size, ) +
                                      fbatch[k].shape[1:])
            self.first_pass = False
        # add data to the buffer
        if self.write_buffer is None:
            self.write_buffer = [fbatch]
            self.write_buffer_size = batch_size
        else:
            self.write_buffer.append(fbatch)
            self.write_buffer_size += batch_size

        if self.write_buffer is not None and self.write_buffer_size >= self.chunk_size:
            self._flush_buffer()
Example #14
0
    def evaluate(self, batch_size=256, shuffle=True, num_workers=8, save=True):
        """Evaluate the model on the validation set
        Args:
          metrics: a list or a dictionary of metrics
          batch_size:
          num_workers:
        """
        print("Started loading validation dataset")
        for d in self.valid_datasets:
            self.valid_datasets.append(d.batch_iter(batch_size))
        
        batch = None
        metric_res_b = []
        print("Loading and training")
        for batch_num in tqdm(enumerate(len(self.valid_datasets[0]))):
            for it in self.valid_datasets:
                # Connecting features from all kipoi datasets
                # we assume that the variants have been curated,
                # i.e. the same in the exact order.
                if batch is None:
                    batch = next(it)
                else:
                    batch = np.concatenate(batch, next(it), axis=1)
            X_batch, y_batch = batch[:, 1:], batch[:,0]
            metric_res_b.append(self.model.test_on_batch(X_batch,
                                y_batch,
                                sample_weight=sample_weight))
        
        metric_res = np.average(metric_res_b)

        if save:
            write_json(metric_res, self.evaluation_path, indent=2)

        if self.cometml_experiment:
            self.cometml_experiment.log_multiple_metrics(flatten(metric_res), prefix="best/")

        return metric_res
Example #15
0
    def evaluate(self,
                 metric,
                 batch_size=256,
                 num_workers=8,
                 eval_train=False,
                 eval_skip=[],
                 save=True,
                 **kwargs):
        """Evaluate the model on the validation set
        Args:
          metric: a function accepting (y_true, y_pred) and returning the evaluation metric(s)
          batch_size:
          num_workers:
          eval_train: if True, also compute the evaluation metrics on the training set
          save: save the json file to the output directory
        """
        if len(kwargs) > 0:
            logger.warn(
                f"Extra kwargs were provided to trainer.evaluate(): {kwargs}")
        # Save the complete model -> HACK
        self.seq_model.save(os.path.join(self.output_dir, 'seq_model.pkl'))

        # contruct a list of dataset to evaluate
        if eval_train:
            eval_datasets = [('train', self.train_dataset)
                             ] + self.valid_dataset
        else:
            eval_datasets = self.valid_dataset

        # skip some datasets for evaluation
        try:
            if len(eval_skip) > 0:
                logger.info(f"Using eval_skip: {eval_skip}")
                eval_datasets = [(k, v) for k, v in eval_datasets
                                 if k not in eval_skip]
        except:
            logger.warn(
                f"eval datasets don't contain tuples. Unable to skip them using {eval_skip}"
            )

        metric_res = OrderedDict()
        for d in eval_datasets:
            if len(d) == 2:
                dataset_name, dataset = d
                eval_metric = None  # Ignore the provided metric
            elif len(d) == 3:
                # specialized evaluation metric was passed
                dataset_name, dataset, eval_metric = d
            else:
                raise ValueError(
                    "Valid dataset needs to be a list of tuples of 2 or 3 elements"
                    "(name, dataset) or (name, dataset, metric)")
            logger.info(f"Evaluating dataset: {dataset_name}")
            metric_res[dataset_name] = self.seq_model.evaluate(
                dataset,
                eval_metric=eval_metric,
                num_workers=num_workers,
                batch_size=batch_size)
        if save:
            write_json(metric_res, self.evaluation_path, indent=2)
            logger.info("Saved metrics to {}".format(self.evaluation_path))

        if self.cometml_experiment is not None:
            self.cometml_experiment.log_multiple_metrics(flatten(
                metric_res, separator='/'),
                                                         prefix="eval/")

        if self.wandb_run is not None:
            self.wandb_run.summary.update(
                flatten(prefix_dict(metric_res, prefix="eval/"),
                        separator='/'))

        return metric_res
Example #16
0
def test_list():
    dic = {'a': 1, 'b': [{'c': [2, 3]}]}
    expected = {'a': 1, 'b_0_c_0': 2, 'b_0_c_1': 3}
    actual = flatten(dic)
    assert actual == expected
Example #17
0
def test_custom_separator():
    dic = {'a': '1', 'b': '2', 'c': {'c1': '3', 'c2': '4'}}
    expected = {'a': '1', 'b': '2', 'c*c1': '3', 'c*c2': '4'}
    actual = flatten(dic, '*')
    assert actual == expected
Example #18
0
def test_one_flatten_utf8_dif():
    a = {u'eñe': 1}
    info = dict(info=a)
    expected = {u'info_{}'.format(u'eñe'): 1}
    actual = flatten(info)
    assert actual == expected
Example #19
0
def test_one_flatten_utf8():
    dic = {'a': '1', u'ñ': u'áéö', 'c': {u'c1': '3', 'c2': '4'}}
    expected = {'a': '1', u'ñ': u'áéö', 'c_c1': '3', 'c_c2': '4'}
    actual = flatten(dic)
    assert actual == expected
Example #20
0
def test_one_flatten():
    dic = {'a': '1', 'b': '2', 'c': {'c1': '3', 'c2': '4'}}
    expected = {'a': '1', 'b': '2', 'c_c1': '3', 'c_c2': '4'}
    actual = flatten(dic)
    assert actual == expected
Example #21
0
def test_no_flatten():
    dic = {'a': '1', 'b': '2', 'c': 3}
    expected = dic
    actual = flatten(dic)
    assert actual == expected
Example #22
0
def test_unflatten_with_list_nested():
    dic = {"a": [[{"b": 1}], [{"d": 1}]]}
    dic_flatten = flatten(dic)
    actual = unflatten_list(dic_flatten)
    assert actual == dic