def test_add_data_set(data_sets_dict): data_bunch = DataBunch() for data_set_name, data_set in data_sets_dict.items(): data_bunch.add_data_set(data_set_name, data_set) for data_set_name, data_set in data_sets_dict.items(): assert getattr(data_bunch, data_set_name) == data_set
def read_data_bunch(read_data_set, params_dict): """ Takes the provider "read_data_set" and uses it to read arbitrarily names DataSets and add them to a new DataBunch :param callable read_data_set: A function that reads source data and returns a DatSet :param dict params_dict: A dictionary consisting of the names and the parameters according to which DataSets are to be created. For example: { "train": { "path": "./example_data/train.csv", "input_format": ".csv", "data_wrappers_params_dict": { "index": ["ID", "ID2"], "features": ["field1_num", "field2_num", "field3_num"], "targets": ["field4_target", "field5_target", "field6_target"], "full_data": ["ID", "ID2", "field1_num", "field2_num", "field3_num", "field4_target", "field5_target", "field6_target"] } }, "valid" : {...}, "test" : {...} } :return: a DataBunch """ data_sets_dict = {} for data_set_name, params in params_dict.items(): data_sets_dict[data_set_name] = read_data_set(**params) return DataBunch(data_sets_dict)
def read_test_data_bunch(read_data_set, test_params): """ Takes the provider "read_data_set" and uses it to read a "test" DataSet and add it to a new DataBunch :param callable read_data_set: A function that reads source data and returns a DatSet :param dict test_params: The parameters according to which to read the "test" DataSet :return: a DataBunch """ return DataBunch({"test": read_data_set(**test_params)})
def read_train_valid_test_data_bunch(read_data_set, train_params, valid_params, test_params): """ Takes the provider "read_data_set" and uses it to read "train, "valid" and "test" DataSets and add them to a new DataBunch :param callable read_data_set: A function that reads source data and returns a DatSet :param dict train_params: The parameters according to which to read the "train" DataSet :param dict valid_params: The parameters according to which to read the "valid" DataSet :param dict test_params: The parameters according to which to read the "test" DataSet :return: a DataBunch """ return DataBunch({ "train": read_data_set(**train_params), "valid": read_data_set(**valid_params), "test": read_data_set(**test_params) })
def transform(self, transform_then_slice, transformation_params): new_data_set = MockDataSet(self.value * 100) new_data_set.transform_then_slice = transform_then_slice new_data_set.transformation_params = transformation_params return new_data_set data_sets_dict = { "train": MockDataSet(123), "valid": MockDataSet(456), "test": MockDataSet(789) } data_bunch = DataBunch(data_sets_dict) @pytest.mark.parametrize( "data_bunch, data_set_names, params, transform_then_slice", [(data_bunch, ["train", "valid"], { "blah": "456" }, True)]) def test_transform(data_bunch, data_set_names, params, transform_then_slice): new_data_bunch = data_bunch.transform(data_set_names, params, transform_then_slice) for data_set_name in data_set_names: new_data_set = getattr(new_data_bunch, data_set_name) assert new_data_set.value == getattr(data_bunch, data_set_name).value * 100
data_bunch = DataBunch( data_sets_dict={ "train": DataSet({ "full_data": PandasDataWrapper(df_train, full_data_columns), "index": PandasDataWrapper(df_train[index_columns], index_columns), "features": PandasDataWrapper(df_train[features_columns], features_columns), "targets": PandasDataWrapper(df_train[targets_columns], targets_columns) }), "valid": DataSet({ "full_data": PandasDataWrapper(df_valid, full_data_columns), "index": PandasDataWrapper(df_valid[index_columns], index_columns), "features": PandasDataWrapper(df_valid[features_columns], features_columns), "targets": PandasDataWrapper(df_valid[targets_columns], targets_columns) }), "test": DataSet({ "full_data": PandasDataWrapper(df_test, full_data_columns), "index": PandasDataWrapper(df_test[index_columns], index_columns), "features": PandasDataWrapper(df_test[features_columns], features_columns), "targets": PandasDataWrapper(df_test[targets_columns], targets_columns) }) })