def test_multi_output(artifact_dir): records_train = gen_records(NUM_SAMPLES_TRAIN) records_validation = gen_records(NUM_SAMPLES_VALIDATION) records_score = gen_records(NUM_SAMPLES_SCORE) loc = os.path.abspath(os.path.dirname(__file__)) cfg = io_utils.load_json("config_multi_output.json", loc) bm = BarrageModel(artifact_dir) bm.train(cfg, records_train, records_validation) scores = bm.predict(records_score) classification = [np.argmax(score["classification"]) for score in scores] regression_1 = [score["regression"][0] for score in scores] regression_2 = [score["regression"][1] for score in scores] df_scores = pd.DataFrame({ "classification": classification, "regression_1": regression_1, "regression_2": regression_2, }) assert (df_scores["classification"] == records_score["y_cls"]).mean() > 0.5 assert abs( (df_scores["regression_1"] - records_score["y_reg_1"]).mean()) < 0.5 assert abs( (df_scores["regression_2"] - records_score["y_reg_2"]).mean()) < 0.5
def test_simple_output(artifact_dir, records_train, records_validation, records_score): loc = os.path.abspath(os.path.dirname(__file__)) cfg = io_utils.load_json("config_single_output.json", loc) bm = BarrageModel(artifact_dir) bm.train(cfg, records_train, records_validation) scores = bm.predict(records_score) df_scores = pd.DataFrame(scores) assert (df_scores["softmax"] == records_score["label"]).mean() >= 0.90
def test_simple_output(artifact_dir): records_train = gen_records(NUM_SAMPLES_TRAIN) records_validation = gen_records(NUM_SAMPLES_VALIDATION) records_score = gen_records(NUM_SAMPLES_SCORE) loc = os.path.abspath(os.path.dirname(__file__)) cfg = io_utils.load_json("config_single_output.json", loc) bm = BarrageModel(artifact_dir) bm.train(cfg, records_train, records_validation) scores = bm.predict(records_score) df_scores = pd.DataFrame(scores) records_score = pd.DataFrame(records_score) assert (df_scores["softmax"] == records_score["label"]).mean() >= 0.90
def predict(score_data, artifact_dir, output): """Barrage deep learning predict. Supported filetypes: 1. .csv 2. .json Args: score-data: filepath to score data [REQUIRED]. artifact-dir: location to load artifacts [REQUIRED]. """ records_score = io_utils.load_data(score_data) scores = BarrageModel(artifact_dir).predict(records_score) io_utils.save_json(scores, output)
def train(config, train_data, validation_data, artifact_dir): """Barrage deep learning train. Supported filetypes: 1. .csv 2. .json Args: config: filepath to barrage config [REQUIRED]. train-data: filepath to train data [REQUIRED]. validation-data: filepath to validation data [REQUIRED]. Note: artifact-dir cannot already exist. """ cfg = io_utils.load_json(config) records_train = io_utils.load_data(train_data) records_validation = io_utils.load_data(train_data) BarrageModel(artifact_dir).train(cfg, records_train, records_validation)
def postprocess(self, score): # Threshold 0.5 / Argmax the score if len(score) == 1: score[self.out_key] = float(score[self.out_key] > 0.5) else: score[self.out_key] = np.argmax(score[self.out_key]) return score def load(self, path): self.tokenizer = io_utils.load_pickle("tokenizer.pkl", path) def save(self, path): io_utils.save_pickle(self.tokenizer, "tokenizer.pkl", path) if __name__ == "__main__": records_train, records_val, records_test = get_data() # Train cfg = io_utils.load_json("config_sentiment.json") BarrageModel("artifacts").train(cfg, records_train, records_val) # Predict scores = BarrageModel("artifacts").predict(records_test) df_preds = pd.DataFrame(scores) acc = (df_preds["target"] == records_test["label"]).mean() print(f"Test set accuracy: {acc}")
MNIST dataset example """ import numpy as np from tensorflow.keras import datasets from barrage import BarrageModel from barrage.utils import io_utils def get_data(): """Load MNIST dataset.""" (X_train, y_train), (X_val, y_val) = datasets.mnist.load_data() X_train = X_train[:, ..., np.newaxis] # need image shape (28, 28, 1) not (28, 28) X_val = X_val[:, ..., np.newaxis] # need image shape (28, 28, 1) not (28, 28) # Convert to list of dicts samples_train = X_train.shape[0] records_train = [ {"x": X_train[ii, ...], "y": y_train[ii]} for ii in range(samples_train) ] samples_val = X_val.shape[0] records_val = [{"x": X_val[ii, ...], "y": y_val[ii]} for ii in range(samples_val)] return records_train, records_val if __name__ == "__main__": records_train, records_val = get_data() # Train cfg = io_utils.load_json("config_mnist.json") BarrageModel("artifacts").train(cfg, records_train, records_val)
def vanilla_iris(): """Here we use sklearn to mean variance normalize the dataset and train a simple model. """ # Get data records_train, records_val = get_data() # For now we will use sklearn.preprocessing.StandardScaler because the dataset # fits in memory. However, this approach does not scale and the overkill example # will demonstrate how to apply mean-var normalizaton with a dataset that does not # fit into memory. cols = ["i1", "i2", "i3", "i4"] scaler = StandardScaler().fit(records_train[cols]) records_train[cols] = scaler.transform(records_train[cols]) records_val[cols] = scaler.transform(records_val[cols]) scaler_params = {"mean": scaler.mean_, "std": scaler.scale_} print(f"scaler params {scaler_params}") # Specify config cfg = { "dataset": { "loader": { # use built in KeySelector "import": "KeySelector", "params": { "inputs": { # name matches 'inputs' name "iris": ["i1", "i2", "i3", "i4"] }, "outputs": { # name matches 'outputs' name "flower": ["label"] }, }, }, "seed": 42, }, "model": { # use sequential_from_config "network": { "import": "barrage.model.sequential_from_config", "params": { "layers": [ {"import": "Input", "params": {"shape": 4, "name": "iris"}}, { "import": "Dense", "params": {"units": 25, "activation": "relu"}, }, { "import": "Dense", "params": {"units": 25, "activation": "relu"}, }, { "import": "Dense", "params": {"units": 25, "activation": "relu"}, }, { "import": "Dense", "params": { "units": 3, "activation": "softmax", "name": "flower", }, }, ] }, }, "outputs": [ { # name matches 'outputs' name "name": "flower", "loss": {"import": "sparse_categorical_crossentropy"}, "metrics": [{"import": "accuracy"}], } ], } # use defaults for solver, services } # Train the model BarrageModel("artifacts_vanilla").train(cfg, records_train, records_val)
def overkill_iris(): """Here we use a custom loader, transformer, and augmentation functions.""" # Get data records_train, records_val = get_data() # Specify config cfg = { "dataset": { # use custom loader "loader": {"import": "example.CustomIrisLoader"}, "transformer": { "import": "example.CustomInputMeanVarTransformer", "params": {"key": "iris"}, }, "augmentor": [ { "import": "example.add_input_noise", "params": {"key": "iris", "scale": 0.1}, } ], "seed": 42, }, "model": { "network": { "import": "barrage.model.sequential_from_config", "params": { "layers": [ {"import": "Input", "params": {"shape": 4, "name": "iris"}}, { "import": "Dense", "params": {"units": 25, "activation": "relu"}, }, { "import": "Dense", "params": {"units": 25, "activation": "relu"}, }, { "import": "Dense", "params": {"units": 25, "activation": "relu"}, }, { "import": "Dense", "params": { "units": 3, "activation": "softmax", "name": "flower", }, }, ] }, }, "outputs": [ { "name": "flower", "loss": {"import": "sparse_categorical_crossentropy"}, "metrics": [{"import": "accuracy"}], } ], }, # specify solver "solver": { "optimizer": {"import": "Adam", "learning_rate": 1e-3}, "batch_size": 32, "epochs": 50, }, # choose best model based on 'val_accuracy' "services": {"best_checkpoint": {"monitor": "val_accuracy", "mode": "max"}}, } # Train the model BarrageModel("artifacts_overkill").train(cfg, records_train, records_val)