Example #1
0
def test_multi_output(artifact_dir):
    records_train = gen_records(NUM_SAMPLES_TRAIN)
    records_validation = gen_records(NUM_SAMPLES_VALIDATION)
    records_score = gen_records(NUM_SAMPLES_SCORE)

    loc = os.path.abspath(os.path.dirname(__file__))
    cfg = io_utils.load_json("config_multi_output.json", loc)

    bm = BarrageModel(artifact_dir)
    bm.train(cfg, records_train, records_validation)
    scores = bm.predict(records_score)

    classification = [np.argmax(score["classification"]) for score in scores]
    regression_1 = [score["regression"][0] for score in scores]
    regression_2 = [score["regression"][1] for score in scores]

    df_scores = pd.DataFrame({
        "classification": classification,
        "regression_1": regression_1,
        "regression_2": regression_2,
    })

    assert (df_scores["classification"] == records_score["y_cls"]).mean() > 0.5
    assert abs(
        (df_scores["regression_1"] - records_score["y_reg_1"]).mean()) < 0.5
    assert abs(
        (df_scores["regression_2"] - records_score["y_reg_2"]).mean()) < 0.5
Example #2
0
def test_simple_output(artifact_dir, records_train, records_validation,
                       records_score):
    loc = os.path.abspath(os.path.dirname(__file__))
    cfg = io_utils.load_json("config_single_output.json", loc)

    bm = BarrageModel(artifact_dir)
    bm.train(cfg, records_train, records_validation)
    scores = bm.predict(records_score)

    df_scores = pd.DataFrame(scores)
    assert (df_scores["softmax"] == records_score["label"]).mean() >= 0.90
Example #3
0
def test_simple_output(artifact_dir):
    records_train = gen_records(NUM_SAMPLES_TRAIN)
    records_validation = gen_records(NUM_SAMPLES_VALIDATION)
    records_score = gen_records(NUM_SAMPLES_SCORE)

    loc = os.path.abspath(os.path.dirname(__file__))
    cfg = io_utils.load_json("config_single_output.json", loc)

    bm = BarrageModel(artifact_dir)
    bm.train(cfg, records_train, records_validation)
    scores = bm.predict(records_score)

    df_scores = pd.DataFrame(scores)
    records_score = pd.DataFrame(records_score)
    assert (df_scores["softmax"] == records_score["label"]).mean() >= 0.90
Example #4
0
def predict(score_data, artifact_dir, output):
    """Barrage deep learning predict.

    Supported filetypes:

        1. .csv

        2. .json

    Args:

        score-data: filepath to score data [REQUIRED].

        artifact-dir: location to load artifacts [REQUIRED].
    """
    records_score = io_utils.load_data(score_data)
    scores = BarrageModel(artifact_dir).predict(records_score)
    io_utils.save_json(scores, output)
Example #5
0
def train(config, train_data, validation_data, artifact_dir):
    """Barrage deep learning train.

    Supported filetypes:

        1. .csv

        2. .json

    Args:

        config: filepath to barrage config [REQUIRED].

        train-data: filepath to train data [REQUIRED].

        validation-data: filepath to validation data [REQUIRED].

    Note: artifact-dir cannot already exist.
    """
    cfg = io_utils.load_json(config)
    records_train = io_utils.load_data(train_data)
    records_validation = io_utils.load_data(train_data)
    BarrageModel(artifact_dir).train(cfg, records_train, records_validation)
Example #6
0
    def postprocess(self, score):
        # Threshold 0.5 / Argmax the score

        if len(score) == 1:
            score[self.out_key] = float(score[self.out_key] > 0.5)
        else:
            score[self.out_key] = np.argmax(score[self.out_key])

        return score

    def load(self, path):
        self.tokenizer = io_utils.load_pickle("tokenizer.pkl", path)

    def save(self, path):
        io_utils.save_pickle(self.tokenizer, "tokenizer.pkl", path)


if __name__ == "__main__":
    records_train, records_val, records_test = get_data()

    # Train
    cfg = io_utils.load_json("config_sentiment.json")
    BarrageModel("artifacts").train(cfg, records_train, records_val)

    # Predict
    scores = BarrageModel("artifacts").predict(records_test)
    df_preds = pd.DataFrame(scores)

    acc = (df_preds["target"] == records_test["label"]).mean()
    print(f"Test set accuracy: {acc}")
Example #7
0
MNIST dataset example
"""
import numpy as np
from tensorflow.keras import datasets

from barrage import BarrageModel
from barrage.utils import io_utils


def get_data():
    """Load MNIST dataset."""
    (X_train, y_train), (X_val, y_val) = datasets.mnist.load_data()
    X_train = X_train[:, ..., np.newaxis]  # need image shape (28, 28, 1) not (28, 28)
    X_val = X_val[:, ..., np.newaxis]  # need image shape (28, 28, 1) not (28, 28)

    # Convert to list of dicts
    samples_train = X_train.shape[0]
    records_train = [
        {"x": X_train[ii, ...], "y": y_train[ii]} for ii in range(samples_train)
    ]
    samples_val = X_val.shape[0]
    records_val = [{"x": X_val[ii, ...], "y": y_val[ii]} for ii in range(samples_val)]
    return records_train, records_val


if __name__ == "__main__":
    records_train, records_val = get_data()
    # Train
    cfg = io_utils.load_json("config_mnist.json")
    BarrageModel("artifacts").train(cfg, records_train, records_val)
Example #8
0
def vanilla_iris():
    """Here we use sklearn to mean variance normalize the dataset and train a
    simple model.
    """
    # Get data
    records_train, records_val = get_data()

    # For now we will use sklearn.preprocessing.StandardScaler because the dataset
    # fits in memory. However, this approach does not scale and the overkill example
    # will demonstrate how to apply mean-var normalizaton with a dataset that does not
    # fit into memory.
    cols = ["i1", "i2", "i3", "i4"]
    scaler = StandardScaler().fit(records_train[cols])
    records_train[cols] = scaler.transform(records_train[cols])
    records_val[cols] = scaler.transform(records_val[cols])
    scaler_params = {"mean": scaler.mean_, "std": scaler.scale_}
    print(f"scaler params {scaler_params}")

    # Specify config
    cfg = {
        "dataset": {
            "loader": {
                # use built in KeySelector
                "import": "KeySelector",
                "params": {
                    "inputs": {
                        # name matches 'inputs' name
                        "iris": ["i1", "i2", "i3", "i4"]
                    },
                    "outputs": {
                        # name matches 'outputs' name
                        "flower": ["label"]
                    },
                },
            },
            "seed": 42,
        },
        "model": {
            # use sequential_from_config
            "network": {
                "import": "barrage.model.sequential_from_config",
                "params": {
                    "layers": [
                        {"import": "Input", "params": {"shape": 4, "name": "iris"}},
                        {
                            "import": "Dense",
                            "params": {"units": 25, "activation": "relu"},
                        },
                        {
                            "import": "Dense",
                            "params": {"units": 25, "activation": "relu"},
                        },
                        {
                            "import": "Dense",
                            "params": {"units": 25, "activation": "relu"},
                        },
                        {
                            "import": "Dense",
                            "params": {
                                "units": 3,
                                "activation": "softmax",
                                "name": "flower",
                            },
                        },
                    ]
                },
            },
            "outputs": [
                {
                    # name matches 'outputs' name
                    "name": "flower",
                    "loss": {"import": "sparse_categorical_crossentropy"},
                    "metrics": [{"import": "accuracy"}],
                }
            ],
        }
        # use defaults for solver, services
    }

    # Train the model
    BarrageModel("artifacts_vanilla").train(cfg, records_train, records_val)
Example #9
0
def overkill_iris():
    """Here we use a custom loader, transformer, and augmentation functions."""

    # Get data
    records_train, records_val = get_data()

    # Specify config
    cfg = {
        "dataset": {
            # use custom loader
            "loader": {"import": "example.CustomIrisLoader"},
            "transformer": {
                "import": "example.CustomInputMeanVarTransformer",
                "params": {"key": "iris"},
            },
            "augmentor": [
                {
                    "import": "example.add_input_noise",
                    "params": {"key": "iris", "scale": 0.1},
                }
            ],
            "seed": 42,
        },
        "model": {
            "network": {
                "import": "barrage.model.sequential_from_config",
                "params": {
                    "layers": [
                        {"import": "Input", "params": {"shape": 4, "name": "iris"}},
                        {
                            "import": "Dense",
                            "params": {"units": 25, "activation": "relu"},
                        },
                        {
                            "import": "Dense",
                            "params": {"units": 25, "activation": "relu"},
                        },
                        {
                            "import": "Dense",
                            "params": {"units": 25, "activation": "relu"},
                        },
                        {
                            "import": "Dense",
                            "params": {
                                "units": 3,
                                "activation": "softmax",
                                "name": "flower",
                            },
                        },
                    ]
                },
            },
            "outputs": [
                {
                    "name": "flower",
                    "loss": {"import": "sparse_categorical_crossentropy"},
                    "metrics": [{"import": "accuracy"}],
                }
            ],
        },
        # specify solver
        "solver": {
            "optimizer": {"import": "Adam", "learning_rate": 1e-3},
            "batch_size": 32,
            "epochs": 50,
        },
        # choose best model based on 'val_accuracy'
        "services": {"best_checkpoint": {"monitor": "val_accuracy", "mode": "max"}},
    }

    # Train the model
    BarrageModel("artifacts_overkill").train(cfg, records_train, records_val)