Beispiel #1
0
def run_inference():
    train_data, test_data = load_data()
    test_X = divide_data(test_data, 'test')

    model = run_training()
    pr = model.predict(test_X)
    return pr
Beispiel #2
0
def run_svm(digits=None, gamma=.001, C=1, fd=sys.stdout):
    if digits is None:
        digits = load_data('mnist.pkl.gz')

    train_set_x, train_set_y = digits[0]
    test_set_x, test_set_y = digits[2]
    classifier = svm.SVC(gamma=gamma, C=C)

    fd.write("fitting\n")
    start = timeit.default_timer()
    classifier.fit(train_set_x, train_set_y)
    end = timeit.default_timer()
    fd.write("fitting completed. Took  %.2fm\n" % ((end - start) / 60.))
    fd.write("predicting\n")
    # Now predict the value of the digit on the second half:
    predicted = classifier.predict(test_set_x)
    fd.write('prediction complete\n')

    fd.write("done\n\n")
    fd.write(
        "Classification report for classifier %s:\n%s\n" %
        (classifier, metrics.classification_report(test_set_y, predicted)))
    fd.write("Accuracy: %s\n" % metrics.accuracy_score(test_set_y, predicted))
    fd.write("Confusion matrix:\n%s" %
             metrics.confusion_matrix(test_set_y, predicted))
def test_load_data_subset_pass():
    from src.data import load_data
    result = load_data("unittests/data/mock_data.csv", ["number"])

    try:
        assert (list(result.columns) == ["number"])
    except AssertionError:
        pytest.fail("load_data didn't subset the return dataframe")
def test_load_data_returns_dataframe():
    from src.data import load_data
    result = load_data("unittests/data/mock_data.csv")

    try:
        assert (isinstance(result, pd.DataFrame))
    except AssertionError:
        pytest.fail("Expected Pandas DataFrame from load_data()")
Beispiel #5
0
def test_aws():
    datasets = load_data('mnist.pkl.gz')
    trainx, trainy = datasets[0]
    validx, validy = datasets[1]
    testx, testy = datasets[2]
    numpy_rng = numpy.random.RandomState(123)

    num_of_train_rows = trainx.shape[0]
    num_of_valid_rows = validx.shape[0]
    num_of_test_rows = testx.shape[0]
    train_indices = range(num_of_train_rows)
    valid_indices = range(num_of_valid_rows)
    test_indices = range(num_of_test_rows)
    numpy_rng.shuffle(range(num_of_train_rows))
    numpy_rng.shuffle(range(num_of_valid_rows))
    numpy_rng.shuffle(range(num_of_test_rows))

    samples = [(10, 10, 10), (50, 10, 10), (100, 10, 10)]
    fd = open('data/aws_dbn_test.log', 'w+')
    fe = open('data/aws_svm_test.log', 'w+')

    for s in samples:
        data = ((trainx[train_indices[:s[0]]], trainy[train_indices[:s[0]]]), (validx[valid_indices[:s[1]]], validy[valid_indices[:s[1]]]),
                (testx[test_indices[:s[2]]], testy[test_indices[:s[2]]]))
        theano_datasets = prep_theano_data(data)

        fd.write("\n\n==========Samples: %s ===========\n\n" % str(s))
        fe.write("\n\n==========Samples: %s ===========\n\n" % str(s))
        test_DBN(pretraining_epochs=100, pretrain_lr=0.01, k=1,
                 training_epochs=1000, finetune_lr=0.1,
                 datasets=theano_datasets, batch_size=10,
                 hidden_layers=[100, 100, 100], fd=fd)

        for c in [.01, .1, 1, 10, 100]:
            fe.write("\n---------------C: %f -------------\n" %c)
            run_svm(digits=data,C=c, fd=fe)



    fd.close()
    fe.close()
Beispiel #6
0
def test_load_data():
    # TODO: replace is a quick hack, neeeds fix
    res = load_data(path.join('input', FILENAME)).rstrip('\n').replace(" ", "")
    assert res == FILE_DATA.rstrip('\n').replace(" ", "")
Beispiel #7
0
def run_training():
    param = best_param()
    train_data, test_data = load_data()
    train_X, train_y = divide_data(train_data, 'train')
    model = train_model(param['classifier'], param, train_X, train_y)
    return model
Beispiel #8
0
    model_scores = []
    # K-fold CV
    kf = KFold(n_splits=8, random_state=SEED, shuffle=True)
    for train_index, test_index in kf.split(X):
        train_X, test_X = X.iloc[train_index], X.iloc[test_index]
        train_y, test_y = y.iloc[train_index], y.iloc[test_index]

        model.fit(train_X, train_y)
        print(f"Model ACC: {model.score(test_X, test_y)}%")
        model_scores.append(model.score(test_X, test_y))
    for sc in model_scores:
        print(f'{sc} %')
    print(f"Max ACC: {max(model_scores)}")
    return model


def run_training():
    param = best_param()
    train_data, test_data = load_data()
    train_X, train_y = divide_data(train_data, 'train')
    model = train_model(param['classifier'], param, train_X, train_y)
    return model


if __name__ == '__main__':
    param = best_param()
    train_data, test_data = load_data()
    train_X, train_y = divide_data(train_data, 'train')
    train_model(param['classifier'], param, train_X, train_y)
def test_preprocessing_pipeline():
    """
    Verify we can read in and preprocess the data as expected.

    :return:
    """
    setup_logging(logging.DEBUG)

    logging.debug("test_preprocessing_pipeline")

    test_data_path = "integrationtests/test_data.csv"
    expected_output_path = "integrationtests/test_preprocessing_expected.csv"

    logging.debug(f"test_data_path: {test_data_path}")
    logging.debug(f"expected_output_path: {expected_output_path}")
    logging.debug("Loading test data")

    df = load_data(test_data_path, ["domain", "class"])

    pipeline = Pipeline([
        preprocess(),
    ])

    logging.debug("Applying pipeline transformations")

    pipeline_output = pipeline.transform(df)
    column_names = pipeline["preprocess"].get_feature_names()

    logging.debug("Pipeline transformation complete")
    logging.debug(f"column_names: {column_names}")

    try:
        assert (column_names == ['class', 'domain'])
    except AssertionError:
        message = f"Didn't get the expected `get_feature_names` from pipeline, got {column_names}"
        logging.exception(message)
        pytest.fail(message)

    try:
        assert (isinstance(pipeline_output, np.ndarray))
    except AssertionError:
        message = f"Didn't get expected type from pipeline, got {type(pipeline_output)}"
        logging.exception(message)
        pytest.fail(message)

    logging.debug(pipeline_output)
    logging.debug("Creating DataFrame from Pipeline output")

    result_df = pd.DataFrame(pipeline_output, columns=column_names)

    logging.debug("Applying `post_process_cleanup`")

    result_df = post_process_cleanup(result_df)

    logging.debug("Loading validation DataFrame")

    expected_df = pd.read_csv(expected_output_path)

    try:
        pd.testing.assert_frame_equal(result_df,
                                      expected_df,
                                      check_dtype=False)
    except AssertionError:
        message = "Data resulting from transformation did not match expected."
        logging.exception(message)
        pytest.fail(message)

    return pipeline
Beispiel #10
0
def main(mapfile, output_dir=None):
    """Split data into train and dev sets"""

    if type(mapfile) is str:
        assert (os.path.isfile(mapfile)), FileNotFoundError
        # read file
        train_df = load_data(mapfile, sep=",", header=0, index_col=0)
    else:
        train_df = mapfile

    # set index
    train_df.index.name = "index"

    # load params
    params = load_params()
    params_split = params['train_test_split']
    params_split["random_seed"] = params["random_seed"]

    # get filenames and dependent variables (labels)
    train_labels = train_df.pop(params_split["target_class"])
    train_files = train_df

    # K-fold split into train and dev sets stratified by train_labels
    # using random seed for reproducibility
    skf = StratifiedKFold(n_splits=params_split['n_split'],
                          random_state=params_split['random_seed'],
                          shuffle=params_split['shuffle'])

    # create splits
    split_df = pd.DataFrame()
    for n_fold, (train_idx,
                 test_idx) in enumerate(skf.split(train_files, train_labels)):
        fold_name = f"fold_{n_fold + 1:02d}"

        # create intermediate dataframe for each fold
        temp_df = pd.DataFrame({
            "image_id": train_idx,
            fold_name: "train"
        }).set_index("image_id")
        temp_df = temp_df.append(
            pd.DataFrame({
                "image_id": test_idx,
                fold_name: "test"
            }).set_index("image_id"))

        # append first fold to empty dataframe or join cols if n_fold > 0
        split_df = split_df.append(temp_df) if n_fold == 0 else split_df.join(
            temp_df)

    # sort by index
    split_df = split_df.sort_index()

    if output_dir:
        assert (os.path.isdir(output_dir)), NotADirectoryError
        output_dir = Path(output_dir).resolve()

        # save output dataframe with indices for train and dev sets
        split_df.to_csv(output_dir.joinpath("split_train_dev.csv"),
                        na_rep="nan")
    else:
        return split_df
Beispiel #11
0
def test_DBN(pretrain_lr=0.01,
             pretraining_epochs=100,
             k=1,
             finetune_lr=0.1,
             training_epochs=1000,
             hidden_layers=[1000, 1000, 1000],
             datasets=None,
             batch_size=10,
             fd=sys.stdout,
             normal_distro=False):
    if datasets is None:
        datasets = prep_theano_data(load_data('mnist.pkl.gz'))

    train_set_x, train_set_y = datasets[0]
    valid_set_x, valid_set_y = datasets[1]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size

    numpy_rng = numpy.random.RandomState(123)

    fd.write('... building the model\n')
    # construct the Deep Belief Network
    dbn = DBN(numpy_rng=numpy_rng,
              n_ins=28 * 28,
              hidden_layers_size=hidden_layers,
              n_outs=10,
              normal=normal_distro)

    fd.write('... getting the pretraining functions\n')
    pretraining_fns = dbn.pretraining_functions(train_set_x=train_set_x,
                                                batch_size=batch_size,
                                                k=k)

    num_of_train_rows = (train_set_x.shape.eval())[0]
    num_of_valid_rows = (valid_set_x.shape.eval())[0]
    indices = range(num_of_train_rows)
    numpy_rng.shuffle(range(num_of_train_rows))
    train = train_set_x[indices[:num_of_valid_rows]]
    valid = valid_set_x
    fd.write('... pre-training the model\n')
    start_time = timeit.default_timer()
    # Pre-train layer-wise
    for i in range(dbn.n_layers):
        # go through pretraining epochs

        for epoch in range(pretraining_epochs):
            # go through the training set
            c = []
            for batch_index in range(n_train_batches):
                c.append(pretraining_fns[i](index=batch_index, lr=pretrain_lr))
            fd.write('Pre-training layer %i, epoch %d, cost %s\n' %
                     (i, epoch, numpy.mean(c, dtype='float64')))
            if (epoch % 5 == 0):
                train_free_energy = dbn.rbm_layers[i].free_energy(train)
                valid_free_energy = dbn.rbm_layers[i].free_energy(valid)
                fd.write(
                    'Pre-training layer %i, epoch %d, representative training free energy %s\n'
                    % (i, epoch,
                       numpy.mean(train_free_energy.eval(), dtype='float64')))
                fd.write(
                    'Pre-training layer %i, epoch %d, validation free energy %s\n'
                    % (i, epoch,
                       numpy.mean(valid_free_energy.eval(), dtype='float64')))

        valid = T.dot(valid, dbn.rbm_layers[i].W)
        train = T.dot(train, dbn.rbm_layers[i].W)

    end_time = timeit.default_timer()

    fd.write('The pretraining code for file ' + os.path.split(__file__)[1] +
             ' ran for %.2fm' % ((end_time - start_time) / 60.))

    fd.write('... getting the finetuning functions')
    train_fn, validate_model, test_model = dbn.build_finetune_functions(
        datasets=datasets, batch_size=batch_size, learning_rate=finetune_lr)

    fd.write('... finetuning the model\n')
    # early-stopping parameters

    # look as this many examples regardless
    patience = 4 * n_train_batches

    # wait this much longer when a new best is found
    patience_increase = 2.

    # a relative improvement of this much is considered significant
    improvement_threshold = 0.995

    # go through this many minibatches before checking the network on
    # the validation set; in this case we check every epoch
    validation_frequency = min(n_train_batches, patience / 2)

    best_validation_loss = numpy.inf
    test_score = 0.
    start_time = timeit.default_timer()

    done_looping = False
    epoch = 0

    while (epoch < training_epochs) and (not done_looping):
        epoch = epoch + 1
        for minibatch_index in range(n_train_batches):

            train_fn(minibatch_index)
            iter = (epoch - 1) * n_train_batches + minibatch_index

            if (iter + 1) % validation_frequency == 0:

                validation_losses = validate_model()
                this_validation_loss = numpy.mean(validation_losses,
                                                  dtype='float64')
                fd.write(
                    'epoch %i, minibatch %i/%i, validation error %f %%\n' %
                    (epoch, minibatch_index + 1, n_train_batches,
                     this_validation_loss * 100.))

                # if we got the best validation score until now
                if this_validation_loss < best_validation_loss:

                    # improve patience if loss improvement is good enough
                    if (this_validation_loss <
                            best_validation_loss * improvement_threshold):
                        patience = max(patience, iter * patience_increase)

                    # save best validation score and iteration number
                    best_validation_loss = this_validation_loss
                    best_iter = iter

                    # test it on the test set
                    test_losses = test_model()
                    test_score = numpy.mean(test_losses, dtype='float64')
                    fd.write(('     epoch %i, minibatch %i/%i, test error of '
                              'best model %f %%\n') %
                             (epoch, minibatch_index + 1, n_train_batches,
                              test_score * 100.))

            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    fd.write(('Optimization complete with best validation score of %f %%, '
              'obtained at iteration %i, '
              'with test performance %f %%\n') %
             (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    fd.write('The fine tuning code for file ' + os.path.split(__file__)[1] +
             ' ran for %.2fm\n' % ((end_time - start_time) / 60.))
Beispiel #12
0
def objective(trial):
    """
    Fine tuning w. Optuna
    :param trial: optuna study
    :return:
    """

    # load data
    train_X, train_y = divide_data(load_data()[0], 'train')

    classifier = trial.suggest_categorical('classifier', [
        'KNeighbor', 'DecisionTree', 'SVM', 'RandomForest', 'LightGBM',
        'Xgboost'
    ])

    # KFold
    kf = KFold(n_splits=3, random_state=SEED, shuffle=True)

    if classifier == 'KNeighbor':
        # KNN params
        knn_n_neighbors = trial.suggest_int('n_neighbors', 3, 10)
        knn_weights = trial.suggest_categorical('weights',
                                                ['uniform', 'distance'])
        knn_algorithm = trial.suggest_categorical(
            'algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
        knn_leaf_size = trial.suggest_int('leaf_size', 10, 40)
        knn_p = trial.suggest_int('p', 1, 2)

        model = KNeighborsClassifier(n_neighbors=knn_n_neighbors,
                                     weights=knn_weights,
                                     algorithm=knn_algorithm,
                                     leaf_size=knn_leaf_size,
                                     p=knn_p)

    elif classifier == 'DecisionTree':
        # DecisionTree params
        dt_criterion = trial.suggest_categorical('criterion',
                                                 ['gini', 'entropy'])
        dt_splitter = trial.suggest_categorical('splitter', ['best', 'random'])

        model = DecisionTreeClassifier(random_state=SEED,
                                       criterion=dt_criterion,
                                       splitter=dt_splitter)

    elif classifier == 'SVM':
        # SVM params
        svm_C = trial.suggest_categorical('svm_C', [0.1, 1, 10, 100, 1000])
        svm_degree = trial.suggest_categorical('svm_degree',
                                               [0, 1, 2, 3, 4, 5, 6])

        model = SVC(random_state=SEED, C=svm_C, degree=svm_degree)

    elif classifier == 'RandomForest':
        # RandomForest params
        rf_max_depth = trial.suggest_categorical('rf_max_depth',
                                                 [80, 90, 100, 110])
        rf_max_features = trial.suggest_categorical('rf_max_features', [2, 3])
        rf_min_samples_leaf = trial.suggest_categorical(
            'rf_min_sample_leaf', [8, 10, 12])
        rf_n_estimators = trial.suggest_categorical('rf_n_estimators',
                                                    [100, 200, 300, 1000])

        model = RandomForestClassifier(random_state=SEED,
                                       max_depth=rf_max_depth,
                                       max_features=rf_max_features,
                                       min_samples_leaf=rf_min_samples_leaf,
                                       n_estimators=rf_n_estimators)

    elif classifier == 'LightGBM':
        # LightGBM params
        lgbm_n_estimators = trial.suggest_categorical('lgbm_n_estimators',
                                                      [100, 500, 1000, 3000])
        lgbm_max_depth = trial.suggest_int('lgbm_max_depth', 20, 200)
        lgbm_learning_rate = trial.suggest_categorical('lgbm_learning_rate',
                                                       [0.01, 0.05, 0.1])
        lgbm_num_leaves = trial.suggest_categorical('lgbm_num_leaves',
                                                    [80, 100, 150, 200])
        lgbm_subsample = trial.suggest_categorical('lgbm_subsample',
                                                   [1, 0.8, 0.7, 0.5])
        lgbm_lambda_l1 = trial.suggest_categorical('lgbm_lambda_l1',
                                                   [0., 0.5, 0.8, 1])
        lgbm_lambda_l2 = trial.suggest_categorical('lgbm_lambda_l2',
                                                   [0., 0.5, 0.8, 1])

        model = lgb.LGBMClassifier(
            random_state=SEED,
            max_depth=lgbm_max_depth,
            num_leaves=lgbm_num_leaves,
            learning_rate=lgbm_learning_rate,
            subsample=lgbm_subsample,
            lambda_l1=lgbm_lambda_l1,
            lambda_l2=lgbm_lambda_l2,
        )

    elif classifier == 'Xgboost':
        # Xgboost params
        xgb_max_depth = trial.suggest_int('xgb_max_depth', 10, 200)
        xgb_learning_rate = trial.suggest_categorical('xgb_learning_rate',
                                                      [0.01, 0.05, 0.1])

        model = xgb.XGBClassifier(
            random_state=SEED,
            max_depth=xgb_max_depth,
            learning_rate=xgb_learning_rate,
        )
        model
    else:
        return

    return cross_val_score(model,
                           train_X,
                           train_y,
                           n_jobs=-1,
                           scoring='accuracy',
                           cv=kf).mean()
Beispiel #13
0
def test_it_should_load_data_when_file_exists():
    res = load_data(path.join(INPUT_PATH, FILENAME))
    assert res is not None
Beispiel #14
0
    fd.write("fitting\n")
    start = timeit.default_timer()
    classifier.fit(train_set_x, train_set_y)
    end = timeit.default_timer()
    fd.write("fitting completed. Took  %.2fm\n" % ((end - start) / 60.))
    fd.write("predicting\n")
    # Now predict the value of the digit on the second half:
    predicted = classifier.predict(test_set_x)
    fd.write('prediction complete\n')

    fd.write("done\n\n")
    fd.write(
        "Classification report for classifier %s:\n%s\n" %
        (classifier, metrics.classification_report(test_set_y, predicted)))
    fd.write("Accuracy: %s\n" % metrics.accuracy_score(test_set_y, predicted))
    fd.write("Confusion matrix:\n%s" %
             metrics.confusion_matrix(test_set_y, predicted))


if __name__ == '__main__':
    digits = load_data('mnist.pkl.gz')
    fd = open("hello.log", 'w+')
    train_set_x, train_set_y = digits[0]
    train_set_x = train_set_x[:1000]
    train_set_y = train_set_y[:1000]

    digits[0] = (train_set_x, train_set_y)
    run_svm(digits=digits, C=1, gamma=.001, fd=fd)
    fd.close()
Beispiel #15
0
def write_inference(result):
    train_data, test_data = load_data()
    test_data['class'] = result
    test_data.to_csv('./build/inference.csv')
Beispiel #16
0
def randomData():
    print('data')
    df = load_data(random=True)
    raw_data = calculate.raw_data(df)
    print("sending back random data: ", raw_data)
    return jsonify(raw_data)
Beispiel #17
0
def data():
    print('data')
    df = load_data()
    raw_data = calculate.raw_data(df)
    print("sending back raw data: ", raw_data)
    return jsonify(raw_data)
Beispiel #18
0
def d3Version():
    print('d3')
    df = load_data()
    raw_data = calculate.raw_data(df)
    return render_template('d3.html.j2', raw_data=raw_data)
Beispiel #19
0
def aws(name):

    datasets = load_data('mnist.pkl.gz')

    numpy_rng = numpy.random.RandomState(123)

    def getEvenData(data, sample_sizes):
        trainx, trainy = data[0]
        validx, validy = data[1]
        testx, testy = data[2]

        num_of_train_rows = trainx.shape[0]
        num_of_valid_rows = validx.shape[0]
        num_of_test_rows = testx.shape[0]
        train_indices = range(num_of_train_rows)
        valid_indices = range(num_of_valid_rows)
        test_indices = range(num_of_test_rows)
        numpy_rng.shuffle(range(num_of_train_rows))
        numpy_rng.shuffle(range(num_of_valid_rows))
        numpy_rng.shuffle(range(num_of_test_rows))

        tx = trainx[train_indices]
        ty = trainy[train_indices]
        vx = validx[valid_indices]
        vy = validy[valid_indices]
        tex = testx[test_indices]
        tey = testy[test_indices]

        train_each_class = sample_sizes[0] / 10
        valid_each_class = sample_sizes[1] / 10
        test_each_class = sample_sizes[2] / 10

        train_new_indices = numpy.where(ty == 0)[0][:train_each_class]
        valid_new_indices = numpy.where(vy == 0)[0][:valid_each_class]
        test_new_indices = numpy.where(tey == 0)[0][:test_each_class]

        for i in range(1, 10):
            train_new_indices = numpy.union1d(
                train_new_indices,
                numpy.where(ty == i)[0][:train_each_class])

            valid_new_indices = numpy.union1d(
                valid_new_indices,
                numpy.where(vy == i)[0][:valid_each_class])
            test_new_indices = numpy.union1d(
                test_new_indices,
                numpy.where(tey == i)[0][:test_each_class])

        newdata = ((tx[train_new_indices], ty[train_new_indices]),
                   (vx[valid_new_indices], vy[valid_new_indices]),
                   (tex[test_new_indices], tey[test_new_indices]))

        return newdata

    prelr = .04
    flr = .07
    s = (5000, 1000, 1000)
    l = [1000, 1000, 1000]
    k = 1
    name = "2017_03_23_woot"
    fd = open('data/' + name + '_dbn.log', 'w+')
    fe = open('data/' + name + '_svm.log', 'w+')
    for j in range(3):
        fd.write("_______RUN: %d" % j)
        fe.write("_______RUN: %d" % j)

        # data = getEvenData(datasets,s)
        theano_datasets = prep_theano_data(datasets)
        print('done prepping data')
        fd.write("\n\n==========Layers: %s ===========\n\n" % str(l))
        fe.write("\n\n==========Layers %s ===========\n\n" % str(l))
        test_DBN(pretraining_epochs=100,
                 pretrain_lr=prelr,
                 k=k,
                 training_epochs=1000,
                 finetune_lr=flr,
                 datasets=theano_datasets,
                 batch_size=10,
                 hidden_layers=l,
                 fd=fd,
                 normal_distro=False)

        for c in [10, 100]:
            fe.write("\n---------------C: %f -------------\n" % c)

            run_svm(digits=data, C=c, fd=fe)

    fd.close()
    fe.close()
Beispiel #20
0
def main(mapfile_path, cv_idx_path,
         results_dir, model_dir,
         image_size=(28, 28, 1),
         batch_size=32,
         shuffle=False):
    """Train model and predict digits"""
    results_dir = Path(results_dir).resolve()
    model_dir = Path(model_dir).resolve()

    assert (os.path.isdir(results_dir)), NotADirectoryError
    assert (os.path.isdir(model_dir)), NotADirectoryError

    # read files
    mapfile_df, cv_idx = load_data([mapfile_path, cv_idx_path],
                                   sep=",", header=0,
                                   index_col=0, )
    # load params
    params = load_params()
    classifier = params["classifier"]
    # target_class = params["train_test_split"]["target_class"]
    model_params = params["model_params"]["cnn"]
    random_seed = params["random_seed"]

    # label column must be string
    mapfile_df["label"] = mapfile_df["label"].astype('str')

    # get train and dev indices
    train_idx = cv_idx[cv_idx["fold_01"] == "train"].index.tolist()
    dev_idx = cv_idx[cv_idx["fold_01"] == "test"].index.tolist()
    train_df = mapfile_df.iloc[train_idx]
    dev_df = mapfile_df.iloc[dev_idx]

    # create train/dev data generators
    train_datagen = ImageDataGenerator(rescale=1. / 255)
    # preprocessing_function
    train_generator = train_datagen.flow_from_dataframe(dataframe=train_df,
                                                        x_col='filenames', y_col='label',
                                                        weight_col=None, target_size=image_size[0:2],
                                                        color_mode='grayscale', classes=None,
                                                        class_mode='categorical', batch_size=batch_size,
                                                        shuffle=shuffle, seed=random_seed,
                                                        interpolation='nearest',
                                                        validate_filenames=True)

    dev_datagen = ImageDataGenerator(rescale=1. / 255)
    dev_generator = dev_datagen.flow_from_dataframe(dataframe=dev_df, rescale=1. / 255,
                                                    x_col='filenames', y_col='label',
                                                    weight_col=None, target_size=image_size[0:2],
                                                    color_mode='grayscale', classes=None,
                                                    class_mode='categorical', batch_size=batch_size,
                                                    shuffle=shuffle, seed=random_seed,
                                                    interpolation='nearest',
                                                    validate_filenames=True)

    # create model
    if classifier.lower() == "simple_mnist":
        # simple mnist parameters
        base_filter = 32
        fc_width = 512

        model = simple_mnist(base_filter, fc_width,
                             dropout_rate=model_params["dropout_rate"],
                             learn_rate=model_params["learn_rate"],
                             image_size=image_size)
    else:
        raise NotImplementedError

    # callbacks
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                                                     patience=5, min_lr=0.0001)

    print(model_params["epochs"])
    history = model.fit(train_generator,
                        epochs=model_params["epochs"],
                        verbose=1,
                        shuffle=True,
                        callbacks=[reduce_lr],
                        validation_data=dev_generator)

    # set model scoring metrics


    # # TODO - add custom metric for GMPR
    # scoring = {'accuracy': 'accuracy', 'balanced_accuracy': 'balanced_accuracy',
    #            'f1': 'f1',
    #            "gmpr": make_scorer(gmpr_score, greater_is_better=True),
    #            'jaccard': 'jaccard', 'precision': 'precision',
    #            'recall': 'recall', 'roc_auc': 'roc_auc'}

    # train using cross validation
    # cv_output = cross_validate(model, train_feats.to_numpy(),
    #                            train_labels.to_numpy(),
    #                            cv=split_generator,
    #                            fit_params=None,
    #                            scoring=scoring,
    #                            return_estimator=True)
    #
    # # get cv estimators
    # cv_estimators = cv_output.pop('estimator')
    # cv_metrics = pd.DataFrame(cv_output)
    #
    # # rename columns
    # col_mapper = dict(zip(cv_metrics.columns,
    #                       [elem.replace('test_', '') for elem in cv_metrics.columns]))
    # cv_metrics = cv_metrics.rename(columns=col_mapper)
    #
    # # save cv estimators as pickle file
    # with open(model_dir.joinpath("estimator.pkl"), "wb") as file:
    #     pickle.dump(cv_estimators, file)

    # save training history
    logs_df = pd.DataFrame(data=history.history,
                           index=range(1, model_params["epochs"]+1))
    logs_df.index.name = "epochs"
    logs_df.to_csv(Path("./reports/figures/logs.csv").resolve())
Beispiel #21
0
def test_it_should_print_when_file_does_not_exist(capsys):
    load_data('unknown_file.txt')
    out, _ = capsys.readouterr()
    assert out.rstrip() == "Logging: file does not exist..."
Beispiel #22
0
def test_rbm(learning_rate=0.1,
             training_epochs=15,
             dataset='mnist.pkl.gz',
             batch_size=20,
             n_chains=20,
             n_samples=10,
             output_folder='rbm_plots',
             n_hidden=500):

    datasets = load_data(dataset)

    train_set_x, train_set_y = datasets[0]
    test_set_x, test_set_y = datasets[2]

    # compute number of minibatches for training, validation and testing
    n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size

    # allocate symbolic variables for the data
    index = T.lscalar()  # index to a [mini]batch
    x = T.matrix('x')  # the data is presented as rasterized images

    rng = numpy.random.RandomState(123)
    theano_rng = RandomStreams(rng.randint(2**30))

    # initialize storage for the persistent chain (state = hidden
    # layer of chain)
    persistent_chain = theano.shared(numpy.zeros((batch_size, n_hidden),
                                                 dtype=theano.config.floatX),
                                     borrow=True)

    # construct the RBM class
    rbm = RBM(input=x,
              n_visible=28 * 28,
              n_hidden=n_hidden,
              numpy_rng=rng,
              theano_rng=theano_rng)

    # get the cost and the gradient corresponding to one step of CD-15
    cost, updates = rbm.get_cost_updates(learning_rate=learning_rate,
                                         persistent=persistent_chain,
                                         k=15)

    #################################
    #     Training the RBM          #
    #################################
    if not os.path.isdir(output_folder):
        os.makedirs(output_folder)
    os.chdir(output_folder)

    # start-snippet-5
    # it is ok for a theano function to have no output
    # the purpose of train_rbm is solely to update the RBM parameters
    train_rbm = theano.function(
        [index],
        cost,
        updates=updates,
        givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]},
        name='train_rbm')

    plotting_time = 0.
    start_time = timeit.default_timer()

    # go through training epochs
    for epoch in range(training_epochs):

        # go through the training set
        mean_cost = []
        for batch_index in range(n_train_batches):
            mean_cost += [train_rbm(batch_index)]

        print('Training epoch %d, cost is ' % epoch, numpy.mean(mean_cost))

        # Plot filters after each training epoch
        plotting_start = timeit.default_timer()
        # Construct image from the weight matrix
        image = Image.fromarray(
            tile_raster_images(X=rbm.W.get_value(borrow=True).T,
                               img_shape=(28, 28),
                               tile_shape=(10, 10),
                               tile_spacing=(1, 1)))
        image.save('filters_at_epoch_%i.png' % epoch)
        plotting_stop = timeit.default_timer()
        plotting_time += (plotting_stop - plotting_start)

    end_time = timeit.default_timer()

    pretraining_time = (end_time - start_time) - plotting_time

    print('Training took %f minutes' % (pretraining_time / 60.))
    # end-snippet-5 start-snippet-6
    #################################
    #     Sampling from the RBM     #
    #################################
    # find out the number of test samples
    number_of_test_samples = test_set_x.get_value(borrow=True).shape[0]

    # pick random test examples, with which to initialize the persistent chain
    test_idx = rng.randint(number_of_test_samples - n_chains)
    persistent_vis_chain = theano.shared(
        numpy.asarray(test_set_x.get_value(borrow=True)[test_idx:test_idx +
                                                        n_chains],
                      dtype=theano.config.floatX))
    # end-snippet-6 start-snippet-7
    plot_every = 1000
    # define one step of Gibbs sampling (mf = mean-field) define a
    # function that does `plot_every` steps before returning the
    # sample for plotting
    ([presig_hids, hid_mfs, hid_samples, presig_vis, vis_mfs,
      vis_samples], updates) = theano.scan(
          rbm.gibbs_vhv,
          outputs_info=[None, None, None, None, None, persistent_vis_chain],
          n_steps=plot_every,
          name="gibbs_vhv")

    # add to updates the shared variable that takes care of our persistent
    # chain :.
    updates.update({persistent_vis_chain: vis_samples[-1]})
    # construct the function that implements our persistent chain.
    # we generate the "mean field" activations for plotting and the actual
    # samples for reinitializing the state of our persistent chain
    sample_fn = theano.function([], [vis_mfs[-1], vis_samples[-1]],
                                updates=updates,
                                name='sample_fn')

    # create a space to store the image for plotting ( we need to leave
    # room for the tile_spacing as well)
    image_data = numpy.zeros((29 * n_samples + 1, 29 * n_chains - 1),
                             dtype='uint8')
    for idx in range(n_samples):
        # generate `plot_every` intermediate samples that we discard,
        # because successive samples in the chain are too correlated
        vis_mf, vis_sample = sample_fn()
        print(' ... plotting sample %d' % idx)
        image_data[29 * idx:29 * idx + 28, :] = tile_raster_images(
            X=vis_mf,
            img_shape=(28, 28),
            tile_shape=(1, n_chains),
            tile_spacing=(1, 1))

    # construct image
    image = Image.fromarray(image_data)
    image.save('samples.png')
    # end-snippet-7
    os.chdir('../')
Beispiel #23
0
def train_model(
    path,
    x_columns,
    y_column,
    output_path="models",
    test_size=0.3,
    random_state=None,
    cross_validation_folds=5,
    verbose=2,
):
    logging.info("Begin `train_model`")
    logging.debug(f"Load Data from {path}")
    logging.debug(f"x_columns: {x_columns}")
    logging.debug(f"y_column: {y_column}")

    df = load_data(path, x_columns + [y_column])

    logging.info("Preprocessing Inputs")
    df, new_X_columns = preprocess_model_inputs(df, x_columns, y_column)

    logging.debug(f"new_X_columns: {new_X_columns}")
    X = df[new_X_columns]
    y = df[y_column]

    logging.info("Splitting data into test and train sets")
    logging.info(f"test_size: {test_size}")
    logging.info(f"random_state: {random_state}")

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=test_size,
        random_state=random_state,
        stratify=y,
    )

    grid_params = get_grid_search_params()

    logging.info("Getting Grid Search CV Pipeline")
    logging.debug(f"cross_validation_folds: {cross_validation_folds}")

    pipeline = model_grid_search_cv(
        new_X_columns,
        get_base_estimator(random_state),
        grid_params,
        verbose=verbose,
        cross_validation_folds=cross_validation_folds)

    try:
        pipeline.fit(X_train, y_train)
    except Exception as e:
        logging.exception("Exception during pipeline fitting")
        raise e

    logging.info("Finished Grid Search CV")
    logging.info(f"Best Score: {pipeline['grid_search_cv'].best_score_}")
    logging.info(f"Best Params: {pipeline['grid_search_cv'].best_params_}")

    write_out_model_params(pipeline["grid_search_cv"].best_params_,
                           output_path)

    eval_predictions("train", pipeline, X_train, y_train, output_path)
    eval_predictions("test", pipeline, X_test, y_test, output_path)

    logging.info(f"Writing out model to {output_path}/trained.model")

    joblib.dump(pipeline, f"{output_path}/trained.model")
    cdsw.track_file(f"{output_path}/trained.model")

    logging.info("End `train_model`")
Beispiel #24
0
            if patience <= iter:
                done_looping = True
                break

    end_time = timeit.default_timer()
    fd.write(('Optimization complete with best validation score of %f %%, '
              'obtained at iteration %i, '
              'with test performance %f %%\n') %
             (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    fd.write('The fine tuning code for file ' + os.path.split(__file__)[1] +
             ' ran for %.2fm\n' % ((end_time - start_time) / 60.))


if __name__ == '__main__':
    datasets = load_data('mnist.pkl.gz')
    fd = open('dbn.log', 'w+')
    # trainx,trainy = datasets[0]
    # validx,validy = datasets[1]
    # testx,testy = datasets[2]
    #
    # datasets = ((trainx[:100],trainy[:100]),(validx[:50],validy[:50]),(testx[:50],testy[:50]))
    theano_datasets = prep_theano_data(datasets)

    test_DBN(pretraining_epochs=100,
             pretrain_lr=0.01,
             k=1,
             training_epochs=1000,
             finetune_lr=0.1,
             datasets=theano_datasets,
             batch_size=10,