def run_inference(): train_data, test_data = load_data() test_X = divide_data(test_data, 'test') model = run_training() pr = model.predict(test_X) return pr
def run_svm(digits=None, gamma=.001, C=1, fd=sys.stdout): if digits is None: digits = load_data('mnist.pkl.gz') train_set_x, train_set_y = digits[0] test_set_x, test_set_y = digits[2] classifier = svm.SVC(gamma=gamma, C=C) fd.write("fitting\n") start = timeit.default_timer() classifier.fit(train_set_x, train_set_y) end = timeit.default_timer() fd.write("fitting completed. Took %.2fm\n" % ((end - start) / 60.)) fd.write("predicting\n") # Now predict the value of the digit on the second half: predicted = classifier.predict(test_set_x) fd.write('prediction complete\n') fd.write("done\n\n") fd.write( "Classification report for classifier %s:\n%s\n" % (classifier, metrics.classification_report(test_set_y, predicted))) fd.write("Accuracy: %s\n" % metrics.accuracy_score(test_set_y, predicted)) fd.write("Confusion matrix:\n%s" % metrics.confusion_matrix(test_set_y, predicted))
def test_load_data_subset_pass(): from src.data import load_data result = load_data("unittests/data/mock_data.csv", ["number"]) try: assert (list(result.columns) == ["number"]) except AssertionError: pytest.fail("load_data didn't subset the return dataframe")
def test_load_data_returns_dataframe(): from src.data import load_data result = load_data("unittests/data/mock_data.csv") try: assert (isinstance(result, pd.DataFrame)) except AssertionError: pytest.fail("Expected Pandas DataFrame from load_data()")
def test_aws(): datasets = load_data('mnist.pkl.gz') trainx, trainy = datasets[0] validx, validy = datasets[1] testx, testy = datasets[2] numpy_rng = numpy.random.RandomState(123) num_of_train_rows = trainx.shape[0] num_of_valid_rows = validx.shape[0] num_of_test_rows = testx.shape[0] train_indices = range(num_of_train_rows) valid_indices = range(num_of_valid_rows) test_indices = range(num_of_test_rows) numpy_rng.shuffle(range(num_of_train_rows)) numpy_rng.shuffle(range(num_of_valid_rows)) numpy_rng.shuffle(range(num_of_test_rows)) samples = [(10, 10, 10), (50, 10, 10), (100, 10, 10)] fd = open('data/aws_dbn_test.log', 'w+') fe = open('data/aws_svm_test.log', 'w+') for s in samples: data = ((trainx[train_indices[:s[0]]], trainy[train_indices[:s[0]]]), (validx[valid_indices[:s[1]]], validy[valid_indices[:s[1]]]), (testx[test_indices[:s[2]]], testy[test_indices[:s[2]]])) theano_datasets = prep_theano_data(data) fd.write("\n\n==========Samples: %s ===========\n\n" % str(s)) fe.write("\n\n==========Samples: %s ===========\n\n" % str(s)) test_DBN(pretraining_epochs=100, pretrain_lr=0.01, k=1, training_epochs=1000, finetune_lr=0.1, datasets=theano_datasets, batch_size=10, hidden_layers=[100, 100, 100], fd=fd) for c in [.01, .1, 1, 10, 100]: fe.write("\n---------------C: %f -------------\n" %c) run_svm(digits=data,C=c, fd=fe) fd.close() fe.close()
def test_load_data(): # TODO: replace is a quick hack, neeeds fix res = load_data(path.join('input', FILENAME)).rstrip('\n').replace(" ", "") assert res == FILE_DATA.rstrip('\n').replace(" ", "")
def run_training(): param = best_param() train_data, test_data = load_data() train_X, train_y = divide_data(train_data, 'train') model = train_model(param['classifier'], param, train_X, train_y) return model
model_scores = [] # K-fold CV kf = KFold(n_splits=8, random_state=SEED, shuffle=True) for train_index, test_index in kf.split(X): train_X, test_X = X.iloc[train_index], X.iloc[test_index] train_y, test_y = y.iloc[train_index], y.iloc[test_index] model.fit(train_X, train_y) print(f"Model ACC: {model.score(test_X, test_y)}%") model_scores.append(model.score(test_X, test_y)) for sc in model_scores: print(f'{sc} %') print(f"Max ACC: {max(model_scores)}") return model def run_training(): param = best_param() train_data, test_data = load_data() train_X, train_y = divide_data(train_data, 'train') model = train_model(param['classifier'], param, train_X, train_y) return model if __name__ == '__main__': param = best_param() train_data, test_data = load_data() train_X, train_y = divide_data(train_data, 'train') train_model(param['classifier'], param, train_X, train_y)
def test_preprocessing_pipeline(): """ Verify we can read in and preprocess the data as expected. :return: """ setup_logging(logging.DEBUG) logging.debug("test_preprocessing_pipeline") test_data_path = "integrationtests/test_data.csv" expected_output_path = "integrationtests/test_preprocessing_expected.csv" logging.debug(f"test_data_path: {test_data_path}") logging.debug(f"expected_output_path: {expected_output_path}") logging.debug("Loading test data") df = load_data(test_data_path, ["domain", "class"]) pipeline = Pipeline([ preprocess(), ]) logging.debug("Applying pipeline transformations") pipeline_output = pipeline.transform(df) column_names = pipeline["preprocess"].get_feature_names() logging.debug("Pipeline transformation complete") logging.debug(f"column_names: {column_names}") try: assert (column_names == ['class', 'domain']) except AssertionError: message = f"Didn't get the expected `get_feature_names` from pipeline, got {column_names}" logging.exception(message) pytest.fail(message) try: assert (isinstance(pipeline_output, np.ndarray)) except AssertionError: message = f"Didn't get expected type from pipeline, got {type(pipeline_output)}" logging.exception(message) pytest.fail(message) logging.debug(pipeline_output) logging.debug("Creating DataFrame from Pipeline output") result_df = pd.DataFrame(pipeline_output, columns=column_names) logging.debug("Applying `post_process_cleanup`") result_df = post_process_cleanup(result_df) logging.debug("Loading validation DataFrame") expected_df = pd.read_csv(expected_output_path) try: pd.testing.assert_frame_equal(result_df, expected_df, check_dtype=False) except AssertionError: message = "Data resulting from transformation did not match expected." logging.exception(message) pytest.fail(message) return pipeline
def main(mapfile, output_dir=None): """Split data into train and dev sets""" if type(mapfile) is str: assert (os.path.isfile(mapfile)), FileNotFoundError # read file train_df = load_data(mapfile, sep=",", header=0, index_col=0) else: train_df = mapfile # set index train_df.index.name = "index" # load params params = load_params() params_split = params['train_test_split'] params_split["random_seed"] = params["random_seed"] # get filenames and dependent variables (labels) train_labels = train_df.pop(params_split["target_class"]) train_files = train_df # K-fold split into train and dev sets stratified by train_labels # using random seed for reproducibility skf = StratifiedKFold(n_splits=params_split['n_split'], random_state=params_split['random_seed'], shuffle=params_split['shuffle']) # create splits split_df = pd.DataFrame() for n_fold, (train_idx, test_idx) in enumerate(skf.split(train_files, train_labels)): fold_name = f"fold_{n_fold + 1:02d}" # create intermediate dataframe for each fold temp_df = pd.DataFrame({ "image_id": train_idx, fold_name: "train" }).set_index("image_id") temp_df = temp_df.append( pd.DataFrame({ "image_id": test_idx, fold_name: "test" }).set_index("image_id")) # append first fold to empty dataframe or join cols if n_fold > 0 split_df = split_df.append(temp_df) if n_fold == 0 else split_df.join( temp_df) # sort by index split_df = split_df.sort_index() if output_dir: assert (os.path.isdir(output_dir)), NotADirectoryError output_dir = Path(output_dir).resolve() # save output dataframe with indices for train and dev sets split_df.to_csv(output_dir.joinpath("split_train_dev.csv"), na_rep="nan") else: return split_df
def test_DBN(pretrain_lr=0.01, pretraining_epochs=100, k=1, finetune_lr=0.1, training_epochs=1000, hidden_layers=[1000, 1000, 1000], datasets=None, batch_size=10, fd=sys.stdout, normal_distro=False): if datasets is None: datasets = prep_theano_data(load_data('mnist.pkl.gz')) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size numpy_rng = numpy.random.RandomState(123) fd.write('... building the model\n') # construct the Deep Belief Network dbn = DBN(numpy_rng=numpy_rng, n_ins=28 * 28, hidden_layers_size=hidden_layers, n_outs=10, normal=normal_distro) fd.write('... getting the pretraining functions\n') pretraining_fns = dbn.pretraining_functions(train_set_x=train_set_x, batch_size=batch_size, k=k) num_of_train_rows = (train_set_x.shape.eval())[0] num_of_valid_rows = (valid_set_x.shape.eval())[0] indices = range(num_of_train_rows) numpy_rng.shuffle(range(num_of_train_rows)) train = train_set_x[indices[:num_of_valid_rows]] valid = valid_set_x fd.write('... pre-training the model\n') start_time = timeit.default_timer() # Pre-train layer-wise for i in range(dbn.n_layers): # go through pretraining epochs for epoch in range(pretraining_epochs): # go through the training set c = [] for batch_index in range(n_train_batches): c.append(pretraining_fns[i](index=batch_index, lr=pretrain_lr)) fd.write('Pre-training layer %i, epoch %d, cost %s\n' % (i, epoch, numpy.mean(c, dtype='float64'))) if (epoch % 5 == 0): train_free_energy = dbn.rbm_layers[i].free_energy(train) valid_free_energy = dbn.rbm_layers[i].free_energy(valid) fd.write( 'Pre-training layer %i, epoch %d, representative training free energy %s\n' % (i, epoch, numpy.mean(train_free_energy.eval(), dtype='float64'))) fd.write( 'Pre-training layer %i, epoch %d, validation free energy %s\n' % (i, epoch, numpy.mean(valid_free_energy.eval(), dtype='float64'))) valid = T.dot(valid, dbn.rbm_layers[i].W) train = T.dot(train, dbn.rbm_layers[i].W) end_time = timeit.default_timer() fd.write('The pretraining code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm' % ((end_time - start_time) / 60.)) fd.write('... getting the finetuning functions') train_fn, validate_model, test_model = dbn.build_finetune_functions( datasets=datasets, batch_size=batch_size, learning_rate=finetune_lr) fd.write('... finetuning the model\n') # early-stopping parameters # look as this many examples regardless patience = 4 * n_train_batches # wait this much longer when a new best is found patience_increase = 2. # a relative improvement of this much is considered significant improvement_threshold = 0.995 # go through this many minibatches before checking the network on # the validation set; in this case we check every epoch validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = numpy.inf test_score = 0. start_time = timeit.default_timer() done_looping = False epoch = 0 while (epoch < training_epochs) and (not done_looping): epoch = epoch + 1 for minibatch_index in range(n_train_batches): train_fn(minibatch_index) iter = (epoch - 1) * n_train_batches + minibatch_index if (iter + 1) % validation_frequency == 0: validation_losses = validate_model() this_validation_loss = numpy.mean(validation_losses, dtype='float64') fd.write( 'epoch %i, minibatch %i/%i, validation error %f %%\n' % (epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.)) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if (this_validation_loss < best_validation_loss * improvement_threshold): patience = max(patience, iter * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter # test it on the test set test_losses = test_model() test_score = numpy.mean(test_losses, dtype='float64') fd.write((' epoch %i, minibatch %i/%i, test error of ' 'best model %f %%\n') % (epoch, minibatch_index + 1, n_train_batches, test_score * 100.)) if patience <= iter: done_looping = True break end_time = timeit.default_timer() fd.write(('Optimization complete with best validation score of %f %%, ' 'obtained at iteration %i, ' 'with test performance %f %%\n') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) fd.write('The fine tuning code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm\n' % ((end_time - start_time) / 60.))
def objective(trial): """ Fine tuning w. Optuna :param trial: optuna study :return: """ # load data train_X, train_y = divide_data(load_data()[0], 'train') classifier = trial.suggest_categorical('classifier', [ 'KNeighbor', 'DecisionTree', 'SVM', 'RandomForest', 'LightGBM', 'Xgboost' ]) # KFold kf = KFold(n_splits=3, random_state=SEED, shuffle=True) if classifier == 'KNeighbor': # KNN params knn_n_neighbors = trial.suggest_int('n_neighbors', 3, 10) knn_weights = trial.suggest_categorical('weights', ['uniform', 'distance']) knn_algorithm = trial.suggest_categorical( 'algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']) knn_leaf_size = trial.suggest_int('leaf_size', 10, 40) knn_p = trial.suggest_int('p', 1, 2) model = KNeighborsClassifier(n_neighbors=knn_n_neighbors, weights=knn_weights, algorithm=knn_algorithm, leaf_size=knn_leaf_size, p=knn_p) elif classifier == 'DecisionTree': # DecisionTree params dt_criterion = trial.suggest_categorical('criterion', ['gini', 'entropy']) dt_splitter = trial.suggest_categorical('splitter', ['best', 'random']) model = DecisionTreeClassifier(random_state=SEED, criterion=dt_criterion, splitter=dt_splitter) elif classifier == 'SVM': # SVM params svm_C = trial.suggest_categorical('svm_C', [0.1, 1, 10, 100, 1000]) svm_degree = trial.suggest_categorical('svm_degree', [0, 1, 2, 3, 4, 5, 6]) model = SVC(random_state=SEED, C=svm_C, degree=svm_degree) elif classifier == 'RandomForest': # RandomForest params rf_max_depth = trial.suggest_categorical('rf_max_depth', [80, 90, 100, 110]) rf_max_features = trial.suggest_categorical('rf_max_features', [2, 3]) rf_min_samples_leaf = trial.suggest_categorical( 'rf_min_sample_leaf', [8, 10, 12]) rf_n_estimators = trial.suggest_categorical('rf_n_estimators', [100, 200, 300, 1000]) model = RandomForestClassifier(random_state=SEED, max_depth=rf_max_depth, max_features=rf_max_features, min_samples_leaf=rf_min_samples_leaf, n_estimators=rf_n_estimators) elif classifier == 'LightGBM': # LightGBM params lgbm_n_estimators = trial.suggest_categorical('lgbm_n_estimators', [100, 500, 1000, 3000]) lgbm_max_depth = trial.suggest_int('lgbm_max_depth', 20, 200) lgbm_learning_rate = trial.suggest_categorical('lgbm_learning_rate', [0.01, 0.05, 0.1]) lgbm_num_leaves = trial.suggest_categorical('lgbm_num_leaves', [80, 100, 150, 200]) lgbm_subsample = trial.suggest_categorical('lgbm_subsample', [1, 0.8, 0.7, 0.5]) lgbm_lambda_l1 = trial.suggest_categorical('lgbm_lambda_l1', [0., 0.5, 0.8, 1]) lgbm_lambda_l2 = trial.suggest_categorical('lgbm_lambda_l2', [0., 0.5, 0.8, 1]) model = lgb.LGBMClassifier( random_state=SEED, max_depth=lgbm_max_depth, num_leaves=lgbm_num_leaves, learning_rate=lgbm_learning_rate, subsample=lgbm_subsample, lambda_l1=lgbm_lambda_l1, lambda_l2=lgbm_lambda_l2, ) elif classifier == 'Xgboost': # Xgboost params xgb_max_depth = trial.suggest_int('xgb_max_depth', 10, 200) xgb_learning_rate = trial.suggest_categorical('xgb_learning_rate', [0.01, 0.05, 0.1]) model = xgb.XGBClassifier( random_state=SEED, max_depth=xgb_max_depth, learning_rate=xgb_learning_rate, ) model else: return return cross_val_score(model, train_X, train_y, n_jobs=-1, scoring='accuracy', cv=kf).mean()
def test_it_should_load_data_when_file_exists(): res = load_data(path.join(INPUT_PATH, FILENAME)) assert res is not None
fd.write("fitting\n") start = timeit.default_timer() classifier.fit(train_set_x, train_set_y) end = timeit.default_timer() fd.write("fitting completed. Took %.2fm\n" % ((end - start) / 60.)) fd.write("predicting\n") # Now predict the value of the digit on the second half: predicted = classifier.predict(test_set_x) fd.write('prediction complete\n') fd.write("done\n\n") fd.write( "Classification report for classifier %s:\n%s\n" % (classifier, metrics.classification_report(test_set_y, predicted))) fd.write("Accuracy: %s\n" % metrics.accuracy_score(test_set_y, predicted)) fd.write("Confusion matrix:\n%s" % metrics.confusion_matrix(test_set_y, predicted)) if __name__ == '__main__': digits = load_data('mnist.pkl.gz') fd = open("hello.log", 'w+') train_set_x, train_set_y = digits[0] train_set_x = train_set_x[:1000] train_set_y = train_set_y[:1000] digits[0] = (train_set_x, train_set_y) run_svm(digits=digits, C=1, gamma=.001, fd=fd) fd.close()
def write_inference(result): train_data, test_data = load_data() test_data['class'] = result test_data.to_csv('./build/inference.csv')
def randomData(): print('data') df = load_data(random=True) raw_data = calculate.raw_data(df) print("sending back random data: ", raw_data) return jsonify(raw_data)
def data(): print('data') df = load_data() raw_data = calculate.raw_data(df) print("sending back raw data: ", raw_data) return jsonify(raw_data)
def d3Version(): print('d3') df = load_data() raw_data = calculate.raw_data(df) return render_template('d3.html.j2', raw_data=raw_data)
def aws(name): datasets = load_data('mnist.pkl.gz') numpy_rng = numpy.random.RandomState(123) def getEvenData(data, sample_sizes): trainx, trainy = data[0] validx, validy = data[1] testx, testy = data[2] num_of_train_rows = trainx.shape[0] num_of_valid_rows = validx.shape[0] num_of_test_rows = testx.shape[0] train_indices = range(num_of_train_rows) valid_indices = range(num_of_valid_rows) test_indices = range(num_of_test_rows) numpy_rng.shuffle(range(num_of_train_rows)) numpy_rng.shuffle(range(num_of_valid_rows)) numpy_rng.shuffle(range(num_of_test_rows)) tx = trainx[train_indices] ty = trainy[train_indices] vx = validx[valid_indices] vy = validy[valid_indices] tex = testx[test_indices] tey = testy[test_indices] train_each_class = sample_sizes[0] / 10 valid_each_class = sample_sizes[1] / 10 test_each_class = sample_sizes[2] / 10 train_new_indices = numpy.where(ty == 0)[0][:train_each_class] valid_new_indices = numpy.where(vy == 0)[0][:valid_each_class] test_new_indices = numpy.where(tey == 0)[0][:test_each_class] for i in range(1, 10): train_new_indices = numpy.union1d( train_new_indices, numpy.where(ty == i)[0][:train_each_class]) valid_new_indices = numpy.union1d( valid_new_indices, numpy.where(vy == i)[0][:valid_each_class]) test_new_indices = numpy.union1d( test_new_indices, numpy.where(tey == i)[0][:test_each_class]) newdata = ((tx[train_new_indices], ty[train_new_indices]), (vx[valid_new_indices], vy[valid_new_indices]), (tex[test_new_indices], tey[test_new_indices])) return newdata prelr = .04 flr = .07 s = (5000, 1000, 1000) l = [1000, 1000, 1000] k = 1 name = "2017_03_23_woot" fd = open('data/' + name + '_dbn.log', 'w+') fe = open('data/' + name + '_svm.log', 'w+') for j in range(3): fd.write("_______RUN: %d" % j) fe.write("_______RUN: %d" % j) # data = getEvenData(datasets,s) theano_datasets = prep_theano_data(datasets) print('done prepping data') fd.write("\n\n==========Layers: %s ===========\n\n" % str(l)) fe.write("\n\n==========Layers %s ===========\n\n" % str(l)) test_DBN(pretraining_epochs=100, pretrain_lr=prelr, k=k, training_epochs=1000, finetune_lr=flr, datasets=theano_datasets, batch_size=10, hidden_layers=l, fd=fd, normal_distro=False) for c in [10, 100]: fe.write("\n---------------C: %f -------------\n" % c) run_svm(digits=data, C=c, fd=fe) fd.close() fe.close()
def main(mapfile_path, cv_idx_path, results_dir, model_dir, image_size=(28, 28, 1), batch_size=32, shuffle=False): """Train model and predict digits""" results_dir = Path(results_dir).resolve() model_dir = Path(model_dir).resolve() assert (os.path.isdir(results_dir)), NotADirectoryError assert (os.path.isdir(model_dir)), NotADirectoryError # read files mapfile_df, cv_idx = load_data([mapfile_path, cv_idx_path], sep=",", header=0, index_col=0, ) # load params params = load_params() classifier = params["classifier"] # target_class = params["train_test_split"]["target_class"] model_params = params["model_params"]["cnn"] random_seed = params["random_seed"] # label column must be string mapfile_df["label"] = mapfile_df["label"].astype('str') # get train and dev indices train_idx = cv_idx[cv_idx["fold_01"] == "train"].index.tolist() dev_idx = cv_idx[cv_idx["fold_01"] == "test"].index.tolist() train_df = mapfile_df.iloc[train_idx] dev_df = mapfile_df.iloc[dev_idx] # create train/dev data generators train_datagen = ImageDataGenerator(rescale=1. / 255) # preprocessing_function train_generator = train_datagen.flow_from_dataframe(dataframe=train_df, x_col='filenames', y_col='label', weight_col=None, target_size=image_size[0:2], color_mode='grayscale', classes=None, class_mode='categorical', batch_size=batch_size, shuffle=shuffle, seed=random_seed, interpolation='nearest', validate_filenames=True) dev_datagen = ImageDataGenerator(rescale=1. / 255) dev_generator = dev_datagen.flow_from_dataframe(dataframe=dev_df, rescale=1. / 255, x_col='filenames', y_col='label', weight_col=None, target_size=image_size[0:2], color_mode='grayscale', classes=None, class_mode='categorical', batch_size=batch_size, shuffle=shuffle, seed=random_seed, interpolation='nearest', validate_filenames=True) # create model if classifier.lower() == "simple_mnist": # simple mnist parameters base_filter = 32 fc_width = 512 model = simple_mnist(base_filter, fc_width, dropout_rate=model_params["dropout_rate"], learn_rate=model_params["learn_rate"], image_size=image_size) else: raise NotImplementedError # callbacks reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001) print(model_params["epochs"]) history = model.fit(train_generator, epochs=model_params["epochs"], verbose=1, shuffle=True, callbacks=[reduce_lr], validation_data=dev_generator) # set model scoring metrics # # TODO - add custom metric for GMPR # scoring = {'accuracy': 'accuracy', 'balanced_accuracy': 'balanced_accuracy', # 'f1': 'f1', # "gmpr": make_scorer(gmpr_score, greater_is_better=True), # 'jaccard': 'jaccard', 'precision': 'precision', # 'recall': 'recall', 'roc_auc': 'roc_auc'} # train using cross validation # cv_output = cross_validate(model, train_feats.to_numpy(), # train_labels.to_numpy(), # cv=split_generator, # fit_params=None, # scoring=scoring, # return_estimator=True) # # # get cv estimators # cv_estimators = cv_output.pop('estimator') # cv_metrics = pd.DataFrame(cv_output) # # # rename columns # col_mapper = dict(zip(cv_metrics.columns, # [elem.replace('test_', '') for elem in cv_metrics.columns])) # cv_metrics = cv_metrics.rename(columns=col_mapper) # # # save cv estimators as pickle file # with open(model_dir.joinpath("estimator.pkl"), "wb") as file: # pickle.dump(cv_estimators, file) # save training history logs_df = pd.DataFrame(data=history.history, index=range(1, model_params["epochs"]+1)) logs_df.index.name = "epochs" logs_df.to_csv(Path("./reports/figures/logs.csv").resolve())
def test_it_should_print_when_file_does_not_exist(capsys): load_data('unknown_file.txt') out, _ = capsys.readouterr() assert out.rstrip() == "Logging: file does not exist..."
def test_rbm(learning_rate=0.1, training_epochs=15, dataset='mnist.pkl.gz', batch_size=20, n_chains=20, n_samples=10, output_folder='rbm_plots', n_hidden=500): datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] test_set_x, test_set_y = datasets[2] # compute number of minibatches for training, validation and testing n_train_batches = train_set_x.get_value(borrow=True).shape[0] // batch_size # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix('x') # the data is presented as rasterized images rng = numpy.random.RandomState(123) theano_rng = RandomStreams(rng.randint(2**30)) # initialize storage for the persistent chain (state = hidden # layer of chain) persistent_chain = theano.shared(numpy.zeros((batch_size, n_hidden), dtype=theano.config.floatX), borrow=True) # construct the RBM class rbm = RBM(input=x, n_visible=28 * 28, n_hidden=n_hidden, numpy_rng=rng, theano_rng=theano_rng) # get the cost and the gradient corresponding to one step of CD-15 cost, updates = rbm.get_cost_updates(learning_rate=learning_rate, persistent=persistent_chain, k=15) ################################# # Training the RBM # ################################# if not os.path.isdir(output_folder): os.makedirs(output_folder) os.chdir(output_folder) # start-snippet-5 # it is ok for a theano function to have no output # the purpose of train_rbm is solely to update the RBM parameters train_rbm = theano.function( [index], cost, updates=updates, givens={x: train_set_x[index * batch_size:(index + 1) * batch_size]}, name='train_rbm') plotting_time = 0. start_time = timeit.default_timer() # go through training epochs for epoch in range(training_epochs): # go through the training set mean_cost = [] for batch_index in range(n_train_batches): mean_cost += [train_rbm(batch_index)] print('Training epoch %d, cost is ' % epoch, numpy.mean(mean_cost)) # Plot filters after each training epoch plotting_start = timeit.default_timer() # Construct image from the weight matrix image = Image.fromarray( tile_raster_images(X=rbm.W.get_value(borrow=True).T, img_shape=(28, 28), tile_shape=(10, 10), tile_spacing=(1, 1))) image.save('filters_at_epoch_%i.png' % epoch) plotting_stop = timeit.default_timer() plotting_time += (plotting_stop - plotting_start) end_time = timeit.default_timer() pretraining_time = (end_time - start_time) - plotting_time print('Training took %f minutes' % (pretraining_time / 60.)) # end-snippet-5 start-snippet-6 ################################# # Sampling from the RBM # ################################# # find out the number of test samples number_of_test_samples = test_set_x.get_value(borrow=True).shape[0] # pick random test examples, with which to initialize the persistent chain test_idx = rng.randint(number_of_test_samples - n_chains) persistent_vis_chain = theano.shared( numpy.asarray(test_set_x.get_value(borrow=True)[test_idx:test_idx + n_chains], dtype=theano.config.floatX)) # end-snippet-6 start-snippet-7 plot_every = 1000 # define one step of Gibbs sampling (mf = mean-field) define a # function that does `plot_every` steps before returning the # sample for plotting ([presig_hids, hid_mfs, hid_samples, presig_vis, vis_mfs, vis_samples], updates) = theano.scan( rbm.gibbs_vhv, outputs_info=[None, None, None, None, None, persistent_vis_chain], n_steps=plot_every, name="gibbs_vhv") # add to updates the shared variable that takes care of our persistent # chain :. updates.update({persistent_vis_chain: vis_samples[-1]}) # construct the function that implements our persistent chain. # we generate the "mean field" activations for plotting and the actual # samples for reinitializing the state of our persistent chain sample_fn = theano.function([], [vis_mfs[-1], vis_samples[-1]], updates=updates, name='sample_fn') # create a space to store the image for plotting ( we need to leave # room for the tile_spacing as well) image_data = numpy.zeros((29 * n_samples + 1, 29 * n_chains - 1), dtype='uint8') for idx in range(n_samples): # generate `plot_every` intermediate samples that we discard, # because successive samples in the chain are too correlated vis_mf, vis_sample = sample_fn() print(' ... plotting sample %d' % idx) image_data[29 * idx:29 * idx + 28, :] = tile_raster_images( X=vis_mf, img_shape=(28, 28), tile_shape=(1, n_chains), tile_spacing=(1, 1)) # construct image image = Image.fromarray(image_data) image.save('samples.png') # end-snippet-7 os.chdir('../')
def train_model( path, x_columns, y_column, output_path="models", test_size=0.3, random_state=None, cross_validation_folds=5, verbose=2, ): logging.info("Begin `train_model`") logging.debug(f"Load Data from {path}") logging.debug(f"x_columns: {x_columns}") logging.debug(f"y_column: {y_column}") df = load_data(path, x_columns + [y_column]) logging.info("Preprocessing Inputs") df, new_X_columns = preprocess_model_inputs(df, x_columns, y_column) logging.debug(f"new_X_columns: {new_X_columns}") X = df[new_X_columns] y = df[y_column] logging.info("Splitting data into test and train sets") logging.info(f"test_size: {test_size}") logging.info(f"random_state: {random_state}") X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=random_state, stratify=y, ) grid_params = get_grid_search_params() logging.info("Getting Grid Search CV Pipeline") logging.debug(f"cross_validation_folds: {cross_validation_folds}") pipeline = model_grid_search_cv( new_X_columns, get_base_estimator(random_state), grid_params, verbose=verbose, cross_validation_folds=cross_validation_folds) try: pipeline.fit(X_train, y_train) except Exception as e: logging.exception("Exception during pipeline fitting") raise e logging.info("Finished Grid Search CV") logging.info(f"Best Score: {pipeline['grid_search_cv'].best_score_}") logging.info(f"Best Params: {pipeline['grid_search_cv'].best_params_}") write_out_model_params(pipeline["grid_search_cv"].best_params_, output_path) eval_predictions("train", pipeline, X_train, y_train, output_path) eval_predictions("test", pipeline, X_test, y_test, output_path) logging.info(f"Writing out model to {output_path}/trained.model") joblib.dump(pipeline, f"{output_path}/trained.model") cdsw.track_file(f"{output_path}/trained.model") logging.info("End `train_model`")
if patience <= iter: done_looping = True break end_time = timeit.default_timer() fd.write(('Optimization complete with best validation score of %f %%, ' 'obtained at iteration %i, ' 'with test performance %f %%\n') % (best_validation_loss * 100., best_iter + 1, test_score * 100.)) fd.write('The fine tuning code for file ' + os.path.split(__file__)[1] + ' ran for %.2fm\n' % ((end_time - start_time) / 60.)) if __name__ == '__main__': datasets = load_data('mnist.pkl.gz') fd = open('dbn.log', 'w+') # trainx,trainy = datasets[0] # validx,validy = datasets[1] # testx,testy = datasets[2] # # datasets = ((trainx[:100],trainy[:100]),(validx[:50],validy[:50]),(testx[:50],testy[:50])) theano_datasets = prep_theano_data(datasets) test_DBN(pretraining_epochs=100, pretrain_lr=0.01, k=1, training_epochs=1000, finetune_lr=0.1, datasets=theano_datasets, batch_size=10,