def wrapper_kaggle(epochs=40, validate_ratio=0.1, save_prob=True): train_x_1, test_x_1 = read_all_predict_score() train_x_2, train_y, test_x_2 = read_aggregated_vectors() train_x_3, train_y, test_x_3 = read_aggregated_vectors(google=False) old_train_x = train_x_1 # np.concatenate((train_x_1, train_x_2, train_x_3), axis=1) test_x = test_x_1 # np.concatenate((test_x_1, test_x_2, test_x_3), axis=1) old_train_y = np.asarray(train_y) # split train validate data sss_indices = StratifiedShuffleSplit(y=old_train_y, n_iter=1, test_size=validate_ratio) for indices in sss_indices: train_index, test_index = indices train_x = old_train_x[train_index] validate_x = old_train_x[test_index] train_y = old_train_y[train_index] validate_y = old_train_y[test_index] # add validation set for training # train_x = np.concatenate((train_x, validate_x)) # train_y = np.concatenate((train_y, validate_y)) # get dataset info dim = train_x[0].shape[0] n_out = len(np.unique(validate_y)) datasets = (train_x, train_y, validate_x, validate_y, test_x) n_layers = 1 print "input dimension is %d, output dimension is %d" % (dim, n_out) return_val = train_dropout_net( datasets=datasets, use_bias=True, n_epochs=epochs, dim=dim, lr_rate=0.02, n_out=n_out, dropout=True, dropout_rates=[0.7], n_hidden=[100], activations=[tanh] * n_layers, batch_size=50, update_rule='adagrad', no_test_y=True, save_prob=save_prob ) if not save_prob: save_csv(return_val) else: train_prob, validate_prob, test_prob = return_val saved_train_prob = np.zeros((old_train_x.shape[0], n_out)) saved_train_prob[train_index] = train_prob saved_train_prob[test_index] = validate_prob save_path = "D:/data/nlpdata/pickled_data/" + SST_KAGGLE + "_prob.pkl" print "saving probability feature to %s" % save_path f = open(Path(save_path), "wb") pkl.dump((saved_train_prob, test_prob), f, -1) f.close()
def train(data=SST_KAGGLE, alg='logcv'): train_x, train_y, test_x = read_aggregated_vectors(google=True, data=data) train_x = np.asarray(train_x) train_y = np.asarray(train_y) test_x = np.asarray(test_x) print "shape for training data is", train_x.shape if alg == 'svm': clf = SVC(verbose=1) elif alg == 'log': clf = LogisticRegression(verbose=1) elif alg == 'logcv': clf = LogisticRegressionCV(cv=5, verbose=1) else: raise NotImplementedError print "training..." clf.fit(train_x, train_y) # clf.fit(validate_x, validate_y) predicted = clf.predict(test_x) save_csv(predicted)
def average_word_kaggle_dataset(google=True): train_x, train_y, test_x = read_aggregated_vectors(data=SST_KAGGLE, google=google) return np.asarray(train_x), np.asarray(train_y), np.asarray(test_x)