def wrapper_kaggle(epochs=40, validate_ratio=0.1, save_prob=True): train_x_1, test_x_1 = read_all_predict_score() train_x_2, train_y, test_x_2 = read_aggregated_vectors() train_x_3, train_y, test_x_3 = read_aggregated_vectors(google=False) old_train_x = train_x_1 # np.concatenate((train_x_1, train_x_2, train_x_3), axis=1) test_x = test_x_1 # np.concatenate((test_x_1, test_x_2, test_x_3), axis=1) old_train_y = np.asarray(train_y) # split train validate data sss_indices = StratifiedShuffleSplit(y=old_train_y, n_iter=1, test_size=validate_ratio) for indices in sss_indices: train_index, test_index = indices train_x = old_train_x[train_index] validate_x = old_train_x[test_index] train_y = old_train_y[train_index] validate_y = old_train_y[test_index] # add validation set for training # train_x = np.concatenate((train_x, validate_x)) # train_y = np.concatenate((train_y, validate_y)) # get dataset info dim = train_x[0].shape[0] n_out = len(np.unique(validate_y)) datasets = (train_x, train_y, validate_x, validate_y, test_x) n_layers = 1 print "input dimension is %d, output dimension is %d" % (dim, n_out) return_val = train_dropout_net( datasets=datasets, use_bias=True, n_epochs=epochs, dim=dim, lr_rate=0.02, n_out=n_out, dropout=True, dropout_rates=[0.7], n_hidden=[100], activations=[tanh] * n_layers, batch_size=50, update_rule='adagrad', no_test_y=True, save_prob=save_prob ) if not save_prob: save_csv(return_val) else: train_prob, validate_prob, test_prob = return_val saved_train_prob = np.zeros((old_train_x.shape[0], n_out)) saved_train_prob[train_index] = train_prob saved_train_prob[test_index] = validate_prob save_path = "D:/data/nlpdata/pickled_data/" + SST_KAGGLE + "_prob.pkl" print "saving probability feature to %s" % save_path f = open(Path(save_path), "wb") pkl.dump((saved_train_prob, test_prob), f, -1) f.close()
def wrapper_kaggle(valid_portion=0.1): train_x, test_x = read_all_predict_score() _, train_y, _ = read_sst_kaggle_pickle() train_y = np.asarray(train_y) # train_x = train_x.reshape(train_x.shape[0], 18, 5) # test_x = test_x.reshape(test_x.shape[0], 18, 5) train_x, validate_x, train_y, validate_y = train_test_split( train_x, train_y, test_size=valid_portion, stratify=train_y ) dim = train_x[0].shape print "input dimension is", dim img_size = (18, 5) n_out = len(np.unique(validate_y)) datasets = (train_x, train_y, validate_x, validate_y, test_x) best_prediction = train_lecun_net( img_size=img_size, datasets=datasets, filter_size=(7, 2), pool_size=(2, 1), n_epochs=10, lr_rate=0.05, n_out=n_out, dropout_rate=0.5, n_hidden=500, nkerns=10, activation=leaky_relu, batch_size=100, update_rule="adagrad", user_bias=True, no_test_y=True, ) import csv save_path = Path("C:/Users/Song/Course/571/hw3/kaggle_result.csv") with open(save_path, "wb") as f: writer = csv.writer(f, delimiter=",") writer.writerow(["PhraseId", "Sentiment"]) phrase_ids = np.arange(156061, 222353) for phrase_id, sentiment in zip(phrase_ids, best_prediction): writer.writerow([phrase_id, sentiment])