def train(data_matrix, save_path, title, hidden_size=256, lr=0.001, saved_model_path=None, RESUME=False, batch_size=256, n_epochs=30): tf.reset_default_graph() _, input_placeholder, labels_placeholder, train_op, loss_op = build_model(data_matrix, train=True, hidden_size=hidden_size, lr=lr) saver = tf.train.Saver() avg_loss_list = [] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) if RESUME: sess.run(tf.global_variables_initializer()) saver.restore(sess, saved_model_path) print("Model restored.") minibatches = util.get_minibatches_lm(data_matrix, batch_size) for i in range(n_epochs): batch_loss_list = [] print "Epoch " + str(i+1) + ": " for j, tup in enumerate(minibatches): # label_data = np.zeros((len(tup[1]), util.vocab_size)) # label_data[np.arange(len(tup[1])), tup[1]] = 1 _, loss = sess.run([train_op, loss_op], feed_dict={input_placeholder: tup[0], labels_placeholder: tup[1]}) batch_loss_list.append(loss) avg_loss_list.append(np.mean(batch_loss_list)) print "=====>loss: " + str(avg_loss_list[i]) + " " if (i > 0) and (avg_loss_list[i] < avg_loss_list[i-1]): tmp_path = save_path + "--smallest loss" saver.save(sess, tmp_path) print "New min loss at epoch %s! Model saved in path %s" % (str(i+1), tmp_path) saver.save(sess, save_path) print("Final model saved in path: %s" % save_path) generatePlots(range(len(avg_loss_list)), avg_loss_list, "Number of Epochs", "Cross-Entropy Loss", title) util.dumpVar("losses/ " + title + " " + today + ".pkl" , avg_loss_list)
def trainAndSaveModel(X_train, y_train, y_label_index, max_iterations=7000, folds=False): n_features = X_train.shape[1] n_classes = len(util.classes) avg_coefficients = np.zeros((n_classes, n_features)) avg_intercepts = np.zeros(n_classes) data_kfold = util.split_data(y_index=y_label_index) train_accuracies = [] eval_accuracies = [] train_predictions = [] eval_predictions = [] for i, (X_train, y_train, X_val, y_val) in enumerate(data_kfold): print("Fold", i + 1) clf = LogisticRegression(max_iter=max_iterations, multi_class='multinomial', solver='newton-cg') clf.fit(X_train, y_train) train_acc = clf.score(X_train, y_train) train_accuracies.append(train_acc) eval_acc = clf.score(X_val, y_val) eval_accuracies.append(eval_acc) avg_coefficients += clf.coef_ avg_intercepts += clf.intercept_ train_predictions.append(clf.predict(X_train)) eval_predictions.append(clf.predict(X_val)) if folds: util.outputConfusionMatrix( clf.predict(X_train), y_train, "../figures/fold_" + str(i + 1) + "_train") util.outputConfusionMatrix( clf.predict(X_val), y_val, "../figures/fold_" + str(i + 1) + "_eval") print("train accuracy:", train_acc) print("eval accuracy:", eval_acc) avg_coefficients /= util.K avg_intercepts /= util.K model = { "coeff_": avg_coefficients, "intercept_": avg_intercepts, "train_accuracies": train_accuracies, "eval_accuracies": eval_accuracies, "train_predictions": train_predictions, "eval_predictions": eval_predictions } util.dumpVar("../models/avg_logistic_model", model)
def train_classifier(data_matrix, data_labels, save_path, seq2seq_model_path, title, max_sequence_length=70, hidden_size=100, lr=0.005, batch_size=60, n_epochs=30): tf.reset_default_graph() build_model(data_matrix) saver = tf.train.Saver() pred, input_placeholder, labels_placeholder, train_op, loss_op = build_classifier( data_matrix, data_labels, max_sequence_length) saver2 = tf.train.Saver() avg_loss_list = [] with tf.Session() as sess: sess.run(tf.global_variables_initializer()) saver.restore(sess, seq2seq_model_path) print("seq2seq model restored.") minibatches = util.get_minibatches_seq_test(data_matrix, data_labels, batch_size, max_sequence_length) for i in range(n_epochs): batch_loss_list = [] print "Epoch " + str(i + 1) + ": " for tup in minibatches: _, loss = sess.run( [train_op, loss_op], feed_dict={ input_placeholder: np.transpose(tup[0]), labels_placeholder: tup[1] }) batch_loss_list.append(loss) print "loss: ", loss avg_loss_list.append(np.mean(batch_loss_list)) print "=====>loss: " + str(avg_loss_list[i]) + " " if (i > 0) and (avg_loss_list[i] < avg_loss_list[i - 1]): tmp_path = save_path + "--smallest loss" saver2.save(sess, tmp_path) print "New min loss at epoch %s! Model saved in path %s" % ( str(i + 1), tmp_path) saver2.save(sess, save_path) print("Final model saved in path: %s" % save_path) print tf.trainable_variables() util.dumpVar("losses/ " + title + " " + today + ".pkl", avg_loss_list) generatePlots(range(len(avg_loss_list)), avg_loss_list, "Number of Epochs", "Cross-Entropy Loss", title)
def create_vocab_dict_helper(train_data, news_names): vocab_dict = {} vocab_dict[util.UNK] = 1 num_tokens_so_far = 1 for news_source in news_names: print "Adding articles from " + news_source all_articles = train_data[news_source] num_articles_so_far = 0 for article in all_articles: if num_articles_so_far % 100 == 0: print "=======>" + "Adding article " + str(num_articles_so_far) for token in article: token = token.lower() if token not in vocab_dict: vocab_dict[token] = num_tokens_so_far num_tokens_so_far += 1 num_articles_so_far += 1 util.dumpVar("vocab_dict.pkl", vocab_dict)
def create_embed_matrix(glove_filename): print "Opening vocab_dict..." vocab_dict = util.openPkl("vocab_dict.pkl") print "Done opening vocab_dict!" print "Creating embed_matrix..." vocab_size = len(vocab_dict) embed_matrix = glove.loadWordVectors(vocab_dict, filepath=glove_filename, dimensions=util.glove_dimensions) print "Done creating embed_matrix!" print "Cleaning up the embeddings_matrix..." # words w/o glove vectors never got updated in matrix # should have empty field, so give them 0s and remap to UNK for word in vocab_dict: embed_matrix_word_index = vocab_dict[word] if len(embed_matrix[embed_matrix_word_index]) < util.glove_dimensions: embed_matrix[embed_matrix_word_index] = [ float(0) for x in range(util.glove_dimensions) ] vocab_dict[word] = vocab_dict[util.UNK] print "Done cleaning up the data!" util.dumpVar("embeddings_matrix.pkl", embed_matrix)
# stalemate elif row[outcome_column_num] == 5: data[i][outcome_column_num] = 4 # other else: data[i][outcome_column_num] = 5 y = data[:, y_label_columns] X = np.delete(data, y_label_columns, axis=1) # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True) num_examples = X.shape[0] print(X.shape) indices = list(range(num_examples)) train_indices = indices[:int(num_examples * .8)] test_indices = indices[int(num_examples * .8):] X_train_time = X[train_indices] y_train_time = y[train_indices] X_test_time = X[test_indices] y_test_time = y[test_indices] print(X_train_time.shape) print(X_test_time.shape) util.dumpVar("../data/X_train_time", X_train_time) util.dumpVar("../data/y_train_time", y_train_time) util.dumpVar("../data/X_test_time", X_test_time) util.dumpVar("../data/y_test_time", y_test_time)
if __name__ == '__main__': vocab_dict = util.openPkl("vocab_dict.pkl") reverse_dict = {} # print vocab_dict[util.UNK] #we have mapping from word to index # we now want index to word for word, index in vocab_dict.iteritems(): if word != util.UNK: reverse_dict[index] = word reverse_dict[0] = util.UNK # print reverse_dict[3] # print vocab_dict[reverse_dict[3]] util.dumpVar("reverse_dict.pkl", reverse_dict) # embed_matrix_mod = util.openPkl("embed_matrix_mod.pkl") # print embed_matrix_mod[0] # print embed_matrix_mod[1] # print embed_matrix_mod[len(embed_matrix_mod)-1]
def build_matrices(embed_matrix_filename, vocab_dict_filename, short=True): print "Opening embed_matrix..." embed_matrix = util.openPkl(embed_matrix_filename) print "Done opening embed_matrix!" print "Opening vocab_dict..." vocab_dict = util.openPkl(vocab_dict_filename) print "Done vocab_dict!" train_data_filename = "" test_data_filename = "" dev_data_filename = "" train_matrix_pkl = "train_matrix" train_labels_pkl = "train_labels" test_matrix_pkl = "test_matrix" test_labels_pkl = "test_labels" dev_matrix_pkl = "dev_matrix" dev_labels_pkl = "dev_labels" if short: train_data_filename = "Data/data_train_short.pkl" test_data_filename = "Data/data_test_short.pkl" dev_data_filename = "Data/data_dev_short.pkl" train_matrix_pkl = addShort(train_matrix_pkl) train_labels_pkl = addShort(train_labels_pkl) test_matrix_pkl = addShort(test_matrix_pkl) test_labels_pkl = addShort(test_labels_pkl) dev_matrix_pkl = addShort(dev_matrix_pkl) dev_labels_pkl = addShort(dev_labels_pkl) else: train_data_filename = "Data/data_train_full.pkl" test_data_filename = "Data/data_test_full.pkl" dev_data_filename = "Data/data_dev_full.pkl" train_matrix, train_labels = build_matrices_helper(train_data_filename, vocab_dict, embed_matrix) test_matrix, test_labels = build_matrices_helper(test_data_filename, vocab_dict, embed_matrix) dev_matrix, dev_labels = build_matrices_helper(dev_data_filename, vocab_dict, embed_matrix) util.dumpVar(train_matrix_pkl + ".pkl", train_matrix) util.dumpVar(train_labels_pkl + ".pkl", train_labels) util.dumpVar(test_matrix_pkl + ".pkl", test_matrix) util.dumpVar(test_labels_pkl + ".pkl", test_labels) util.dumpVar(dev_matrix_pkl + ".pkl", dev_matrix) util.dumpVar(dev_labels_pkl + ".pkl", dev_labels)