def train_modelsoftmax_regression(df, name): # train word vectors using all reviews as data model, df = train_word2vec(df) ### begin training the softmax logistic regression classifier and predict using cross validation on training set #print(df.head()) #print(df["vectorized_texts"]) features = np.array(df["vectorized_texts"]).reshape(-1,1) labels = df['funniness_category'].values ###### train and predict with softmax logistic regression labels_train, y_train_predict = train_and_predict_softmax_logistic_regression(features, labels) ############ confusion matrix ######### create_confusion_matrices(labels_train, y_train_predict, feature_representation="word2vec", classifier_type="Softmax regression", condition=False if name =='no_cond' else True)
def main(passed_args=None): parser = argparse.ArgumentParser( description="train a neural network on tweets against prices") parser.add_argument( "--word2vec", "-w", dest="word2vec", action="store_true", default=False, help="toggle this option if you are obtaining dataset using word2vec", ) parser.add_argument( "--tune", "-t", dest="tuning", action="store_true", default=False, help="toogle this option if you are tuning hyperparameters", ) parser.add_argument( "--rnn", "-r", dest="train_rnn", action="store_true", default=False, help="toogle this option to train rnn", ) parser.add_argument( "--predict", "-d", dest="predict", action="store_true", default=False, help="toogle this option if you are making predictions", ) parser.add_argument( "--markowitz", "-m", dest="markowitz", action="store_true", default=False, help= "toogle this option if you are doing Markowitz portfolio optimisation", ) parser.add_argument( "--glove", "-g", dest="glove", action="store_true", default=False, help="toogle this option if you are obtaining dataset using glove", ) parser.add_argument( "--metrics", "-f", dest="metrics", action="store_true", default=False, help="toogle this option if you are evaluating the metrics", ) args = parser.parse_args(passed_args) if args.word2vec: # prepare Word2Vec model if not os.path.exists(PATH_TO_WORD2VEC): w2v.train_word2vec() # prepare all data required prices = d.load_prices() w2v_model = w2v.load_word2vec() for stock in stock_universe: d.get_return_by_stock(stock, prices) d.load_tweets_by_stock(stock) w2v.get_padded_embeddings(stock, w2v_model) sys.exit() if args.glove: # prepare all data required prices = d.load_prices() w2v_model = w2v.load_glove_model( path_to_glove="~/Downloads/GloVe-1.2/glove.twitter.27B.50d.txt", path_to_output="./temp/glove_pretrained_w2vformat.txt", ) for stock in stock_universe: d.get_return_by_stock(stock, prices) d.load_tweets_by_stock(stock) w2v.get_padded_embeddings( stock, w2v_model, path_to_output="./temp/padded_embeddings/glove_pretrained", ) sys.exit() if args.tuning: hyperparam_list = get_hyperparam_list(NN_HYPERPARAM_DICT) best_hyperparam_list = [] for stock in stock_universe: print(stock) x = pd.read_pickle( "temp/padded_embeddings/glove_pretrained/pickle/" + stock + ".pickle") y = pd.read_pickle("temp/returns/pickle/" + stock + ".pickle") torch_dataset = nn.get_tensor_dataset(x, y) for hyperparam in hyperparam_list: train_set, _ = nn.train_test_split(torch_dataset, hyperparam["TEST_SIZE"]) train_set, validation_set = nn.train_test_split( train_set, hyperparam["VALIDATION_SIZE"]) tuning_list = [] _, _, validation_losses = nn.train_nn(train_set, validation_set, hyperparam) tuning_list.append((hyperparam, validation_losses[-1])) tuning_list.sort(key=operator.itemgetter(1)) best_hyperparam = tuning_list[0][0] best_hyperparam_list.append((stock, best_hyperparam)) with open("./temp/best-hyperparam-glove-pretrained.txt", "wb") as f: pickle.dump(best_hyperparam_list, f) print(best_hyperparam_list) sys.exit() if args.predict: if os.path.exists("./temp/best-hyperparam-glove.txt"): with open("./temp/best-hyperparam-glove.txt", "rb") as f: best_hyperparam_list = pickle.load(f) best_hyperparam_dict = dict(best_hyperparam_list) for stock in stock_universe: hyperparam = best_hyperparam_dict[stock] x = pd.read_pickle("temp/padded_embeddings/glove/pickle/" + stock + ".pickle") y = pd.read_pickle("temp/returns/pickle/" + stock + ".pickle") torch_dataset = nn.get_tensor_dataset(x, y) _, test_set = nn.train_test_split(torch_dataset, hyperparam["TEST_SIZE"]) results = nn.predict_nn(test_set, "temp/nn/glove/" + stock + ".pth") results_df = pd.DataFrame(results) results_df.columns = ["y", "pred", "loss"] if not os.path.exists("./output/glove"): os.makedirs("./output/glove") results_df.to_csv("./output/glove/" + stock + ".csv") sys.exit() if args.train_rnn: eval_only = True hyperparam_list = get_hyperparam_list(RNN_HYPERPARAM_DICT) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") for hyperparam in hyperparam_list: for stock in stock_universe: print(stock) returns = pd.read_pickle("temp/returns/pickle/" + stock + ".pickle") returns = nn.normalise( torch.tensor(np.stack(returns.values, axis=0), device=device)) vectorised_seq, vocab = rnn.get_vectorised_seq_by_stock(stock) input_size = len(vocab) encoder, feedforward, results = rnn.train_rnn( vectorised_seq, returns, input_size, hyperparam, eval_only=eval_only, path_to_encoder="temp/rnn/encoder/" + stock + ".pth", path_to_feedforward="temp/rnn/feedforward/" + stock + ".pth", ) if eval_only == False: if not os.path.exists("temp/rnn"): os.makedirs("temp/rnn/encoder") os.makedirs("temp/rnn/feedforward") torch.save(encoder.state_dict(), "temp/rnn/encoder/" + stock + ".pth") torch.save( feedforward.state_dict(), "temp/rnn/feedforward/" + stock + ".pth", ) results_df = pd.DataFrame(results) results_df.columns = ["returns", "pred", "loss"] if not os.path.exists("./output/rnn"): os.makedirs("./output/rnn") results_df.to_csv("./output/rnn/" + stock + ".csv") sys.exit() if args.markowitz: model_dict = { "dtm": "purple", "tfidf": "pink", "word2vec": "black", "glove": "blue", "glove_pretrained": "green", "rnn": "orange", "actual": "red", } mean_var_dict = d.get_etf_mean_var() p.plot_frontier_with_points(model_dict, mean_var_dict) # p.plot_frontier(model_dict) sys.exit() if args.metrics: models = [ "rnn", "glove", "glove_pretrained", "word2vec", "dtm", "tfidf" ] for model in models: me.get_metrics_summary(model) sys.exit() if os.path.exists("./temp/best-hyperparam-glove.txt"): with open("./temp/best-hyperparam-glove.txt", "rb") as f: best_hyperparam_list = pickle.load(f) best_hyperparam_dict = dict(best_hyperparam_list) for stock in stock_universe: print(stock) hyperparam = best_hyperparam_dict[stock] x = pd.read_pickle("temp/padded_embeddings/glove/pickle/" + stock + ".pickle") y = pd.read_pickle("temp/returns/pickle/" + stock + ".pickle") torch_dataset = nn.get_tensor_dataset(x, y) train_set, test_set = nn.train_test_split(torch_dataset, hyperparam["TEST_SIZE"]) model, _, _ = nn.train_nn(train_set, test_set, hyperparam) if not os.path.exists("temp/nn/glove"): os.makedirs("temp/nn/glove") torch.save(model.state_dict(), "temp/nn/glove/" + stock + ".pth") sys.exit()
if __name__ == '__main__': parser = argparse.ArgumentParser() # Not to add argument of labels file,always use that of input parser.add_argument("-a", "--account", type=str, help="Providing account to train word2vec model.") args = parser.parse_args() if args.account is None: print("Missing account,it must be provided.") sys.exit(1) else: # Todo:Check if account is valid,should create a collection to store all accounts? pass input_segment_file = INPUT_SEGMENT_FILE.format(name=args.account) sentences_file = SENTENCES_FILE.format(name=args.account) output_model = WORD2VEC_MODEL_FILE.format(name=args.account) output_word_vector = WORD2VEC_VECTOR_FILE.format(name=args.account) train_word2vec( input_segment_file, sentences_file, output_model, output_word_vector ) word_vector_file = output_word_vector word2vec_file = WORD2VEC_FILE.format(name=args.account) bin2pkl(word_vector_file, word2vec_file)
# y_test, vocabulary_inv for later use # Output shape print('x_train shape: ', x_train.shape) print('x_test shape:', x_test.shape) print('Vocabulary Size: {:d}'.format(len(vocabulary_inv))) # Word2Vec parameters (see train_word2vec) embedding_dim = 300 min_word_count = 1 context = 10 #Prepare embedding layer weights for not-static model embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=embedding_dim, min_word_count=min_word_count, context=context) #===========================create model==================== # Model Hyperparameters embedding_dim = 300 filter_sizes = (3, 4, 5) num_filters = 100 dropout_prob = (0.5, 0.8) hidden_dims = 50 vocab_size = len(vocabulary_inv) batch_size = 64 num_epochs = 10 # Create model
preprocessed = '' for t in text: preprocessed = preprocessed + " " + t preprocessed = [preprocessed] predict_class_word2vec(preprocessed, classifiers_word2vec, model) #get user input #https://dl.acm.org/ft_gateway.cfm?id=1298095&ftid=461125&dwn=1&CFID=10092248&CFTOKEN=cdee3755fc691456-5AEDDACC-0F06-2584-E8F69728D3802547 ## print('start training models.......') dataset_path = 'D:\\Oulu\\NLP\\project\\workspace\\out.txt' #classifiers,features = train_all_models(dataset_path) dataset_path_word2vec = 'D:\\Oulu\\NLP\\project\\workspace\\out_lemmas.txt' classifiers_word2vec, model = train_word2vec(dataset_path_word2vec) url = input("please input the url of ACM pdf document\n") file = retrieve_acm(url) #show_results(file,classifiers) show_result_word2vec(file, classifiers_word2vec, model) #print(text) url = input("please input another url or type exit\n") while url != 'exit': file = retrieve_acm(url) #show_results(file,classifiers) show_result_word2vec(file, classifiers_word2vec, model) url = input("please input another url or type exit\n")