Ejemplo n.º 1
0
def main():
    from sklearn.preprocessing import LabelEncoder
    import pickle

    print("-" * 20 + " " + config.WORD2VEC_MODEL +
          " embedding model is loading " + "-" * 20)
    os.makedirs("preprocessed_data", exist_ok=True)
    try:
        with open("preprocessed_data/word.vec", 'rb') as file:
            word2vec = pickle.load(file)
            wordvec_index = word2vec['wordvec_index']
            word_vectors = word2vec['word_vectors']
    except:
        wordvec_index, word_vectors = load_word2vec()
        word2vec = {
            "wordvec_index": wordvec_index,
            "word_vectors": word_vectors
        }
        with open("preprocessed_data/word.vec", 'wb') as file:
            pickle.dump(word2vec, file)

    train_set = preprocessing(config.TRAIN_DATA_PATH, wordvec_index)
    test_set = preprocessing(config.TEST_DATA_PATH, wordvec_index)

    lbe = LabelEncoder()
    lbe.fit(train_set['labels'])
    lbe.fit(test_set['labels'])
    train_set['labels'] = np.array(lbe.transform(train_set['labels']),
                                   dtype='int32')
    test_set['labels'] = np.array(lbe.transform(test_set['labels']),
                                  dtype='int32')
    print(
        "train:\n\ttoken_mat.shape: {}, pos_mat1.shape: {}, pos_mat2.shape: {}"
        .format(train_set['token_mat'].shape, train_set['pos_mat1'].shape,
                train_set['pos_mat2'].shape))
    print(
        "test:\n\ttoken_mat.shape: {}, pos_mat1.shape: {}, pos_mat2.shape: {}".
        format(test_set['token_mat'].shape, test_set['pos_mat1'].shape,
               test_set['pos_mat2'].shape))
    print("data in train/test: {}/{}".format(len(train_set['labels']),
                                             len(test_set['labels'])))
    print("There are {} classes in dataset".format(len(lbe.classes_)))

    data = {
        "wordvec_index": wordvec_index,
        "word_vectors": word_vectors,
        "train_set": train_set,
        "test_set": test_set,
        "label_encoder": lbe
    }

    with open('preprocessed_data/data.pkl', 'wb') as file:
        pickle.dump(data, file)
Ejemplo n.º 2
0
def predict(config, text, code, model=None, embedding_input=None):
    if model is None:
        model = load_model(config, code)

    preprocessed = preprocess_text(text)

    if embedding_input is None:
        embedding = []
        word_model = load_word2vec(config.embeddings_model)
        for word in preprocessed.split(' '):
            if word in word_model.wv.index2word:
                vec = word_model.wv[word]
                embedding.append(vec)

        embedding_input = Variable(
            torch.Tensor(np_sentence_to_list(embedding)))

    pred = model(embedding_input)
    pred_label = pred.data.max(1)[1].numpy()[0]
    pred_char = get_char_for_binary(code, pred_label)
    return pred_char
Ejemplo n.º 3
0
def main(passed_args=None):
    parser = argparse.ArgumentParser(
        description="train a neural network on tweets against prices")
    parser.add_argument(
        "--word2vec",
        "-w",
        dest="word2vec",
        action="store_true",
        default=False,
        help="toggle this option if you are obtaining dataset using word2vec",
    )
    parser.add_argument(
        "--tune",
        "-t",
        dest="tuning",
        action="store_true",
        default=False,
        help="toogle this option if you are tuning hyperparameters",
    )
    parser.add_argument(
        "--rnn",
        "-r",
        dest="train_rnn",
        action="store_true",
        default=False,
        help="toogle this option to train rnn",
    )
    parser.add_argument(
        "--predict",
        "-d",
        dest="predict",
        action="store_true",
        default=False,
        help="toogle this option if you are making predictions",
    )
    parser.add_argument(
        "--markowitz",
        "-m",
        dest="markowitz",
        action="store_true",
        default=False,
        help=
        "toogle this option if you are doing Markowitz portfolio optimisation",
    )
    parser.add_argument(
        "--glove",
        "-g",
        dest="glove",
        action="store_true",
        default=False,
        help="toogle this option if you are obtaining dataset using glove",
    )
    parser.add_argument(
        "--metrics",
        "-f",
        dest="metrics",
        action="store_true",
        default=False,
        help="toogle this option if you are evaluating the metrics",
    )
    args = parser.parse_args(passed_args)
    if args.word2vec:
        # prepare Word2Vec model
        if not os.path.exists(PATH_TO_WORD2VEC):
            w2v.train_word2vec()

        # prepare all data required
        prices = d.load_prices()
        w2v_model = w2v.load_word2vec()
        for stock in stock_universe:
            d.get_return_by_stock(stock, prices)
            d.load_tweets_by_stock(stock)
            w2v.get_padded_embeddings(stock, w2v_model)
        sys.exit()

    if args.glove:
        # prepare all data required
        prices = d.load_prices()
        w2v_model = w2v.load_glove_model(
            path_to_glove="~/Downloads/GloVe-1.2/glove.twitter.27B.50d.txt",
            path_to_output="./temp/glove_pretrained_w2vformat.txt",
        )
        for stock in stock_universe:
            d.get_return_by_stock(stock, prices)
            d.load_tweets_by_stock(stock)
            w2v.get_padded_embeddings(
                stock,
                w2v_model,
                path_to_output="./temp/padded_embeddings/glove_pretrained",
            )
        sys.exit()

    if args.tuning:
        hyperparam_list = get_hyperparam_list(NN_HYPERPARAM_DICT)
        best_hyperparam_list = []
        for stock in stock_universe:
            print(stock)
            x = pd.read_pickle(
                "temp/padded_embeddings/glove_pretrained/pickle/" + stock +
                ".pickle")
            y = pd.read_pickle("temp/returns/pickle/" + stock + ".pickle")
            torch_dataset = nn.get_tensor_dataset(x, y)
            for hyperparam in hyperparam_list:
                train_set, _ = nn.train_test_split(torch_dataset,
                                                   hyperparam["TEST_SIZE"])
                train_set, validation_set = nn.train_test_split(
                    train_set, hyperparam["VALIDATION_SIZE"])
                tuning_list = []
                _, _, validation_losses = nn.train_nn(train_set,
                                                      validation_set,
                                                      hyperparam)
                tuning_list.append((hyperparam, validation_losses[-1]))
            tuning_list.sort(key=operator.itemgetter(1))
            best_hyperparam = tuning_list[0][0]
            best_hyperparam_list.append((stock, best_hyperparam))
        with open("./temp/best-hyperparam-glove-pretrained.txt", "wb") as f:
            pickle.dump(best_hyperparam_list, f)
        print(best_hyperparam_list)
        sys.exit()

    if args.predict:
        if os.path.exists("./temp/best-hyperparam-glove.txt"):
            with open("./temp/best-hyperparam-glove.txt", "rb") as f:
                best_hyperparam_list = pickle.load(f)
                best_hyperparam_dict = dict(best_hyperparam_list)
        for stock in stock_universe:
            hyperparam = best_hyperparam_dict[stock]
            x = pd.read_pickle("temp/padded_embeddings/glove/pickle/" + stock +
                               ".pickle")
            y = pd.read_pickle("temp/returns/pickle/" + stock + ".pickle")
            torch_dataset = nn.get_tensor_dataset(x, y)
            _, test_set = nn.train_test_split(torch_dataset,
                                              hyperparam["TEST_SIZE"])
            results = nn.predict_nn(test_set,
                                    "temp/nn/glove/" + stock + ".pth")
            results_df = pd.DataFrame(results)
            results_df.columns = ["y", "pred", "loss"]
            if not os.path.exists("./output/glove"):
                os.makedirs("./output/glove")
            results_df.to_csv("./output/glove/" + stock + ".csv")
        sys.exit()

    if args.train_rnn:
        eval_only = True
        hyperparam_list = get_hyperparam_list(RNN_HYPERPARAM_DICT)
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        for hyperparam in hyperparam_list:
            for stock in stock_universe:
                print(stock)
                returns = pd.read_pickle("temp/returns/pickle/" + stock +
                                         ".pickle")
                returns = nn.normalise(
                    torch.tensor(np.stack(returns.values, axis=0),
                                 device=device))
                vectorised_seq, vocab = rnn.get_vectorised_seq_by_stock(stock)
                input_size = len(vocab)
                encoder, feedforward, results = rnn.train_rnn(
                    vectorised_seq,
                    returns,
                    input_size,
                    hyperparam,
                    eval_only=eval_only,
                    path_to_encoder="temp/rnn/encoder/" + stock + ".pth",
                    path_to_feedforward="temp/rnn/feedforward/" + stock +
                    ".pth",
                )
                if eval_only == False:
                    if not os.path.exists("temp/rnn"):
                        os.makedirs("temp/rnn/encoder")
                        os.makedirs("temp/rnn/feedforward")
                    torch.save(encoder.state_dict(),
                               "temp/rnn/encoder/" + stock + ".pth")
                    torch.save(
                        feedforward.state_dict(),
                        "temp/rnn/feedforward/" + stock + ".pth",
                    )
                results_df = pd.DataFrame(results)
                results_df.columns = ["returns", "pred", "loss"]
                if not os.path.exists("./output/rnn"):
                    os.makedirs("./output/rnn")
                results_df.to_csv("./output/rnn/" + stock + ".csv")
        sys.exit()

    if args.markowitz:
        model_dict = {
            "dtm": "purple",
            "tfidf": "pink",
            "word2vec": "black",
            "glove": "blue",
            "glove_pretrained": "green",
            "rnn": "orange",
            "actual": "red",
        }
        mean_var_dict = d.get_etf_mean_var()
        p.plot_frontier_with_points(model_dict, mean_var_dict)
        # p.plot_frontier(model_dict)
        sys.exit()

    if args.metrics:
        models = [
            "rnn", "glove", "glove_pretrained", "word2vec", "dtm", "tfidf"
        ]
        for model in models:
            me.get_metrics_summary(model)
        sys.exit()

    if os.path.exists("./temp/best-hyperparam-glove.txt"):
        with open("./temp/best-hyperparam-glove.txt", "rb") as f:
            best_hyperparam_list = pickle.load(f)
            best_hyperparam_dict = dict(best_hyperparam_list)

    for stock in stock_universe:
        print(stock)
        hyperparam = best_hyperparam_dict[stock]
        x = pd.read_pickle("temp/padded_embeddings/glove/pickle/" + stock +
                           ".pickle")
        y = pd.read_pickle("temp/returns/pickle/" + stock + ".pickle")
        torch_dataset = nn.get_tensor_dataset(x, y)
        train_set, test_set = nn.train_test_split(torch_dataset,
                                                  hyperparam["TEST_SIZE"])
        model, _, _ = nn.train_nn(train_set, test_set, hyperparam)
        if not os.path.exists("temp/nn/glove"):
            os.makedirs("temp/nn/glove")
        torch.save(model.state_dict(), "temp/nn/glove/" + stock + ".pth")
    sys.exit()
Ejemplo n.º 4
0
def CNN():
    #Load word2vec
    W_in, W_out = w2v.load_word2vec(num_review, numwords, mode, dimension)

    print(gd.get_splited_reviews(datas, w2i)[0])
Ejemplo n.º 5
0
max_length = 150
review_pad = pad_sequences(sequences, maxlen=max_length)

# train test split
print('Step9: train and test set generation...')
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(review_pad,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=0)

# word2vec Embedding Matrix
print('Step10: Generating word2vec embedding matrix...')
num_words = len(tokenizer_word_index) + 1
embedding_matrix_w2v = word2vec.load_word2vec(
    working_directory + '/' + 'embeddings' + '/' + 'embeddings_w2v.txt',
    tokenizer_word_index=tokenizer_word_index,
    EMBEDDING_DIM=EMBEDDING_DIM)

# training the word2vec model with lstm
print('Step11: designing lstm+w2v model...')

w2v_lstm = designing_network.model_architecture_word2vec(
    embedding_matrix_w2v,
    num_words,
    EMBEDDING_DIM=EMBEDDING_DIM,
    max_length=max_length)
w2v_lstm, history = designing_network.fit_network(w2v_lstm, X_train, X_test,
                                                  y_train, y_test)
designing_network.save_network_model(w2v_lstm,
                                     modelname='w2v_lstm',
                                     directory=model_directory)
Ejemplo n.º 6
0
                        help='No. of hidden units [128]')
    parser.add_argument('--decay',
                        default=1.0,
                        type=float,
                        help='Learning rate decay [1.0]')
    parser.add_argument('--dropout',
                        default=0.0,
                        type=float,
                        help='Probability of dropping [0.0]')
    args = parser.parse_args()

    n_epochs = args.epochs
    batch_size = args.batch
    lr = args.lr
    n_hidden = args.hidden
    decay = args.decay
    dropout = args.dropout

    dataset = args.dataset

    train_filename = '../data/{}/train'.format(dataset)
    val_filename = '../data/{}/dev.out'.format(dataset)
    out_filename = '../data/{}/dev.p5.out'.format(dataset)
    test_filename = '../data/{}/test.in'.format(dataset)
    test_out_filename = '../data/{}/test.p5.out'.format(dataset)

    word2vec_dir = 'weights/word2vec/{}'.format(dataset)
    w2v_W, w2v_U = load_word2vec(word2vec_dir)

    main()