def initialize_inputs(self):
        # Read data
        # Use the kaggle Bag of words vs Bag of popcorn data:
        # https://www.kaggle.com/c/word2vec-nlp-tutorial/data
        data_labeled_bak = data_labeled = pd.read_csv(self.labeled_data_path,
                                                      header=0,
                                                      delimiter="\t",
                                                      quoting=3,
                                                      error_bad_lines=False,
                                                      encoding="utf-8")
        data_unlabeled = pd.read_csv(self.unlabeled_data_path,
                                     header=0,
                                     delimiter="\t",
                                     quoting=3,
                                     error_bad_lines=False,
                                     encoding="utf-8")
        # data2 and data are combined to train word2vec model
        data_combined = data_labeled.append(data_unlabeled)

        # Load or create(if not exists) word2vec model
        self.word2vecmodel = features_word2vec.get_word2vec_model(
            data_combined,
            "review",
            num_features=50,
            downsampling=1e-3,
            model_path=self.word2vecmodel_path)
        # Create word embeddings, which is essentially a dictionary that maps indices to features
        self.embedding_weights = features_word2vec.create_embedding_weights(
            self.word2vecmodel, writeEmbeddingFileName=self.embedding_path)

        # Map words to indices
        self.X = features_word2vec.get_indices_word2vec(
            data_labeled_bak,
            "review",
            self.word2vecmodel,
            maxLength=100,
            writeIndexFileName=self.text2indices_path_label,
            padLeft=True)

        self.y = data_labeled_bak["sentiment"]
        #print("self.y")
        #print(self.y)

        self.X_test_data = features_word2vec.get_indices_word2vec(
            data_unlabeled,
            "review",
            self.word2vecmodel,
            maxLength=100,
            writeIndexFileName=self.text2indices_path_unlabel,
            padLeft=True)
        #print(type(self.X_test_data))
        #print(self.X_test_data)
        # convert types
        self.embedding_weights = self.embedding_weights.astype("float32")
Exemple #2
0
def data_prep():
    # Read data
    # Use the kaggle Bag of words vs Bag of popcorn data:
    # https://www.kaggle.com/c/word2vec-nlp-tutorial/data

    data_bak = data = pd.read_csv(labeled_data_path, header=0,
                       delimiter="\t", quoting=3,error_bad_lines=False, encoding="utf-8")

    data2 = pd.read_csv(unlabeled_data_path, header=0,
                        delimiter="\t", quoting=3, error_bad_lines=False, encoding="utf-8")

    # data2 and data are combined to train word2vec model
    data3 = data.append(data2)

    model = features_word2vec.get_word2vec_model(data3, "review", num_features=50, downsampling=1e-3, model_path=model_path)
    embedding_weights = features_word2vec.create_embedding_weights(model)
    features = features_word2vec.get_indices_word2vec(data_bak, "review", model, maxLength=500,
                                                      writeIndexFileName="./model/imdb_indices.pickle", padLeft=True)
    return model, embedding_weights, features
Exemple #3
0
def data_prep_quora():
    # Read data
    # Use the kaggle Bag of words vs Bag of popcorn data:
    # https://www.kaggle.com/c/word2vec-nlp-tutorial/data

    data = pd.read_csv(labeled_data_path, delimiter=",", engine = "python", encoding = "utf8")
    data_concat = pd.concat([data[["question1"]], data[["question2"]].rename(columns={"question2":"question1"})], axis = 0)
    model = features_word2vec.get_word2vec_model(data_concat, "question1", num_features=300, downsampling=1e-3, model_path=model_path)

    embedding_weights = features_word2vec.create_embedding_weights(model, writeEmbeddingFileName = "./model/embedding_weights_quora_tmp.pkl" )

    features1 = features_word2vec.get_indices_word2vec(data, "question1", model, maxLength=maxSeqLength,
                                                             writeIndexFileName="./model/quora_indices1.pickle",
                                                             padLeft=True)
    features2 = features_word2vec.get_indices_word2vec(data, "question2", model, maxLength=maxSeqLength,
                                                         writeIndexFileName="./model/quora_indices2.pickle",
                                                         padLeft=True)
    label = data["is_duplicate"]

    return model, embedding_weights, features1, features2, label
Exemple #4
0
    # data2 and data are combined to train word2vec model
    data2 = data.append(data2)

    # Construct word2vec model
    model_path = "./model/300features_40minwords_10context"

    if not os.path.isfile(model_path):
        model = features_word2vec.get_word2vec_model(data2, "review", num_features=300, downsampling=1e-3, model_name=model_path)
    else:
        # After model is created, we can load it as an existing file
        model = features_word2vec.load_word2vec_model(model_name=model_path)

    # Create word embeddings, which is essentially a dictionary
    # that maps indices to features
    embedding_weights = features_word2vec.create_embedding_weights(model)

    # Map words to indices
    features = features_word2vec.get_indices_word2vec(data, "review", model, maxLength=500,
                             writeIndexFileName="./model/imdb_indices.pickle", padLeft=True )
    y = data["sentiment"]
    X_train, y_train, X_test, y_test = data_split.train_test_split_shuffle(y, features, test_size=0.1)

    # Vanilla LSTM
    model_lstm.classif_imdb( X_train, y_train, X_test, y_test, embedding_weights = embedding_weights, dense_dim = 256, nb_epoch = 3 )

    # Compare with LSTM + CNN
    #model_lstm.classif_imdb(X_train, y_train, X_test, y_test, embedding_weights=embedding_weights, dense_dim=256,
    #                        nb_epoch=3, include_cnn = True)

    # Compare with LSTM without embedding
Exemple #5
0
def func():
    from sklearn import metrics
    from lib import data_split, features_word2vec, model_lstm, model_randomforest
    import pandas as pd
    import os

    # Read data
    # Use the kaggle Bag of words vs Bag of popcorn data:
    # The data is downloaded from:
    # https://www.kaggle.com/c/word2vec-nlp-tutorial/data
    data = pd.read_csv("./data/labeledTrainData.tsv",
                       header=0,
                       delimiter="\t",
                       quoting=3,
                       encoding="utf-8")

    print("The labeled training set dimension is:\n")
    print(data.shape)

    data2 = pd.read_csv("./data/unlabeledTrainData.tsv",
                        header=0,
                        delimiter="\t",
                        quoting=3,
                        encoding="utf-8")

    print("The unlabeled training set dimension is:\n")
    print(data.shape)

    # Labeled data(data) and Unlabeled data(data2)
    # are combined to train the word2vec model
    data2.append(data)
    print(data2.shape)

    model_path = "./model/300features_40minwords_10context"

    # If we have a pre-trained model we'd like to use, it can be loaded here directly.
    # Otherwise we will use the existing data to train it from scratch
    if not os.path.isfile(model_path):
        model = features_word2vec.get_word2vec_model(data2,
                                                     "review",
                                                     num_features=300,
                                                     downsampling=1e-3,
                                                     model_name=model_path)
    else:
        # After model is created, we can load it as an existing file
        model = features_word2vec.load_word2vec_model(model_name=model_path)
    embedding_weights = features_word2vec.create_embedding_weights(model)
    print(embedding_weights.shape)

    # We also need to prepare the word2vec features, so that they are
    # each word is now mapped to an index, consistents with the training embedding
    # Currently, we are limiting each review article to 500 words.
    # By default, we pad the LHS of each vector with zeros.
    # e.g [ 0, 0, 0 .... 0.27, 0.89, 0.35]
    features = features_word2vec.get_indices_word2vec(
        data,
        "review",
        model,
        maxLength=500,
        writeIndexFileName="./model/imdb_indices.pickle",
        padLeft=True)

    print(embedding_weights.shape)

    # Now we separate data for training and validation
    y = data["sentiment"]
    X_train, y_train, X_test, y_test = data_split.train_test_split_shuffle(
        y, features, test_size=0.1)

    model_lstm.classif_imdb(X_train,
                            y_train,
                            X_test,
                            y_test,
                            embedding_weights=embedding_weights,
                            dense_dim=256,
                            nb_epoch=3)

    model_lstm.classif_imdb(X_train,
                            y_train,
                            X_test,
                            y_test,
                            embedding_weights=embedding_weights,
                            dense_dim=256,
                            nb_epoch=3,
                            include_cnn=True)

    model_lstm.classif_imdb(X_train,
                            y_train,
                            X_test,
                            y_test,
                            embedding_weights=None,
                            dense_dim=256,
                            nb_epoch=3)

    features_avg_word2vec = features_word2vec.get_avgfeatures_word2vec(
        data, "review", model)
    X_train, y_train, X_test, y_test = data_split.train_test_split_shuffle(
        y, features_avg_word2vec, test_size=0.1)
    model_randomforest.classif(X_train, y_train, X_test, y_test)