Esempio n. 1
0
def preprare_data(min_occurrences):
    import os
    training_data = None
    testing_data = None
    print("Loading data...")
    test_data_file_name = "data/processed_test_word2vec_bow_" + str(
        min_occurrences) + ".csv"
    train_data_file_name = "data/processed_train_word2vec_bow_" + str(
        min_occurrences) + ".csv"
    use_cache = os.path.isfile(train_data_file_name) and os.path.isfile(
        test_data_file_name)
    if use_cache:
        training_data = TwitterData()
        training_data.initialize(None, from_cached=train_data_file_name)
        training_data = training_data.data_model

        testing_data = TwitterData()
        testing_data.initialize(None, from_cached=test_data_file_name)
        testing_data = testing_data.data_model
        print("Loaded from cached files...")
    else:
        print("Preprocessing data...")
        with multiprocessing.Manager() as manager:

            results = manager.dict()

            preprocess_training = Process(target=preprocess,
                                          args=(
                                              results,
                                              "data/train.csv",
                                              False,
                                              "train",
                                              min_occurrences,
                                              train_data_file_name,
                                          ))
            preprocess_testing = Process(target=preprocess,
                                         args=(
                                             results,
                                             "data/test.csv",
                                             True,
                                             "test",
                                             min_occurrences,
                                             test_data_file_name,
                                         ))

            preprocess_training.start()
            preprocess_testing.start()
            print("Multiple processes started...")

            preprocess_testing.join()
            print("Preprocessed testing data...")

            preprocess_training.join()
            print("Preprocessed training data...")

            training_data = results["train"]
            testing_data = results["test"]

            print("Data preprocessed & cached...")

    return training_data, testing_data
Esempio n. 2
0
    def main():

        for m in range(3, 4):
            print("Preparing data with min_occurrences=" + str(m))
            training_data, testing_data = preprare_data(m)
            log("********************************************************")
            log("Validating for {0} min_occurrences:".format(m))
            # drop idx & id columns
            # if training_data.columns[0] == "idx":
            #     training_data = training_data.iloc[:, 1:]
            #
            # if testing_data.columns[0] == "idx":
            #     testing_data = testing_data.iloc[:, 1:]
            #
            # if "original_id" in training_data.columns:
            #     training_data.drop( "original_id", axis=1, inplace=True )
            #
            # if "original_id" in testing_data.columns:
            #     testing_data.drop( "original_id", axis=1, inplace=True )

            td = TwitterData()
            td.initialize("data\\train.csv")
            td.build_features()
            td.cleanup(TwitterCleanuper())
            td.tokenize()
            td.stem()
            td.build_wordlist()
            td.build_final_model(word2vec)

            td.data_model.head(5)

        print("Done!")
Esempio n. 3
0
def preprocess(results,
               data_path,
               is_testing,
               data_name,
               min_occurrences=5,
               cache_output=None):
    twitter_data = TwitterData()
    twitter_data.initialize(data_path, is_testing)
    twitter_data.build_features()
    twitter_data.cleanup(TwitterCleanuper())
    twitter_data.tokenize()
    twitter_data.stem()
    twitter_data.build_wordlist(min_occurrences=min_occurrences)
    #twitter_data.build_data_model()
    # twitter_data.build_ngrams()
    # twitter_data.build_ngram_model()
    # twitter_data.build_data_model(with_ngram=2)
    # word2vec = Word2VecProvider()
    # word2vec.load("H:\\Programowanie\\glove.twitter.27B.200d.txt")
    # twitter_data.build_word2vec_model(word2vec)
    print(cache_output)
    if cache_output is not None:
        twitter_data.data_model.to_csv(cache_output,
                                       index_label="idx",
                                       float_format="%.6f")
    results[data_name] = twitter_data.data_model
Esempio n. 4
0
def preprocess(results,
               data_path,
               is_testing,
               data_name,
               min_occurrences=5,
               cache_output=None):
    twitter_data = TwitterData()
    twitter_data.initialize(data_path, is_testing)
    twitter_data.build_features()
    twitter_data.cleanup(TwitterCleanuper())
    twitter_data.tokenize()
    twitter_data.stem()
    twitter_data.build_wordlist(min_occurrences=min_occurrences)

    if cache_output is not None:
        twitter_data.data_model.to_csv(cache_output,
                                       index_label="idx",
                                       float_format="%.6f")
    results[data_name] = twitter_data.data_model