def preprare_data(min_occurrences): import os training_data = None testing_data = None print("Loading data...") test_data_file_name = "data/processed_test_word2vec_bow_" + str( min_occurrences) + ".csv" train_data_file_name = "data/processed_train_word2vec_bow_" + str( min_occurrences) + ".csv" use_cache = os.path.isfile(train_data_file_name) and os.path.isfile( test_data_file_name) if use_cache: training_data = TwitterData() training_data.initialize(None, from_cached=train_data_file_name) training_data = training_data.data_model testing_data = TwitterData() testing_data.initialize(None, from_cached=test_data_file_name) testing_data = testing_data.data_model print("Loaded from cached files...") else: print("Preprocessing data...") with multiprocessing.Manager() as manager: results = manager.dict() preprocess_training = Process(target=preprocess, args=( results, "data/train.csv", False, "train", min_occurrences, train_data_file_name, )) preprocess_testing = Process(target=preprocess, args=( results, "data/test.csv", True, "test", min_occurrences, test_data_file_name, )) preprocess_training.start() preprocess_testing.start() print("Multiple processes started...") preprocess_testing.join() print("Preprocessed testing data...") preprocess_training.join() print("Preprocessed training data...") training_data = results["train"] testing_data = results["test"] print("Data preprocessed & cached...") return training_data, testing_data
def main(): for m in range(3, 4): print("Preparing data with min_occurrences=" + str(m)) training_data, testing_data = preprare_data(m) log("********************************************************") log("Validating for {0} min_occurrences:".format(m)) # drop idx & id columns # if training_data.columns[0] == "idx": # training_data = training_data.iloc[:, 1:] # # if testing_data.columns[0] == "idx": # testing_data = testing_data.iloc[:, 1:] # # if "original_id" in training_data.columns: # training_data.drop( "original_id", axis=1, inplace=True ) # # if "original_id" in testing_data.columns: # testing_data.drop( "original_id", axis=1, inplace=True ) td = TwitterData() td.initialize("data\\train.csv") td.build_features() td.cleanup(TwitterCleanuper()) td.tokenize() td.stem() td.build_wordlist() td.build_final_model(word2vec) td.data_model.head(5) print("Done!")
def preprocess(results, data_path, is_testing, data_name, min_occurrences=5, cache_output=None): twitter_data = TwitterData() twitter_data.initialize(data_path, is_testing) twitter_data.build_features() twitter_data.cleanup(TwitterCleanuper()) twitter_data.tokenize() twitter_data.stem() twitter_data.build_wordlist(min_occurrences=min_occurrences) #twitter_data.build_data_model() # twitter_data.build_ngrams() # twitter_data.build_ngram_model() # twitter_data.build_data_model(with_ngram=2) # word2vec = Word2VecProvider() # word2vec.load("H:\\Programowanie\\glove.twitter.27B.200d.txt") # twitter_data.build_word2vec_model(word2vec) print(cache_output) if cache_output is not None: twitter_data.data_model.to_csv(cache_output, index_label="idx", float_format="%.6f") results[data_name] = twitter_data.data_model
def preprocess(results, data_path, is_testing, data_name, min_occurrences=5, cache_output=None): twitter_data = TwitterData() twitter_data.initialize(data_path, is_testing) twitter_data.build_features() twitter_data.cleanup(TwitterCleanuper()) twitter_data.tokenize() twitter_data.stem() twitter_data.build_wordlist(min_occurrences=min_occurrences) if cache_output is not None: twitter_data.data_model.to_csv(cache_output, index_label="idx", float_format="%.6f") results[data_name] = twitter_data.data_model