def workflow(dataset): dataset = remove_empty_words_1(dataset) dataset = tokenize_data(dataset) dataset = remove_stop_words(dataset) dataset = stemming(dataset) dataset = lemmatization(dataset) dataset = remove_garbage(dataset) return dataset
def testing_data(): pos_train_data_path = os.path.join(PATH, "dataset", "pos", "") neg_train_data_path = os.path.join(PATH, "dataset", "neg", "") dataset = load_data(pos_train_data_path, neg_train_data_path) dataset = remove_empty_words_1(dataset) dataset = tokenize_data(dataset) dataset = remove_stop_words(dataset) dataset = stemming(dataset) dataset = lemmatization(dataset) dataset = remove_garbage(dataset) return dataset
def testing_data(): logging.debug('{} - ({}) processing testing data started'.format( script_name, str(datetime.datetime.now()))) pos_train_data_path = os.path.join(PATH, "dataset", "pos", "") neg_train_data_path = os.path.join(PATH, "dataset", "neg", "") dataset = load_data(pos_train_data_path, neg_train_data_path) dataset = remove_empty_words_1(dataset) dataset = tokenize_data(dataset) dataset = remove_stop_words(dataset) dataset = stemming(dataset) dataset = lemmatization(dataset) dataset = remove_garbage(dataset) logging.debug('{} - ({}) processing training data finished'.format( script_name, str(datetime.datetime.now()))) return dataset