def load_queries(queries_filename): file = open(queries_filename, 'r') queries = {} vocab = Vocab() tokenizer = Tokenizer() lemmatizer = Lemmatizer() for l in file.readlines(): l = l.replace('\n', '') l_arr = l.split('\t') q = Query() q.id = int(l_arr[0]) q_text = l_arr[1] q_syn_text = '' if len(l_arr) > 2: q_syn_text = l_arr[2] q.text = q_text + ' ' + q_syn_text q.tokens = lemmatizer.fit_transform(tokenizer.fit_transform(q_text)) q.synonim_tokens = lemmatizer.fit_transform(tokenizer.fit_transform(q_syn_text)) queries[q.id] = q file.close() # create vocab for q_id in queries.keys(): q = queries[q_id] tokens = q.tokens + q.synonim_tokens vocab.add_phrase(tuple(q.tokens)) for tkn in tokens: vocab.add1(tkn) grams, inv_grams, gap_grams = get_ngrams(tokens, 2, inverted=True, with_gap=True) for g in grams + inv_grams + gap_grams: vocab.add2(g) return queries, vocab
def lemmatize(self, stop_words=None): tokenizer = Tokenizer(stop_words=stop_words) lemmatizer = Lemmatizer(stop_words=stop_words) self.lemmatized_queries = dict() for q_id in self.queries.dict.keys(): q = self.queries.get(q_id) tok_q = tokenizer.fit_transform(q) lem_q = lemmatizer.fit_transform(tok_q) self.lemmatized_queries[int(q_id)] = lem_q
removeStopWords=args["removeStopWords"], doSpellingCorrection=args["doSpellingCorrection"], removeNewLine=args["removeNewLine"], removePunctuation=args["removePunctuation"], removeHtmlTags=args["removeHtmlTags"], minTextLength=args["minTextLength"]) predict_df["processed"] = preprocessor.fit_transform( predict_df["text_german"]) predict_df = predict_df.dropna(subset=["processed"], axis=0) print("Tokenize") tokenizer = Tokenizer(tokenizeStr=preperation_technique, ngram=preperation_ngram, fasttextFile=args["fasttext_file"], doLower=args["doLower"]) predict_df["processed"] = tokenizer.fit_transform(predict_df["processed"]) ## for testing purposes #train_df = train_df.sample(100) #val_df = val_df.sample(20) #test_df = test_df.sample(20) ## apply the model labels = [ "price_pos", "price_neg", "quality_pos", "quality_neg", "restaurant_pos", "restaurant_neg", "food_pos", "food_neg", "drinks_pos", "drinks_neg", "ambience_pos", "ambience_neg", "service_pos", "service_neg" ] sentimentDict = { "pos": "positiv",
val_df.to_pickle(val_pre_path) test_df.to_pickle(test_pre_path) else: train_df = pd.read_pickle(train_pre_path) val_df = pd.read_pickle(val_pre_path) test_df = pd.read_pickle(test_pre_path) ## get data and train columns data_column = list(set(train_df.columns) - set(args["targets"]))[0] if run_tokenization: ## do tokenization print("Tokenize") tokenizer = Tokenizer(tokenizeStr=tokenizer_model[0], fasttextFile=args["fasttext_file"], doLower=args["doLower"]) train_df[data_column] = tokenizer.fit_transform(train_df[data_column]) val_df[data_column] = tokenizer.transform(val_df[data_column]) test_df[data_column] = tokenizer.transform(test_df[data_column]) ## save the preprocessed data if not os.path.exists(os.path.join(args["data_path"], "temp")): os.makedirs(os.path.join(args["data_path"], "temp")) train_df.to_pickle(train_tok_path) val_df.to_pickle(val_tok_path) test_df.to_pickle(test_tok_path) else: train_df = pd.read_pickle(train_tok_path) val_df = pd.read_pickle(val_tok_path) test_df = pd.read_pickle(test_tok_path)
np.save(train_pre_path.format("data"), train_data, allow_pickle=True) np.save(val_pre_path.format("data"), val_data, allow_pickle=True) np.save(test_pre_path.format("data"), test_data, allow_pickle=True) np.save(train_pre_path.format("target"), train_target, allow_pickle=True) np.save(val_pre_path.format("target"), val_target, allow_pickle=True) np.save(test_pre_path.format("target"), test_target, allow_pickle=True) if run_tokenization: ## do tokenization print("Tokenize") tokenizer = Tokenizer(args=tokenizer_model, fasttextFile=args["fasttext_file"], doLower=args["doLower"]) train_data = tokenizer.fit_transform(train_data) val_data = tokenizer.transform(val_data) test_data = tokenizer.transform(test_data) ## save the preprocessed data if not os.path.exists(os.path.join(args["data_path"], "temp")): os.makedirs(os.path.join(args["data_path"], "temp")) if sparse.issparse(train_data): sparse.save_npz(train_tok_path.format("data"), train_data) else: np.save(train_tok_path.format("data"), train_data) np.save(train_tok_path.format("target"), train_target) if sparse.issparse(val_data): sparse.save_npz(val_tok_path.format("data"), val_data) else: np.save(val_tok_path.format("data"), val_data)