# input must be 2D list xxx = utils.prepare_words([xx], contractions) np.array(xxx) # In[ ]: # In[13]: xx_clean = utils.replace_contr(utils.clean_sentence(xx), contractions) xx_clean # In[14]: xx_clean = utils.replace_contr(xx, contractions) xx_clean # In[15]: x = utils.token_sens(xx_clean, sentence_size = 30, word_to_idx = word_to_idx) x.shape
# ## preprocess # In[4]: with open('./model/contractions.pkl', 'rb') as f: contractions = pickle.load(f) # In[5]: # clean, contract, split to sentence, token - if in dict, ok; if not in dict, replace with <digit> for i in range(10): s = utils.clean_sentence([neg_data[i]]) s = utils.replace_contr(s, contractions) print(repr(s[0])) print('\n') # In[6]: pos_data_clean = [] for x in tqdm(pos_data, total = len(pos_data)): x = utils.clean_sentence([x]) x = utils.replace_contr(x, contractions) pos_data_clean.append(x[0]) neg_data_clean = []
# In[7]: with open('../model/contractions.pkl', 'rb') as f: contractions = pickle.load(f) # In[8]: # clean, contract, split to sentence, token - if in dict, ok; if not in dict, replace with <digit> # In[9]: pos_data_clean = [] for x in tqdm(pos_data, total=len(pos_data)): x = utils.clean_sentence(x) x = utils.replace_contr(x, contractions) #x = utils.split_document(x[0]) #x, _ = utils.refine_document(x) # use this if need to train "phrase" pos_data_clean.append(x[0]) neg_data_clean = [] for x in tqdm(neg_data, total=len(neg_data)): x = utils.clean_sentence(x) x = utils.replace_contr(x, contractions) #x = utils.split_document(x[0]) #x, _ = utils.refine_document(x) # use this if need to train "phrase" neg_data_clean.append(x[0]) print(len(pos_data_clean), len(neg_data_clean))
digit_p = r'[0-9]+[,]*[,]*[.]*[%]*[%]*' txt_clean = re.sub(digit_p, 'digit_char', txt, flags=re.MULTILINE) #txt_clean = re.sub(r'\s*[digit_char]+\s*', 'digit_char', txt_clean, flags=re.MULTILINE) txt_clean = re.sub(r'\s*[digit_char]+\s*', '', txt_clean, flags=re.MULTILINE) #[元,吨]* return txt_clean X_train_clean = [] for i in tqdm(range(len(X_train)), total=len(X_train)): x = X_train[i] x = replace_digit(x) x = utils.clean_sentence([x]) x = utils.replace_contr(x, contractions) #x = utils.split_document(x[0]) #x, _ = utils.refine_document(x) # use this if need to train "phrase" X_train_clean.append(x[0]) X_test_clean = [] for i in tqdm(range(len(X_test)), total=len(X_test)): x = X_test[i] x = replace_digit(x) x = utils.clean_sentence([x]) x = utils.replace_contr(x, contractions) #x = utils.split_document(x[0]) #x, _ = utils.refine_document(x) # use this if need to train "phrase" X_test_clean.append(x[0])