# input must be 2D list xxx = utils.prepare_words([xx], contractions) np.array(xxx) # In[ ]: # In[13]: xx_clean = utils.replace_contr(utils.clean_sentence(xx), contractions) xx_clean # In[14]: xx_clean = utils.replace_contr(xx, contractions) xx_clean # In[15]: x = utils.token_sens(xx_clean, sentence_size = 30, word_to_idx = word_to_idx) x.shape
# In[7]: with open('../model/contractions.pkl', 'rb') as f: contractions = pickle.load(f) # In[8]: # clean, contract, split to sentence, token - if in dict, ok; if not in dict, replace with <digit> # In[9]: pos_data_clean = [] for x in tqdm(pos_data, total=len(pos_data)): x = utils.clean_sentence(x) x = utils.replace_contr(x, contractions) #x = utils.split_document(x[0]) #x, _ = utils.refine_document(x) # use this if need to train "phrase" pos_data_clean.append(x[0]) neg_data_clean = [] for x in tqdm(neg_data, total=len(neg_data)): x = utils.clean_sentence(x) x = utils.replace_contr(x, contractions) #x = utils.split_document(x[0]) #x, _ = utils.refine_document(x) # use this if need to train "phrase" neg_data_clean.append(x[0]) print(len(pos_data_clean), len(neg_data_clean))