# input must be 2D list
xxx = utils.prepare_words([xx], contractions)
np.array(xxx)


# In[ ]:





# In[13]:


xx_clean = utils.replace_contr(utils.clean_sentence(xx), contractions)
xx_clean


# In[14]:


xx_clean = utils.replace_contr(xx, contractions)
xx_clean


# In[15]:


x = utils.token_sens(xx_clean, sentence_size = 30, word_to_idx = word_to_idx)
x.shape
Exemple #2
0
# ## preprocess

# In[4]:


with open('./model/contractions.pkl', 'rb') as f:
    contractions = pickle.load(f)


# In[5]:


# clean, contract, split to sentence, token - if in dict, ok; if not in dict, replace with <digit>

for i in range(10):
    s = utils.clean_sentence([neg_data[i]])
    s = utils.replace_contr(s, contractions)
    print(repr(s[0]))
    print('\n')
# In[6]:


pos_data_clean = []

for x in tqdm(pos_data, total = len(pos_data)):
    x = utils.clean_sentence([x])
    x = utils.replace_contr(x, contractions)
    pos_data_clean.append(x[0])
    
neg_data_clean = []
Exemple #3
0
# In[7]:

with open('../model/contractions.pkl', 'rb') as f:
    contractions = pickle.load(f)

# In[8]:

# clean, contract, split to sentence, token - if in dict, ok; if not in dict, replace with <digit>

# In[9]:

pos_data_clean = []

for x in tqdm(pos_data, total=len(pos_data)):
    x = utils.clean_sentence(x)
    x = utils.replace_contr(x, contractions)
    #x = utils.split_document(x[0])
    #x, _ = utils.refine_document(x) # use this if need to train "phrase"
    pos_data_clean.append(x[0])

neg_data_clean = []

for x in tqdm(neg_data, total=len(neg_data)):
    x = utils.clean_sentence(x)
    x = utils.replace_contr(x, contractions)
    #x = utils.split_document(x[0])
    #x, _ = utils.refine_document(x) # use this if need to train "phrase"
    neg_data_clean.append(x[0])

print(len(pos_data_clean), len(neg_data_clean))
Exemple #4
0
    digit_p = r'[0-9]+[,]*[,]*[.]*[%]*[%]*'
    txt_clean = re.sub(digit_p, 'digit_char', txt, flags=re.MULTILINE)
    #txt_clean = re.sub(r'\s*[digit_char]+\s*', 'digit_char', txt_clean, flags=re.MULTILINE)
    txt_clean = re.sub(r'\s*[digit_char]+\s*',
                       '',
                       txt_clean,
                       flags=re.MULTILINE)  #[元,吨]*
    return txt_clean


X_train_clean = []

for i in tqdm(range(len(X_train)), total=len(X_train)):
    x = X_train[i]
    x = replace_digit(x)
    x = utils.clean_sentence([x])
    x = utils.replace_contr(x, contractions)
    #x = utils.split_document(x[0])
    #x, _ = utils.refine_document(x) # use this if need to train "phrase"
    X_train_clean.append(x[0])

X_test_clean = []

for i in tqdm(range(len(X_test)), total=len(X_test)):
    x = X_test[i]
    x = replace_digit(x)
    x = utils.clean_sentence([x])
    x = utils.replace_contr(x, contractions)
    #x = utils.split_document(x[0])
    #x, _ = utils.refine_document(x) # use this if need to train "phrase"
    X_test_clean.append(x[0])