def test_process_thai_sparse(self): text = "👍👍👍 #AnA มากกกก น้อยน้อย ().1146" actual = process_thai(text) # after pre_rules_th_sparse # >>> "👍👍👍 # Ana มาก xxrep น้้อยน้อย .1146" # # after tokenize with word_tokenize(engine="newmm") # >>> ["👍👍👍", " ", "#", " ","Ana", " ", "มาก", "xxrep", # " ", "น้อย", "น้อย", " ", ".", "1146"] # # after post_rules_th # - remove whitespace token (" ") # >>> ["xxwrep, "👍", "#", "ana", "มาก", # "xxrep", "xxwrep", "น้อย", ".", "1146"] expect = [ "xxwrep", "👍", "#", "ana", "มาก", "xxrep", "xxwrep", "น้อย", ".", "1146", ] self.assertEqual(actual, expect)
def predictbyid(): if request.method == 'POST': # get user id from text box user = request.form['userID'] # get tweets with user id posts = getTweetFromUser(user) if posts.shape[0] != 0: # words processing posts["processed"] = posts.Tweets.map( lambda x: "|".join(process_thai(x))) posts["wc"] = posts.processed.map(lambda x: len(x.split("|"))) posts["uwc"] = posts.processed.map( lambda x: len(set(x.split("|")))) tf_input = tfidf_fit.transform(posts["Tweets"]) num_input = scaler_fit.transform(posts[["wc", "uwc"]].astype(float)) t_input = np.concatenate([num_input, tf_input.toarray()], axis=1) # predict and convert output to list result = predict(t_input, posts) return render_template('output.html', output_result=result, length=len(result)) else: return render_template('output.html', length=0) return render_template('predictById.html')
def test_process_thai_2(self): """rules for dense features""" text = "👍👍👍 #AnA มากกกก น้อยน้อย ().1146" actual = process_thai(text, pre_rules=pre_rules_th, post_rules=post_rules_th, tok_func=_pythainlp_tokenizer.word_tokenize) # after pre_rules_th # >>> "👍👍👍 # Ana มากxxrep4 น้้อยน้อย .1146" # # after tokenize with word_tokenize(engine="newmm") # >>> ["👍👍👍", " ", "#", "Ana", " ", "มาก", "xxrep", "4", # " ", "น้อย", "น้อย", " ", ".", "1146"] # after post_rules_th # -- because it performs `replace_wrep_post` before `ungroup_emoji`, # 3 repetitive emoji are not marked with special token "xxwrep num" # # >>> ["👍", "👍","👍", " ", "#", "ana", " ", "มาก", # "xxrep", "4", " ", "xxwrep", "1", "น้อย", " ", # ".", "1146"] expect = [ "👍", "👍", "👍", " ", "#", " ", "ana", " ", "มาก", "xxrep", "4", " ", "xxwrep", "1", "น้อย", " ", ".", "1146" ] self.assertEqual(actual, expect)
def predictbysentence(): result = "" if request.method == 'POST': # get sentense from text box text = request.form['texts'] # clean text texts = cleanText(text) # create and put data into dataframe posts = pd.DataFrame({"texts": [texts]}) if posts.shape[0] != 0: # words processing posts["processed"] = posts.texts.map( lambda x: "|".join(process_thai(x))) posts["wc"] = posts.processed.map(lambda x: len(x.split("|"))) posts["uwc"] = posts.processed.map( lambda x: len(set(x.split("|")))) tf_input = tfidf_fit.transform(posts["texts"]) num_input = scaler_fit.transform(posts[["wc", "uwc"]].astype(float)) t_input = np.concatenate([num_input, tf_input.toarray()], axis=1) # predict output_pd = pd.DataFrame(model.predict_proba(t_input)) output_pd.columns = model.classes_ output = model.predict(t_input) # replace output with word if output == "neg": result = "Negative" elif output == "pos": result = "Positive" elif output == "neu": result = "Neutral" return render_template('outputBySentence.html', result=result, text=texts) else: return render_template('outputBySentence.html', length=0) return render_template('predictBySentence.html')