def sentenceTokenize(inputSentence): # Tokenize tokenized = word_tokenize(inputSentence) newTokenize = [] for w in tokenized: newTokenize += word_tokenize(w, engine='newmm') return " ".join(newTokenize)
def time_question_features(self, text): """ Provide an analysis of significant features in the string. """ features = {} # A list of all words from the known sentences all_words = " ".join(self.positive + self.negative).split() # A list of the first word in each of the known sentence all_first_words = [] for sentence in self.positive + self.negative: all_first_words.append(sentence.split(' ', 1)[0]) for word in word_tokenize(text): features['first_word({})'.format(word)] = (word in all_first_words) for word in word_tokenize(text): features['contains({})'.format(word)] = (word in all_words) for letter in 'abcdefghijklmnopqrstuvwxyzกขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรลวศษสหฬอฮุูึๆไำะัํี๊ฯโเ้็่๋า.แิื์ใๅ': features['count({})'.format(letter)] = text.lower().count(letter) features['has({})'.format(letter)] = (letter in text.lower()) return features
def token_en2words(set_index, txt_test_data, engine='newmm'): # Dumps and load json data txt_test_data = json.dumps(txt_test_data) txt_test_data = json.loads(txt_test_data) temp = 0 save_token = [] for idx, _ in enumerate(range(len(txt_test_data))): # Tokenize entities if idx in set_index: e_token = txt_test_data[temp:idx] # Check empty text if len(e_token) > 0: # word tokenize # should rename variable 'words_tokenize' words_tokenize = [] if engine in ['newmm', 'deepcut', 'attacut']: words_tokenize = word_tokenize(e_token, engine=engine) elif engine is 'bpe': pass else: raise Exception('Tokenizer mismatch') words_tokenize = space_tokenizer(words_tokenize) save_token.extend(words_tokenize) temp = idx words_tokenize = word_tokenize(txt_test_data[temp:], engine=engine) words_tokenize = space_tokenizer(words_tokenize) save_token.extend(words_tokenize) return save_token
def test_word_tokenize_newmm_longtext(self): self.assertIsInstance( word_tokenize(self.long_text, engine="newmm"), list ) self.assertIsInstance( word_tokenize(self.long_text, engine="newmm-safe"), list )
def token_en2words(set_index, txt_test_data, engine='newmm'): # Dumps and load json data txt_test_data = json.dumps(txt_test_data) txt_test_data = json.loads(txt_test_data) temp = 0 save_token = [] for idx, _ in enumerate(range(len(txt_test_data))): # Tokenize entities if idx in set_index: e_token = txt_test_data[temp:idx] #Check empty text if (len(e_token) > 0): # Tokenize each sentence if (engine in [ 'newmm', 'longest', 'deepcut', 'icu', 'ulmfit', 'attacut' ]): words_tokenize = word_tokenize(e_token, engine=engine) else: raise 'Tokenizer mismatch' save_token.extend(words_tokenize) temp = idx save_token.extend(word_tokenize(txt_test_data[temp:], engine=engine)) return save_token
def tokenize(start_index, end_index, open_tsv='thairath1.tsv', write_tsv='tokenized1.tsv'): """ tokenize headline (line[1]) & article (line[-1]) """ # make id list for checking duplicate file = open(write_tsv, 'r', encoding='utf-8') lines = list(csv.reader(file, delimiter='\t')) id_list = [line[0] for line in lines] file.close() open_file = open(open_tsv, 'r', encoding='utf-8') write_file = open(write_tsv, 'a', encoding='utf-8') # append mode lines = list(csv.reader(open_file, delimiter='\t')) writer = csv.writer(write_file, lineterminator='\n', delimiter='\t') for line in lines[start_index: end_index]: if line[0] not in id_list: headline = [line[0], '\t'.join(word_tokenize(line[1]))] writer.writerow(headline) article = [line[0], '\t'.join(word_tokenize(line[-1]))] writer.writerow(article) open_file.close() write_file.close()
def can_process(self, statement): if 'พยากรณ์' in word_tokenize( statement.text) or 'พยากรณ์อากาศ' in word_tokenize( statement.text) or 'อากาศ' in word_tokenize( statement.text): return True else: return False
def test_word_tokenize_mm(self): self.assertEqual(multi_cut.segment(None), []) self.assertEqual(multi_cut.segment(""), []) self.assertEqual(word_tokenize("", engine="mm"), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="mm"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) self.assertIsNotNone(multi_cut.find_all_segment("รถไฟฟ้ากรุงเทพมหานครBTS"))
def test_deepcut(self): self.assertEqual(deepcut.segment(None), []) self.assertEqual(deepcut.segment(""), []) self.assertIsNotNone(deepcut.segment("ทดสอบ", DEFAULT_WORD_DICT_TRIE)) self.assertIsNotNone(deepcut.segment("ทดสอบ", ["ทด", "สอบ"])) self.assertIsNotNone(word_tokenize("ทดสอบ", engine="deepcut")) self.assertIsNotNone( word_tokenize("ทดสอบ", engine="deepcut", custom_dict=DEFAULT_WORD_DICT_TRIE))
def main(): try: with open('assets/type_1_refactor_naming_elements/original.txt', 'r', encoding='utf8') as original: original_contents = original.readlines() original_contents = [ ''.join(content.strip().split('|')) for content in original_contents ] with open('assets/type_1_refactor_naming_elements/naming_list.txt', 'r', encoding='utf8') as source: contents = source.readlines() contents = [content.strip() for content in contents] tokenized_contents = [ list(word_tokenize(content)) for content in contents ] original_tokenized_contents = [ list(word_tokenize(content)) for content in original_contents ] with open('assets/type_1_refactor_naming_elements/cosine_values.txt', 'w', encoding='utf8') as result: for sentence in (tokenized_contents): for original_sentence in (original_tokenized_contents): vector1 = (count(sentence)) vector2 = (count(original_sentence)) cosine = get_cosine(vector1, vector2) if cosine >= 0.3: result.write('"' + (''.join(sentence)) + '"' + ' COMPARED TO ' + '"' + (''.join(original_sentence)) + '"' + ' = ' + str(cosine)) result.write('\n') with open( 'assets/type_1_refactor_naming_elements/cosine_values_only_value.txt', 'w', encoding='utf8') as result: for sentence in (tokenized_contents): for original_sentence in (original_tokenized_contents): vector1 = (count(sentence)) vector2 = (count(original_sentence)) cosine = get_cosine(vector1, vector2) if cosine >= 0.3: result.write(str(cosine)) result.write('\n') except Exception as e: print(e)
def json_example(): req_data = request.get_json() message = req_data['message'] removeSpecialChars = message.translate({ord(c): "" for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+"}) classifier = initialize() tokenize = word_tokenize(removeSpecialChars) label = classifier.classify(extract_features(word_tokenize(removeSpecialChars))) return jsonify({'sentiment_label': label, 'word_tokenize': tokenize})
def test_tag(self): self.assertEqual( pos_tag(word_tokenize("คุณกำลังประชุม"), engine='old'), [('คุณ', 'PPRS'), ('กำลัง', 'XVBM'), ('ประชุม', 'VACT')]) self.assertEqual( pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]), [[('ผม', 'PPRS'), ('กิน', 'VACT'), ('ข้าว', 'NCMN')], [('แมว', 'NCMN'), ('วิ่ง', 'VACT')]]) if sys.version_info >= (3, 4): self.assertEqual( str(type(pos_tag(word_tokenize("ผมรักคุณ"), engine='artagger'))), "<class 'list'>")
def test_word_tokenize_mm(self): self.assertEqual(multi_cut.segment(None), []) self.assertEqual(multi_cut.segment(""), []) self.assertEqual(word_tokenize("", engine="mm"), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="mm"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) self.assertIsNotNone(multi_cut.mmcut("ทดสอบ")) self.assertIsNotNone(multi_cut.find_all_segment("รถไฟฟ้ากรุงเทพมหานครBTS")) self.assertEqual(multi_cut.find_all_segment(None), [])
def test_segment_newmm(self): self.assertEqual( word_tokenize('ฉันรักภาษาไทยเพราะฉันเป็นคนไทย', engine='newmm'), [u'ฉัน', u'รัก', u'ภาษาไทย', u'เพราะ', u'ฉัน', u'เป็น', u'คนไทย']) self.assertEqual( word_tokenize('สวัสดีครับ สบายดีไหมครับ', engine='newmm'), [u'สวัสดี', u'ครับ', u' ', u'สบายดี', u'ไหม', u'ครับ']) self.assertEqual(word_tokenize('จุ๋มง่วงนอนยัง', engine='newmm'), [u'จุ๋ม', u'ง่วงนอน', u'ยัง']) self.assertEqual(word_tokenize('จุ๋มง่วง', engine='newmm'), [u'จุ๋ม', u'ง่วง']) self.assertEqual( word_tokenize('จุ๋ม ง่วง', engine='newmm', whitespaces=False), [u'จุ๋ม', u'ง่วง'])
def initialize(): #open example positive and negative reviews --------------------------------- pos_reviews_file = codecs.open('pos.txt', 'r', "utf-8") neg_reviews_file = codecs.open('neg.txt', 'r', "utf-8") neu_reviews_file = codecs.open('neu.txt', 'r', "utf-8") #store positive reviews into a list ----------------------------------------- pos_reviews = [] for each_review in pos_reviews_file: each_review = ' '.join(word_tokenize(each_review)) if each_review.endswith('\n'): each_review = each_review[:-1] if not each_review == '': pos_reviews.append([each_review, 'pos']) #store negative reviews into a list ----------------------------------------- neg_reviews = [] for each_review in neg_reviews_file: each_review = ' '.join(word_tokenize(each_review)) if each_review.endswith('\n'): each_review = each_review[:-1] if not each_review == '': neg_reviews.append([each_review, 'neg']) neu_reviews = [] for each_review in neu_reviews_file: each_review = ' '.join(word_tokenize(each_review)) if each_review.endswith('\n'): each_review = each_review[:-1] if not each_review == '': neu_reviews.append([each_review, 'neu']) #remove words whose length is < 3 and combine both lists -------------------- all_reviews = [] for (review, sentiment) in pos_reviews + neg_reviews + neu_reviews: reviews_filtered = [ w.lower() for w in word_tokenize(review) if len(w) >= 3 ] all_reviews.append((reviews_filtered, sentiment)) #get feature set------------------------------------------------------------- global review_features review_features = get_word_features(get_words_in_reviews(all_reviews)) #review_features = remove_punctuation(review_features) #get training set ----------------------------------------------------------- training_set = nltk.classify.apply_features(extract_features, all_reviews) classifier = nltk.NaiveBayesClassifier.train(training_set) return classifier
def process_one_pantip(text_list, min_seq_length=5, max_seq_length=300, sep_func=sent_tokenize): word_counts = [] texts = [] for text in text_list: text = text.strip() word_count = len(word_tokenize(text)) if word_count > max_seq_length: sub_text = [process_transformers(i) for i in sep_func(text)] sub_word_count = [len(word_tokenize(i)) for i in sub_text] texts+=sub_text word_counts+=sub_word_count else: texts.append(process_transformers(text)) word_counts.append(word_count) return pd.DataFrame({"text": texts, "wc": word_counts})
def pos_tag_api(): sent = request.args.get('sent', 0, type=str) txt = "" for i in sent.split('<br>'): txt += " ".join("%s/%s" % tup for tup in pos_tag(word_tokenize(i))) + "<br>" return jsonify(result=txt)
def text2conll2002(text): text = text.replace(' ', '<space>') text = text.replace("''", '"') text = text.replace("’", '"').replace("‘", '"') tag = tokenizer.tokenize(text) j = 0 conll2002 = "" for tagopen, text, tagclose in tag: word_cut = word_tokenize(text) i = 0 while i < len(word_cut): if word_cut[i] == "''" or word_cut[i] == '"': pass elif i == 0 and tagopen != 'word': conll2002 += word_cut[i] #conll2002+='\t'+pos_tag2[j][1] conll2002 += '\t' + 'B-' + 'NP' #tagopen elif tagopen != 'word': conll2002 += word_cut[i] #conll2002+='\t'+pos_tag2[j][1] conll2002 += '\t' + 'I-' + 'NP' #tagopen else: conll2002 += word_cut[i] #conll2002+='\t'+pos_tag2[j][1] conll2002 += '\t' + 'O' conll2002 += '\n' #j+=1 i += 1 return postag(conll2002)
def test_word_tokenize_icu(self): self.assertEqual(tokenize_pyicu.segment(None), []) self.assertEqual(tokenize_pyicu.segment(""), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"), ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], )
def test_word_tokenize_deepcut(self): self.assertEqual(tokenize_deepcut.segment(None), []) self.assertEqual(tokenize_deepcut.segment(""), []) self.assertIsNotNone( tokenize_deepcut.segment("ทดสอบ", DEFAULT_DICT_TRIE)) self.assertIsNotNone(tokenize_deepcut.segment("ทดสอบ", ["ทด", "สอบ"])) self.assertIsNotNone(word_tokenize("ทดสอบ", engine="deepcut"))
def getTokensesFromPandas(dataSample, columnName): tokenses = [] for index, row in dataSample.iterrows(): tokens = word_tokenize(row[columnName]) tokenses.append(tokens) return tokenses
def test_word_tokenize_attacut(self): self.assertEqual(attacut.segment(None), []) self.assertEqual(attacut.segment(""), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="attacut"), ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], )
def romanization(data, engine='royin'): """ :param str data: Thai text to be romanized :param str engine: choose between 'royin' , 'pyicu' and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning thai romanization. :return: English (more or less) text that spells out how the Thai text should read. """ listword = [] if engine == 'royin': from .royin import romanization elif engine == 'pyicu': from .pyicu import romanization elif engine == 'thai2rom': from pythainlp.romanization.thai2rom import thai2rom thai = thai2rom() return thai.romanization(data) else: raise Exception("error no have engine.") try: word_list = word_tokenize(data) i = 0 while i < len(word_list): listword.append(romanization(word_list[i])) i += 1 except: listword = [romanization(data)] return ''.join(listword)
def getVector(sentence): # print('getVector') words = word_tokenize(sentence) # words = word_tokenize(sentence, engine='icu') # vectors = map(lambda x:model.wv(x), words) # print('tokenized') # print(words) # words= list(map(lambda v: removeNoVocab(v), words)) # words= [removeNoVocab(w) for w in words] # words= removeNoVocab(words) # print('removed No Vocabs') # print(words) vectors = [] for w in words: if w in w2vModel.wv: vectors.append(w2vModel.wv[w]) if len(vectors) == 0: return [] # vectors= list(map(lambda v: removeNoVocab(v), vectors)) # print("vector=") # print(vectors) npArray = np.array(vectors) avg = np.mean(npArray, axis=0) return avg
def createBOW(ls_txt, corpus): custom_dict = set(thai_words()) word = ['ราเมง', 'อิเกีย', 'คาปูชิโน่', 'น้ำมัน', 'หอยลาย', 'ปุ้มปุ้ย'] for i in word: custom_dict.add(i) trie = dict_trie(dict_source=custom_dict) BOW_t = [list() for i in range(len(ls_txt))] l = 0 for i in ls_txt: tmp = word_tokenize(i, engine='dict', custom_dict=trie) for j in corpus: if j in tmp: BOW_t[l].append(tmp.count(j)) tmp.remove(j) else: BOW_t[l].append(0) if len(tmp) != 0: BOW_t[l].append(len(tmp)) elif len(tmp) == 0: BOW_t[l].append(0) l += 1 # corpus_t = corpus.append('Other') # ch = pd.DataFrame({ # 'train':corpus, # 'target':BOW_t[0] # }) # ch # predictiontree = dtree.predict(BOW_t) return list(BOW_t)
def prepro(txt, wanto): cut_cum = word_tokenize(txt) ff = list( filter( lambda x: x not in ("http", "https", ":", " ", '://', 't', '.', 'co', 'RT', '\n', '...'), cut_cum)) return list(filter(lambda x: x in wanto, ff))
def receive_message(): if request.method == 'GET': """Before allowing people to message your bot, Facebook has implemented a verify token that confirms all requests that your bot receives came from Facebook.""" token_sent = request.args.get("hub.verify_token") return verify_fb_token(token_sent) #if the request was not get, it must be POST and we can just proceed with sending a message back to user else: # get whatever message a user sent the bot output = request.get_json() for event in output['entry']: print(event) messaging = event['messaging'] #messaging for message in messaging: if message.get('message'): #Facebook Messenger ID for user so we know where to send response back to recipient_id = message['sender']['id'] if message['message'].get('text'): msg_input = message['message'].get('text') dict_count_thai = isthai(msg_input) response_sent_text = get_message() if dict_count_thai['thai'] > 0: list_tokenized = word_tokenize(msg_input, engine='newmm') response_sent_text = ' '.join(list_tokenized) send_message(recipient_id, response_sent_text) #send_message("1834191463278166", response_sent_text) #if user sends us a GIF, photo,video, or any other non-text item if message['message'].get('attachments'): response_sent_nontext = get_message() send_message(recipient_id, response_sent_nontext) return "Message Processed"
def sentence_vectorizer(text: str, use_mean: bool = True): """ Get sentence vector from text If a word is not in the vocabulary, KeyError will be raised. :param string text: text input :param boolean use_mean: if `True` use mean of all word vectors else use summation :return: sentence vector of given input text """ words = word_tokenize(text, engine="ulmfit") vec = np.zeros((1, WV_DIM)) for word in words: if word == " ": word = "xxspace" elif word == "\n": word = "xxeol" if word in _MODEL.wv.index2word: vec += _MODEL.wv.word_vec(word) else: pass if use_mean: vec /= len(words) return vec
def tokenize(self,text): """ :meth: tokenize text with selected engine :param str text: text to tokenize :return: tokenized text """ return [t for t in word_tokenize(self.sub_br(text),engine=self.engine)]
def test_icu(self): self.assertEqual(pyicu.segment(None), []) self.assertEqual(pyicu.segment(""), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="icu"), ["ฉัน", "รัก", "ภาษา", "ไทย", "เพราะ", "ฉัน", "เป็น", "คน", "ไทย"], )
def text2conll2002(text, pos=True): """ ใช้แปลงข้อความให้กลายเป็น conll2002 """ text = toolner_to_tag(text) text = text.replace("''", '"') text = text.replace("’", '"').replace("‘", '"') #.replace('"',"") tag = tokenizer.tokenize(text) j = 0 conll2002 = "" for tagopen, text, tagclose in tag: word_cut = word_tokenize(text, engine=thaicut) # ใช้ตัวตัดคำ newmm i = 0 txt5 = "" while i < len(word_cut): if word_cut[i] == "''" or word_cut[i] == '"': pass elif i == 0 and tagopen != 'word': txt5 += word_cut[i] txt5 += '\t' + 'B-' + tagopen elif tagopen != 'word': txt5 += word_cut[i] txt5 += '\t' + 'I-' + tagopen else: txt5 += word_cut[i] txt5 += '\t' + 'O' txt5 += '\n' #j+=1 i += 1 conll2002 += txt5 if pos == False: return conll2002 return postag(conll2002)
def split_word(text): th_stop = tuple(thai_stopwords()) en_stop = tuple(get_stop_words('en')) p_stemmer = PorterStemmer() tokens = word_tokenize(text,engine='newmm') # Remove Thai and English stop words tokens = [i for i in tokens if not i in th_stop and not i in en_stop] # Find Thai and English stem words # English tokens = [p_stemmer.stem(i) for i in tokens] # Thai tokens_temp=[] for i in tokens: w_syn = wordnet.synsets(i) if (len(w_syn)>0) and (len(w_syn[0].lemma_names('tha'))>0): tokens_temp.append(w_syn[0].lemma_names('tha')[0]) else: tokens_temp.append(i) tokens = tokens_temp # Remove numbers tokens = [i for i in tokens if not i.isnumeric()] # Remove space tokens = [i for i in tokens if not ' ' in i] return tokens
def get_ner(self,text,postag=True): """ Get NER from Thai NER. :param string text: thai text :param boolean postag: get postag (True) or get not postag (False) :return: list NER. **Example**:: >>> from pythainlp.ner import thainer >>> ner=thainer() >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.") [('วันที่', 'JSBR', 'O'), (' ', 'NCMN', 'O'), ('15', 'NCNM', 'B-DATE'), (' ', 'NCMN', 'I-DATE'), ('ก.ย.', 'CMTR', 'I-DATE'), (' ', 'NCMN', 'I-DATE'), ('61', 'NCNM', 'I-DATE'), (' ', 'NCMN', 'O'), ('ทดสอบ', 'VACT', 'O'), ('ระบบ', 'NCMN', 'O'), ('เวลา', 'NCMN', 'O'), (' ', 'NCMN', 'O'), ('14', 'NCNM', 'B-TIME'), (':', 'PUNC', 'I-TIME'), ('49', 'NCNM', 'I-TIME'), (' ', 'NCMN', 'I-TIME'), ('น.', 'CMTR', 'I-TIME')] >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.",postag=False) [('วันที่', 'O'), (' ', 'O'), ('15', 'B-DATE'), (' ', 'I-DATE'), ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'), ('61', 'I-DATE'), (' ', 'O'), ('ทดสอบ', 'O'), ('ระบบ', 'O'), ('เวลา', 'O'), (' ', 'O'), ('14', 'B-TIME'), (':', 'I-TIME'), ('49', 'I-TIME'), (' ', 'I-TIME'), ('น.', 'I-TIME')] """ self.word_cut=word_tokenize(text,engine=thaicut) self.list_word=pos_tag(self.word_cut,engine='perceptron') self.X_test = self.extract_features([(data,self.list_word[i][1]) for i,data in enumerate(self.word_cut)]) self.y_=self.crf.predict_single(self.X_test) if postag: return [(self.word_cut[i],self.list_word[i][1],data) for i,data in enumerate(self.y_)] else: return [(self.word_cut[i],data) for i,data in enumerate(self.y_)]
def tokenize(self, text): """ :meth: tokenize text with selected engine :param str text: text to tokenize :return: tokenized text """ return [t for t in word_tokenize(self.sub_br(text), engine=self.engine)]
def process_nlp_prediction(request): text = request.POST.get("text") text = re.sub(r"<.*?>", "", text) text = re.sub(r"#", "", text) text = re.sub(r"…", "", text) for c in string.punctuation: text = re.sub(r"\{}".format(c), "", text) text = " ".join(text.split()) language = detect(text) if language == "th": vocabulary = pickle.load(open("././nlp-vocabulary.pkl", "rb")) NLP_model = pickle.load(open("././nlp-model.pkl", "rb")) featurized_test_sentence = { i: (i in word_tokenize(text.lower())) for i in vocabulary } response = { "test_sent": text, "result": NLP_model.classify(featurized_test_sentence), } else: response = { "test_sent": text, "result": "Sorry!! This language is not supported, please send a message in Thai.", } return JsonResponse(response)
def make_doc(self, text): try: from pythainlp.tokenize import word_tokenize except ImportError: raise ImportError("The Thai tokenizer requires the PyThaiNLP library: " "https://github.com/wannaphongcom/pythainlp/") words = [x for x in list(word_tokenize(text,"newmm"))] return Doc(self.vocab, words=words, spaces=[False]*len(words))
def test_word_tokenize_longest(self): self.assertEqual(longest.segment(None), []) self.assertEqual(longest.segment(""), []) self.assertIsNotNone(longest.segment("กรุงเทพฯมากๆเพราโพาง BKKฯ")) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="longest"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], )
def sentence_vectorizer(ss,dim=300,use_mean=False): s = word_tokenize(ss) vec = np.zeros((1,dim)) for word in s: if word in get_model().wv.index2word: vec+= get_model().wv.word_vec(word) else: pass if use_mean: vec /= len(s) return(vec)
def document_vector(ss, m, stoi,tok_engine='newmm'): s = word_tokenize(ss) t = LongTensor([stoi[i] for i in s]).view(-1,1).cuda() t = Variable(t,volatile=False) m.reset() pred,*_ = m[0](t) #get average of last lstm layer along bptt res = to_np(torch.mean(pred[-1],0).view(-1)) return(res)
def test_pos_tag(self): tokens = ["ผม", "รัก", "คุณ"] self.assertEqual(pos_tag(None), []) self.assertEqual(pos_tag([]), []) self.assertEqual(unigram.tag(None, corpus="pud"), []) self.assertEqual(unigram.tag([], corpus="pud"), []) self.assertEqual(unigram.tag(None, corpus="orchid"), []) self.assertEqual(unigram.tag([], corpus="orchid"), []) self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="orchid")) self.assertIsNotNone(pos_tag(tokens, engine="unigram", corpus="pud")) self.assertIsNotNone(pos_tag([""], engine="unigram", corpus="pud")) self.assertEqual( pos_tag(word_tokenize("คุณกำลังประชุม"), engine="unigram"), [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")], ) self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="orchid")) self.assertIsNotNone(pos_tag(tokens, engine="perceptron", corpus="pud")) self.assertEqual(perceptron.tag(None, corpus="pud"), []) self.assertEqual(perceptron.tag([], corpus="pud"), []) self.assertEqual(perceptron.tag(None, corpus="orchid"), []) self.assertEqual(perceptron.tag([], corpus="orchid"), []) self.assertIsNotNone(pos_tag(None, engine="artagger")) self.assertIsNotNone(pos_tag([], engine="artagger")) self.assertIsNotNone(pos_tag(tokens, engine="artagger")) self.assertEqual( pos_tag(word_tokenize("คุณกำลังประชุม"), engine="artagger"), [("คุณ", "PPRS"), ("กำลัง", "XVBM"), ("ประชุม", "VACT")], ) self.assertEqual(pos_tag_sents(None), []) self.assertEqual(pos_tag_sents([]), []) self.assertEqual( pos_tag_sents([["ผม", "กิน", "ข้าว"], ["แมว", "วิ่ง"]]), [ [("ผม", "PPRS"), ("กิน", "VACT"), ("ข้าว", "NCMN")], [("แมว", "NCMN"), ("วิ่ง", "VACT")], ], )
def get_sentiment(ss,return_score=False): s = word_tokenize(ss) t = LongTensor([stoi[i] for i in s]).view(-1,1).cpu() t = Variable(t,volatile=False) m.reset() pred,*_ = m(t) result = pred.data.cpu().numpy().reshape(-1) if return_score: return(softmax(result)) else: return(np.argmax(result))
def summarize(self, text, n,tokenize): sents = sent_tokenize(text) word_sent = [word_tokenize(s,tokenize) for s in sents] self._freq = self._compute_frequencies(word_sent) ranking = defaultdict(int) for i, sent in enumerate(word_sent): for w in sent: if w in self._freq: ranking[i] += self._freq[w] sents_idx = self._rank(ranking,n) return [sents[j] for j in sents_idx]
def test_word_tokenize_newmm(self): self.assertEqual(newmm.segment(None), []) self.assertEqual(newmm.segment(""), []) self.assertEqual( word_tokenize("ฉันรักภาษาไทยเพราะฉันเป็นคนไทย", engine="newmm"), ["ฉัน", "รัก", "ภาษาไทย", "เพราะ", "ฉัน", "เป็น", "คนไทย"], ) self.assertEqual( word_tokenize( "สวัสดีครับ สบายดีไหมครับ", engine="newmm", keep_whitespace=True ), ["สวัสดี", "ครับ", " ", "สบายดี", "ไหม", "ครับ"], ) self.assertEqual( word_tokenize("จุ๋มง่วงนอนยัง", engine="newmm"), ["จุ๋ม", "ง่วงนอน", "ยัง"] ) self.assertEqual(word_tokenize("จุ๋มง่วง", engine="newmm"), ["จุ๋ม", "ง่วง"]) self.assertEqual( word_tokenize("จุ๋ม ง่วง", engine="newmm", keep_whitespace=False), ["จุ๋ม", "ง่วง"], )
def summarize(self, text: str, n: int, tokenizer: str = "newmm") -> List[str]: sents = sent_tokenize(text) word_tokenized_sents = [word_tokenize(sent, engine=tokenizer) for sent in sents] self.__freq = self.__compute_frequencies(word_tokenized_sents) ranking = defaultdict(int) for i, sent in enumerate(word_tokenized_sents): for w in sent: if w in self.__freq: ranking[i] += self.__freq[w] summaries_idx = self.__rank(ranking, n) return [sents[j] for j in summaries_idx]
def sentiment(text): """ sentiment ภาษาไทย ใช้ข้อมูลจาก https://github.com/wannaphongcom/lexicon-thai/tree/master/ข้อความ/ รับค่าสตริง str คืนค่า pos , neg""" with open(os.path.join(templates_dir, 'vocabulary.data'), 'rb') as in_strm: vocabulary = dill.load(in_strm) in_strm.close() with open(os.path.join(templates_dir, 'sentiment.data'), 'rb') as in_strm: classifier = dill.load(in_strm) in_strm.close() text=set(word_tokenize(text))-set(stopwords.words('thai')) featurized_test_sentence = {i:(i in text) for i in vocabulary} return classifier.classify(featurized_test_sentence)
def document_vector(ss, m, stoi,tok_engine='newmm'): """ :meth: `document_vector` get document vector using pretrained ULMFit model :param str ss: sentence to extract embeddings :param m: pyTorch model :param dict stoi: string-to-integer dict e.g. {'_unk_':0, '_pad_':1,'first_word':2,'second_word':3,...} :param str tok_engine: tokenization engine (recommend using `newmm` if you are using pretrained ULMFit model) :return: `numpy.array` of document vector sized 300 """ s = word_tokenize(ss) t = LongTensor([stoi[i] for i in s]).view(-1,1).cuda() t = Variable(t,volatile=False) m.reset() pred,*_ = m[0](t) #get average of last lstm layer along bptt res = to_np(torch.mean(pred[-1],0).view(-1)) return(res)
def get_ner( self, text: str, pos: bool = True ) -> Union[List[Tuple[str, str]], List[Tuple[str, str, str]]]: """ Get named-entities in text :param string text: Thai text :param boolean pos: get Part-Of-Speech tag (True) or get not (False) :return: list of strings with name labels (and part-of-speech tags) **Example**:: >>> from pythainlp.tag.named_entity import ThaiNameTagger >>> ner = ThaiNameTagger() >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.") [('วันที่', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), ('15', 'NUM', 'B-DATE'), (' ', 'PUNCT', 'I-DATE'), ('ก.ย.', 'NOUN', 'I-DATE'), (' ', 'PUNCT', 'I-DATE'), ('61', 'NUM', 'I-DATE'), (' ', 'PUNCT', 'O'), ('ทดสอบ', 'VERB', 'O'), ('ระบบ', 'NOUN', 'O'), ('เวลา', 'NOUN', 'O'), (' ', 'PUNCT', 'O'), ('14', 'NOUN', 'B-TIME'), (':', 'PUNCT', 'I-TIME'), ('49', 'NUM', 'I-TIME'), (' ', 'PUNCT', 'I-TIME'), ('น.', 'NOUN', 'I-TIME')] >>> ner.get_ner("วันที่ 15 ก.ย. 61 ทดสอบระบบเวลา 14:49 น.", pos=False) [('วันที่', 'O'), (' ', 'O'), ('15', 'B-DATE'), (' ', 'I-DATE'), ('ก.ย.', 'I-DATE'), (' ', 'I-DATE'), ('61', 'I-DATE'), (' ', 'O'), ('ทดสอบ', 'O'), ('ระบบ', 'O'), ('เวลา', 'O'), (' ', 'O'), ('14', 'B-TIME'), (':', 'I-TIME'), ('49', 'I-TIME'), (' ', 'I-TIME'), ('น.', 'I-TIME')] """ self.__tokens = word_tokenize(text, engine=_WORD_TOKENIZER) self.__pos_tags = pos_tag( self.__tokens, engine="perceptron", corpus="orchid_ud" ) self.__x_test = self.__extract_features(self.__pos_tags) self.__y = self.crf.predict_single(self.__x_test) if pos: return [ (self.__pos_tags[i][0], self.__pos_tags[i][1], data) for i, data in enumerate(self.__y) ] return [(self.__pos_tags[i][0], data) for i, data in enumerate(self.__y)]
def sentiment(text, engine='old'): """ :param str text: thai text :param str engine: sentiment analysis engine (old or ulmfit) :return: pos or neg **Example**:: >>> from pythainlp.sentiment import sentiment >>> text="วันนี้อากาศดีจัง" >>> sentiment(text) 'pos' >>> sentiment(text,'ulmfit') 'pos' >>> text="วันนี้อารมณ์เสียมาก" >>> sentiment(text) 'neg' >>> sentiment(text,'ulmfit') 'neg' """ if engine=='old': with open(os.path.join(templates_dir, 'vocabulary.data'), 'rb') as in_strm: vocabulary = dill.load(in_strm) with open(os.path.join(templates_dir, 'sentiment.data'), 'rb') as in_strm: classifier = dill.load(in_strm) text=set(word_tokenize(text))-set(stopwords.words('thai')) featurized_test_sentence = {i:(i in text) for i in vocabulary} return classifier.classify(featurized_test_sentence) elif engine=='ulmfit': from pythainlp.sentiment import ulmfit_sent tag=ulmfit_sent.get_sentiment(text) sa="" if tag==0: sa="neg" else: sa="pos" return sa else: raise Exception("error no have engine.")
def romanization(data,engine='royin'): """ :param str data: Thai text to be romanized :param str engine: choose between 'royin' , 'pyicu' and 'thai2rom'. 'royin' will romanize according to the standard of Thai Royal Institute. 'pyicu' will romanize according to the Internaitonal Phonetic Alphabet. 'thai2rom' is deep learning thai romanization. :return: English (more or less) text that spells out how the Thai text should read. """ word_list=word_tokenize(data) listword=[] i=0 if engine=='royin': from .royin import romanization elif engine=='pyicu': from .pyicu import romanization elif engine=='thai2rom': from pythainlp.romanization.thai2rom import thai2rom thai=thai2rom() return thai.romanization(data) else: raise Exception("error no have engine.") while i<len(word_list): listword.append(romanization(word_list[i])) i+=1 return ''.join(listword)
def test_keywords(self): self.assertEqual(find_keyword(word_tokenize("แมวกินปลาอร่อยรู้ไหมว่าแมวเป็นแมวรู้ไหมนะแมว",engine='newmm')),{u'แมว': 4})
def test_segment_mm(self): self.assertEqual(word_tokenize('ฉันรักภาษาไทยเพราะฉันเป็นคนไทย',engine='mm'),[u'ฉัน', u'รัก', u'ภาษาไทย', u'เพราะ', u'ฉัน', u'เป็น', u'คนไทย'])
from pythainlp.tokenize import word_tokenize textTest ="ฉันรักคนไทยที่กินข้าว" print(word_tokenize("นี่ข้าวใคร"))
dill.dump(marisa_trie.Trie(data),dill_file) dill_file.close() with open(path,'rb') as dill_file: data=dill.load(dill_file) dill_file.close() return data def test_segmenter(segmenter, test): ''' ระบบทดสอบการตัดคำ ''' words = test result = segmenter correct = (result == words) if not correct: print ('expected', words) print('got ', result) return correct if __name__ == "__main__": from pythainlp.tokenize import word_tokenize text="ฉันเป็นคนและฉันรักภาษาไทยฉันอยู่ประเทศไทยฉันศึกษาอยู่ที่มหาวิทยาลัยพายุฝนกำลังมาต้องหลบแล้วล่ะคุณสบายดีไหม" test=["ฉัน","เป็น","คน","และ","ฉัน","รัก","ภาษาไทย","ฉัน","อยู่","ประเทศไทย","ฉัน","ศึกษา","อยู่","ที่","มหาวิทยาลัย","พายุฝน","กำลัง","มา","ต้อง","หลบ","แล้ว","ล่ะ","คุณ","สบายดี","ไหม"] print("icu :") pyicu=test_segmenter(word_tokenize(text,engine='icu'),test) print(pyicu) print("newmm :") newmm=test_segmenter(word_tokenize(text,engine='newmm'),test) print(newmm) print("mm :") mm=test_segmenter(word_tokenize(text,engine='mm'),test) print(mm)
def test_tag(self): self.assertEqual(pos_tag(word_tokenize("คุณกำลังประชุม"),engine='old'),[('คุณ', 'PPRS'), ('กำลัง', 'XVBM'), ('ประชุม', 'VACT')]) self.assertEqual(pos_tag_sents([["ผม","กิน","ข้าว"],["แมว","วิ่ง"]]),[[('ผม', 'PPRS'), ('กิน', 'VACT'), ('ข้าว', 'NCMN')], [('แมว', 'NCMN'), ('วิ่ง', 'VACT')]]) if sys.version_info >= (3,4): self.assertEqual(str(type(pos_tag(word_tokenize("ผมรักคุณ"),engine='artagger'))),"<class 'list'>")
def test_keywords(self): word_list = word_tokenize( "แมวกินปลาอร่อยรู้ไหมว่าแมวเป็นแมวรู้ไหมนะแมว", engine="newmm" ) self.assertEqual(find_keyword(word_list), {"แมว": 4})
def test_segment_newmm(self): self.assertEqual(word_tokenize('ฉันรักภาษาไทยเพราะฉันเป็นคนไทย',engine='newmm'),[u'ฉัน', u'รัก', u'ภาษาไทย', u'เพราะ', u'ฉัน', u'เป็น', u'คนไทย']) self.assertEqual(word_tokenize('สวัสดีครับ สบายดีไหมครับ',engine='newmm'),[u'สวัสดี', u'ครับ', u' ', u'สบายดี', u'ไหม', u'ครับ']) self.assertEqual(word_tokenize('จุ๋มง่วงนอนยัง',engine='newmm'),[u'จุ๋ม', u'ง่วงนอน', u'ยัง']) self.assertEqual(word_tokenize('จุ๋มง่วง',engine='newmm'),[u'จุ๋ม', u'ง่วง']) self.assertEqual(word_tokenize('จุ๋ม ง่วง',engine='newmm',whitespaces=False),[u'จุ๋ม', u'ง่วง'])
def test_segment_Wordcut(self): if sys.version_info >= (3,4) and sys.platform!="win32" and sys.platform!="win64": self.assertEqual(word_tokenize('ฉันรักภาษาไทยเพราะฉันเป็นคนไทย',engine='wordcutpy'),[u'ฉัน', u'รัก', u'ภาษา', u'ไทย', u'เพราะ', u'ฉัน', u'เป็น', u'คน', u'ไทย'])
def Text(str1): if isinstance(str1,list) == False: str1=word_tokenize(str(str1)) return nltk.Text(str1)
def test_segment_longest_matching(self): self.assertEqual(word_tokenize('ฉันรักภาษาไทยเพราะฉันเป็นคนไทย',engine='longest-matching'),[u'ฉัน', u'รัก', u'ภาษาไทย', u'เพราะ', u'ฉัน', u'เป็น', u'คนไทย'])
def tokenize(self,x): return [t for t in word_tokenize(self.sub_br(x),engine=self.engine)]