def review_to_words(review, filename): """ Function to convert a raw review to a string of words :param review :return: meaningful_words """ # 1. Convert to lower case, split into individual words # words = review.lower().split() tup = ViPosTagger.postagging( ViTokenizer.tokenize(unicode(review, encoding='utf-8'))) # gan nhan POS words = review.split() # 2. In Python, searching a set is much faster than searching # a list, so convert the stop words to a set with open(filename, "r") as f3: dict_data = f3.read() array = dict_data.splitlines() # 3. Remove stop words meaningful_words = [w for w in words if not w in array] # 4. Join the words back into one string separated by space, # and return the result. return " ".join(meaningful_words) meaningful_words = [w for w in words if not w in array] b = " ".join(meaningful_words) # cau sau khi loai bo stopword words_list = b.split() tup = ViPosTagger.postagging( ViTokenizer.tokenize(unicode(b, encoding='utf-8'))) # gan nhan POS a = tup[1] c = words_list + a return " ".join(c)
def remove_stop_postag(dataset, output_dir): utils.mkdir(output_dir) stack = os.listdir(dataset) # print 'loading data in ' + dataset total_doc = 0 while (len(stack) > 0): file_name = stack.pop() file_path = os.path.join(dataset, file_name) if (os.path.isdir(file_path)): # neu la thu muc thi day vao strong stack utils.push_data_to_stack(stack, file_path, file_name) else: with open(file_path, 'r', encoding='utf-8') as fr: data = unicodedata.normalize('NFKC', fr.read().strip()) original_content = tokenizer.predict(data) content = map(lambda x: ViPosTagger.postagging(x), spliter.split(original_content)) clean_content = [] for info in content: sen = [] for i in xrange(len(info[0])): if is_exist(info[1][i]): sen.append(info[0][i]) clean_content.append(u' '.join(sen)) with open(os.path.join(output_dir, os.path.basename(file_name)), 'w', encoding='utf-8') as fw: if len(clean_content) > 0: fw.write(u'\n'.join(clean_content)) else: fw.write(original_content) total_doc += 1
def load_data(filename): col1 = [] col2 = [] col3 = [] col4 = [] with open(filename, 'r') as f: for line in f: label1, p, label2, question = line.split(" ", 3) question = question.replace("\n", "") s1 = ViPosTagger.postagging(unicode( question, encoding='utf-8')) # gan nhan POS r1 = [] for i1, i2 in zip(s1[0], s1[1]): t1 = i1 + "_" + i2 t1 = t1.encode('utf-8') r1.append(t1) z = ' '.join(r1) col4.append(z) col1.append(label1) col2.append(label2) # col3.append(question) d = {"label1": col1, "label2": col2, "question": col4} train = pd.DataFrame(d) if filename == 'datavn/train': joblib.dump(train, 'model_pos/train_rf_pos12.pkl') else: joblib.dump(train, 'model_pos/test_rf_pos12.pkl') return train
def load_data(filename, dict): res = [] col1 = [] col2 = [] col3 = [] col4 = [] with open(filename, 'r') as f, open(dict, "w") as f2: for line in f: label1, p, label2, question = line.split(" ", 3) question = review_to_words(question, 'datavn/question_stopwords.txt') # question = review_add_pos(question,'datavn/question_stopwords.txt') col1.append(label1) col2.append(label2) col3.append(question) ngram = ngrams_array(col3, 2) # tu dien cac tu va so lan xuat hien cua no dict_arr = [] # list cac tu co tan suat < 1 for x in ngram: p = ngram.get(x) if p < 1: dict_arr.append(x) f2.write(x + "\n") col4 = [] for q in col3: r1 = [] r2 = [] q = review_to_words2(q, dict, 2) # q la 1 cau q1 = [' '.join(x) for x in ngrams(q, 1)] # q1:mang cac 1-grams s1 = ViPosTagger.postagging( ViTokenizer.tokenize(unicode( q, encoding='utf-8'))) # gan nhan POS for i1, i2 in zip(s1[0], s1[1]): t1 = i1 + "_" + i2 t1 = t1.encode('utf-8') r1.append(t1) s2 = ' '.join( i for i in s1[1]) # Nhan tu loai cua cau dang str. vd: "N V E N" q2 = [' '.join(x) for x in ngrams(q, 2) ] # q2: mang cac phan tu 2-grams la word s22 = [' '.join(x) for x in ngrams(s2, 2) ] # s22: mang cac phan tu 2-grams la tag q3 = (' '.join(x.replace(' ', '_') for x in q2)).split() s3 = (' '.join(x.replace(' ', '_') for x in s22)).split() for i1, i2 in zip(q3, s3): t2 = i1 + "_" + i2 r2.append(t2) y = r1 + r2 # z1 = [' '.join(x) for x in y] z = ' '.join(y) col4.append(z) # col4.append(q) d = {"label1": col1, "label2": col2, "question": col4} train = pd.DataFrame(d) return train
def review_add_pos(review, filename): words = review.split() with open(filename, "r") as f3: dict_data = f3.read() array = dict_data.splitlines() meaningful_words = [w for w in words if not w in array] b = " ".join(meaningful_words) # cau sau khi loai bo stopword words_list = b.split() tup = ViPosTagger.postagging(ViTokenizer.tokenize(unicode(b,encoding='utf-8'))) # gan nhan POS a = tup[1] c = words_list + a return " ".join(c)
def test_ner(crf, test_sent): from tokenizer.tokenizer import Tokenizer token = Tokenizer() token.run() arr_featurized_sent = [] postaged_sent = ViPosTagger.postagging(token.predict(test_sent)) print postaged_sent test_arr = [] for i in xrange(len(postaged_sent[0])): test_arr.append((postaged_sent[0][i], postaged_sent[1][i])) print test_arr featurized_sent = sent2features(test_arr) arr_featurized_sent.append(featurized_sent) predict = crf.predict(arr_featurized_sent) return zip(test_arr, predict[0])
def count_tokens(): print('count tokens...') statistic = {name: {} for name in my_map.name2label.keys()} stack = os.listdir(tokenized_dataset) print 'loading data in ' + dataset while (len(stack) > 0): file_name = stack.pop() file_path = os.path.join(tokenized_dataset, file_name) if (os.path.isdir(file_path)): utils.push_data_to_stack(stack, file_path, file_name) else: print('\r%s' % (file_path)), sys.stdout.flush() with open(file_path, 'r', encoding='utf-8') as fp: label = utils.get_dir_name(file_path) for sen in fp: sen = sen.strip() tag = ViPosTagger.postagging(sen) tokens = [ tag[0][i] for i in xrange(len(tag[0])) if tag[1][i] == u'N' ] update_count_tokens(statistic, label, tokens)
def ner_crf(question): text = ViPosTagger.postagging(question) detect = [] ar = [] for i in range(len(text[0])): l = [] l.append(text[0][i]) l.append(text[1][i]) ar.append(tuple(l)) detect.append(ar) X_detect = [sent2features(s) for s in detect] tagger = pycrfsuite.Tagger() tagger.open('./adapter/crf_ner_no_accent/crf.model') y_detect = [tagger.tag(xseq) for xseq in X_detect] pred = [] for i in range(len(detect[0])): k = detect[0][i][0] v = y_detect[0][i] kv = [] kv.append(k) kv.append(v) pred.append(tuple(kv)) return pred
def detect_entity(question): text = ViPosTagger.postagging(ViTokenizer.tokenize(question)) detect = [] ar = [] for i in range(len(text[0])): l = [] l.append(text[0][i]) l.append(text[1][i]) ar.append(tuple(l)) detect.append(ar) X_detect = [sent2features(s) for s in detect] tagger = pycrfsuite.Tagger() tagger.open('./adapter/ner_crf/crf.model') y_detect = [tagger.tag(xseq) for xseq in X_detect] pred = [] for i in range(len(detect[0])): k = detect[0][i][0] k = k.replace("_", " ") v = y_detect[0][i] kv = [] kv.append(k) kv.append(v) pred.append(tuple(kv)) return pred
# -*- coding: utf8 -*- import codecs import sys sys.stdout = codecs.getwriter('utf_8')(sys.stdout) sys.stdin = codecs.getreader('utf_8')(sys.stdin) import pyvi from pyvi.pyvi import ViTokenizer, ViPosTagger # print(ViTokenizer.tokenize(u"Tôi ăn xôi xéo")) output = ViPosTagger.postagging(ViTokenizer.tokenize(u"Tôi ăn rất nhiều cơm")) print(output[0][0])
from pyvi.pyvi import ViTokenizer, ViPosTagger # with open('test.txt', 'r') as f: # tf = f.read().splitlines() # l = [] # ar = [] # for i in range(len(tf)): # if tf[i] != "": # a = tf[i].split(' ') # l.append(tuple(a)) # ar.append(l) # print(ar) with open('predict.txt', 'r') as f: tf = f.read() text = ViPosTagger.postagging(ViTokenizer.tokenize(tf)) test = [] ar = [] for i in range(len(text[0])): l = [] l.append(text[0][i]) l.append(text[1][i]) ar.append(tuple(l)) test.append(ar) def word2features(sent, i): word = sent[i][0] postag = sent[i][1] features = {
def extract( id ="text", content ="text", part_index ="int", chap_index ="int", sec_index ="int", law_index ="int", item_index ="int", start_index ="int", end_index ="int", ): sent_index = 0 for s in content[start_index:end_index].split("\n"): if s != "": it = re.finditer(r"(.(?!(\.\s)))+.{2}",s,re.I) lent = divlaw.lenIterator(it) it = re.finditer(r"(.(?!(\.\s)))+.{2}",s,re.I) listIndex = [] position = 0 if item_index is None: position = "{}_{}_{}_{}_{}".format(part_index+1,chap_index+1,sec_index+1,law_index+1,0) else : position = "{}_{}_{}_{}_{}".format(part_index+1,chap_index+1,sec_index+1,law_index+1,item_index+1) if lent > 0: for i in it : listIndex.append(i.start()) if (len(s) - i.end()) > 5 : listIndex.append(i.end()) lent += 1 else : listIndex.append(0) for j in range(0,lent) : if (j != (lent - 1)) : string = handle_string.to_unicode(s[listIndex[j]:listIndex[j+1]]) string = string.replace("\\",'') tokenize = ViPosTagger.postagging(ViTokenizer.tokenize(string))[0] pos_tag = ViPosTagger.postagging(ViTokenizer.tokenize(string))[1] tk = [] sent_index += 1 for token in tokenize : token = token.encode('utf-8') tk.append(token) if '' in tk : continue else : yield [ id, position, sent_index - 1, " ".join(tk), tk, pos_tag ] else : string = handle_string.to_unicode(s[listIndex[j]:]) string = string.replace("\\",'') tokenize = ViPosTagger.postagging(ViTokenizer.tokenize(string))[0] pos_tag = ViPosTagger.postagging(ViTokenizer.tokenize(string))[1] tk = [] sent_index+=1 for token in tokenize : token = token.encode('utf-8') tk.append(token) if '' in tk : continue else : yield [ id, position, sent_index -1, " ".join(tk), tk, pos_tag ]
# -*- encoding: utf8 -*- from pyvi.pyvi import ViPosTagger, ViTokenizer a = ViPosTagger.postagging( ViTokenizer.tokenize(u"Trường đại học Bách Khoa Hà Nội")) print a
def add_pos_sen(review): tup = ViPosTagger.postagging( ViTokenizer.tokenize(unicode(review, encoding='utf-8'))) # gan nhan POS
# -*- coding=utf-8 -*- from pyvi.pyvi import ViTokenizer, ViPosTagger import requests test_sent = "Đệ nhất phu nhân Mỹ Melania Trump cảm ơn Chelsea Clinton, con gái của cựu Tổng thống Bill Clinton và cựu Ngoại trưởng Hillary Clinton, vì đã lên tiếng bênh vực con trai 11 tuổi của bà trước những ý kiến trái chiều của dư luận." url = "http://ai.topica.vn:9119/get_mlbka" headers = { 'cache-control': "no-cache", 'postman-token': "dd327f89-2a5f-bf16-c115-590b590e32c3" } response = requests.request("POST", url, data=test_sent, headers=headers) tach_tu_anh_son = response.text postag_as = ViPosTagger.postagging(tach_tu_anh_son) y = zip(postag_as[0], postag_as[1]) print repr(y).decode('unicode-escape') postaged_sent = ViPosTagger.postagging( ViTokenizer.tokenize(test_sent.decode('utf-8'))) x = zip(postaged_sent[0], postaged_sent[1]) print repr(x).decode('unicode-escape')