def convert_st_to_bow(self, st): bow = [0] * len(self.words) tagger = ViPosTagger.postagging(ViTokenizer.tokenize(st)) if not (len(tagger[1]) == 1 and tagger[1][0] == 'Np' and tagger[0][0] not in SKIP_WORDS): tagger = ViPosTagger.postagging(ViTokenizer.tokenize(st.lower())) for i, j in enumerate(tagger[1]): if j in REPLACE: tagger[0][i] = REPLACE[tagger[1][i]] if tagger[0][i] in self.words: bow[self.words.index(tagger[0][i])] = tagger[0].count( tagger[0][i]) return np.array(bow)
def separate_sentence(self): # this file include financial symbols symbol_arr = [] with open('./data/stockslist.txt', encoding='utf-8') as acro_file: lines = acro_file.readlines() for line in lines: symboli = line.rstrip('\n').split(',') symbol_arr.append(symboli[0].lower()) self.data = self.execute_special_character(self.data) sentences = self.data.split('\n') new_sentences = [] for sentence in sentences: part_of_sentence = sentence.split(' ') for part in part_of_sentence: # file.write(part + '\n') new_sentences.append(part) all_words = [] for sentence in new_sentences: words = ViPosTagger.postagging(ViTokenizer.tokenize(sentence)) words = self.tokenizer_tunning(words, 1) ssi = 'ssi' for i, word in enumerate(words): if (self.is_stop_word(word) == False): if word not in symbol_arr: all_words.append(word) else: all_words.append(ssi) return all_words
def res_sentence(self, test_sentence): test_sentence = ViTokenizer.tokenize(test_sentence) test_sentence, pos = ViPosTagger.postagging(test_sentence) new_words, pos = self.process(test_sentence, pos) X_test = self.sent2features(new_words, pos) new_tags = self.crf.predict_single(X_test) st1, st2 = [], [] for i in range(len(new_words)): if new_tags[i] == 'O': if new_tags[i - 1] != 'O': st1.append(new_words[i]) st2.append('O') print(i) continue else: if i == 0: st1.append(new_words[i]) st2.append('O') else: st1[-1] = st1[-1] + '_' + new_words[i] elif new_tags[i][0] == 'B': tag = "" + new_tags[i][2:] st1.append(new_words[i]) st2.append(tag.upper()) elif new_tags[i][0] == 'I': st1[-1] = st1[-1] + '_' + new_words[i] return st1, st2
def sentence_segment(self, text, sw_file='./stopwords', cadidate_pos=['Nc', 'Np', 'S', 'R', 'A', 'C', 'V', 'I']): """Store those words only in cadidate_pos""" # Get stopword with open(sw_file, 'r') as f: sw = f.readlines() for i in range(len(sw)): sw[i] = sw[i].strip() # word segment text = ViTokenizer.tokenize(text) text = text.replace('‘', ' ') text = text.replace('’', ' ') text = text.split('.') sentences = [] for t in text: temp = ViPosTagger.postagging(t) sentence = [] for w,t in zip(temp[0], temp[1]): if len(w) > 0 and w not in sw and t in cadidate_pos: sentence.append(w) sentences.append(sentence) temp = [] for sentence in sentences: if len(sentence) >= self.window_size: temp.append(sentence) return temp
def text_postag(text): pos_tag = ViPosTagger.postagging(ViTokenizer.tokenize(text)) dict_tag = {} for i in range(len(pos_tag[0])): dict_tag[pos_tag[0][i]] = pos_tag[1][i] return dict_tag
def pos_pyvi(): file = open("test_gan_nhan.txt", "r", encoding="utf-8") st = file.readlines() for i in range(len(st)): st[i] = list(map(str, st[i].split())) for j in range(len(st[i])): st[i][j] = list(map(str, st[i][j].split('/'))) s1 = list([]) kq = list([]) for i in range(len(st)): tmp = "" for j in range(len(st[i])): tmp = tmp + st[i][j][0] + ' ' s1.append(tmp) s = ViPosTagger.postagging(tmp) kq.append(list([])) for j in range(len(s[0])): kq[i].append(list([s[0][j], s[1][j]])) H = 0 T = 0 for i in range(len(st)): for j in range(len(kq[i])): if (kq[i][j][1] != "CH"): kq[i][j][1] = kq[i][j][1][0] if (kq[i][j][1] == "F"): kq[i][j][1] = "CH" for j in range(len(kq[i])): T += 1 if (j < len(st[i]) and st[i][j][1] == kq[i][j][1]): H += 1 print("Do chinh xac cua gan nhan tu loai cua Pyvi la: " + str(float(H) / float(T)))
def load_data(data): train_data = [] for line in data: line = line.replace(',', '.') line = line.split('.') for sentence in line: sentence = re.sub(r'[():;/%$-@!*&^?><_#+]', ' ', sentence) sentence = sentence.replace('"', ' ') sentence = sentence.replace("'", " ") sentence, pos_tag = ViPosTagger.postagging(sentence) for index, pos in enumerate(pos_tag): if pos == 'Np': sentence[index] = 'Np' if pos == 'Nc': sentence[index] = 'Nc' if pos == 'X': sentence[index] = 'X' if pos == 'Ny': sentence[index] = 'Ny' if pos == 'M': sentence[index] = 'M' sentence = [ word.lower().rstrip('\n') for word in sentence if word != '' ] if len(sentence) > 1: train_data.append(sentence) return train_data
def pyvi_prc(text): tokens, tags = ViPosTagger.postagging(ViTokenizer.tokenize(text)) result = {} for i in range(len(tokens)): tokens[i] = tokens[i].replace('_', ' ') result[tokens[i]] = tags[i] return result
def get_POS_feature(text): tag_pos = ViPosTagger.postagging(text) vocab = tag_pos[0] list_pos = tag_pos[1] result = [] for index,pos in enumerate(list_pos): result.append(pos) return result
def add_postag(comment): comment = comment.lower() a = ViPosTagger.postagging(ViTokenizer.tokenize(comment)) X = [] for i in range(len(a[0])): test = a[0][i] + "_" + a[1][i] X.append(test) b = " ".join(X) return b
def pos(query): query = re.sub(r'[?|$|.|!|<|=|,|\-|\'|\“|\”]', r'', query) b = ViPosTagger.postagging(ViTokenizer.tokenize(query)) important = ['N', 'Nc', 'Ny', 'Np', 'Nu', 'A', 'V'] result = [] for i in range(len(b[1])): if (b[1][i] in important): result.append(b[0][i]) return ' '.join(result)
def get_Word_based_POS(text): tag_pos = ViPosTagger.postagging(text) vocab = tag_pos[0] list_pos = tag_pos[1] result = [] for index,pos in enumerate(list_pos): if "N" in pos or "V" in pos or "A" in pos: result.append(vocab[index]) return result
def processing_text(text): normalized_text = '. '.join([line.strip() for line in text.split('.')]) tokenized_text, pos_seqs = ViPosTagger.postagging( ViTokenizer.tokenize(normalized_text)) for i, tag in enumerate(pos_seqs): if tag in ['Np', 'Nu', 'M']: tokenized_text[i] = tag tokens = ' '.join( [token for token in tokenized_text if token not in string.punctuation]) return tokens
def tokenize(string: str): tokenized = ViTokenizer.tokenize(string) words, label = ViPosTagger.postagging(tokenized) for idx, word in enumerate(words): word = word.replace('_', " ") words[idx] = word words = [w for w in words if w] return words
def posvt(): m = Frame.m txt = m.get() seg = ViPosTagger.postagging(ViTokenizer.tokenize(txt)) print(seg) pyperclip.copy(" ".join(seg)) root10 = tk.Tk() label0 = tk.Label(root10, text=seg, font=16) label0.pack(fill="x") root10.title('Result(POS-VT)') root10.mainloop()
def extract_name(text, stopwords): tokenized_text = ViTokenizer.tokenize(text) tokenized_text = clean_text(tokenized_text, stopwords) words, tags = ViPosTagger.postagging(tokenized_text) res = [] for i in range(len(words)): if (tags[i] == "Np"): # print(words[i]) res.append(words[i].replace("_", " ")) return res
def load_new_stopwords(text): not_labels = ['A', 'L', 'R', 'T', 'E', 'M', 'I'] stop_words = utils.load_stop_words('stopwords.txt') stop_words += list(string.punctuation) stop_word_mini = [] text = text.split() tokens, postag = ViPosTagger.postagging_tokens(text) for i in range(len(tokens)): if postag[i] in not_labels: stop_word_mini.append(tokens[i].lower()) stop_words += stop_word_mini stop_words = set(stop_words) return stop_words
def _sanitize(self): ''' Trims nonessential words such as 'and', 'or', 'for' Parts of Speech types: http://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html For vietnamese, see: https://pypi.org/project/pyvi/ ''' words_list = [] if (len(self.query_str) == len(self.query_str.encode('utf-8'))): tags_to_keep = [ 'NN', 'NNS', 'NNP', 'NNPS', # noun types 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', # verb types 'JJ', 'JJR', 'JJS', # adjective types 'RB', 'RBR', 'RBS', # adverbs 'CD', 'FW' ] tokens = nltk.word_tokenize(self.query_str) tags = nltk.pos_tag(tokens) for tag in tags: if tag[1] in tags_to_keep: words_list.append(tag[0]) else: #Vietnamese? tags_to_keep = ['N', 'Ny', 'Np', 'V', 'A', 'R', 'M', 'X'] tokens = ViTokenizer.tokenize(self.query_str) tags = ViPosTagger.postagging(tokens) for index in range(len(tags[0])): if tags[1][index] in tags_to_keep: words_list.append(tags[0][index].replace('_', ' ')) new_query_str = ' '.join(words_list) if len(new_query_str) == 0: new_query_str = self.query_str self.deleted_words += len(self.query_str.split()) - len( new_query_str.split()) self.query_str = new_query_str
def create_and_train(self): for key, value in self.json_data.items(): if 'patterns' not in value: continue for pattern in value['patterns']: tagger = ViPosTagger.postagging(ViTokenizer.tokenize(pattern)) w = [] for i, j in enumerate(tagger[1]): if j in REPLACE: tagger[0][i] = REPLACE[j] if j not in POS_TAG and tagger[0][i] not in STOP_WORDS: w.append(tagger[0][i]) self.words.extend(w) self.documents.append((w, key)) self.classes.append(key) self.words = sorted(list(set(self.words))) training = [] for doc in self.documents: st_out = [0] * len(self.words) for w in doc[0]: st_out[self.words.index(w)] = doc[0].count(w) class_out = [0] * len(self.classes) class_out[self.classes.index(doc[1])] = 1 training.append([st_out, class_out]) random.shuffle(training) training = np.array(training) self.train_x = list(training[:, 0]) self.train_y = list(training[:, 1]) pickle.dump( { "documents": self.documents, "classes": self.classes, "words": self.words }, open(SAVE_FILE, "wb")) self.model = Model() self.model.train(self.train_x, self.train_y)
def get_topic_word_in_sentence(sentence): topic_words, other_words = [], [] # token = list(ViTokenizer.tokenize(setence).split()) token = list( requests.post(url=url_token, data={ "text": sentence }).text.split()) tokens = " ".join((t for t in token if len(t) > 1)) token_pos = ViPosTagger.postagging(tokens) for i in range(len(token_pos[0])): # print(token_pos[0][i], token_pos[1][i]) if token_pos[1][i] in noun: topic_words.append(token_pos[0][i]) else: other_words.append(token_pos[0][i]) return topic_words, other_words
def locKiTuDacBiet(s): s1 = "" s2 = "" if s != 'href' and s != 'class' and s != 'hashtag-link' and s != '\n' and s != 'thì' and s != 'là' and s != 'ở' and s != 'đi' and s != 'tao' and s != 'mày' and s != 'cây' and s != 'đến' and s != 'vừng' and s != 'bán' and s != 'đồ ăn' and s != 'Đồ ăn' and s != 'cơm_chiên' and s != 'vô' and s != 'cách' and s != 'đây' and s != 'Vị_trí' and s != 'bánh_bao' and s != 'Kem' and s != 'từ' and s != 'ngoài' and s != 'vô' and s != 'của' and s != 'xe' and s != 'thứ' and s != 'hôm' and s != 'đó' and s != 'kho' and s != 'quẹt' and s != 'buổi_sáng' and s != 'Xe_đẩy' and s != 'decor' and s != 'i' and s != 'o' and s != 'đươ' and s != 'c' and s != 'n' and s != 'cu' and s != '_' and s != 'service' and s != 'Menu' and s != 'bad' and s != 'ㅠ' and s != 'bill' and s != 'Matcha' and s != 'green' and s != 'almond' and s != 'chocolate' and s != 'PERFECT' and s != 'kpop' and s != 'SG' and s != 'upstair' and s != 'driving' and s != 'in' and s != 'to' and s != 'check' and s != 'say' and s != 'ran': for i in range(len(s)): if s[i] != '!' and s[i] != '/' and s[i] != '.' and s[ i] != "'": #and s[i]!='['and s[i]!=']'and s[i]!='|'and s[i]!='{'and s[i]!='}'and s[i]!=';'and s[i]!=',' s1 += s[i] s1.rstrip("\n") s3 = ViPosTagger.postagging(ViTokenizer.tokenize(u"%s" % s1)) for i in range(len(s3[0])): if s3[1][i] != 'N' and s3[1][i] != 'Np' and s3[1][i] != 'P' and s3[1][ i] != 'E' and s3[1][i] != 'T' and s3[1][i] != 'L' and s3[1][ i] != 'M': s2 += s3[0][i] + " " return s2
def fetch_data(st, t): tagger = ViPosTagger.postagging(ViTokenizer.tokenize(st)) for i, j in enumerate(tagger[1]): if j == 'Np' and t == 'NAME': return tagger[0][i] if j == 'M': if t == 'NUMBER': return tagger[0][i] if t == 'DATE': str_date = tagger[0][i] try: return datetime.datetime.strptime(str_date, "%d/%m/%Y") except ValueError: return ( "Bạn chưa nhập đúng định dạng ngày theo ngày/tháng/năm hoặc ngày nhập không hợp lệ", 1) return False
def get_kqxs(st, session, data): tagger = ViPosTagger.postagging(ViTokenizer.tokenize(st.title())) if tagger[1][-1] == 'Np': p = tagger[0][-1] url = 'http://xskt.com.vn/rss-feed/' rss = { 'Bắc': 'mien-bac-xsmb.rss', 'Nam': 'mien-nam-xsmn.rss', 'Trung': 'mien-trung-xsmt.rss', 'Bình_Định': 'binh-dinh-xsbdi.rss', 'Đắc_Lắk': 'dac-lac-xsdlk.rss', 'Đà Nẵng': 'da-nang-xsdng.rss', 'Đắc_Nông': 'dac-nong-xsdno.rss', 'TP.HCM': 'tp-hcm-xshcm.rss', 'Sài_Gòn': 'tp-hcm-xshcm.rss', 'Hcm': 'tp-hcm-xshcm.rss', 'Quảng_Ngãi': 'quang-ngai-xsqng.rss', 'Quảng_Nam': 'quang-nam-xsqnm.rss' } try: link = url + rss[p] except KeyError: p = no_accent_vietnamese(p.lower().replace("_", " ")) vt = "" for i in p.split(): vt += i[0] link = url + p.replace(" ", "-") + ("-xs{0}.rss").format(vt) feed = feedparser.parse(link) content = "" if len(feed['items']) == 0: feed = feedparser.parse(url + rss['Bắc']) for item in feed['items']: content += "<div><strong>" + item['title'] + "</strong></div>" content += "<div>" + item['summary'].replace("[", "<br/>[").replace( "8:", " 8:") + "</div>" return content
def get_bmi(st, track, data): st = st.lower() try: if st.index("cm"): st = st.replace("cm", " cm") except ValueError: st = st.replace("m", " m") st = st.replace("kg", " kg") arr = [] tagger = ViPosTagger.postagging(ViTokenizer.tokenize(st)) for i, j in enumerate(tagger[1]): if j == 'M': arr.append(float(tagger[0][i])) if j == 'Nu' and tagger[0][i] == 'cm': arr[0] = arr[0] / 100 if (len(arr) < 2): return ({"text": "Cho em chỉ số về chiều cao (m) và cân nặng (kg)"}, 1) arr = sorted(arr) h = arr[0] w = arr[1] bmi = w / (h * h) if (bmi < 18.5): tt = "gầy" elif (bmi < 24.9): tt = "bình thường" elif (bmi < 29.9): tt = "hơi béo" elif (bmi < 34.9): tt = "béo phì cấp độ 1" elif (bmi < 39.9): tt = "béo phì cấp độ 2" else: tt = "béo phì cấp độ 3" return ({ "text": "Chỉ số BMI của bạn là: %.2f.\nTình trạng hiện tại: %s" % (bmi, tt) }, 1)
def get_L_sentences(self, s): from pyvi import ViTokenizer, ViPosTagger LSeg, LPOS = ViPosTagger.postagging(ViTokenizer.tokenize(s)) LRtn = [] for i, (segment, pos) in enumerate(zip(LSeg, LPOS), start=1): LRtn.append( CubeItem(index=i, word=segment.replace('_', ' '), lemma=segment.replace('_', ' '), upos=DPOSToCube[pos], xpos='', attrs='', head='', label='', space_after='_' if i != len(LSeg) - 1 else 'SpaceAfter=No')) return [LRtn]
def get_data_for_training(self): with open(self.PATH, encoding='utf-8') as json_file: training_data = json.load(json_file) for pattern in training_data: wTV = ViPosTagger.postagging( ViTokenizer.tokenize((pattern['sentence']))) self.words.extend(wTV[0]) self.documents.append((wTV[0], pattern['class'])) if pattern['class'] not in self.classes: self.classes.append(pattern['class']) self.output_empty = [0] * len(self.classes) self.words = [ w.lower() for w in self.words if w not in self.ignore_words ] self.words = sorted(list(set(self.words))) # prepare for predicting self.load_w()
def Process_V_Sentence(sent): #tach tu va lay nhan cho tu processed_sent = [] list_tagged_word = ViPosTagger.postagging(ViTokenizer.tokenize(sent)) list_word = list_tagged_word[0] #lay ra cac tu list_pos = list_tagged_word[1] #lay ra cac nhan tuong ung voi cac tu for i in range(len(list_word)): #them tu vao cau if list_pos[i] not in ['F', 'P']: if list_pos[i] not in ['Ny', 'Np']: list_word[i] = list_word[i].lower() list_word[i] = list_word[i].replace('_', ' ') processed_sent.append(list_word[i]) #loai bo cac stop word trong cau processed_sent = Eliminate_V_Stop_Word("v_stopwords.txt", processed_sent) return processed_sent
def Get_List_V_Sent(paragraph): list_sents = [] words_in_sent = [] list_tagged_word = ViPosTagger.postagging(ViTokenizer.tokenize(paragraph)) list_word = list_tagged_word[0] #lay ra cac tu list_pos = list_tagged_word[1] #lay ra cac nhan tuong ung voi cac tu for i in range(len(list_word)): #them tu vao cau if list_pos[i] != 'F': list_word[i] = list_word[i].replace('_', ' ') words_in_sent.append(list_word[i]) if (list_pos[i] == 'F') and (list_word[i] in '.!?'): #neu gap dau ket thuc cau sentence = ' '.join(words_in_sent) sentence += list_word[i] #them dau ket thuc cau list_sents.append(sentence) words_in_sent = [] return list_sents
def create_pos(sentence): sentence = preprocess(sentence) pos = ViPosTagger.postagging(ViTokenizer.tokenize(sentence)) result = [] final = [] for i in range(len(pos[0])): result.append((pos[0][i], pos[-1][i])) result = postprocess_pos(result) i = 0 while i < len(result): if i < len(result) - 2 and pos_tags[result[i][-1]] == pos_tags[result[i+2][-1]] == "numeral" and pos_tags[result[i+1][-1]] == "punctuation": temp = (result[i][0] + result[i+1][0] + result[i+2][0], name_to_pos["numeral"]) i += 3 else: temp = result[i] i += 1 final.append(temp) return final
def remove_stopword_sent(self, sent): new_tokens = [] new_pos = [] print("before res:", sent) sent = self.restore_acronym(sent) print("after res:", sent) s = ViPosTagger.postagging(ViTokenizer.tokenize(sent)) for i in range(len(s[0])): if s[0][i] in self.stock_code: new_tokens.append(s[0][i]) new_pos.append("Np") elif self.is_stop_word(s[0][i]): if (s[0][i] == "như"): print("S[i]", s[0][i]) a = 0 else: new_tokens.append(s[0][i]) new_pos.append(s[1][i]) return (new_tokens, new_pos)