def find(text): text = preprocess(text) text = ViTokenizer.tokenize(text) stopwords = pickle.load(open('RESTful/stopwords', 'rb')) vocal = pickle.load(open('RESTful/vocal', 'rb')) model = pickle.load(open('RESTful/model', 'rb')) vectorizer = TfidfVectorizer(stop_words=stopwords, vocabulary=vocal) corpus = [ text, ] x = vectorizer.fit_transform(corpus) y = model.predict(x) file = open('data/data.txt') content = file.read() lines = content.split('\n') result = {} index = 0 # tìm 5 kết quả while index < 5: i = np.random.random_integers(0, len(lines) - 50) line = lines[i] origin_line = line line = preprocess(line) line = ViTokenizer.tokenize(line) corpus = [ line, ] x_find = vectorizer.fit_transform(corpus) y_find = model.predict(x_find) if (y_find == y): result[index] = origin_line index += 1 if index > 5: break return result
def preprocess(txt, tokenize=True): try: txt = re.sub(RE_HTML_TAG, ' ', txt) txt = re.sub('&.{3,4};', ' ', txt) if tokenize: txt = ViTokenizer.tokenize(txt) txt = txt.lower() txt = re.sub(RE_CLEAR, ' ', txt) return txt.strip() except: traceback.print_exc() return ''
def tokenize(self, message: Message, attribute: Text) -> List[Token]: text = message.get(attribute) text = text.replace('òa', 'oà').replace('óa', 'oá').replace('ỏa', 'oả').replace('õa', 'oã').replace('ọa', 'oạ').replace('òe', 'oè').replace('óe', 'oé').replace('ỏe', 'oẻ').replace('õe', 'oẽ').replace('ọe', 'oẹ').replace('ùy', 'uỳ').replace('úy', 'uý').replace('ủy', 'uỷ').replace('ũy', 'uỹ').replace('ụy', 'uỵ') if self.tokenizer == 'underthesea': from underthesea import word_tokenize words = word_tokenize(text, format="text").split() else: from pyvi import ViTokenizer words = ViTokenizer.tokenize(text).split() text = ' '.join(words) return self._convert_words_to_tokens(words, text)
def makeSummary(sentences, best_sentence, query, summary_length, lambta, IDF): summary = [best_sentence] sum_len = len( ViTokenizer.tokenize(best_sentence.getOriginalWords()).split()) MMRval = {} # keeping adding sentences until number of words exceeds summary length while (sum_len <= summary_length): MMRval = {} for sent in sentences: MMRval[sent] = MMRScore(sent, query, summary, lambta, IDF) maxxer = max(MMRval, key=MMRval.get) summary.append(maxxer) sentences.remove(maxxer) sum_len += len(ViTokenizer.tokenize(maxxer.getOriginalWords()).split()) return summary
def tokenize_nmt(text, num_examples=None): """Tokenize the English-French dataset.""" source, target = [], [] for i, line in enumerate(text.split('\n')): if num_examples and i > num_examples: break parts = line.split('\t') if len(parts) == 2: source.append(parts[0].split(' ')) segmented = ViTokenizer.tokenize(parts[1]) target.append(segmented.split(' ')) return source, target
def process(str): str = tran(str) str = str.lower() str = ''.join(c for c in str if c not in punctuation) tach = ViTokenizer.tokenize(str) filtered_words = [ word.replace("_", " ") for word in tach.split(" ") if word not in list_stopword ] return [str, filtered_words]
def text_process(line_data): line_data = nlp.convert_unicode(line_data) # line_data = nlp.chuan_hoa_dau_cau_tieng_viet(line_data) # bị lỗi chinh_tri_vẻ line_data = ViTokenizer.tokenize(line_data) line_data = line_data.lower() line_data = re.sub(r'\d', '', line_data).strip() line_data = re.sub( r'[^\s\wáàảãạăắằẳẵặâấầẩẫậéèẻẽẹêếềểễệóòỏõọôốồổỗộơớờởỡợíìỉĩịúùủũụưứừửữựýỳỷỹỵđ_]', ' ', line_data) line_data = re.sub(r'\s+', ' ', line_data).strip() line_data = remove_stopword(line_data) return line_data
def get_tokenizer(sentense): ''' read the text then tokenizer ''' sentense = sentense.lower() sentense = ViTokenizer.tokenize(sentense) temp = sentense.strip().split() delete_stop_words(temp) return temp
def add_data_file(): file = request.files['file'] if file: filename = secure_filename(file.filename) file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename) file.save(file_path) else: return jsonify('NOT FILE') with open(file_path) as json_file: data = json.load(json_file) data = list(data) for field in data: field["content"] = ViTokenizer.tokenize(field["content"]) field["author"] = field["author"].strip().replace(' ', '_') field["title"] = ViTokenizer.tokenize(field["title"]) field["description"] = ViTokenizer.tokenize(field["description"]) field["topic"] = ViTokenizer.tokenize(field["topic"]) solr.add(data) return jsonify("OK")
def get_data(path): doc_data = get_datasets_localdata(path) X, y = doc_data.data, doc_data.target sw = stop_words(r"stopwords.txt") documents = [] for x in X: doc = ViTokenizer.tokenize(x) doc = gensim.utils.simple_preprocess(doc) doc = " ".join( [word for word in doc if word.encode('utf-8') not in sw]) documents.append(doc) return documents, y
def vi_term_tokenize(self, text): tokens = [] text = self.__remove_html_tags(text) terms = ViTokenizer.tokenize(text) for term in terms.split(" "): if term.lower() not in stop_words.STOP_WORDS: if ("_" in term) or (term.isalpha() == True) and ( len(term) >= 3): tokens.append(term) tokens = self.__standarlize_duplicate_token(tokens) return tokens
def str_idx(corpus, dic): X = [] for i in corpus: ints = [] m = ViTokenizer.tokenize(i).split(' ') for k in m: try: ints.append(dic[k]) except Exception as e: # print(e) ints.append(2) X.append(ints) return X
def word_segment(text, sw_file='./stopwords'): # Get stopword with open(sw_file, 'r') as f: sw = f.readlines() for i in range(len(sw)): sw[i] = sw[i].strip() # word segment text = ViTokenizer.tokenize(text) text = gensim.utils.simple_preprocess(text) text = [w for w in text if not w in sw] text = " ".join(text) return text
def clean_data (content): list_words = ViTokenizer.tokenize(content).split() # Get stopword stopwords = [] f = open('stopwords.txt', 'r', encoding="utf-8") for word in f: stopwords.append(word.strip()) f.close() words = [] # word after remove stop word for word in list_words: if word not in stopwords: words.append(word) return ' '.join(words)
def get_data(path): doc_data = get_datasets_localdata(path) X, y = doc_data.data, doc_data.target sw = stop_words(r"stopwords.txt") documents = [] for x in X: doc = ViTokenizer.tokenize(x) doc = re.sub(r'^https?:\/\/.*[\r\n]*', '', doc, flags=re.MULTILINE) doc = re.sub(" \d+", " ", doc) doc = gensim.utils.simple_preprocess(doc) doc = " ".join([word for word in doc if word.encode('utf-8') not in sw]) documents.append(doc) return documents, y
def preprocess_vi(chatbot, statement): """ Remove any consecutive whitespace characters from the statement text. """ import re import pyvi.ViTokenizer as tokenizer tokenized_text = tokenizer.tokenize(statement.text) statement.add_extra_data('tokenized_text', tokenized_text) #statement.text = statement.text.lower() return statement
def read_data_from_file_to_list(file_name): X = [] y = [] with open(file_name) as lines: for line in lines: try: json_data = json.loads(line) X.append(ViTokenizer.tokenize(json_data['comment'])) star_num = int(json_data['star']) y.append(get_sentiment_from_star(star_num)) except: print(line) return X, y
def tokenize(self, text, index=-1): """ :param text: :return: list """ if index != -1: logging.debug('Tokenize count: %s', index) if index == 23730: logging.debug('F*****g text: %s', text) result = ViTokenizer.tokenize(text).split(' ') return result
def preprocess(self, txt, tokenize=True): txt = re.sub('&.{3,4};', ' ', txt) txt = utils.convertwindown1525toutf8(txt) if tokenize: txt = ViTokenizer.tokenize(txt) txt = txt.lower() txt = self.replace_common_token(txt) txt = self.remove_emoji(txt) txt = re.sub(RE_CLEAR_1, ' ', txt) txt = re.sub(RE_CLEAR_2, ' ', txt) txt = re.sub(RE_CLEAR_3, ' ', txt) txt = utils.chuan_hoa_dau_cau_tieng_viet(txt) return txt.strip()
def bag_of_words(s, words): bag = [0 for _ in range(len(words))] s_words = ViTokenizer.tokenize(s).split() #s_words = nltk.word_tokenize(s) s_words = [word.lower() for word in s_words] for se in s_words: for i, w in enumerate(words): if w == se: bag[i] = 1 return numpy.array(bag)
def extract_name(text, stopwords): tokenized_text = ViTokenizer.tokenize(text) tokenized_text = clean_text(tokenized_text, stopwords) words, tags = ViPosTagger.postagging(tokenized_text) res = [] for i in range(len(words)): if (tags[i] == "Np"): # print(words[i]) res.append(words[i].replace("_", " ")) return res
def sent_embedding_with_w2v(text, sentences): w2v = Word2Vec.load("/home/thangnd/git/python/Vietnamese_doc_summarization_basic/vi/vi.bin") vocab = w2v.wv.vocab X = [] for sentence in sentences: sentence = ViTokenizer.tokenize(sentence) words = sentence.split(" ") sentence_vec = np.zeros((100)) for word in words: if word in vocab: sentence_vec+=w2v.wv[word] X.append(sentence_vec) return X
def word_count(text_list): str = " ".join(text_list) str = ViTokenizer.tokenize(str) counts = dict() words = str.split() for word in words: if word in counts: counts[word] += 1 else: counts[word] = 1 return counts
def predict_specific_content(): file_path = os.path.join(dir_path, "specific_test.txt") content = [] file = open(file_path, 'r', encoding="utf-8") line = file.readlines() line = ' '.join(line) line = gensim.utils.simple_preprocess(line) line = ' '.join(line) line = ViTokenizer.tokenize(line) content.append(line) content_data_tfidf = tfidf_vector.transform(content) prediction = trained_model.predict(content_data_tfidf) return prediction[0]
def get_top_n_words_tf(doc, n=None): with open("stopwords_vn.txt") as f: content = f.readlines() stopwords = frozenset([x.strip() for x in content]) words = ViTokenizer.tokenize(convert_text(doc)) vec = CountVectorizer(stop_words=stopwords).fit([words]) bag_of_words = vec.transform([words]) sum_words = bag_of_words.sum(axis=0) words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()] words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True) topn = words_freq[:n] topn = [item[0] for item in topn] return topn
def getKeyWord(self): doc = self.text content = u'' for i in self.getContent().lower().split('\n'): t = ViTokenizer.tokenize(i) content = content + t + u'\n' tfidfDict = tfidf(content) listkey_init = top(10, tfidfDict) tm = list(top(len(list(tfidfDict.keys())) * 4 / 5, tfidfDict).keys()) global pharse pharse = [] for word in listkey_init: generatePharse(word, content, doc, tm) return pharse
def tachTu(file): with io.open(file, 'r', encoding='utf8') as f: data = json.load(f) newData = [] for i in range(0, len(data)): newString = ViTokenizer.tokenize(data[i]['comment']) newData.append({ 'rating' : data[i]['rating'], 'comment' : newString }) # print(newData[i]['comment']) with io.open(file, 'w', encoding='utf8') as f: json.dump(newData, f)
def search_synonym (query): try: solr = connect_solr() list_words = ViTokenizer.tokenize(query).split() stopwords = utils.get_stopwords() words = [] # word after remove stop word for word in list_words: if word not in stopwords: words.append(word) except Exception: print("[ERROR] search synoym error: Something went wrong!")
def get_tokenizer(link): ''' read the text then tokenizer ''' with open(link, 'r', encoding='utf-8') as f: sentense = f.read() sentense = sentense.lower() sentense = ViTokenizer.tokenize(sentense) temp = sentense.strip().split() delete_stop_words(temp) return temp
def preprocess_text(text): text = parse_html_v2(text) text = text.lower() text = remove_links_content(text) text = remove_emails(text) text = remove_special_tags(text) # remove content between {} text = remove_punctuation(text) # remove all puntuations text = split_alphanum(text) # add space between word and numeric text = strip_numeric(text) # remove digits text = strip_non_alphanum(text) # remove non-alphabetic characters text = strip_short(text, minsize=2) # remove word with length < minsize text = remove_multiple_space(text).strip() # remove space and strip text = ViTokenizer.tokenize(text) return text