def simple_usage(): # Uncomment this line for debugging # logging.basicConfig(level=logging.DEBUG) vncorenlp_file = 'D:\study\PlagismDetector\PlagismDetector/VnCoreNLP/VnCoreNLP-1.1.1.jar' sentences = 'VTV đồng ý chia sẻ bản quyền World Cup 2018 cho HTV để khai thác. ' \ 'Nhưng cả hai nhà đài đều phải chờ sự đồng ý của FIFA mới thực hiện được điều này.' # Use "with ... as" to close the server automatically with VnCoreNLP(vncorenlp_file) as vncorenlp: print('Tokenizing:', vncorenlp.tokenize(sentences)) print('POS Tagging:', vncorenlp.pos_tag(sentences)) print('Named-Entity Recognizing:', vncorenlp.ner(sentences)) print('Dependency Parsing:', vncorenlp.dep_parse(sentences)) print('Annotating:', vncorenlp.annotate(sentences)) print('Language:', vncorenlp.detect_language(sentences)) # In this way, you have to close the server manually by calling close function vncorenlp = VnCoreNLP(vncorenlp_file) print('Tokenizing:', vncorenlp.tokenize(sentences)) print('POS Tagging:', vncorenlp.pos_tag(sentences)) print('Named-Entity Recognizing:', vncorenlp.ner(sentences)) print('Dependency Parsing:', vncorenlp.dep_parse(sentences)) print('Annotating:', vncorenlp.annotate(sentences)) print('Language:', vncorenlp.detect_language(sentences)) # Do not forget to close the server vncorenlp.close()
class SIFRank4VN(): def __init__(self): # path = os.path.dirname(os.path.realpath('__file__')) self.vncorenlp = VnCoreNLP( "auxiliary_data/VnCoreNLP-master/VnCoreNLP-1.1.1.jar", annotators="wseg, pos", max_heap_size='-Xmx500m') self.phoBERT = word_emb_phoBert.WordEmbeddings() self.SIF = sent_emb_sif.SentEmbeddings(self.phoBERT, lamda=1.0, embeddings_type='bert') def sifrank_extract(self, text, nphrase=15, ratio=0.6): keyphrases = SIFRank(text, self.SIF, self.vncorenlp, N=nphrase, ratio=ratio) return keyphrases def sifrank_plus_extract(self, text, nphrase=15, ratio=0.6): keyphrases = SIFRank_plus(text, self.SIF, self.vncorenlp, N=nphrase, ratio=ratio) return keyphrases def close_vncorenlp(self): self.vncorenlp.close()
def nlp_tokenize(path): data = pd.read_excel(path) data = data[['ID', 'Content', 'ID người đăng']] data = data.dropna() data['Content'] = data['Content'].str.strip() data['Content'] = data['Content'].str.lower() data['status'] = data['Content'] for i in range(len(data['status'])): data['status'].iloc[i] = re.sub('\W+', ' ', data['Content'].iloc[i]) data['Content'].iloc[i] = data['status'].iloc[i] vncorenlp_file = r'VnCoreNLP/VnCoreNLP-1.1.1.jar' vncorenlp = VnCoreNLP(vncorenlp_file) # content = vncorenlp.tokenize(content) for i in range(len(data['status'])): data['status'].iloc[i] = vncorenlp.tokenize(data['status'].iloc[i]) key_word = [] for i in data['status']: key_word = key_word + i vncorenlp.close() return key_word, data[['Content', 'ID', 'ID người đăng']]
import csv sys.path.insert(1, os.getcwd() + "/PreprocessingComponent") from PreprocessingComponent.pdfminer3 import Pdf_extract if ("PreprocessingComponent" in os.getcwd()): vncorenlp_file = os.getcwd() + '/VnCoreNLP/VnCoreNLP-1.1.1.jar' else: vncorenlp_file = os.getcwd( ) + '/PreprocessingComponent/VnCoreNLP/VnCoreNLP-1.1.1.jar' vncorenlp = VnCoreNLP(vncorenlp_file, annotators="wseg,pos,ner,parse", max_heap_size='-Xmx4g', port=6000) # annotator = VnCoreNLP(vncorenlp_file, annotators="wseg,pos,ner,parse", max_heap_size='-Xmx2g',port=6000) """def previous_and_next(some_iterable): prevs, items, nexts = tee(some_iterable, 3) prevs = chain([None], prevs) nexts = chain(islice(nexts, 1, None), [None]) return zip(prevs, items, nexts) # LƯU Ý: # input của các hàm xử lý là tên file và output của các hàm xử lý ...2txt là một list các câu. # từng phần tử của 1 list lớn (1 câu) là 1 list nhỏ chứa các từ được phân tách nhờ VNCORE. #### ------------------------------------các function hỗ trợ cho function docx2txt # Get paragraph string. Input is paragraph element def para_string(para): string = "" # if (str(para)[21:34] not in str(wp_tbl)):# and (str(para)[21:34] not in str(wp_txbx)):
# res.append([os.path.basename(filename), b, num_word]) # filename, list câu. số từ của mỗi câu # print("Run time of file ",filename," là: --- %s seconds ---" % (time.time() - start_time)) return pos_tag, os.path.basename(filename), b, num_word def rtf2txt(filename): with open("yourfile.rtf") as infile: for line in infile: print(line) if __name__ == '__main__': list_filename = [ "docFile_test/bacho1.docx", "docFile_test/sample1.doc", "docFile_test/sacvui.pdf", "docFile_test/bacho.docx", ] start_time = time.time() for filename in list_filename: a = preprocess(filename) print(a) print("\n\nRun time of file ", filename, " là: --- %s seconds ---" % (time.time() - start_time)) # print("Tên file là: ", a) # print("\n Danh sách các câu của file là: ", b) # print("\n Danh sách số từ của file là: ", c) vncorenlp.close()
class Extractor: def __init__(self, jarfile='VnCoreNLP-1.1.1.jar'): print('Init VnCoreNLP Annotator...') self.annotator = VnCoreNLP(jarfile, annotators="wseg,pos,ner,parse", max_heap_size='-Xmx2g') def stop(self): self.annotator.close() def _pos_tagging(self, text): pos_tagged_text = self.annotator.pos_tag(text) return pos_tagged_text def _ner(self, text): ner_text = self.annotator.ner(text) return ner_text def _lemmatize(self, doc, allowed_postags=('N', 'Np', 'V')): sentences = [] ignores = set() for sent in doc: new_sent = [] for word, tag in sent: new_sent.append(word) if tag not in allowed_postags: ignores.add(word) sentences.append(new_sent) return sentences, ignores def _get_named_entities(self, text): endline = ('.', 'O') old_tag = '' entity_segments = [] entities = [] for sent in text: sent.append(endline) for word, tag in sent: # not a segment of a named entity if len(tag) < 3 or tag[-3:] not in NER_TAGS: if entity_segments: entity = ' '.join(entity_segments) if (entity, old_tag) not in entities and not any( p in entity.lower() for p in wrong_entity): entities.append((entity, old_tag)) entity_segments = [] old_tag = '' continue # is a segment of a named entity tag = tag[-3:] if tag != old_tag: if entity_segments: entity = ' '.join(entity_segments) if (entity, old_tag) not in entities and not any( p in entity.lower() for p in wrong_entity): entities.append((entity, old_tag)) entity_segments = [] old_tag = tag entity_segments.append(word) return entities def extract(self, text): annotated_text = self.annotator.annotate(text) ner_text = [[(word['form'], word['nerLabel']) for word in sent] for sent in annotated_text['sentences']] pos_tagged_text = [[(word['form'], word['posTag']) for word in sent] for sent in annotated_text['sentences']] return self._get_named_entities(ner_text), self._lemmatize( pos_tagged_text) def annotate(self, doc): annotated_doc = self.annotator.annotate(doc) return [[ Token(word['form'], word['nerLabel'], word['posTag']) for word in sent ] for sent in annotated_doc['sentences']] def get_long_tokens(self, annotated_doc, pos_tags=('N', 'Ny', 'Np', 'Nc', 'Y', 'Z', 'A'), min_word_number=2, max_word_count=6): eos = Token('.', '.', '.') # end of sentence long_tokens = [] for sent in annotated_doc: sent.append(eos) for i, token in enumerate(sent): if token.posTag in pos_tags: tokens = [token.form] for next_token in sent[i + 1:]: if next_token.posTag in pos_tags: tokens.append(next_token.form) else: new_long_token = ' '.join(tokens).lower() if len(tokens) >= min_word_number and len( tokens) <= max_word_count and not any( p in new_long_token.replace('_', ' ') for p in popular_phrase_part ) and not any(new_long_token in p for p in long_tokens): long_tokens.append(new_long_token) break return long_tokens def merge_name_entities(self, annotated_doc): remake_doc = [[(token.form, token.nerLabel) for token in sent] for sent in annotated_doc] ners = self._get_named_entities(remake_doc) new_doc = [] for sent in annotated_doc: raw_sent = ' '.join([token.form for token in sent]).lower() pos_tags = [token.posTag for token in sent] for ner, _ in ners: ner = ner.lower() i = raw_sent.find(ner) while i > -1 and ner.count(' ') > 0: raw_sent = raw_sent.replace(ner, ner.replace(' ', '_'), 1) i = raw_sent.count(' ', 0, i) pos_tags[i:i + ner.count(' ') + 1] = ['N'] i = raw_sent.find(ner) new_sent = raw_sent.split(' ') if len(new_sent) != len(pos_tags): raise Exception('Wrong went merge NE') new_doc.append([(new_sent[i], pos_tags[i]) for i in range(len(new_sent))]) return ners, new_doc def merge_noun_phrases(self, tokenized_doc, noun_phrases=()): new_doc = [] for sent in tokenized_doc: raw_sent = ' '.join([word for word, tag in sent]).lower() pos_tags = [tag for word, tag in sent] for np in noun_phrases: i = raw_sent.replace('_', ' ').find(np.replace('_', ' ')) while i > -1 and raw_sent[i:i + len(np)].count(' ') > 0: j = raw_sent.count(' ', 0, i) pos_tags[j:j + raw_sent[i:i + len(np)].count(' ') + 1] = ['N'] raw_sent = raw_sent[:i] + np.replace( ' ', '_') + raw_sent[i + len(np):] i = raw_sent.replace('_', ' ').find(np.replace('_', ' '), i + 1) new_sent = raw_sent.split() if len(new_sent) != len(pos_tags): raise Exception('Wrong went merge NE') new_doc.append([(new_sent[i], pos_tags[i]) for i in range(len(new_sent))]) return new_doc def get_most_noun_phrases(self, noun_phrases, threshold=2): appearances = {} for np in noun_phrases: appearances[np] = appearances.get(np, 0) + 1 return [np for np, app in appearances.items() if app >= threshold] def analyse_about(self, about): annotated_doc = self.annotate(about) noun_phrases = self.get_long_tokens(annotated_doc, min_word_number=2, max_word_count=4) phrases = self.get_long_tokens(annotated_doc, pos_tags=('N', 'Np', 'Nc', 'A', 'V'), min_word_number=2, max_word_count=5) named_entities, _ = self.merge_name_entities(annotated_doc) return noun_phrases, phrases, named_entities def analyse_content(self, doc, noun_phrases_in_about): annotated_doc = self.annotate(doc) named_entities, new_doc = self.merge_name_entities(annotated_doc) noun_phrases = self.get_long_tokens(annotated_doc, min_word_number=2, max_word_count=4) popular_entity_noun_phrases = [ p for p in noun_phrases if any( p.startswith(popular_prefix) for popular_prefix in popular_prefix_named_entity) ] most_noun_phrases = self.get_most_noun_phrases(noun_phrases) merged_doc = self.merge_noun_phrases( new_doc, noun_phrases=popular_entity_noun_phrases + noun_phrases_in_about + most_noun_phrases) while len(merged_doc) > 0 and not merged_doc[0]: del merged_doc[0] return self._lemmatize(merged_doc), noun_phrases, named_entities
print(result) return result @app.route('/tener', methods=['POST']) def tener(): text = request.get_data() text = text.decode('utf8') _text = json.loads(text) text = _text['text'] # print(type(text)) __text = annotator.tokenize(text) _text = [] for sen in __text: _text += sen text = " ".join(_text) print(text) if args.inference == True: _result = inference(text, model) else: _result = test(text, model) result = {} result['sentence'] = _result return jsonify(result) if __name__ == '__main__': app.run() annotator.close()