def simple_usage(): # Uncomment this line for debugging # logging.basicConfig(level=logging.DEBUG) vncorenlp_file = 'D:\study\PlagismDetector\PlagismDetector/VnCoreNLP/VnCoreNLP-1.1.1.jar' sentences = 'VTV đồng ý chia sẻ bản quyền World Cup 2018 cho HTV để khai thác. ' \ 'Nhưng cả hai nhà đài đều phải chờ sự đồng ý của FIFA mới thực hiện được điều này.' # Use "with ... as" to close the server automatically with VnCoreNLP(vncorenlp_file) as vncorenlp: print('Tokenizing:', vncorenlp.tokenize(sentences)) print('POS Tagging:', vncorenlp.pos_tag(sentences)) print('Named-Entity Recognizing:', vncorenlp.ner(sentences)) print('Dependency Parsing:', vncorenlp.dep_parse(sentences)) print('Annotating:', vncorenlp.annotate(sentences)) print('Language:', vncorenlp.detect_language(sentences)) # In this way, you have to close the server manually by calling close function vncorenlp = VnCoreNLP(vncorenlp_file) print('Tokenizing:', vncorenlp.tokenize(sentences)) print('POS Tagging:', vncorenlp.pos_tag(sentences)) print('Named-Entity Recognizing:', vncorenlp.ner(sentences)) print('Dependency Parsing:', vncorenlp.dep_parse(sentences)) print('Annotating:', vncorenlp.annotate(sentences)) print('Language:', vncorenlp.detect_language(sentences)) # Do not forget to close the server vncorenlp.close()
class Extractor: def __init__(self, jarfile='VnCoreNLP-1.1.1.jar'): print('Init VnCoreNLP Annotator...') self.annotator = VnCoreNLP(jarfile, annotators="wseg,pos,ner,parse", max_heap_size='-Xmx2g') def stop(self): self.annotator.close() def _pos_tagging(self, text): pos_tagged_text = self.annotator.pos_tag(text) return pos_tagged_text def _ner(self, text): ner_text = self.annotator.ner(text) return ner_text def _lemmatize(self, doc, allowed_postags=('N', 'Np', 'V')): sentences = [] ignores = set() for sent in doc: new_sent = [] for word, tag in sent: new_sent.append(word) if tag not in allowed_postags: ignores.add(word) sentences.append(new_sent) return sentences, ignores def _get_named_entities(self, text): endline = ('.', 'O') old_tag = '' entity_segments = [] entities = [] for sent in text: sent.append(endline) for word, tag in sent: # not a segment of a named entity if len(tag) < 3 or tag[-3:] not in NER_TAGS: if entity_segments: entity = ' '.join(entity_segments) if (entity, old_tag) not in entities and not any( p in entity.lower() for p in wrong_entity): entities.append((entity, old_tag)) entity_segments = [] old_tag = '' continue # is a segment of a named entity tag = tag[-3:] if tag != old_tag: if entity_segments: entity = ' '.join(entity_segments) if (entity, old_tag) not in entities and not any( p in entity.lower() for p in wrong_entity): entities.append((entity, old_tag)) entity_segments = [] old_tag = tag entity_segments.append(word) return entities def extract(self, text): annotated_text = self.annotator.annotate(text) ner_text = [[(word['form'], word['nerLabel']) for word in sent] for sent in annotated_text['sentences']] pos_tagged_text = [[(word['form'], word['posTag']) for word in sent] for sent in annotated_text['sentences']] return self._get_named_entities(ner_text), self._lemmatize( pos_tagged_text) def annotate(self, doc): annotated_doc = self.annotator.annotate(doc) return [[ Token(word['form'], word['nerLabel'], word['posTag']) for word in sent ] for sent in annotated_doc['sentences']] def get_long_tokens(self, annotated_doc, pos_tags=('N', 'Ny', 'Np', 'Nc', 'Y', 'Z', 'A'), min_word_number=2, max_word_count=6): eos = Token('.', '.', '.') # end of sentence long_tokens = [] for sent in annotated_doc: sent.append(eos) for i, token in enumerate(sent): if token.posTag in pos_tags: tokens = [token.form] for next_token in sent[i + 1:]: if next_token.posTag in pos_tags: tokens.append(next_token.form) else: new_long_token = ' '.join(tokens).lower() if len(tokens) >= min_word_number and len( tokens) <= max_word_count and not any( p in new_long_token.replace('_', ' ') for p in popular_phrase_part ) and not any(new_long_token in p for p in long_tokens): long_tokens.append(new_long_token) break return long_tokens def merge_name_entities(self, annotated_doc): remake_doc = [[(token.form, token.nerLabel) for token in sent] for sent in annotated_doc] ners = self._get_named_entities(remake_doc) new_doc = [] for sent in annotated_doc: raw_sent = ' '.join([token.form for token in sent]).lower() pos_tags = [token.posTag for token in sent] for ner, _ in ners: ner = ner.lower() i = raw_sent.find(ner) while i > -1 and ner.count(' ') > 0: raw_sent = raw_sent.replace(ner, ner.replace(' ', '_'), 1) i = raw_sent.count(' ', 0, i) pos_tags[i:i + ner.count(' ') + 1] = ['N'] i = raw_sent.find(ner) new_sent = raw_sent.split(' ') if len(new_sent) != len(pos_tags): raise Exception('Wrong went merge NE') new_doc.append([(new_sent[i], pos_tags[i]) for i in range(len(new_sent))]) return ners, new_doc def merge_noun_phrases(self, tokenized_doc, noun_phrases=()): new_doc = [] for sent in tokenized_doc: raw_sent = ' '.join([word for word, tag in sent]).lower() pos_tags = [tag for word, tag in sent] for np in noun_phrases: i = raw_sent.replace('_', ' ').find(np.replace('_', ' ')) while i > -1 and raw_sent[i:i + len(np)].count(' ') > 0: j = raw_sent.count(' ', 0, i) pos_tags[j:j + raw_sent[i:i + len(np)].count(' ') + 1] = ['N'] raw_sent = raw_sent[:i] + np.replace( ' ', '_') + raw_sent[i + len(np):] i = raw_sent.replace('_', ' ').find(np.replace('_', ' '), i + 1) new_sent = raw_sent.split() if len(new_sent) != len(pos_tags): raise Exception('Wrong went merge NE') new_doc.append([(new_sent[i], pos_tags[i]) for i in range(len(new_sent))]) return new_doc def get_most_noun_phrases(self, noun_phrases, threshold=2): appearances = {} for np in noun_phrases: appearances[np] = appearances.get(np, 0) + 1 return [np for np, app in appearances.items() if app >= threshold] def analyse_about(self, about): annotated_doc = self.annotate(about) noun_phrases = self.get_long_tokens(annotated_doc, min_word_number=2, max_word_count=4) phrases = self.get_long_tokens(annotated_doc, pos_tags=('N', 'Np', 'Nc', 'A', 'V'), min_word_number=2, max_word_count=5) named_entities, _ = self.merge_name_entities(annotated_doc) return noun_phrases, phrases, named_entities def analyse_content(self, doc, noun_phrases_in_about): annotated_doc = self.annotate(doc) named_entities, new_doc = self.merge_name_entities(annotated_doc) noun_phrases = self.get_long_tokens(annotated_doc, min_word_number=2, max_word_count=4) popular_entity_noun_phrases = [ p for p in noun_phrases if any( p.startswith(popular_prefix) for popular_prefix in popular_prefix_named_entity) ] most_noun_phrases = self.get_most_noun_phrases(noun_phrases) merged_doc = self.merge_noun_phrases( new_doc, noun_phrases=popular_entity_noun_phrases + noun_phrases_in_about + most_noun_phrases) while len(merged_doc) > 0 and not merged_doc[0]: del merged_doc[0] return self._lemmatize(merged_doc), noun_phrases, named_entities