def simple_usage():
    # Uncomment this line for debugging
    # logging.basicConfig(level=logging.DEBUG)

   
    vncorenlp_file = 'D:\study\PlagismDetector\PlagismDetector/VnCoreNLP/VnCoreNLP-1.1.1.jar'
    
    sentences = 'VTV đồng ý chia sẻ bản quyền World Cup 2018 cho HTV để khai thác. ' \
                'Nhưng cả hai nhà đài đều phải chờ sự đồng ý của FIFA mới thực hiện được điều này.'

    # Use "with ... as" to close the server automatically
    with VnCoreNLP(vncorenlp_file) as vncorenlp:
        print('Tokenizing:', vncorenlp.tokenize(sentences))
        print('POS Tagging:', vncorenlp.pos_tag(sentences))
        print('Named-Entity Recognizing:', vncorenlp.ner(sentences))
        print('Dependency Parsing:', vncorenlp.dep_parse(sentences))
        print('Annotating:', vncorenlp.annotate(sentences))
        print('Language:', vncorenlp.detect_language(sentences))

    # In this way, you have to close the server manually by calling close function
    vncorenlp = VnCoreNLP(vncorenlp_file)

    print('Tokenizing:', vncorenlp.tokenize(sentences))
    print('POS Tagging:', vncorenlp.pos_tag(sentences))
    print('Named-Entity Recognizing:', vncorenlp.ner(sentences))
    print('Dependency Parsing:', vncorenlp.dep_parse(sentences))
    print('Annotating:', vncorenlp.annotate(sentences))
    print('Language:', vncorenlp.detect_language(sentences))

    # Do not forget to close the server
    vncorenlp.close()
Beispiel #2
0
 def vn_ner(self):
     annotator = VnCoreNLP(address=DEFAULT_LOCAL_ADDRESS,
                           port=DEFAULT_VI_NER_PORT)
     for line in self.textMap.keys():
         taggedText = annotator.annotate(line)
         try:
             taggedText = taggedText['sentences'][0]
             for value in taggedText:
                 if value['nerLabel'] in ['B-PER', 'I-PER']:
                     self.textMap[line][self.PER_KEY] += 1
                 if value['nerLabel'] in ['B-LOC', 'I-LOC']:
                     self.textMap[line][self.LOC_KEY] += 1
                 if value['nerLabel'] in ['B-ORG', 'I-ORG']:
                     self.textMap[line][self.ORG_KEY] += 1
         except Exception as e:
             print("Unable to anotate " + str(line))
             print(e)
             return e
Beispiel #3
0
    "/hdd/Zalo_Team_HoaChan/resources/VnCoreNLP/VnCoreNLP-1.1.1.jar",
    annotators="wseg,pos,ner,parse",
    max_heap_size='-Xmx2g')

# To perform word segmentation, POS tagging and then NER
# annotator = VnCoreNLP("<FULL-PATH-to-VnCoreNLP-jar-file>", annotators="wseg,pos,ner", max_heap_size='-Xmx2g')
# To perform word segmentation and then POS tagging
# annotator = VnCoreNLP("<FULL-PATH-to-VnCoreNLP-jar-file>", annotators="wseg,pos", max_heap_size='-Xmx2g')
# To perform word segmentation only
# annotator = VnCoreNLP("<FULL-PATH-to-VnCoreNLP-jar-file>", annotators="wseg", max_heap_size='-Xmx500m')

# Input
text = "Tuy vậy, 1 phút sau, Chelsea đã cụ thể hóa sức ép liên tục bằng bàn thắng. Người lập công cho The Blues là đội trưởng Cahill với cú đánh đầu cận thành từ tình huống phạt góc."

# To perform word segmentation, POS tagging, NER and then dependency parsing
annotated_text = annotator.annotate(text)

# To perform word segmentation only
word_segmented_text = annotator.tokenize(text)

print('annotated_text: ', annotated_text)
print('word_segmented_text: ', word_segmented_text)

{
    'index': 7,
    'form': 'Cahill',
    'posTag': 'Np',
    'nerLabel': 'B-PER',
    'head': 6,
    'depLabel': 'nmod'
}
Beispiel #4
0
from vncorenlp import VnCoreNLP
import time

annotator = VnCoreNLP(
    '/home/misa/PycharmProjects/lightning-projects/VnCoreNLP/VnCoreNLP-1.1.jar'
)
text = "Ông Nguyễn Khắc Chúc  đang làm việc tại Đại học Quốc gia Hà Nội. Bà Lan, vợ ông Chúc, cũng làm việc tại đây."
annotated_text = annotator.annotate(text)
word_segmented_text = annotator.tokenize(text)
for i in range(10):
    semi_text = text[:-i]
    start_time = time.time()
    annotated_text = annotator.annotate(semi_text)
    word_segmented_text = annotator.tokenize(semi_text)
    print(str(start_time - time.time()))
print()
Beispiel #5
0
class TextRank4KeywordVN():
    """Extract keywords from text"""
    
    def __init__(self, stopwords, ngrams=1, window_size=3, candidate_pos=["N", "Np"], num_keywords=5, use_vncorenlp=True):
        self.d = 0.85 # damping coefficient, usually is .85
        self.min_diff = 1e-5 # convergence threshold
        self.steps = 10 # iteration steps
        self.node_weight = None # save keywords and its weight
        self.ngrams = ngrams
        self.window_size = window_size
        self.candidate_pos = candidate_pos
        self.num_keywords = num_keywords
        self.stopwords = stopwords
        self.use_vncorenlp = use_vncorenlp
        if self.use_vncorenlp:
            self.annotator = VnCoreNLP(VNCORENLP_JAR_PATH, annotators="wseg,pos", max_heap_size='-Xmx2g')
    
    # tokenizing, filtering stopwords for sentence
    def filtering_sentence(self, sent, stopwords, keyword, lower=False):
        sent = re.sub(r'[^\w\s]','',sent)
        filtered_words = []
        if lower:
            words = word_tokenize(sent, format="text").split(" ")
            words = [word.lower() for word in words]
        else:
            words = word_tokenize(sent, format="text").split(" ")
        for word in words:
            if word not in stopwords and (keyword not in word) and (word not in keyword) and not (word.isnumeric()) and word not in punctuation:
                try:
                    if detect(word) == "vi":
                        filtered_words.append(word)   
                except Exception:
                    continue 
                # filtered_words.append(word)
        return ' '.join(filtered_words)
    
    # pos-tagging text for segmentation step
    def pos_tagging_sentence(self, sent):
        if self.use_vncorenlp:
            sent = sent.replace("_", " ")
            temp = self.annotator.annotate(sent)
            return [(element["form"], element["posTag"]) for sent in temp["sentences"] for element in sent]
        else:
            return pos_tag(sent)

    def sentence_segment(self, doc, lower):
        """Store those words only in cadidate_pos"""
        sentences = []
        for sent in doc:
            postag = self.pos_tagging_sentence(sent)
            words = [x[0] for x in postag]
            selected_words = []
            res = []
            for i in range(len(words)):
                # Store words only with cadidate POS tag
                if postag[i][1] in self.candidate_pos or words[i] in ["trình_duyệt"]:
                    if lower is True:
                        selected_words.append(words[i].lower())
                    else:
                        selected_words.append(words[i])
            if self.ngrams == 1:
                sentences.append(selected_words)
            else:
                for i in range(len(selected_words) - self.ngrams + 1):
                    word = ''
                    for j in range(self.ngrams):
                        word += selected_words[i+j]
                        if j != self.ngrams - 1:
                            word += ' '
                    res.append(word)
                sentences.append(res)
        return sentences
        
    def analyze(self, text, keyword, lower=False):
        """Main function to analyze text"""
        doc = sent_tokenize(text)
        doc = [self.filtering_sentence(sent, self.stopwords, keyword, lower) for sent in doc]

        # Filter sentences
        sentences = self.sentence_segment(doc, lower) # list of list of words
        
        # Build vocabulary
        vocab = get_vocab(sentences)
        
        # Get token_pairs from windows
        token_pairs = get_token_pairs(sentences, self.window_size)
        
        # Get normalized matrix
        g = get_matrix(vocab, token_pairs)
        
        # Initionlization for weight(pagerank value)
        pr = np.array([1] * len(vocab))
        
        # Iteration
        previous_pr = 0
        for epoch in range(self.steps):
            pr = (1-self.d) + self.d * np.dot(g, pr)
            if abs(previous_pr - sum(pr))  < self.min_diff:
                break
            else:
                previous_pr = sum(pr)

        # Get weight for each node
        node_weight = dict()
        for word, index in vocab.items():
            node_weight[word] = pr[index]
        
        self.node_weight = node_weight
        return get_keywords(self.node_weight, self.num_keywords)
class Extractor:
    def __init__(self, jarfile='VnCoreNLP-1.1.1.jar'):
        print('Init VnCoreNLP Annotator...')
        self.annotator = VnCoreNLP(jarfile,
                                   annotators="wseg,pos,ner,parse",
                                   max_heap_size='-Xmx2g')

    def stop(self):
        self.annotator.close()

    def _pos_tagging(self, text):
        pos_tagged_text = self.annotator.pos_tag(text)
        return pos_tagged_text

    def _ner(self, text):
        ner_text = self.annotator.ner(text)
        return ner_text

    def _lemmatize(self, doc, allowed_postags=('N', 'Np', 'V')):
        sentences = []
        ignores = set()
        for sent in doc:
            new_sent = []
            for word, tag in sent:
                new_sent.append(word)
                if tag not in allowed_postags:
                    ignores.add(word)
            sentences.append(new_sent)
        return sentences, ignores

    def _get_named_entities(self, text):
        endline = ('.', 'O')
        old_tag = ''
        entity_segments = []
        entities = []

        for sent in text:
            sent.append(endline)
            for word, tag in sent:
                # not a segment of a named entity
                if len(tag) < 3 or tag[-3:] not in NER_TAGS:
                    if entity_segments:
                        entity = ' '.join(entity_segments)
                        if (entity, old_tag) not in entities and not any(
                                p in entity.lower() for p in wrong_entity):
                            entities.append((entity, old_tag))
                        entity_segments = []
                        old_tag = ''
                    continue

                # is a segment of a named entity
                tag = tag[-3:]
                if tag != old_tag:
                    if entity_segments:
                        entity = ' '.join(entity_segments)
                        if (entity, old_tag) not in entities and not any(
                                p in entity.lower() for p in wrong_entity):
                            entities.append((entity, old_tag))
                        entity_segments = []

                old_tag = tag
                entity_segments.append(word)

        return entities

    def extract(self, text):
        annotated_text = self.annotator.annotate(text)
        ner_text = [[(word['form'], word['nerLabel']) for word in sent]
                    for sent in annotated_text['sentences']]
        pos_tagged_text = [[(word['form'], word['posTag']) for word in sent]
                           for sent in annotated_text['sentences']]
        return self._get_named_entities(ner_text), self._lemmatize(
            pos_tagged_text)

    def annotate(self, doc):
        annotated_doc = self.annotator.annotate(doc)
        return [[
            Token(word['form'], word['nerLabel'], word['posTag'])
            for word in sent
        ] for sent in annotated_doc['sentences']]

    def get_long_tokens(self,
                        annotated_doc,
                        pos_tags=('N', 'Ny', 'Np', 'Nc', 'Y', 'Z', 'A'),
                        min_word_number=2,
                        max_word_count=6):
        eos = Token('.', '.', '.')  # end of sentence
        long_tokens = []
        for sent in annotated_doc:
            sent.append(eos)
            for i, token in enumerate(sent):
                if token.posTag in pos_tags:
                    tokens = [token.form]
                    for next_token in sent[i + 1:]:
                        if next_token.posTag in pos_tags:
                            tokens.append(next_token.form)
                        else:
                            new_long_token = ' '.join(tokens).lower()
                            if len(tokens) >= min_word_number and len(
                                    tokens) <= max_word_count and not any(
                                        p in new_long_token.replace('_', ' ')
                                        for p in popular_phrase_part
                                    ) and not any(new_long_token in p
                                                  for p in long_tokens):
                                long_tokens.append(new_long_token)
                            break
        return long_tokens

    def merge_name_entities(self, annotated_doc):
        remake_doc = [[(token.form, token.nerLabel) for token in sent]
                      for sent in annotated_doc]
        ners = self._get_named_entities(remake_doc)
        new_doc = []
        for sent in annotated_doc:
            raw_sent = ' '.join([token.form for token in sent]).lower()
            pos_tags = [token.posTag for token in sent]
            for ner, _ in ners:
                ner = ner.lower()
                i = raw_sent.find(ner)
                while i > -1 and ner.count(' ') > 0:
                    raw_sent = raw_sent.replace(ner, ner.replace(' ', '_'), 1)
                    i = raw_sent.count(' ', 0, i)
                    pos_tags[i:i + ner.count(' ') + 1] = ['N']
                    i = raw_sent.find(ner)

            new_sent = raw_sent.split(' ')
            if len(new_sent) != len(pos_tags):
                raise Exception('Wrong went merge NE')
            new_doc.append([(new_sent[i], pos_tags[i])
                            for i in range(len(new_sent))])
        return ners, new_doc

    def merge_noun_phrases(self, tokenized_doc, noun_phrases=()):
        new_doc = []
        for sent in tokenized_doc:
            raw_sent = ' '.join([word for word, tag in sent]).lower()
            pos_tags = [tag for word, tag in sent]
            for np in noun_phrases:
                i = raw_sent.replace('_', ' ').find(np.replace('_', ' '))
                while i > -1 and raw_sent[i:i + len(np)].count(' ') > 0:
                    j = raw_sent.count(' ', 0, i)
                    pos_tags[j:j + raw_sent[i:i + len(np)].count(' ') +
                             1] = ['N']
                    raw_sent = raw_sent[:i] + np.replace(
                        ' ', '_') + raw_sent[i + len(np):]
                    i = raw_sent.replace('_',
                                         ' ').find(np.replace('_', ' '), i + 1)

            new_sent = raw_sent.split()
            if len(new_sent) != len(pos_tags):
                raise Exception('Wrong went merge NE')
            new_doc.append([(new_sent[i], pos_tags[i])
                            for i in range(len(new_sent))])
        return new_doc

    def get_most_noun_phrases(self, noun_phrases, threshold=2):
        appearances = {}
        for np in noun_phrases:
            appearances[np] = appearances.get(np, 0) + 1
        return [np for np, app in appearances.items() if app >= threshold]

    def analyse_about(self, about):
        annotated_doc = self.annotate(about)
        noun_phrases = self.get_long_tokens(annotated_doc,
                                            min_word_number=2,
                                            max_word_count=4)
        phrases = self.get_long_tokens(annotated_doc,
                                       pos_tags=('N', 'Np', 'Nc', 'A', 'V'),
                                       min_word_number=2,
                                       max_word_count=5)
        named_entities, _ = self.merge_name_entities(annotated_doc)
        return noun_phrases, phrases, named_entities

    def analyse_content(self, doc, noun_phrases_in_about):
        annotated_doc = self.annotate(doc)
        named_entities, new_doc = self.merge_name_entities(annotated_doc)
        noun_phrases = self.get_long_tokens(annotated_doc,
                                            min_word_number=2,
                                            max_word_count=4)
        popular_entity_noun_phrases = [
            p for p in noun_phrases if any(
                p.startswith(popular_prefix)
                for popular_prefix in popular_prefix_named_entity)
        ]
        most_noun_phrases = self.get_most_noun_phrases(noun_phrases)
        merged_doc = self.merge_noun_phrases(
            new_doc,
            noun_phrases=popular_entity_noun_phrases + noun_phrases_in_about +
            most_noun_phrases)
        while len(merged_doc) > 0 and not merged_doc[0]:
            del merged_doc[0]
        return self._lemmatize(merged_doc), noun_phrases, named_entities