コード例 #1
0
def simple_usage():
    # Uncomment this line for debugging
    # logging.basicConfig(level=logging.DEBUG)

   
    vncorenlp_file = 'D:\study\PlagismDetector\PlagismDetector/VnCoreNLP/VnCoreNLP-1.1.1.jar'
    
    sentences = 'VTV đồng ý chia sẻ bản quyền World Cup 2018 cho HTV để khai thác. ' \
                'Nhưng cả hai nhà đài đều phải chờ sự đồng ý của FIFA mới thực hiện được điều này.'

    # Use "with ... as" to close the server automatically
    with VnCoreNLP(vncorenlp_file) as vncorenlp:
        print('Tokenizing:', vncorenlp.tokenize(sentences))
        print('POS Tagging:', vncorenlp.pos_tag(sentences))
        print('Named-Entity Recognizing:', vncorenlp.ner(sentences))
        print('Dependency Parsing:', vncorenlp.dep_parse(sentences))
        print('Annotating:', vncorenlp.annotate(sentences))
        print('Language:', vncorenlp.detect_language(sentences))

    # In this way, you have to close the server manually by calling close function
    vncorenlp = VnCoreNLP(vncorenlp_file)

    print('Tokenizing:', vncorenlp.tokenize(sentences))
    print('POS Tagging:', vncorenlp.pos_tag(sentences))
    print('Named-Entity Recognizing:', vncorenlp.ner(sentences))
    print('Dependency Parsing:', vncorenlp.dep_parse(sentences))
    print('Annotating:', vncorenlp.annotate(sentences))
    print('Language:', vncorenlp.detect_language(sentences))

    # Do not forget to close the server
    vncorenlp.close()
コード例 #2
0
ファイル: test.py プロジェクト: toanloi2569/SIFRank4VN
class SIFRank4VN():
    def __init__(self):
        # path = os.path.dirname(os.path.realpath('__file__'))
        self.vncorenlp = VnCoreNLP(
            "auxiliary_data/VnCoreNLP-master/VnCoreNLP-1.1.1.jar",
            annotators="wseg, pos",
            max_heap_size='-Xmx500m')
        self.phoBERT = word_emb_phoBert.WordEmbeddings()
        self.SIF = sent_emb_sif.SentEmbeddings(self.phoBERT,
                                               lamda=1.0,
                                               embeddings_type='bert')

    def sifrank_extract(self, text, nphrase=15, ratio=0.6):
        keyphrases = SIFRank(text,
                             self.SIF,
                             self.vncorenlp,
                             N=nphrase,
                             ratio=ratio)
        return keyphrases

    def sifrank_plus_extract(self, text, nphrase=15, ratio=0.6):
        keyphrases = SIFRank_plus(text,
                                  self.SIF,
                                  self.vncorenlp,
                                  N=nphrase,
                                  ratio=ratio)
        return keyphrases

    def close_vncorenlp(self):
        self.vncorenlp.close()
コード例 #3
0
def nlp_tokenize(path):
    data = pd.read_excel(path)
    data = data[['ID', 'Content', 'ID người đăng']]
    data = data.dropna()
    data['Content'] = data['Content'].str.strip()
    data['Content'] = data['Content'].str.lower()
    data['status'] = data['Content']

    for i in range(len(data['status'])):
        data['status'].iloc[i] = re.sub('\W+', ' ', data['Content'].iloc[i])
        data['Content'].iloc[i] = data['status'].iloc[i]
    vncorenlp_file = r'VnCoreNLP/VnCoreNLP-1.1.1.jar'
    vncorenlp = VnCoreNLP(vncorenlp_file)
    # content = vncorenlp.tokenize(content)
    for i in range(len(data['status'])):
        data['status'].iloc[i] = vncorenlp.tokenize(data['status'].iloc[i])
    key_word = []
    for i in data['status']:
        key_word = key_word + i

    vncorenlp.close()
    return key_word, data[['Content', 'ID', 'ID người đăng']]
コード例 #4
0
import csv
sys.path.insert(1, os.getcwd() + "/PreprocessingComponent")

from PreprocessingComponent.pdfminer3 import Pdf_extract

if ("PreprocessingComponent" in os.getcwd()):
    vncorenlp_file = os.getcwd() + '/VnCoreNLP/VnCoreNLP-1.1.1.jar'
else:
    vncorenlp_file = os.getcwd(
    ) + '/PreprocessingComponent/VnCoreNLP/VnCoreNLP-1.1.1.jar'
vncorenlp = VnCoreNLP(vncorenlp_file,
                      annotators="wseg,pos,ner,parse",
                      max_heap_size='-Xmx4g',
                      port=6000)
# annotator = VnCoreNLP(vncorenlp_file, annotators="wseg,pos,ner,parse", max_heap_size='-Xmx2g',port=6000)
"""def previous_and_next(some_iterable):
    prevs, items, nexts = tee(some_iterable, 3)
    prevs = chain([None], prevs)
    nexts = chain(islice(nexts, 1, None), [None])
    return zip(prevs, items, nexts)


# LƯU Ý:
# input của các hàm xử lý là tên file và output của các hàm xử lý ...2txt là một list các câu.
# từng phần tử của 1 list lớn (1 câu) là 1 list nhỏ chứa các từ được phân tách nhờ VNCORE.

#### ------------------------------------các function hỗ trợ cho function docx2txt
# Get paragraph string. Input is paragraph element
def para_string(para):
    string = ""
    # if (str(para)[21:34] not in str(wp_tbl)):# and (str(para)[21:34] not in str(wp_txbx)):
コード例 #5
0
    # res.append([os.path.basename(filename), b, num_word])  # filename, list câu. số từ của mỗi câu
    # print("Run time of file ",filename," là: --- %s seconds ---" % (time.time() - start_time))
    return pos_tag, os.path.basename(filename), b, num_word


def rtf2txt(filename):
    with open("yourfile.rtf") as infile:
        for line in infile:
            print(line)


if __name__ == '__main__':
    list_filename = [
        "docFile_test/bacho1.docx",
        "docFile_test/sample1.doc",
        "docFile_test/sacvui.pdf",
        "docFile_test/bacho.docx",
    ]
    start_time = time.time()
    for filename in list_filename:
        a = preprocess(filename)
        print(a)
        print("\n\nRun time of file ", filename,
              " là: --- %s seconds ---" % (time.time() - start_time))
    # print("Tên file là: ", a)
    # print("\n Danh sách các câu của file là: ", b)
    # print("\n Danh sách số từ của file là: ", c)

vncorenlp.close()
コード例 #6
0
class Extractor:
    def __init__(self, jarfile='VnCoreNLP-1.1.1.jar'):
        print('Init VnCoreNLP Annotator...')
        self.annotator = VnCoreNLP(jarfile,
                                   annotators="wseg,pos,ner,parse",
                                   max_heap_size='-Xmx2g')

    def stop(self):
        self.annotator.close()

    def _pos_tagging(self, text):
        pos_tagged_text = self.annotator.pos_tag(text)
        return pos_tagged_text

    def _ner(self, text):
        ner_text = self.annotator.ner(text)
        return ner_text

    def _lemmatize(self, doc, allowed_postags=('N', 'Np', 'V')):
        sentences = []
        ignores = set()
        for sent in doc:
            new_sent = []
            for word, tag in sent:
                new_sent.append(word)
                if tag not in allowed_postags:
                    ignores.add(word)
            sentences.append(new_sent)
        return sentences, ignores

    def _get_named_entities(self, text):
        endline = ('.', 'O')
        old_tag = ''
        entity_segments = []
        entities = []

        for sent in text:
            sent.append(endline)
            for word, tag in sent:
                # not a segment of a named entity
                if len(tag) < 3 or tag[-3:] not in NER_TAGS:
                    if entity_segments:
                        entity = ' '.join(entity_segments)
                        if (entity, old_tag) not in entities and not any(
                                p in entity.lower() for p in wrong_entity):
                            entities.append((entity, old_tag))
                        entity_segments = []
                        old_tag = ''
                    continue

                # is a segment of a named entity
                tag = tag[-3:]
                if tag != old_tag:
                    if entity_segments:
                        entity = ' '.join(entity_segments)
                        if (entity, old_tag) not in entities and not any(
                                p in entity.lower() for p in wrong_entity):
                            entities.append((entity, old_tag))
                        entity_segments = []

                old_tag = tag
                entity_segments.append(word)

        return entities

    def extract(self, text):
        annotated_text = self.annotator.annotate(text)
        ner_text = [[(word['form'], word['nerLabel']) for word in sent]
                    for sent in annotated_text['sentences']]
        pos_tagged_text = [[(word['form'], word['posTag']) for word in sent]
                           for sent in annotated_text['sentences']]
        return self._get_named_entities(ner_text), self._lemmatize(
            pos_tagged_text)

    def annotate(self, doc):
        annotated_doc = self.annotator.annotate(doc)
        return [[
            Token(word['form'], word['nerLabel'], word['posTag'])
            for word in sent
        ] for sent in annotated_doc['sentences']]

    def get_long_tokens(self,
                        annotated_doc,
                        pos_tags=('N', 'Ny', 'Np', 'Nc', 'Y', 'Z', 'A'),
                        min_word_number=2,
                        max_word_count=6):
        eos = Token('.', '.', '.')  # end of sentence
        long_tokens = []
        for sent in annotated_doc:
            sent.append(eos)
            for i, token in enumerate(sent):
                if token.posTag in pos_tags:
                    tokens = [token.form]
                    for next_token in sent[i + 1:]:
                        if next_token.posTag in pos_tags:
                            tokens.append(next_token.form)
                        else:
                            new_long_token = ' '.join(tokens).lower()
                            if len(tokens) >= min_word_number and len(
                                    tokens) <= max_word_count and not any(
                                        p in new_long_token.replace('_', ' ')
                                        for p in popular_phrase_part
                                    ) and not any(new_long_token in p
                                                  for p in long_tokens):
                                long_tokens.append(new_long_token)
                            break
        return long_tokens

    def merge_name_entities(self, annotated_doc):
        remake_doc = [[(token.form, token.nerLabel) for token in sent]
                      for sent in annotated_doc]
        ners = self._get_named_entities(remake_doc)
        new_doc = []
        for sent in annotated_doc:
            raw_sent = ' '.join([token.form for token in sent]).lower()
            pos_tags = [token.posTag for token in sent]
            for ner, _ in ners:
                ner = ner.lower()
                i = raw_sent.find(ner)
                while i > -1 and ner.count(' ') > 0:
                    raw_sent = raw_sent.replace(ner, ner.replace(' ', '_'), 1)
                    i = raw_sent.count(' ', 0, i)
                    pos_tags[i:i + ner.count(' ') + 1] = ['N']
                    i = raw_sent.find(ner)

            new_sent = raw_sent.split(' ')
            if len(new_sent) != len(pos_tags):
                raise Exception('Wrong went merge NE')
            new_doc.append([(new_sent[i], pos_tags[i])
                            for i in range(len(new_sent))])
        return ners, new_doc

    def merge_noun_phrases(self, tokenized_doc, noun_phrases=()):
        new_doc = []
        for sent in tokenized_doc:
            raw_sent = ' '.join([word for word, tag in sent]).lower()
            pos_tags = [tag for word, tag in sent]
            for np in noun_phrases:
                i = raw_sent.replace('_', ' ').find(np.replace('_', ' '))
                while i > -1 and raw_sent[i:i + len(np)].count(' ') > 0:
                    j = raw_sent.count(' ', 0, i)
                    pos_tags[j:j + raw_sent[i:i + len(np)].count(' ') +
                             1] = ['N']
                    raw_sent = raw_sent[:i] + np.replace(
                        ' ', '_') + raw_sent[i + len(np):]
                    i = raw_sent.replace('_',
                                         ' ').find(np.replace('_', ' '), i + 1)

            new_sent = raw_sent.split()
            if len(new_sent) != len(pos_tags):
                raise Exception('Wrong went merge NE')
            new_doc.append([(new_sent[i], pos_tags[i])
                            for i in range(len(new_sent))])
        return new_doc

    def get_most_noun_phrases(self, noun_phrases, threshold=2):
        appearances = {}
        for np in noun_phrases:
            appearances[np] = appearances.get(np, 0) + 1
        return [np for np, app in appearances.items() if app >= threshold]

    def analyse_about(self, about):
        annotated_doc = self.annotate(about)
        noun_phrases = self.get_long_tokens(annotated_doc,
                                            min_word_number=2,
                                            max_word_count=4)
        phrases = self.get_long_tokens(annotated_doc,
                                       pos_tags=('N', 'Np', 'Nc', 'A', 'V'),
                                       min_word_number=2,
                                       max_word_count=5)
        named_entities, _ = self.merge_name_entities(annotated_doc)
        return noun_phrases, phrases, named_entities

    def analyse_content(self, doc, noun_phrases_in_about):
        annotated_doc = self.annotate(doc)
        named_entities, new_doc = self.merge_name_entities(annotated_doc)
        noun_phrases = self.get_long_tokens(annotated_doc,
                                            min_word_number=2,
                                            max_word_count=4)
        popular_entity_noun_phrases = [
            p for p in noun_phrases if any(
                p.startswith(popular_prefix)
                for popular_prefix in popular_prefix_named_entity)
        ]
        most_noun_phrases = self.get_most_noun_phrases(noun_phrases)
        merged_doc = self.merge_noun_phrases(
            new_doc,
            noun_phrases=popular_entity_noun_phrases + noun_phrases_in_about +
            most_noun_phrases)
        while len(merged_doc) > 0 and not merged_doc[0]:
            del merged_doc[0]
        return self._lemmatize(merged_doc), noun_phrases, named_entities
コード例 #7
0
    print(result)
    return result


@app.route('/tener', methods=['POST'])
def tener():
    text = request.get_data()
    text = text.decode('utf8')
    _text = json.loads(text)
    text = _text['text']
    # print(type(text))
    __text = annotator.tokenize(text)
    _text = []
    for sen in __text:
        _text += sen
    text = " ".join(_text)
    print(text)
    if args.inference == True:
        _result = inference(text, model)
    else:
        _result = test(text, model)
    result = {}
    result['sentence'] = _result
    return jsonify(result)


if __name__ == '__main__':
    app.run()

annotator.close()