Beispiel #1
0
class Entity(object):

    def __init__(self, nlp, keywords_list=[], keywords_dict={}, keywords_file=None, label='', attrs=('has_entities', 'is_entity', 'entity_desc', 'entities')):
        self._has_entities, self._is_entity, self._entity_desc, self._entities = attrs
        self.keyword_processor = KeywordProcessor()
        self.keyword_processor.add_keywords_from_list(keywords_list)
        self.keyword_processor.add_keywords_from_dict(keywords_dict)
        if keywords_file: self.keyword_processor.add_keyword_from_file(keywords_file)
        self.label = label
        Doc.set_extension(self._has_entities, getter=self.has_entities)
        Doc.set_extension(self._entities, getter=self.iter_entities)
        Span.set_extension(self._has_entities, getter=self.has_entities)
        Span.set_extension(self._entities, getter=self.iter_entities)
        Token.set_extension(self._is_entity, default=False)
        Token.set_extension(self._entity_desc, getter=self.get_entity_desc)

    def __call__(self, doc):
        matches = self.keyword_processor.extract_keywords(doc.text, span_info=True)
        spans = []  # keep spans here to merge them later
        for _, start, end in matches:
            entity = doc.char_span(start, end, label=self.label)
            for token in entity: token._.set(self._is_entity, True)
            spans.append(entity)
            doc.ents = list(doc.ents) + [entity]
        for span in spans: span.merge()
        return doc

    def has_entities(self, tokens):
        return any(token._.get(self._is_entity) for token in tokens)

    def iter_entities(self, tokens):
        return [(t.text, i, t._.get(self._entity_desc)) for i, t in enumerate(tokens) if t._.get(self._is_entity)]

    def get_entity_desc(self, token):
        return token.text
Beispiel #2
0
 def test_file_format_two(self):
     keyword_processor = KeywordProcessor()
     keyword_processor.add_keyword_from_file('test/keywords_format_two.txt')
     sentence = 'I know java and product management'
     keywords_extracted = keyword_processor.extract_keywords(sentence)
     self.assertEqual(keywords_extracted, ['java', 'product management'],
                      "Failed file format one test")
     sentence_new = keyword_processor.replace_keywords(sentence)
     self.assertEqual(sentence_new, "I know java and product management",
                      "Failed file format one test")
Beispiel #3
0
 def test_file_format_one_first_occ(self):
     keyword_processor = KeywordProcessor()
     keyword_processor.add_keyword_from_file('test/keywords_format_one.txt')
     sentence = 'I know java_2e and product management techniques'
     keywords_extracted = keyword_processor.extract_keywords(
         sentence, stop_at_first_occ=True)
     self.assertEqual(keywords_extracted, ['java'],
                      "Failed file format one test")
     sentence_new = keyword_processor.replace_keywords(sentence)
     self.assertEqual(sentence_new, "I know java and product management",
                      "Failed file format one test")
Beispiel #4
0
class Entity(object):

    name = 'entity'

    def __init__(self,
                 nlp,
                 keywords_list=[],
                 keywords_dict={},
                 keywords_file=None,
                 label='',
                 attrs=('has_entities', 'is_entity', 'entity_desc',
                        'entities')):
        self._has_entities, self._is_entity, self._entity_desc, self._entities = attrs
        self.keyword_processor = KeywordProcessor()
        self.keyword_processor.add_keywords_from_list(keywords_list)
        self.keyword_processor.add_keywords_from_dict(keywords_dict)
        if keywords_file:
            self.keyword_processor.add_keyword_from_file(keywords_file)
        self.label = label
        # Add attributes
        Doc.set_extension(self._has_entities, getter=self.has_entities)
        Doc.set_extension(self._entities, getter=self.iter_entities)
        Span.set_extension(self._has_entities, getter=self.has_entities)
        Span.set_extension(self._entities, getter=self.iter_entities)
        Token.set_extension(self._is_entity, default=False)
        Token.set_extension(self._entity_desc, getter=self.get_entity_desc)

    def __call__(self, doc):
        matches = self.keyword_processor.extract_keywords(doc.text,
                                                          span_info=True)
        for _, start, end in matches:
            entity = doc.char_span(start, end, label=self.label)
            if entity:
                for token in entity:
                    token._.set(self._is_entity, True)
            # Overwrite doc.ents and add entity – be careful not to replace!
            doc.ents = list(doc.ents) + [entity]
        return doc

    def has_entities(self, tokens):
        return any(token._.get(self._is_entity) for token in tokens)

    def iter_entities(self, tokens):
        return [(t.text, i, t._.get(self._entity_desc))
                for i, t in enumerate(tokens) if t._.get(self._is_entity)]

    def get_entity_desc(self, token):
        return token.text
 def test_add_keyword_file_missing(self):
     keyword_processor = KeywordProcessor()
     with pytest.raises(IOError):
         keyword_processor.add_keyword_from_file('missing_file')
Beispiel #6
0
my_parser = argparse.ArgumentParser()
my_parser.add_argument('--file',
                       action='store',
                       default=0,
                       type=str,
                       required=True)
my_parser.add_argument('--old-file', action='store', type=str)
my_parser.add_argument('--vowels', action='store', default=0, type=int)
my_parser.add_argument('--max-size', action='store', default=10, type=int)
my_parser.add_argument('--keywords', action='store', type=str)
args = my_parser.parse_args()

# setup keyword processor
keywords = KeywordProcessor(case_sensitive=False)
if args.keywords:
    keywords.add_keyword_from_file(args.keywords)


def count_vowels(sentence):
    count = 0
    for letter in sentence:
        if letter in ['a', 'e', 'i', 'o', 'u']:
            count += 1
    return count


domains = []
with open(args.file) as input:
    for domain in input:
        domains.append(domain.rstrip())
Beispiel #7
0
class Entity(object):

    name = 'entity'

    def __init__(self,
                 keywords_list=[],
                 keywords_dict={},
                 keywords_file=None,
                 label='',
                 attrs=('has_entities', 'is_entity', 'entity_desc',
                        'entities')):
        """Initialise the pipeline component.
        """
        self._has_entities, self._is_entity, self._entity_desc, self._entities = attrs

        # Set up the KeywordProcessor
        self.keyword_processor = KeywordProcessor()
        self.keyword_processor.add_keywords_from_list(keywords_list)
        self.keyword_processor.add_keywords_from_dict(keywords_dict)
        if keywords_file:
            self.keyword_processor.add_keyword_from_file(keywords_file)
        self.label = label

        # Register attribute on the Doc and Span
        Doc.set_extension(self._has_entities,
                          getter=self.has_entities,
                          force=True)
        Doc.set_extension(self._entities,
                          getter=self.iter_entities,
                          force=True)
        Span.set_extension(self._has_entities,
                           getter=self.has_entities,
                           force=True)
        Span.set_extension(self._entities,
                           getter=self.iter_entities,
                           force=True)

        # Register attribute on the Token.
        Token.set_extension(self._is_entity, default=False, force=True)
        Token.set_extension(self._entity_desc,
                            getter=self.get_entity_desc,
                            force=True)

    def __call__(self, doc):
        """Apply the pipeline component on a Doc object and modify it if matches
        are found. Return the Doc, so it can be processed by the next component
        in the pipeline, if available.
        """
        matches = self.keyword_processor.extract_keywords(doc.text,
                                                          span_info=True)
        spans = []  # keep spans here to merge them later
        for _, start, end in matches:
            # Generate Span representing the entity & set label
            # Using doc.char_span() instead of Span() because the keyword processor returns
            # index values based on character positions, not words
            entity = doc.char_span(start, end, label=self.label)
            # Set custom attribute on each token of the entity
            if entity:
                for token in entity:
                    token._.set(self._is_entity, True)
            spans.append(entity)
            # Overwrite doc.ents and add entity – be careful not to replace!
            doc.ents = list(doc.ents) + [entity]

        for span in spans:
            # Iterate over all spans and merge them into one token. This is done
            # after setting the entities – otherwise, it would cause mismatched
            # indices!
            span.merge()

        return doc

    def has_entities(self, tokens):
        return any(token._.get(self._is_entity) for token in tokens)

    def iter_entities(self, tokens):
        return [(t.text, i, t._.get(self._entity_desc))
                for i, t in enumerate(tokens) if t._.get(self._is_entity)]

    def get_entity_desc(self, token):
        return token.text
# -*- coding: utf-8 -*-
__author__ = 'xujian'

import jieba.posseg as pseg
from jieba import analyse, load_userdict
from app.util.pre_model import RESOURCES_NET_KEYS, USER_STOP, INDEX_SX_APP_CONTNAME_SET
from flashtext import KeywordProcessor
from app.common.config import W2V_VOCABULARY_PATH, ALL_STAR_NAME_PATH, INDEX_SX_APP_CONTENT_NAME_PATH, USER_DIC_PATH
import re

# 导入关键字
resources_net_processor = KeywordProcessor()
resources_net_processor.add_keywords_from_list(RESOURCES_NET_KEYS)
resources_net_processor.add_keywords_from_list(list(INDEX_SX_APP_CONTNAME_SET))
resources_net_processor.add_keyword_from_file(USER_DIC_PATH)

# 导入用户字典
load_userdict(W2V_VOCABULARY_PATH)
load_userdict(ALL_STAR_NAME_PATH)
load_userdict(INDEX_SX_APP_CONTENT_NAME_PATH)
load_userdict(USER_DIC_PATH)


# 分词,词性标注,词和词性构成一个元组
def postagger(sentence):
    pos_data = pseg.cut(sentence)
    pos_list = []
    for w in pos_data:
        if w.word.strip() == '': continue
        pos_list.append((w.word, sym2name(w.flag)))
    # print pos_list[:]
Beispiel #9
0
def flashtext_init(keyword_path):
    kp = KeywordProcessor()
    kp.add_keyword_from_file(keyword_path)
    return kp