class Entity(object): def __init__(self, nlp, keywords_list=[], keywords_dict={}, keywords_file=None, label='', attrs=('has_entities', 'is_entity', 'entity_desc', 'entities')): self._has_entities, self._is_entity, self._entity_desc, self._entities = attrs self.keyword_processor = KeywordProcessor() self.keyword_processor.add_keywords_from_list(keywords_list) self.keyword_processor.add_keywords_from_dict(keywords_dict) if keywords_file: self.keyword_processor.add_keyword_from_file(keywords_file) self.label = label Doc.set_extension(self._has_entities, getter=self.has_entities) Doc.set_extension(self._entities, getter=self.iter_entities) Span.set_extension(self._has_entities, getter=self.has_entities) Span.set_extension(self._entities, getter=self.iter_entities) Token.set_extension(self._is_entity, default=False) Token.set_extension(self._entity_desc, getter=self.get_entity_desc) def __call__(self, doc): matches = self.keyword_processor.extract_keywords(doc.text, span_info=True) spans = [] # keep spans here to merge them later for _, start, end in matches: entity = doc.char_span(start, end, label=self.label) for token in entity: token._.set(self._is_entity, True) spans.append(entity) doc.ents = list(doc.ents) + [entity] for span in spans: span.merge() return doc def has_entities(self, tokens): return any(token._.get(self._is_entity) for token in tokens) def iter_entities(self, tokens): return [(t.text, i, t._.get(self._entity_desc)) for i, t in enumerate(tokens) if t._.get(self._is_entity)] def get_entity_desc(self, token): return token.text
def test_file_format_two(self): keyword_processor = KeywordProcessor() keyword_processor.add_keyword_from_file('test/keywords_format_two.txt') sentence = 'I know java and product management' keywords_extracted = keyword_processor.extract_keywords(sentence) self.assertEqual(keywords_extracted, ['java', 'product management'], "Failed file format one test") sentence_new = keyword_processor.replace_keywords(sentence) self.assertEqual(sentence_new, "I know java and product management", "Failed file format one test")
def test_file_format_one_first_occ(self): keyword_processor = KeywordProcessor() keyword_processor.add_keyword_from_file('test/keywords_format_one.txt') sentence = 'I know java_2e and product management techniques' keywords_extracted = keyword_processor.extract_keywords( sentence, stop_at_first_occ=True) self.assertEqual(keywords_extracted, ['java'], "Failed file format one test") sentence_new = keyword_processor.replace_keywords(sentence) self.assertEqual(sentence_new, "I know java and product management", "Failed file format one test")
class Entity(object): name = 'entity' def __init__(self, nlp, keywords_list=[], keywords_dict={}, keywords_file=None, label='', attrs=('has_entities', 'is_entity', 'entity_desc', 'entities')): self._has_entities, self._is_entity, self._entity_desc, self._entities = attrs self.keyword_processor = KeywordProcessor() self.keyword_processor.add_keywords_from_list(keywords_list) self.keyword_processor.add_keywords_from_dict(keywords_dict) if keywords_file: self.keyword_processor.add_keyword_from_file(keywords_file) self.label = label # Add attributes Doc.set_extension(self._has_entities, getter=self.has_entities) Doc.set_extension(self._entities, getter=self.iter_entities) Span.set_extension(self._has_entities, getter=self.has_entities) Span.set_extension(self._entities, getter=self.iter_entities) Token.set_extension(self._is_entity, default=False) Token.set_extension(self._entity_desc, getter=self.get_entity_desc) def __call__(self, doc): matches = self.keyword_processor.extract_keywords(doc.text, span_info=True) for _, start, end in matches: entity = doc.char_span(start, end, label=self.label) if entity: for token in entity: token._.set(self._is_entity, True) # Overwrite doc.ents and add entity – be careful not to replace! doc.ents = list(doc.ents) + [entity] return doc def has_entities(self, tokens): return any(token._.get(self._is_entity) for token in tokens) def iter_entities(self, tokens): return [(t.text, i, t._.get(self._entity_desc)) for i, t in enumerate(tokens) if t._.get(self._is_entity)] def get_entity_desc(self, token): return token.text
def test_add_keyword_file_missing(self): keyword_processor = KeywordProcessor() with pytest.raises(IOError): keyword_processor.add_keyword_from_file('missing_file')
my_parser = argparse.ArgumentParser() my_parser.add_argument('--file', action='store', default=0, type=str, required=True) my_parser.add_argument('--old-file', action='store', type=str) my_parser.add_argument('--vowels', action='store', default=0, type=int) my_parser.add_argument('--max-size', action='store', default=10, type=int) my_parser.add_argument('--keywords', action='store', type=str) args = my_parser.parse_args() # setup keyword processor keywords = KeywordProcessor(case_sensitive=False) if args.keywords: keywords.add_keyword_from_file(args.keywords) def count_vowels(sentence): count = 0 for letter in sentence: if letter in ['a', 'e', 'i', 'o', 'u']: count += 1 return count domains = [] with open(args.file) as input: for domain in input: domains.append(domain.rstrip())
class Entity(object): name = 'entity' def __init__(self, keywords_list=[], keywords_dict={}, keywords_file=None, label='', attrs=('has_entities', 'is_entity', 'entity_desc', 'entities')): """Initialise the pipeline component. """ self._has_entities, self._is_entity, self._entity_desc, self._entities = attrs # Set up the KeywordProcessor self.keyword_processor = KeywordProcessor() self.keyword_processor.add_keywords_from_list(keywords_list) self.keyword_processor.add_keywords_from_dict(keywords_dict) if keywords_file: self.keyword_processor.add_keyword_from_file(keywords_file) self.label = label # Register attribute on the Doc and Span Doc.set_extension(self._has_entities, getter=self.has_entities, force=True) Doc.set_extension(self._entities, getter=self.iter_entities, force=True) Span.set_extension(self._has_entities, getter=self.has_entities, force=True) Span.set_extension(self._entities, getter=self.iter_entities, force=True) # Register attribute on the Token. Token.set_extension(self._is_entity, default=False, force=True) Token.set_extension(self._entity_desc, getter=self.get_entity_desc, force=True) def __call__(self, doc): """Apply the pipeline component on a Doc object and modify it if matches are found. Return the Doc, so it can be processed by the next component in the pipeline, if available. """ matches = self.keyword_processor.extract_keywords(doc.text, span_info=True) spans = [] # keep spans here to merge them later for _, start, end in matches: # Generate Span representing the entity & set label # Using doc.char_span() instead of Span() because the keyword processor returns # index values based on character positions, not words entity = doc.char_span(start, end, label=self.label) # Set custom attribute on each token of the entity if entity: for token in entity: token._.set(self._is_entity, True) spans.append(entity) # Overwrite doc.ents and add entity – be careful not to replace! doc.ents = list(doc.ents) + [entity] for span in spans: # Iterate over all spans and merge them into one token. This is done # after setting the entities – otherwise, it would cause mismatched # indices! span.merge() return doc def has_entities(self, tokens): return any(token._.get(self._is_entity) for token in tokens) def iter_entities(self, tokens): return [(t.text, i, t._.get(self._entity_desc)) for i, t in enumerate(tokens) if t._.get(self._is_entity)] def get_entity_desc(self, token): return token.text
# -*- coding: utf-8 -*- __author__ = 'xujian' import jieba.posseg as pseg from jieba import analyse, load_userdict from app.util.pre_model import RESOURCES_NET_KEYS, USER_STOP, INDEX_SX_APP_CONTNAME_SET from flashtext import KeywordProcessor from app.common.config import W2V_VOCABULARY_PATH, ALL_STAR_NAME_PATH, INDEX_SX_APP_CONTENT_NAME_PATH, USER_DIC_PATH import re # 导入关键字 resources_net_processor = KeywordProcessor() resources_net_processor.add_keywords_from_list(RESOURCES_NET_KEYS) resources_net_processor.add_keywords_from_list(list(INDEX_SX_APP_CONTNAME_SET)) resources_net_processor.add_keyword_from_file(USER_DIC_PATH) # 导入用户字典 load_userdict(W2V_VOCABULARY_PATH) load_userdict(ALL_STAR_NAME_PATH) load_userdict(INDEX_SX_APP_CONTENT_NAME_PATH) load_userdict(USER_DIC_PATH) # 分词,词性标注,词和词性构成一个元组 def postagger(sentence): pos_data = pseg.cut(sentence) pos_list = [] for w in pos_data: if w.word.strip() == '': continue pos_list.append((w.word, sym2name(w.flag))) # print pos_list[:]
def flashtext_init(keyword_path): kp = KeywordProcessor() kp.add_keyword_from_file(keyword_path) return kp