class ExtractEntitiesStep(AbstractStep): """ Extract entities from collected text. """ def __init__(self, name: str, source_key: str = None, overwrite: bool = False, source_iter: Callable[[List[str]], Iterator[IO[AnyStr]]] = file_iter, output_handler: Callable[[str, Dict[str, Any]], None] = oh): super().__init__(name, source_key, overwrite) self.__source_iter = source_iter self.__output_handler = output_handler root_path = Path(__file__).parent.parent entities_path = str(root_path / 'config/entities.csv') self.entity_reverse_lookup, synonyms, self.regexprs = load_entities( entities_path) self.keyword_processor = prepare_keyword_processor(synonyms) duckling_entities = {ENTITY_DATE, ENTITY_NUMBER} tagger_entities = {ENTITY_PERSON} if len(duckling_entities.intersection(ENABLED_SYSTEM_ENTITIES)) > 0: self.d = DucklingWrapper() if len(tagger_entities.intersection(ENABLED_SYSTEM_ENTITIES)) > 0: self.tagger = SequenceTagger.load('ner') def process_file(self, file: IO[AnyStr], path: str, control_data: Dict[str, Any], logger: Logger, accumulator: Dict[str, Any]) -> None: logger.debug('process file: {}'.format(file.name)) input_doc = json.load(file) metadata = input_doc['metadata'] record_id = metadata['record_id'] data = input_doc['data'] text = data['text'] nlp_text = [] for t in text: entities = [] keywords_found = self.keyword_processor.extract_keywords( t, span_info=True) for keyword in keywords_found: entities.append({ 'entity': self.entity_reverse_lookup[keyword[0]], 'location': keyword[1:], 'value': keyword[0], 'confidence': 1.0 }) matches = match_regexprs(t, self.regexprs) for match in matches: match['entity'] = self.entity_reverse_lookup[match['value']] entities.extend(matches) entities.extend(self.match_system_entities(t)) # is the span of an entity contained within the span # of another entity def is_contained(entity): start, end = entity['location'] for ent in entities: s, e = ent['location'] # exclude exact span matches if (start == s and end < e) or ( start > s and end == e) or (start > s and end < e): return True return False def is_valid(entity): # remove spurious dates if entity['entity'] == 'sys-date': start, end = entity['location'] if (end - start) < 8: return False value = entity['value'] if isinstance(value, str): try: date = parse(value) except ValueError: return False year = date.year if year < 1990 or year > 2025: return False return True # keep the entity with the longest span where an entity # is contained within the span of another pruned_entities = [ ent for ent in entities if not is_contained(ent) and is_valid(ent) ] nlp_text.append({'text': t, 'entities': pruned_entities}) now = datetime.utcnow().isoformat() write_root_dir = control_data['job']['write_root_dir'] step_name = convert_name_to_underscore(self.name) output_filename = '{}_{}.json'.format(step_name, record_id) output_path = os.path.join(write_root_dir, step_name, output_filename) data = {} data['nlp_text'] = nlp_text content = {'metadata': metadata, 'data': data} accumulator['files_output'].append({ 'filename': output_filename, 'input': path, 'path': output_path, 'status': 'processed', 'time': now }) self.__output_handler(output_path, content) def run(self, control_data: Dict[str, Any], logger: Logger, accumulator: Dict[str, Any]) -> None: file_paths = [x['path'] for x in control_data[self.source_key]] step_name = convert_name_to_underscore(self.name) processed_file_paths = {} if step_name in control_data: for x in control_data[step_name]: if x['status'] == 'processed': processed_file_paths[x['input']] = x for file, path in self.__source_iter(file_paths): if not self._overwrite and path in processed_file_paths.keys(): accumulator['files_output'].append(processed_file_paths[path]) continue self.process_file(file, path, control_data, logger, accumulator) def match_system_entities(self, utter): matches = [] if ENTITY_DATE in ENABLED_SYSTEM_ENTITIES: results = self.d.parse_time(utter) for result in results: matches.append({ 'entity': 'sys-date', 'location': [result['start'], result['end']], 'value': result['value']['value'], 'confidence': 1.0 }) if ENTITY_NUMBER in ENABLED_SYSTEM_ENTITIES: results = self.d.parse_number(utter) for result in results: matches.append({ 'entity': 'sys-number', 'location': [result['start'], result['end']], 'value': result['value']['value'], 'confidence': 1.0 }) sentence = None if ENTITY_PERSON in ENABLED_SYSTEM_ENTITIES: if sentence is None: sentence = Sentence(utter) self.tagger.predict(sentence) for entity in sentence.get_spans('ner'): if entity.tag == 'PER': matches.append({ 'entity': 'sys-person', 'location': [entity.start_pos, entity.end_pos], 'value': entity.text, 'confidence': entity.score }) return matches
ent['value'] = txt ent['entity'] = 'product' sentence['entities'].append(ent) sentence['text'] += txt + " " while random.random() > .5: m = random.choice(middle) sentence['text'] += m txt = df.sample().iloc[0, 0] ent = dict() ent['start'] = len(sentence['text']) ent['end'] = len(sentence['text'] + txt) ent['value'] = txt ent['entity'] = 'product' sentence['entities'].append(ent) sentence['text'] += txt + " " sentence['text'] += random.choice(end) train_data['rasa_nlu_data']["common_examples"].append(sentence) with open('result.json', 'w+') as fp: json.dump(train_data, fp) container = IntentContainer('intent_cache') d = DucklingWrapper() d.parse('Bring me 250 ml sugar') d.parse_ print(d.parse_time(u'Let\'s meet at 11:45am')) print(d.parse_number(u'Bring me one conserve of ravioli')) print(d.parse_quantity(u'Bring me 100 g of sugar'))