def analyze(self, document, meta): if not document.source.generate_entities: return begin_time = time() try: entities = self.extract_entities(document, meta) except Exception as ex: log.warning(ex) return Reference.delete_document(document.id, origin=self.origin) for name, weight, schema in entities: entity_id = self.load_entity(name, schema) if entity_id is None: continue ref = Reference() ref.document_id = document.id ref.entity_id = entity_id ref.weight = weight db.session.add(ref) self.save(document, meta) duration_time = int((time() - begin_time) * 1000) if len(entities): log.info("Polyglot tagged %r with %d entities (%sms)", document, len(entities), duration_time) else: log.info("Polyglot found no entities on %r (%sms)", document, duration_time)
def analyze(self, document, meta): if not document.source.generate_entities: return begin_time = time() try: entities = self.extract_entities(document, meta) except Exception as ex: log.warning(ex) return Reference.delete_document(document.id, origin=self.origin) for name, weight, schema in entities: entity_id = self.load_entity(name, schema) if entity_id is None: continue ref = Reference() ref.document_id = document.id ref.entity_id = entity_id ref.weight = weight db.session.add(ref) self.save(document, meta) duration_time = int((time() - begin_time) * 1000) if len(entities): log.info("Polyglot tagged %r with %d entities (%sms)", document, len(entities), duration_time) else: log.info("Polyglot found no entities on %r (%sms)", document, duration_time)
def analyze(self, document, meta): begin_time = time() self.cache.generate() entities = defaultdict(int) for text, rec in document.text_parts(): text = normalize_strong(text) if text is None or len(text) <= 2: continue for rex in self.cache.regexes: for match in rex.finditer(text): match = match.group(2) # match = match.group(1) for entity_id in self.cache.matches.get(match, []): entities[entity_id] += 1 Reference.delete_document(document.id, origin=self.origin) for entity_id, weight in entities.items(): ref = Reference() ref.document_id = document.id ref.entity_id = entity_id ref.origin = self.origin ref.weight = weight db.session.add(ref) self.save(document, meta) duration_time = int((time() - begin_time) * 1000) if len(entities): log.info("Regex tagged %r with %d entities (%sms)", document, len(entities), duration_time) else: log.info("Regex found no entities on %r (%sms)", document, duration_time)
def analyze(self, document, meta): begin_time = time() self.cache.generate() entities = defaultdict(int) for text, rec in document.text_parts(): text = normalize_strong(text) if text is None or len(text) <= 2: continue for rex in self.cache.regexes: for match in rex.finditer(text): match = match.group(2) # match = match.group(1) for entity_id in self.cache.matches.get(match, []): entities[entity_id] += 1 Reference.delete_document(document.id, origin=self.origin) for entity_id, weight in entities.items(): ref = Reference() ref.document_id = document.id ref.entity_id = entity_id ref.origin = self.origin ref.weight = weight db.session.add(ref) self.save(document, meta) duration_time = int((time() - begin_time) * 1000) if len(entities): log.info("Regex tagged %r with %d entities (%sms)", document, len(entities), duration_time) else: log.info("Regex found no entities on %r (%sms)", document, duration_time)
def finalize(self): Reference.delete_document(self.document.id, origin=self.origin) for entity_id, weight in self.entities.items(): ref = Reference() ref.document_id = self.document.id ref.entity_id = entity_id ref.origin = self.origin ref.weight = weight db.session.add(ref) log.info('Regex extraced %s entities.', len(self.entities))
def finalize(self): Reference.delete_document(self.document.id, origin=self.origin) for entity_id, weight in self.entities.items(): ref = Reference() ref.document_id = self.document.id ref.entity_id = entity_id ref.origin = self.origin ref.weight = weight db.session.add(ref) log.info('Regex extraced %s entities.', len(self.entities))
def save(self, document, meta, entities): if len(entities): log.info("Tagged %r with %d entities", document, len(entities)) Reference.delete_document(document.id) for entity_id, weight in entities.items(): ref = Reference() ref.document_id = document.id ref.entity_id = entity_id ref.weight = weight db.session.add(ref) super(EntityAnalyzer, self).save(document, meta)
def finalize(self): output = [] for entity_name, schemas in self.entities.items(): schema = max(set(schemas), key=schemas.count) output.append((entity_name, len(schemas), schema)) Reference.delete_document(self.document.id, origin=self.origin) for name, weight, schema in output: entity_id = self.load_entity(name, schema) if entity_id is None: continue ref = Reference() ref.document_id = self.document.id ref.entity_id = entity_id ref.origin = self.origin ref.weight = weight db.session.add(ref) log.info('Polyglot extraced %s entities.', len(output))
def finalize(self): output = [] for entity_name, schemas in self.entities.items(): schema = max(set(schemas), key=schemas.count) output.append((entity_name, len(schemas), schema)) Reference.delete_document(self.document.id, origin=self.origin) for name, weight, schema in output: entity_id = self.load_entity(name, schema) if entity_id is None: continue ref = Reference() ref.document_id = self.document.id ref.entity_id = entity_id ref.origin = self.origin ref.weight = weight db.session.add(ref) log.info('Polyglot extraced %s entities.', len(output))
def analyze(self, document, meta): entities = defaultdict(int) for text, rec in document.text_parts(): text = normalize_strong(text) if text is None or not len(text): continue for rex, matches in self.matchers: for match in rex.finditer(text): match = match.group(2) for entity_id in matches.get(match, []): entities[entity_id] += 1 if len(entities): log.info("Tagged %r with %d entities", document, len(entities)) Reference.delete_document(document.id) for entity_id, weight in entities.items(): ref = Reference() ref.document_id = document.id ref.entity_id = entity_id ref.weight = weight db.session.add(ref) self.save(document, meta)