def process_doc(doc): text=doc.title if text is None: return if doc.body is not None: text=text+' '+doc.body else: doc.body='' if is_duplicate(doc): return #print 'process_doc: '+doc.title # save doc doc.save() # get named entities using NLP nlp_entities=portal.nlpextractor.extract_entities(text) # see if no such entity or pattern already exists, then add it as a disabled new entity for nlp_entity in nlp_entities: x=(Entity.objects.filter(name__iexact=nlp_entity) | Entity.objects.filter(pattern__pattern__iexact=nlp_entity)) if len(x)==0: print 'adding new NLP entity: '+nlp_entity e=Entity(name=nlp_entity) e.enabled=False e.save() p=Pattern(pattern=nlp_entity) p.entity=e p.save() # and attach to this document doc.entities.add(e) # get entities using database models entities=Tagger.extract_entities(text) if entities is not None: if len(entities)>0: print 'found '+str(len(entities)) +' matching entities for document' print str([entity.name for entity in entities]) for entity in entities: doc.entities.add(entity) # add entities from feed source for e in doc.feed.entities.all(): # TODO: make sure we dont add entities doc already has doc.entities.add(e) doc.save()
def process_entity(id): e=Entity.objects.get(id=id) Tagger.process_entity(e)
def process_entity(id): e = Entity.objects.get(id=id) Tagger.process_entity(e)
def process_doc(doc): text = doc.title if text is None: return if doc.body is not None: text = text + ' ' + doc.body else: doc.body = '' if is_duplicate(doc): return # save doc doc.save() need_save = False # get named entities using NLP nlp_entities = portal.nlpextractor.extract_entities(text) # see if no such entity or pattern already exists, then add it as a disabled new entity for nlp_entity in nlp_entities: x = (Entity.objects.filter(name__iexact=nlp_entity) | Entity.objects.filter(pattern__pattern__iexact=nlp_entity)) if len(x) == 0: print('adding new NLP entity: ' + nlp_entity) e = Entity(name=nlp_entity) e.enabled = False e.save() p = Pattern(pattern=nlp_entity) p.entity = e p.save() # and attach to this document doc.entities.add(e) need_save = True e.save() # get entities using database models entities = Tagger.extract_entities(text) doc_entity_names = set() if entities is not None: if len(entities) > 0: print('found ' + str(len(entities)) + ' matching entities for document') print(str([entity.name for entity in entities])) for entity in entities: doc.entities.add(entity) if entity.parent is not None: doc.entities.add(entity.parent) doc_entity_names.add(entity.parent.name) doc_entity_names.add(entity.name) entity.save() need_save = True # add entities from feed source for e in doc.feed.entities.all(): # make sure we dont add entities doc already has if not e.name in doc_entity_names: doc.entities.add(e) doc_entity_names.add(e.name) need_save = True e.save() if e.parent is not None: if not e.parent.name in doc_entity_names: doc.entities.add(e.parent) doc_entity_names.add(e.parent.name) need_save = True e.save() if need_save: doc.save()
def process_entities(): for t in EntityType.objects.all(): for e in Entity.objects.filter(entity_type=t): Tagger.process_entity(e)