Beispiel #1
0
def process_doc(doc):

    text=doc.title

    if text is None:
        return
    
    if doc.body is not None:
        text=text+' '+doc.body
    else:
        doc.body=''

    if is_duplicate(doc):
        return

    #print 'process_doc: '+doc.title
    
    # save doc
    doc.save()
    
    # get named entities using NLP
    nlp_entities=portal.nlpextractor.extract_entities(text)

    
    # see if no such entity or pattern already exists, then add it as a disabled new entity
    for nlp_entity in nlp_entities:
        x=(Entity.objects.filter(name__iexact=nlp_entity) | Entity.objects.filter(pattern__pattern__iexact=nlp_entity))
        if len(x)==0:
            print 'adding new NLP entity: '+nlp_entity
            e=Entity(name=nlp_entity)
            e.enabled=False
            e.save()
            p=Pattern(pattern=nlp_entity)
            p.entity=e
            p.save()
            # and attach to this document
            doc.entities.add(e)

    # get entities using database models
    entities=Tagger.extract_entities(text)

    if entities is not None:
        if len(entities)>0:
            print 'found '+str(len(entities)) +' matching entities for document'
            print str([entity.name for entity in entities])
            for entity in entities:
                doc.entities.add(entity)

    # add entities from feed source
    for e in doc.feed.entities.all():
        # TODO: make sure we dont add entities doc already has
        doc.entities.add(e)

    doc.save()
Beispiel #2
0
def process_entity(id):
    e=Entity.objects.get(id=id)
    Tagger.process_entity(e)
Beispiel #3
0
def process_entity(id):
    e = Entity.objects.get(id=id)
    Tagger.process_entity(e)
Beispiel #4
0
def process_doc(doc):

    text = doc.title

    if text is None:
        return

    if doc.body is not None:
        text = text + ' ' + doc.body
    else:
        doc.body = ''

    if is_duplicate(doc):
        return

    # save doc
    doc.save()

    need_save = False
    # get named entities using NLP
    nlp_entities = portal.nlpextractor.extract_entities(text)

    # see if no such entity or pattern already exists, then add it as a disabled new entity
    for nlp_entity in nlp_entities:
        x = (Entity.objects.filter(name__iexact=nlp_entity)
             | Entity.objects.filter(pattern__pattern__iexact=nlp_entity))
        if len(x) == 0:
            print('adding new NLP entity: ' + nlp_entity)
            e = Entity(name=nlp_entity)
            e.enabled = False
            e.save()
            p = Pattern(pattern=nlp_entity)
            p.entity = e
            p.save()
            # and attach to this document
            doc.entities.add(e)
            need_save = True
            e.save()

    # get entities using database models
    entities = Tagger.extract_entities(text)

    doc_entity_names = set()
    if entities is not None:
        if len(entities) > 0:
            print('found ' + str(len(entities)) +
                  ' matching entities for document')
            print(str([entity.name for entity in entities]))
            for entity in entities:
                doc.entities.add(entity)
                if entity.parent is not None:
                    doc.entities.add(entity.parent)
                    doc_entity_names.add(entity.parent.name)
                doc_entity_names.add(entity.name)
                entity.save()
                need_save = True

    # add entities from feed source
    for e in doc.feed.entities.all():
        # make sure we dont add entities doc already has
        if not e.name in doc_entity_names:
            doc.entities.add(e)
            doc_entity_names.add(e.name)
            need_save = True
            e.save()
        if e.parent is not None:
            if not e.parent.name in doc_entity_names:
                doc.entities.add(e.parent)
                doc_entity_names.add(e.parent.name)
                need_save = True
                e.save()

    if need_save:
        doc.save()
Beispiel #5
0
def process_entities():
    for t in EntityType.objects.all():
        for e in Entity.objects.filter(entity_type=t):
            Tagger.process_entity(e)