Beispiel #1
0
    def generate_scored_mappings(cls, origins=[], threshold=.5):
        """Do a cross-product comparison of entities and generate mappings."""
        index = EntityIndex()
        index.build()
        q = Entity.find_by_origins(origins=[])
        q = q.filter(Entity.active == True)  # noqa
        entities = {e.uid: e for e in q.all()}
        decided = cls.get_decided()
        for entity in entities.values():
            if len(origins) and entity.origin not in origins:
                continue
            skip = set()
            for pair in decided:
                if entity.uid in pair:
                    skip.update(pair)

            for uid in index.search_similar(entity, skip=skip):
                match = entities.get(uid)
                score = entity.compare(match)
                if score <= threshold:
                    continue

                project.log.info("Candidate [%.3f]: %s <-> %s", score,
                                 entity.name, match.name)
                cls.save(entity.uid,
                         match.uid,
                         judgement=None,
                         score=score,
                         generated=True)
                decided.add((entity.uid, match.uid))
                session.commit()
Beispiel #2
0
 def emit_entity(self, data):
     """Create or update an entity in the context of this emitter."""
     entity = Entity.save(dict(data),
                          self.origin,
                          query_uid=self.query_uid,
                          match_uid=self.match_uid)
     session.commit()
     return entity
Beispiel #3
0
 def emit_judgement(self, uida, uidb, judgement, score=None, decided=False):
     """Change the record linkage status of two entities."""
     mapping = Mapping.save(uida,
                            uidb,
                            judgement,
                            decided=decided,
                            score=score)
     session.commit()
     return mapping
Beispiel #4
0
 def emit_document(self, entity_uid, url, title, publisher=None):
     """Create or update a document in the context of this emitter."""
     doc = Document.save(entity_uid,
                         url,
                         title,
                         self.origin,
                         publisher=publisher)
     session.commit()
     return doc
Beispiel #5
0
def enrich_documents(origin, entity):
    for uid in entity.uids:
        Document.delete_by_entity(entity.uid)
    session.commit()

    if entity.schema not in [PERSON, COMPANY, ORGANIZATION, OTHER]:
        return

    total = 0
    query = search_entity(entity)
    for url, title, publisher in search_documents(query):
        origin.emit_document(entity.uid, url, title, publisher=publisher)
        total += 1
    origin.log.info('Query [%s]: %s -> %s', entity.name, query, total)
Beispiel #6
0
def enrich(origin, entity):
    gmaps = googlemaps.Client(key=API_KEY)
    for uid in entity.uids:
        q = Address.find_by_entity(uid)
        q = q.filter(Address.normalized == None)  # noqa
        for address in q:
            origin.log.info("Geocoding [%s] %s", entity.name, address.clean)
            results = geocode(gmaps, address.clean)
            if not len(results):
                origin.log.info("No results: %s" % address.clean)
            for result in results:
                address.update(normalized=result['formatted_address'],
                               latitude=result['geometry']['location']['lat'],
                               longitude=result['geometry']['location']['lng'])
                break
        session.commit()
Beispiel #7
0
 def find_undecided(cls, limit=10, offset=0):
     """Return candidates for manual matching."""
     decided = cls.get_decided()
     q = session.query(cls)
     q = q.filter(cls.project == project.name)
     q = q.filter(cls.decided == False)  # noqa
     q = q.filter(cls.judgement == None)  # noqa
     q = q.order_by(cls.score.desc())
     q = q.offset(offset)
     mappings = []
     for mapping in q.yield_per(limit):
         if (mapping.left_uid, mapping.right_uid) in decided or \
            mapping.left is None or mapping.right is None:
             mapping.delete()
             continue
         mappings.append(mapping)
         if len(mappings) == limit:
             break
     session.commit()
     return mappings
Beispiel #8
0
    def emit_entity(self, data):
        # Enrichment results are first held as inactive and become active only
        # once the judgement between the query and result entities is confirmed
        entity = super(ResultEmitter, self).emit_entity(data)
        if (self.mapping is None) or \
           (not self.mapping.decided) or \
           (self.mapping.judgement is False):
            entity.active = False

        if self.mapping is None or self.mapping.judgement is None:
            entity.active = False
            if entity.uid == self.match_uid:
                # Generate a tentative mapping.
                query = Entity.get(self.query_uid)
                if query is not None:
                    Mapping.save(self.match_uid,
                                 self.query_uid,
                                 None,
                                 score=query.compare(entity))
        session.commit()
        return entity
Beispiel #9
0
 def clear(self):
     Entity.delete_by_origin(self.origin,
                             query_uid=self.query_uid,
                             match_uid=self.match_uid)
     session.commit()
Beispiel #10
0
 def emit_link(self, data):
     """Create or update a link in the context of this emitter."""
     entity = Link.save(dict(data), self.origin)
     session.commit()
     return entity
Beispiel #11
0
def mappings_cleanup():
    """Delete undecided generated mappings."""
    Mapping.cleanup()
    session.commit()
Beispiel #12
0
def mappings_apply():
    """Apply mapped canonical IDs to all entities."""
    Mapping.canonicalize()
    session.commit()