Example #1
0
def index_package(package, plain_text, normalized_text):
    es.json_encoder = JSONEncoder
    body = {
        'id': package.id,
        'collection': package.collection
    }
    source = package.source
    if source is None:
        log.error("No source for package %r, skipping", package)
        return

    body['name'] = source.meta.get('name')
    body['slug'] = source.meta.get('slug')
    body['title'] = source.meta.get('title') or body['name']
    body['source_url'] = source.meta.get('source_url')
    body['created_at'] = source.meta.get('created_at')
    body['updated_at'] = source.meta.get('updated_at')
    body['filed_at'] = source.meta.get('filed_at')
    body['extension'] = source.meta.get('extension')
    body['mime_type'] = source.meta.get('mime_type')

    if plain_text.exists():
        body['text'] = plain_text.fh().read()
        summary = source.meta.get('summary') or body.get('text')
        body['summary'] = html_summary(summary)

    if normalized_text.exists():
        body['normalized'] = normalized_text.fh().read()

    if not body['title']:
        log.error("No title for package %r, skipping", package)
        return

    body['entities'] = EntityTag.by_package(package.collection, package.id)
    body['attributes'] = generate_attributes(source.meta)

    log.info("Indexing: %r", body['title'])
    es.index(es_index, DOC_TYPE, body, package.id)
                   
Example #2
0
def index_package(package, plain_text, normalized_text):
    es.json_encoder = JSONEncoder
    body = {
        'id': package.id,
        'collection': package.collection
    }
    source = package.source
    if source is None:
        log.error("No source for package %r, skipping", package)
        return

    body['name'] = source.meta.get('name')
    body['slug'] = source.meta.get('slug')
    body['title'] = source.meta.get('title') or body['name']
    body['source_url'] = source.meta.get('source_url')
    body['created_at'] = source.meta.get('created_at')
    body['updated_at'] = source.meta.get('updated_at')
    body['filed_at'] = source.meta.get('filed_at')
    body['extension'] = source.meta.get('extension')
    body['mime_type'] = source.meta.get('mime_type')

    if plain_text.exists():
        body['text'] = plain_text.fh().read()
        summary = source.meta.get('summary') or body.get('text')
        body['summary'] = html_summary(summary)

    if normalized_text.exists():
        body['normalized'] = normalized_text.fh().read()

    if not body['title']:
        log.error("No title for package %r, skipping", package)
        return

    body['entities'] = EntityTag.by_package(package.collection, package.id)
    body['attributes'] = generate_attributes(source.meta)

    log.info("Indexing: %r", body['title'])
    es.index(es_index, DOC_TYPE, body, package.id)
Example #3
0
    def analyze(self, normalized):
        text = normalized.data()
        EntityTag.delete_set(normalized.package.collection,
                             normalized.package.id)

        entities = set()
        for rex, matches in self.expressions():
            for match in rex.finditer(text):
                _, match, _ = match.groups()
                entities.update(matches[match])

        for entity in entities:
            tag = EntityTag()
            tag.collection = normalized.package.collection
            tag.package_id = normalized.package.id
            tag.entity_id = entity
            db.session.add(tag)

        db.session.commit()

        if len(entities):
            log.info("Tagged %r with %d entities", normalized.package.id,
                     len(entities))