Example #1
0
 def _inner(body):
     metadata = sedar.collect_sedar_metadata(body['source_url'])
     if len(body['attributes']) > 0:
         return None
     attributes = generate_attributes(metadata)
     body['attributes'] = attributes
     body = fix_date(body)
     return body
Example #2
0
 def _inner(body):
     metadata = sedar.collect_sedar_metadata(body['source_url'])
     if len(body['attributes']) > 0:
         return None
     attributes = generate_attributes(metadata)
     body['attributes'] = attributes
     body = fix_date(body)
     return body
Example #3
0
def index_package(package, plain_text, normalized_text):
    es.json_encoder = JSONEncoder
    body = {
        'id': package.id,
        'collection': package.collection
    }
    source = package.source
    if source is None:
        log.error("No source for package %r, skipping", package)
        return

    body['name'] = source.meta.get('name')
    body['slug'] = source.meta.get('slug')
    body['title'] = source.meta.get('title') or body['name']
    body['source_url'] = source.meta.get('source_url')
    body['created_at'] = source.meta.get('created_at')
    body['updated_at'] = source.meta.get('updated_at')
    body['filed_at'] = source.meta.get('filed_at')
    body['extension'] = source.meta.get('extension')
    body['mime_type'] = source.meta.get('mime_type')

    if plain_text.exists():
        body['text'] = plain_text.fh().read()
        summary = source.meta.get('summary') or body.get('text')
        body['summary'] = html_summary(summary)

    if normalized_text.exists():
        body['normalized'] = normalized_text.fh().read()

    if not body['title']:
        log.error("No title for package %r, skipping", package)
        return

    body['entities'] = EntityTag.by_package(package.collection, package.id)
    body['attributes'] = generate_attributes(source.meta)

    log.info("Indexing: %r", body['title'])
    es.index(es_index, DOC_TYPE, body, package.id)
                   
Example #4
0
def index_package(package, plain_text, normalized_text):
    es.json_encoder = JSONEncoder
    body = {
        'id': package.id,
        'collection': package.collection
    }
    source = package.source
    if source is None:
        log.error("No source for package %r, skipping", package)
        return

    body['name'] = source.meta.get('name')
    body['slug'] = source.meta.get('slug')
    body['title'] = source.meta.get('title') or body['name']
    body['source_url'] = source.meta.get('source_url')
    body['created_at'] = source.meta.get('created_at')
    body['updated_at'] = source.meta.get('updated_at')
    body['filed_at'] = source.meta.get('filed_at')
    body['extension'] = source.meta.get('extension')
    body['mime_type'] = source.meta.get('mime_type')

    if plain_text.exists():
        body['text'] = plain_text.fh().read()
        summary = source.meta.get('summary') or body.get('text')
        body['summary'] = html_summary(summary)

    if normalized_text.exists():
        body['normalized'] = normalized_text.fh().read()

    if not body['title']:
        log.error("No title for package %r, skipping", package)
        return

    body['entities'] = EntityTag.by_package(package.collection, package.id)
    body['attributes'] = generate_attributes(source.meta)

    log.info("Indexing: %r", body['title'])
    es.index(es_index, DOC_TYPE, body, package.id)