Example #1
0
def index_collection(collection):
    """Index a collection."""
    if collection.deleted_at is not None:
        return delete_collection(collection.id)

    data = {
        'foreign_id': collection.foreign_id,
        'created_at': collection.created_at,
        'updated_at': collection.updated_at,
        'label': collection.label,
        'summary': collection.summary,
        'category': collection.category,
        'countries': collection.countries,
        'languages': collection.languages,
        'managed': collection.managed,
        'roles': collection.roles
    }
    if collection.creator is not None:
        data['creator'] = {
            'id': collection.creator.id,
            'type': collection.creator.type,
            'name': collection.creator.name
        }
    data.update(get_collection_stats(collection.id))
    es.index(index=es_index,
             doc_type=TYPE_COLLECTION,
             id=collection.id,
             body=data)
Example #2
0
def index_lead(lead):
    """Index a lead."""
    hash_sum = sha1()
    hash_sum.update(lead.get('entity_id') or '')
    hash_sum.update(lead.get('match_id') or '')
    lead_id = hash_sum.hexdigest()
    es.index(index=es_index, doc_type=TYPE_LEAD, id=lead_id, body=lead)
Example #3
0
def index_entity(entity):
    """Index an entity."""
    data = entity.to_index()
    data.pop('id', None)
    data['doc_count'] = get_count(entity)
    data = finalize_index(data, entity.schema)
    es.index(index=es_index, doc_type=TYPE_ENTITY, id=entity.id, body=data)
Example #4
0
def index_document(document_id):
    clear_session()
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data,
             id=document.id)
    clear_children(document)

    try:
        if document.type == Document.TYPE_TEXT:
            bulk(es, generate_pages(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)

        if document.type == Document.TYPE_TABULAR:
            bulk(es, generate_records(document), stats_only=True,
                 chunk_size=2000, request_timeout=60.0)
    except Exception as ex:
        log.exception(ex)
Example #5
0
def index_entity(entity):
    """Index an entity."""
    if entity.deleted_at is not None:
        return delete_entity(entity.id)

    data = {
        'foreign_ids': entity.foreign_ids,
        'data': entity.data,
        'created_at': entity.created_at,
        'updated_at': entity.updated_at,
        '$bulk': False,
        'roles': entity.collection.roles,
        'collection_id': entity.collection_id,
        'properties': {
            'name': [entity.name]
        }
    }

    for k, v in entity.data.items():
        data['properties'][k] = ensure_list(v)

    # data['$documents'] = get_count(entity)
    data = finalize_index(data, entity.schema)
    es.index(index=es_index, doc_type=TYPE_ENTITY, id=entity.id, body=data)
    data['id'] = entity.id
    data['$type'] = TYPE_ENTITY
    return data
Example #6
0
def index_safe(index, id, body):
    """Index a single document and retry until it has been stored."""
    for attempt in count():
        try:
            es.index(index=index, doc_type='doc', id=str(id), body=body)
            body['id'] = str(id)
            return body
        except Exception as exc:
            log.warning("Index error [%s:%s]: %s", index, id, exc)
        backoff_cluster(failures=attempt)
Example #7
0
def index_doc(index, id, body):
    """Index a single document and retry until it has been stored."""
    while True:
        try:
            es.index(index=index, doc_type='doc', id=str(id), body=body)
            body['id'] = str(id)
            return body
        except TransportError as terr:
            log.warning("Index error [%s:%s]: %s", index, id, terr)
            time.sleep(RETRY_DELAY)
Example #8
0
File: util.py Project: we1l1n/aleph
def index_safe(index, id, body, **kwargs):
    """Index a single document and retry until it has been stored."""
    for attempt in service_retries():
        try:
            es.index(index=index, id=id, body=body, **kwargs)
            body['id'] = str(id)
            body.pop('text', None)
            return body
        except TransportError as exc:
            log.warning("Index error [%s:%s]: %s", index, id, exc)
            backoff(failures=attempt)
Example #9
0
File: util.py Project: pudo/aleph
def index_safe(index, id, body, **kwargs):
    """Index a single document and retry until it has been stored."""
    for attempt in service_retries():
        try:
            es.index(index=index, id=id, body=body, **kwargs)
            body['id'] = str(id)
            body.pop('text', None)
            return body
        except TransportError as exc:
            log.warning("Index error [%s:%s]: %s", index, id, exc)
            backoff(failures=attempt)
Example #10
0
def index_single(obj, data, texts):
    """Indexing aspects common to entities and documents."""
    data['bulk'] = False
    data['roles'] = obj.collection.roles
    data['collection_id'] = obj.collection.id
    data['created_at'] = obj.created_at
    data['updated_at'] = obj.updated_at
    data = finalize_index(data, obj.model, texts)
    data = clean_dict(data)
    es.index(index=entity_index(), doc_type='doc', id=str(obj.id), body=data)
    data['id'] = str(obj.id)
    return data
Example #11
0
def index_document(document, index_records=True):
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['text'] = get_text(document)
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id)

    if index_records:
        clear_records(document.id)
        bulk_op(generate_records(document))
Example #12
0
def index_safe(index, id, body, **kwargs):
    """Index a single document and retry until it has been stored."""
    for attempt in service_retries():
        try:
            es.index(index=index, id=id, body=body, **kwargs)
            body["id"] = str(id)
            body.pop("text", None)
            return body
        except TransportError as exc:
            if exc.status_code in ("400", "403"):
                raise
            log.warning("Index error [%s:%s]: %s", index, id, exc)
            backoff(failures=attempt)
Example #13
0
def index_safe(index, id, body, **kwargs):
    """Index a single document and retry until it has been stored."""
    for attempt in range(REQUEST_RETRIES):
        try:
            es.index(index=index, doc_type='doc', id=id, body=body, **kwargs)
            body['id'] = str(id)
            body.pop('text', None)
            return body
        except RequestError:
            raise
        except Exception as exc:
            log.warning("Index error [%s:%s]: %s", index, id, exc)
        backoff_cluster(failures=attempt)
Example #14
0
def index_document(document):
    if document.status == Document.STATUS_PENDING:
        return

    log.info("Index document: %r", document)
    data = document.to_index_dict()

    data['entities'] = []
    for entity_id, collection_id in Reference.index_references(document.id):
        data['entities'].append({
            'id': entity_id,
            'collection_id': collection_id
        })

    es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id)
Example #15
0
def index_package(package, plain_text, normalized_text):
    es.json_encoder = JSONEncoder
    body = {
        'id': package.id,
        'collection': package.collection
    }
    source = package.source
    if source is None:
        log.error("No source for package %r, skipping", package)
        return

    body['name'] = source.meta.get('name')
    body['slug'] = source.meta.get('slug')
    body['title'] = source.meta.get('title') or body['name']
    body['source_url'] = source.meta.get('source_url')
    body['created_at'] = source.meta.get('created_at')
    body['updated_at'] = source.meta.get('updated_at')
    body['filed_at'] = source.meta.get('filed_at')
    body['extension'] = source.meta.get('extension')
    body['mime_type'] = source.meta.get('mime_type')

    if plain_text.exists():
        body['text'] = plain_text.fh().read()
        summary = source.meta.get('summary') or body.get('text')
        body['summary'] = html_summary(summary)

    if normalized_text.exists():
        body['normalized'] = normalized_text.fh().read()

    if not body['title']:
        log.error("No title for package %r, skipping", package)
        return

    body['entities'] = EntityTag.by_package(package.collection, package.id)
    body['attributes'] = generate_attributes(source.meta)

    log.info("Indexing: %r", body['title'])
    es.index(es_index, DOC_TYPE, body, package.id)
                   
Example #16
0
def index_package(package, plain_text, normalized_text):
    es.json_encoder = JSONEncoder
    body = {
        'id': package.id,
        'collection': package.collection
    }
    source = package.source
    if source is None:
        log.error("No source for package %r, skipping", package)
        return

    body['name'] = source.meta.get('name')
    body['slug'] = source.meta.get('slug')
    body['title'] = source.meta.get('title') or body['name']
    body['source_url'] = source.meta.get('source_url')
    body['created_at'] = source.meta.get('created_at')
    body['updated_at'] = source.meta.get('updated_at')
    body['filed_at'] = source.meta.get('filed_at')
    body['extension'] = source.meta.get('extension')
    body['mime_type'] = source.meta.get('mime_type')

    if plain_text.exists():
        body['text'] = plain_text.fh().read()
        summary = source.meta.get('summary') or body.get('text')
        body['summary'] = html_summary(summary)

    if normalized_text.exists():
        body['normalized'] = normalized_text.fh().read()

    if not body['title']:
        log.error("No title for package %r, skipping", package)
        return

    body['entities'] = EntityTag.by_package(package.collection, package.id)
    body['attributes'] = generate_attributes(source.meta)

    log.info("Indexing: %r", body['title'])
    es.index(es_index, DOC_TYPE, body, package.id)
Example #17
0
def rebuild_test_index(n=100):
    n=int(n)
    assert aleph.core.es_index == 'aleph_dev'
    aleph.search.delete_index()
    aleph.search.init_search()
    docswanted=n
    perpage = 10
    for offset in range(0,docswanted,perpage):
        for i, result in enumerate(random_docs(howmany=perpage,offset=offset)):
            new = es.index(
                index='aleph_dev',
                doc_type = result['_type'],
                body = result['_source'],
            )
            assert new['created'] == True
    print('created %s docs' % i)
Example #18
0
def rebuild_test_index(n=100):
    n = int(n)
    assert aleph.core.es_index == 'aleph_dev'
    aleph.search.delete_index()
    aleph.search.init_search()
    docswanted = n
    perpage = 10
    for offset in range(0, docswanted, perpage):
        for i, result in enumerate(random_docs(howmany=perpage,
                                               offset=offset)):
            new = es.index(
                index='aleph_dev',
                doc_type=result['_type'],
                body=result['_source'],
            )
            assert new['created'] == True
    print('created %s docs' % i)
Example #19
0
def replace_es(query, updatefunc, index='aleph_test', howmany=10):
    perpage = 50
    start = 522050
    for offset in range(start, howmany, perpage):
        print('# %s' % offset)
        results = es.search(index=index,
                            body=query,
                            from_=offset,
                            size=min(perpage, howmany))
        for result in results['hits']['hits']:
            newbody = updatefunc(result['_source'])
            if not newbody:
                print('skipping item')
                continue
            updated = es.index(index=result['_index'],
                               doc_type=result['_type'],
                               id=result['_id'],
                               body=newbody)
            assert updated['created'] == False
Example #20
0
def replace_es(query, updatefunc, index='aleph_test', howmany=10):
    perpage = 50
    start = 522050
    for offset in range(start,howmany,perpage):
        print('# %s' % offset)
        results = es.search(
            index=index,
            body=query,
            from_=offset,
            size=min(perpage, howmany))
        for result in results['hits']['hits']:
            newbody = updatefunc(result['_source'])
            if not newbody:
                print('skipping item')
                continue
            updated = es.index(
                index=result['_index'],
                doc_type=result['_type'],
                id = result['_id'],
                body = newbody)
            assert updated['created'] == False
Example #21
0
def index_collection(collection):
    """Index a collection."""
    if collection.deleted_at is not None:
        return delete_collection(collection.id)

    data = {
        'foreign_id': collection.foreign_id,
        'created_at': collection.created_at,
        'updated_at': collection.updated_at,
        'label': collection.label,
        'summary': collection.summary,
        'category': collection.category,
        'countries': collection.countries,
        'languages': collection.languages,
        'managed': collection.managed,
        'roles': collection.roles,
        'schemata': {},
    }

    texts = [
        collection.label, collection.foreign_id, collection.summary,
        collection.category
    ]

    if collection.creator is not None:
        data['creator'] = {
            'id': collection.creator.id,
            'type': collection.creator.type,
            'name': collection.creator.name
        }
        texts.append(collection.creator.name)

    # Compute some statistics on the content of a collection.
    query = {
        'size': 0,
        'query': {
            'bool': {
                'filter': [{
                    'term': {
                        'collection_id': collection.id
                    }
                }, {
                    'term': {
                        'schemata': Entity.THING
                    }
                }]
            }
        },
        'aggs': {
            'schema': {
                'terms': {
                    'field': 'schema',
                    'size': 1000
                }
            },
            'countries': {
                'terms': {
                    'field': 'countries',
                    'size': 500
                }
            },
            'languages': {
                'terms': {
                    'field': 'languages',
                    'size': 100
                }
            },
        }
    }
    result = es.search(index=entities_index(), body=query)
    aggregations = result.get('aggregations')
    data['count'] = result['hits']['total']

    # expose entities by schema count.
    for schema in aggregations['schema']['buckets']:
        data['schemata'][schema['key']] = schema['doc_count']

    # if no countries or langs are given, take the most common from the data.
    if not data.get('countries'):
        countries = aggregations['countries']['buckets']
        data['countries'] = [c['key'] for c in countries]

    if not data.get('languages'):
        countries = aggregations['languages']['buckets']
        data['languages'] = [c['key'] for c in countries]

    texts.extend([match_form(t) for t in texts])
    data['text'] = index_form(texts)
    es.index(index=collection_index(),
             doc_type='doc',
             id=collection.id,
             body=data)
Example #22
0
def index_document(document):
    if document.status == Document.STATUS_PENDING:
        return

    # FIXME:
    if document.type == Document.TYPE_OTHER:
        return

    log.info("Index document [%s]: %s", document.id, document.title)
    data = {
        'schema': document.SCHEMA,
        'schemata': [document.SCHEMA],
        'collection_id': document.collection_id,
        'roles': document.collection.roles,
        'type': document.type,
        'status': document.status,
        'content_hash': document.content_hash,
        'foreign_id': document.foreign_id,
        'error_message': document.error_message,
        'uploader_id': document.uploader_id,
        'created_at': document.created_at,
        'updated_at': document.updated_at,
        'title': document.title,
        'name_sort': document.title,
        'summary': document.summary,
        'author': document.author,
        'file_size': document.file_size,
        'file_name': document.file_title,
        'source_url': document.source_url,
        'languages': document.languages,
        'countries': document.countries,
        'keywords': document.keywords,
        'dates': document.dates,
        'extension': document.extension,
        'encoding': document.encoding,
        'mime_type': document.mime_type,
        'pdf_version': document.pdf_version,
        'columns': document.columns,
        '$children': document.children.count(),
        'text': index_form(document.text_parts())
    }
    if document.parent_id is not None:
        data['parent'] = {
            'id': document.parent_id,
            'type': document.parent.type,
            'title': document.parent.title,
        }

    q = db.session.query(DocumentTag)
    q = q.filter(DocumentTag.document_id == document.id)
    for tag in q.yield_per(5000):
        field = TAG_FIELDS.get(tag.type)
        if field is None:
            log.warning("Cannot index document tag: %r", tag)
            continue
        if field not in data:
            data[field] = []
        data[field].append(tag.text)

    index_names(data)
    es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id)
    data['id'] = document.id
    data['$type'] = TYPE_DOCUMENT
    return data