Beispiel #1
0
def delete_source(source_id):
    """Delete all documents from a particular source."""
    q = {'query': {'term': {'source_id': source_id}}, '_source': False}

    def deletes():
        for res in scan(get_es(),
                        query=q,
                        index=get_es_index(),
                        doc_type=[TYPE_RECORD]):
            yield {
                '_op_type': 'delete',
                '_index': get_es_index(),
                '_parent': res.get('_parent'),
                '_type': res.get('_type'),
                '_id': res.get('_id')
            }
        for res in scan(get_es(),
                        query=q,
                        index=get_es_index(),
                        doc_type=[TYPE_DOCUMENT]):
            yield {
                '_op_type': 'delete',
                '_index': get_es_index(),
                '_type': res.get('_type'),
                '_id': res.get('_id')
            }

    bulk_op(deletes())
Beispiel #2
0
def index_items(entities, links):
    """Index a set of links or entities."""
    while True:
        try:
            bulk_op(_index_updates(entities, links))
            break
        except BulkIndexError as exc:
            log.warning('Indexing error: %s', exc)
            time.sleep(10)
Beispiel #3
0
def delete_entity_references(entity_id):
    """Delete all entities associated with any collection.

    This is used by the ``indexentities`` management command in order to clear
    out any leftover entities in the index.
    """
    q = {'query': {'term': {'entities.id': entity_id}}}
    bulk_op(document_updates(q, entity_id))
    flush_index()
def clear_records(document_id):
    """Delete all records associated with the given document."""
    while True:
        try:
            bulk_op(generate_deletes(document_id))
            break
        except BulkIndexError as exc:
            log.warning('Clear records error: %s', exc)
            time.sleep(10)
def index_records(document):
    clear_records(document.id)
    while True:
        try:
            bulk_op(generate_records(document))
            break
        except BulkIndexError as exc:
            log.warning('Indexing error: %s', exc)
            time.sleep(10)
Beispiel #6
0
def index_bulk(collection, entities, chunk_size=500):
    """Index a set of entities."""
    while True:
        try:
            bulk_op(_index_updates(collection, entities),
                    chunk_size=chunk_size)
            break
        except (BulkIndexError, TransportError) as exc:
            log.warning('Indexing error: %s', exc)
            time.sleep(10)
Beispiel #7
0
def index_bulk(collection, entities, chunk_size=200):
    """Index a set of entities."""
    for attempt in count():
        try:
            entities = _index_updates(collection, entities)
            bulk_op(entities, chunk_size=chunk_size)
            return
        except (BulkIndexError, TransportError) as exc:
            log.warning('Indexing error: %s', exc)
        backoff_cluster(failures=attempt)
Beispiel #8
0
def index_records(document):
    if not document.supports_records:
        return

    clear_records(document.id)
    while True:
        try:
            bulk_op(generate_records(document))
            return
        except BulkIndexError as exc:
            log.exception(exc)
            time.sleep(RETRY_DELAY)
Beispiel #9
0
def index_records(document):
    if not document.has_records():
        return

    clear_records(document.id)
    while True:
        try:
            bulk_op(generate_records(document))
            return
        except BulkIndexError as exc:
            log.warning('Indexing error: %s', exc)
            time.sleep(10)
Beispiel #10
0
def index_document(document, index_records=True):
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data,
                   id=document.id)

    if index_records:
        clear_records(document.id)
        bulk_op(generate_records(document))
Beispiel #11
0
def index_document(document, index_records=True):
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['text'] = get_text(document)
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id)

    if index_records:
        clear_records(document.id)
        bulk_op(generate_records(document))
Beispiel #12
0
def index_records(document):
    if not document.supports_records:
        return

    clear_records(document.id)
    for attempt in count():
        try:
            bulk_op(generate_records(document))
            refresh_index(records_index())
            return
        except Exception as exc:
            log.warning('Failed to index records: %s', exc)
        backoff_cluster(failures=attempt)
Beispiel #13
0
def update_entity_references(entity, max_query=1000):
    """Same as above but runs in bulk for a particular entity."""
    q = db.session.query(Reference.document_id)
    q = q.filter(Reference.entity_id == entity.id)
    q = q.filter(Entity.id == Reference.entity_id)
    q = q.filter(Entity.state == Entity.STATE_ACTIVE)
    documents = [str(r.document_id) for r in q]

    for i in range(0, len(documents), max_query):
        q = {'query': {'ids': {'values': documents[i:i + max_query]}}}
        bulk_op(document_updates(q, entity.id, entity.collection_id))

    flush_index()
Beispiel #14
0
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data,
                   id=document.id)

    clear_records(document)
    bulk_op(generate_records(document))
Beispiel #15
0
def delete_collection(collection_id):
    """Delete all documents from a particular collection."""
    q = {'query': {'term': {'collection_id': collection_id}}, '_source': False}

    def deletes():
        for res in scan(es, query=q, index=es_index):
            yield {
                '_op_type': 'delete',
                '_index': six.text_type(es_index),
                '_type': res.get('_type'),
                '_id': res.get('_id')
            }

    flush_index()
    bulk_op(deletes())
    flush_index()
Beispiel #16
0
def index_records(document):
    if not document.supports_records:
        return

    try:
        clear_records(document.id)
    except TransportError as terr:
        log.exception(terr)

    while True:
        try:
            bulk_op(generate_records(document))
            return
        except BulkIndexError as exc:
            log.exception(exc)
            time.sleep(10)
Beispiel #17
0
def update_entity_references(entity_id, max_query=1000):
    """Same as above but runs in bulk for a particular entity."""
    q = db.session.query(Reference.document_id)
    q = q.filter(Reference.entity_id == entity_id)
    q = q.filter(Entity.id == entity_id)
    q = q.filter(Entity.state == Entity.STATE_ACTIVE)
    q = q.filter(collection_entity_table.c.entity_id == Entity.id)
    q = q.add_column(collection_entity_table.c.collection_id)
    references = defaultdict(list)
    for row in q:
        references[str(row.document_id)].append(row.collection_id)

    ids = references.keys()
    for i in range(0, len(ids), max_query):
        q = {'query': {'ids': {'values': ids[i:i + max_query]}}}
        bulk_op(document_updates(q, entity_id, references))
    flush_es()
Beispiel #18
0
def index_document(document_id):
    document = Document.by_id(document_id)
    if document is None:
        log.info("Could not find document: %r", document_id)
        return
    log.info("Index document: %r", document)
    data = document.to_index_dict()
    data['entities'] = generate_entities(document)
    data['title_latin'] = latinize_text(data.get('title'))
    data['summary_latin'] = latinize_text(data.get('summary'))
    get_es().index(index=get_es_index(),
                   doc_type=TYPE_DOCUMENT,
                   body=data,
                   id=document.id)

    clear_records(document)
    bulk_op(generate_records(document))
Beispiel #19
0
def delete_collection(collection_id):
    """Delete all documents from a particular collection."""
    q = {'query': {'term': {'collection_id': collection_id}}, '_source': False}

    def deletes():
        types = [TYPE_RECORD, TYPE_DOCUMENT, TYPE_ENTITY]
        for res in scan(es, query=q, index=es_index, doc_type=types):
            yield {
                '_op_type': 'delete',
                '_index': six.text_type(es_index),
                '_parent': res.get('_parent'),
                '_type': res.get('_type'),
                '_id': res.get('_id')
            }

    flush_index()
    bulk_op(deletes())
Beispiel #20
0
def delete_collection(collection_id):
    """Delete all documents from a particular collection."""
    q = {'query': {'term': {'collection_id': collection_id}}, '_source': False}

    def deletes():
        for res in scan(get_es(), query=q, index=get_es_index(),
                        doc_type=[TYPE_RECORD, TYPE_DOCUMENT, TYPE_ENTITY]):
            yield {
                '_op_type': 'delete',
                '_index': get_es_index(),
                '_parent': res.get('_parent'),
                '_type': res.get('_type'),
                '_id': res.get('_id')
            }

    flush_es()
    bulk_op(deletes())
Beispiel #21
0
def delete_dataset(dataset_name):
    """Delete all entries from a particular dataset."""
    q = {'query': {'term': {'dataset': dataset_name}}, '_source': False}

    def deletes():
        docs = scan(es,
                    query=q,
                    index=es_index,
                    doc_type=[TYPE_LINK, TYPE_ENTITY, TYPE_LEAD])
        for i, res in enumerate(docs):
            yield {
                '_op_type': 'delete',
                '_index': str(es_index),
                '_type': res.get('_type'),
                '_id': res.get('_id')
            }
            if i > 0 and i % 10000 == 0:
                log.info("Delete %s: %s", dataset_name, i)

    es.indices.refresh(index=es_index)
    bulk_op(deletes())
Beispiel #22
0
def delete_source(source_id):
    """Delete all documents from a particular source."""
    q = {'query': {'term': {'source_id': source_id}}, '_source': False}

    def deletes():
        for res in scan(get_es(), query=q, index=get_es_index(),
                        doc_type=[TYPE_RECORD]):
            yield {
                '_op_type': 'delete',
                '_index': get_es_index(),
                '_parent': res.get('_parent'),
                '_type': res.get('_type'),
                '_id': res.get('_id')
            }
        for res in scan(get_es(), query=q, index=get_es_index(),
                        doc_type=[TYPE_DOCUMENT]):
            yield {
                '_op_type': 'delete',
                '_index': get_es_index(),
                '_type': res.get('_type'),
                '_id': res.get('_id')
            }

    bulk_op(deletes())
Beispiel #23
0
def delete_entity_references(entity_id):
    q = {'query': {'term': {'entities.id': entity_id}}}
    bulk_op(document_updates(q, entity_id))
    flush_es()