def delete_source(source_id): """Delete all documents from a particular source.""" q = {'query': {'term': {'source_id': source_id}}, '_source': False} def deletes(): for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_RECORD]): yield { '_op_type': 'delete', '_index': get_es_index(), '_parent': res.get('_parent'), '_type': res.get('_type'), '_id': res.get('_id') } for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_DOCUMENT]): yield { '_op_type': 'delete', '_index': get_es_index(), '_type': res.get('_type'), '_id': res.get('_id') } bulk_op(deletes())
def index_items(entities, links): """Index a set of links or entities.""" while True: try: bulk_op(_index_updates(entities, links)) break except BulkIndexError as exc: log.warning('Indexing error: %s', exc) time.sleep(10)
def delete_entity_references(entity_id): """Delete all entities associated with any collection. This is used by the ``indexentities`` management command in order to clear out any leftover entities in the index. """ q = {'query': {'term': {'entities.id': entity_id}}} bulk_op(document_updates(q, entity_id)) flush_index()
def clear_records(document_id): """Delete all records associated with the given document.""" while True: try: bulk_op(generate_deletes(document_id)) break except BulkIndexError as exc: log.warning('Clear records error: %s', exc) time.sleep(10)
def index_records(document): clear_records(document.id) while True: try: bulk_op(generate_records(document)) break except BulkIndexError as exc: log.warning('Indexing error: %s', exc) time.sleep(10)
def index_bulk(collection, entities, chunk_size=500): """Index a set of entities.""" while True: try: bulk_op(_index_updates(collection, entities), chunk_size=chunk_size) break except (BulkIndexError, TransportError) as exc: log.warning('Indexing error: %s', exc) time.sleep(10)
def index_bulk(collection, entities, chunk_size=200): """Index a set of entities.""" for attempt in count(): try: entities = _index_updates(collection, entities) bulk_op(entities, chunk_size=chunk_size) return except (BulkIndexError, TransportError) as exc: log.warning('Indexing error: %s', exc) backoff_cluster(failures=attempt)
def index_records(document): if not document.supports_records: return clear_records(document.id) while True: try: bulk_op(generate_records(document)) return except BulkIndexError as exc: log.exception(exc) time.sleep(RETRY_DELAY)
def index_records(document): if not document.has_records(): return clear_records(document.id) while True: try: bulk_op(generate_records(document)) return except BulkIndexError as exc: log.warning('Indexing error: %s', exc) time.sleep(10)
def index_document(document, index_records=True): log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data, id=document.id) if index_records: clear_records(document.id) bulk_op(generate_records(document))
def index_document(document, index_records=True): log.info("Index document: %r", document) data = document.to_index_dict() data['text'] = get_text(document) data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id) if index_records: clear_records(document.id) bulk_op(generate_records(document))
def index_records(document): if not document.supports_records: return clear_records(document.id) for attempt in count(): try: bulk_op(generate_records(document)) refresh_index(records_index()) return except Exception as exc: log.warning('Failed to index records: %s', exc) backoff_cluster(failures=attempt)
def update_entity_references(entity, max_query=1000): """Same as above but runs in bulk for a particular entity.""" q = db.session.query(Reference.document_id) q = q.filter(Reference.entity_id == entity.id) q = q.filter(Entity.id == Reference.entity_id) q = q.filter(Entity.state == Entity.STATE_ACTIVE) documents = [str(r.document_id) for r in q] for i in range(0, len(documents), max_query): q = {'query': {'ids': {'values': documents[i:i + max_query]}}} bulk_op(document_updates(q, entity.id, entity.collection_id)) flush_index()
def index_document(document_id): document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) get_es().index(index=get_es_index(), doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_records(document) bulk_op(generate_records(document))
def delete_collection(collection_id): """Delete all documents from a particular collection.""" q = {'query': {'term': {'collection_id': collection_id}}, '_source': False} def deletes(): for res in scan(es, query=q, index=es_index): yield { '_op_type': 'delete', '_index': six.text_type(es_index), '_type': res.get('_type'), '_id': res.get('_id') } flush_index() bulk_op(deletes()) flush_index()
def index_records(document): if not document.supports_records: return try: clear_records(document.id) except TransportError as terr: log.exception(terr) while True: try: bulk_op(generate_records(document)) return except BulkIndexError as exc: log.exception(exc) time.sleep(10)
def update_entity_references(entity_id, max_query=1000): """Same as above but runs in bulk for a particular entity.""" q = db.session.query(Reference.document_id) q = q.filter(Reference.entity_id == entity_id) q = q.filter(Entity.id == entity_id) q = q.filter(Entity.state == Entity.STATE_ACTIVE) q = q.filter(collection_entity_table.c.entity_id == Entity.id) q = q.add_column(collection_entity_table.c.collection_id) references = defaultdict(list) for row in q: references[str(row.document_id)].append(row.collection_id) ids = references.keys() for i in range(0, len(ids), max_query): q = {'query': {'ids': {'values': ids[i:i + max_query]}}} bulk_op(document_updates(q, entity_id, references)) flush_es()
def delete_collection(collection_id): """Delete all documents from a particular collection.""" q = {'query': {'term': {'collection_id': collection_id}}, '_source': False} def deletes(): types = [TYPE_RECORD, TYPE_DOCUMENT, TYPE_ENTITY] for res in scan(es, query=q, index=es_index, doc_type=types): yield { '_op_type': 'delete', '_index': six.text_type(es_index), '_parent': res.get('_parent'), '_type': res.get('_type'), '_id': res.get('_id') } flush_index() bulk_op(deletes())
def delete_collection(collection_id): """Delete all documents from a particular collection.""" q = {'query': {'term': {'collection_id': collection_id}}, '_source': False} def deletes(): for res in scan(get_es(), query=q, index=get_es_index(), doc_type=[TYPE_RECORD, TYPE_DOCUMENT, TYPE_ENTITY]): yield { '_op_type': 'delete', '_index': get_es_index(), '_parent': res.get('_parent'), '_type': res.get('_type'), '_id': res.get('_id') } flush_es() bulk_op(deletes())
def delete_dataset(dataset_name): """Delete all entries from a particular dataset.""" q = {'query': {'term': {'dataset': dataset_name}}, '_source': False} def deletes(): docs = scan(es, query=q, index=es_index, doc_type=[TYPE_LINK, TYPE_ENTITY, TYPE_LEAD]) for i, res in enumerate(docs): yield { '_op_type': 'delete', '_index': str(es_index), '_type': res.get('_type'), '_id': res.get('_id') } if i > 0 and i % 10000 == 0: log.info("Delete %s: %s", dataset_name, i) es.indices.refresh(index=es_index) bulk_op(deletes())
def delete_entity_references(entity_id): q = {'query': {'term': {'entities.id': entity_id}}} bulk_op(document_updates(q, entity_id)) flush_es()