def index_collection(collection): """Index a collection.""" if collection.deleted_at is not None: return delete_collection(collection.id) data = { 'foreign_id': collection.foreign_id, 'created_at': collection.created_at, 'updated_at': collection.updated_at, 'label': collection.label, 'summary': collection.summary, 'category': collection.category, 'countries': collection.countries, 'languages': collection.languages, 'managed': collection.managed, 'roles': collection.roles } if collection.creator is not None: data['creator'] = { 'id': collection.creator.id, 'type': collection.creator.type, 'name': collection.creator.name } data.update(get_collection_stats(collection.id)) es.index(index=es_index, doc_type=TYPE_COLLECTION, id=collection.id, body=data)
def index_lead(lead): """Index a lead.""" hash_sum = sha1() hash_sum.update(lead.get('entity_id') or '') hash_sum.update(lead.get('match_id') or '') lead_id = hash_sum.hexdigest() es.index(index=es_index, doc_type=TYPE_LEAD, id=lead_id, body=lead)
def index_entity(entity): """Index an entity.""" data = entity.to_index() data.pop('id', None) data['doc_count'] = get_count(entity) data = finalize_index(data, entity.schema) es.index(index=es_index, doc_type=TYPE_ENTITY, id=entity.id, body=data)
def index_document(document_id): clear_session() document = Document.by_id(document_id) if document is None: log.info("Could not find document: %r", document_id) return log.info("Index document: %r", document) data = document.to_dict() data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id) clear_children(document) try: if document.type == Document.TYPE_TEXT: bulk(es, generate_pages(document), stats_only=True, chunk_size=2000, request_timeout=60.0) if document.type == Document.TYPE_TABULAR: bulk(es, generate_records(document), stats_only=True, chunk_size=2000, request_timeout=60.0) except Exception as ex: log.exception(ex)
def index_entity(entity): """Index an entity.""" if entity.deleted_at is not None: return delete_entity(entity.id) data = { 'foreign_ids': entity.foreign_ids, 'data': entity.data, 'created_at': entity.created_at, 'updated_at': entity.updated_at, '$bulk': False, 'roles': entity.collection.roles, 'collection_id': entity.collection_id, 'properties': { 'name': [entity.name] } } for k, v in entity.data.items(): data['properties'][k] = ensure_list(v) # data['$documents'] = get_count(entity) data = finalize_index(data, entity.schema) es.index(index=es_index, doc_type=TYPE_ENTITY, id=entity.id, body=data) data['id'] = entity.id data['$type'] = TYPE_ENTITY return data
def index_safe(index, id, body): """Index a single document and retry until it has been stored.""" for attempt in count(): try: es.index(index=index, doc_type='doc', id=str(id), body=body) body['id'] = str(id) return body except Exception as exc: log.warning("Index error [%s:%s]: %s", index, id, exc) backoff_cluster(failures=attempt)
def index_doc(index, id, body): """Index a single document and retry until it has been stored.""" while True: try: es.index(index=index, doc_type='doc', id=str(id), body=body) body['id'] = str(id) return body except TransportError as terr: log.warning("Index error [%s:%s]: %s", index, id, terr) time.sleep(RETRY_DELAY)
def index_safe(index, id, body, **kwargs): """Index a single document and retry until it has been stored.""" for attempt in service_retries(): try: es.index(index=index, id=id, body=body, **kwargs) body['id'] = str(id) body.pop('text', None) return body except TransportError as exc: log.warning("Index error [%s:%s]: %s", index, id, exc) backoff(failures=attempt)
def index_single(obj, data, texts): """Indexing aspects common to entities and documents.""" data['bulk'] = False data['roles'] = obj.collection.roles data['collection_id'] = obj.collection.id data['created_at'] = obj.created_at data['updated_at'] = obj.updated_at data = finalize_index(data, obj.model, texts) data = clean_dict(data) es.index(index=entity_index(), doc_type='doc', id=str(obj.id), body=data) data['id'] = str(obj.id) return data
def index_document(document, index_records=True): log.info("Index document: %r", document) data = document.to_index_dict() data['text'] = get_text(document) data['entities'] = generate_entities(document) data['title_latin'] = latinize_text(data.get('title')) data['summary_latin'] = latinize_text(data.get('summary')) es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id) if index_records: clear_records(document.id) bulk_op(generate_records(document))
def index_safe(index, id, body, **kwargs): """Index a single document and retry until it has been stored.""" for attempt in service_retries(): try: es.index(index=index, id=id, body=body, **kwargs) body["id"] = str(id) body.pop("text", None) return body except TransportError as exc: if exc.status_code in ("400", "403"): raise log.warning("Index error [%s:%s]: %s", index, id, exc) backoff(failures=attempt)
def index_safe(index, id, body, **kwargs): """Index a single document and retry until it has been stored.""" for attempt in range(REQUEST_RETRIES): try: es.index(index=index, doc_type='doc', id=id, body=body, **kwargs) body['id'] = str(id) body.pop('text', None) return body except RequestError: raise except Exception as exc: log.warning("Index error [%s:%s]: %s", index, id, exc) backoff_cluster(failures=attempt)
def index_document(document): if document.status == Document.STATUS_PENDING: return log.info("Index document: %r", document) data = document.to_index_dict() data['entities'] = [] for entity_id, collection_id in Reference.index_references(document.id): data['entities'].append({ 'id': entity_id, 'collection_id': collection_id }) es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id)
def index_package(package, plain_text, normalized_text): es.json_encoder = JSONEncoder body = { 'id': package.id, 'collection': package.collection } source = package.source if source is None: log.error("No source for package %r, skipping", package) return body['name'] = source.meta.get('name') body['slug'] = source.meta.get('slug') body['title'] = source.meta.get('title') or body['name'] body['source_url'] = source.meta.get('source_url') body['created_at'] = source.meta.get('created_at') body['updated_at'] = source.meta.get('updated_at') body['filed_at'] = source.meta.get('filed_at') body['extension'] = source.meta.get('extension') body['mime_type'] = source.meta.get('mime_type') if plain_text.exists(): body['text'] = plain_text.fh().read() summary = source.meta.get('summary') or body.get('text') body['summary'] = html_summary(summary) if normalized_text.exists(): body['normalized'] = normalized_text.fh().read() if not body['title']: log.error("No title for package %r, skipping", package) return body['entities'] = EntityTag.by_package(package.collection, package.id) body['attributes'] = generate_attributes(source.meta) log.info("Indexing: %r", body['title']) es.index(es_index, DOC_TYPE, body, package.id)
def rebuild_test_index(n=100): n=int(n) assert aleph.core.es_index == 'aleph_dev' aleph.search.delete_index() aleph.search.init_search() docswanted=n perpage = 10 for offset in range(0,docswanted,perpage): for i, result in enumerate(random_docs(howmany=perpage,offset=offset)): new = es.index( index='aleph_dev', doc_type = result['_type'], body = result['_source'], ) assert new['created'] == True print('created %s docs' % i)
def rebuild_test_index(n=100): n = int(n) assert aleph.core.es_index == 'aleph_dev' aleph.search.delete_index() aleph.search.init_search() docswanted = n perpage = 10 for offset in range(0, docswanted, perpage): for i, result in enumerate(random_docs(howmany=perpage, offset=offset)): new = es.index( index='aleph_dev', doc_type=result['_type'], body=result['_source'], ) assert new['created'] == True print('created %s docs' % i)
def replace_es(query, updatefunc, index='aleph_test', howmany=10): perpage = 50 start = 522050 for offset in range(start, howmany, perpage): print('# %s' % offset) results = es.search(index=index, body=query, from_=offset, size=min(perpage, howmany)) for result in results['hits']['hits']: newbody = updatefunc(result['_source']) if not newbody: print('skipping item') continue updated = es.index(index=result['_index'], doc_type=result['_type'], id=result['_id'], body=newbody) assert updated['created'] == False
def replace_es(query, updatefunc, index='aleph_test', howmany=10): perpage = 50 start = 522050 for offset in range(start,howmany,perpage): print('# %s' % offset) results = es.search( index=index, body=query, from_=offset, size=min(perpage, howmany)) for result in results['hits']['hits']: newbody = updatefunc(result['_source']) if not newbody: print('skipping item') continue updated = es.index( index=result['_index'], doc_type=result['_type'], id = result['_id'], body = newbody) assert updated['created'] == False
def index_collection(collection): """Index a collection.""" if collection.deleted_at is not None: return delete_collection(collection.id) data = { 'foreign_id': collection.foreign_id, 'created_at': collection.created_at, 'updated_at': collection.updated_at, 'label': collection.label, 'summary': collection.summary, 'category': collection.category, 'countries': collection.countries, 'languages': collection.languages, 'managed': collection.managed, 'roles': collection.roles, 'schemata': {}, } texts = [ collection.label, collection.foreign_id, collection.summary, collection.category ] if collection.creator is not None: data['creator'] = { 'id': collection.creator.id, 'type': collection.creator.type, 'name': collection.creator.name } texts.append(collection.creator.name) # Compute some statistics on the content of a collection. query = { 'size': 0, 'query': { 'bool': { 'filter': [{ 'term': { 'collection_id': collection.id } }, { 'term': { 'schemata': Entity.THING } }] } }, 'aggs': { 'schema': { 'terms': { 'field': 'schema', 'size': 1000 } }, 'countries': { 'terms': { 'field': 'countries', 'size': 500 } }, 'languages': { 'terms': { 'field': 'languages', 'size': 100 } }, } } result = es.search(index=entities_index(), body=query) aggregations = result.get('aggregations') data['count'] = result['hits']['total'] # expose entities by schema count. for schema in aggregations['schema']['buckets']: data['schemata'][schema['key']] = schema['doc_count'] # if no countries or langs are given, take the most common from the data. if not data.get('countries'): countries = aggregations['countries']['buckets'] data['countries'] = [c['key'] for c in countries] if not data.get('languages'): countries = aggregations['languages']['buckets'] data['languages'] = [c['key'] for c in countries] texts.extend([match_form(t) for t in texts]) data['text'] = index_form(texts) es.index(index=collection_index(), doc_type='doc', id=collection.id, body=data)
def index_document(document): if document.status == Document.STATUS_PENDING: return # FIXME: if document.type == Document.TYPE_OTHER: return log.info("Index document [%s]: %s", document.id, document.title) data = { 'schema': document.SCHEMA, 'schemata': [document.SCHEMA], 'collection_id': document.collection_id, 'roles': document.collection.roles, 'type': document.type, 'status': document.status, 'content_hash': document.content_hash, 'foreign_id': document.foreign_id, 'error_message': document.error_message, 'uploader_id': document.uploader_id, 'created_at': document.created_at, 'updated_at': document.updated_at, 'title': document.title, 'name_sort': document.title, 'summary': document.summary, 'author': document.author, 'file_size': document.file_size, 'file_name': document.file_title, 'source_url': document.source_url, 'languages': document.languages, 'countries': document.countries, 'keywords': document.keywords, 'dates': document.dates, 'extension': document.extension, 'encoding': document.encoding, 'mime_type': document.mime_type, 'pdf_version': document.pdf_version, 'columns': document.columns, '$children': document.children.count(), 'text': index_form(document.text_parts()) } if document.parent_id is not None: data['parent'] = { 'id': document.parent_id, 'type': document.parent.type, 'title': document.parent.title, } q = db.session.query(DocumentTag) q = q.filter(DocumentTag.document_id == document.id) for tag in q.yield_per(5000): field = TAG_FIELDS.get(tag.type) if field is None: log.warning("Cannot index document tag: %r", tag) continue if field not in data: data[field] = [] data[field].append(tag.text) index_names(data) es.index(index=es_index, doc_type=TYPE_DOCUMENT, body=data, id=document.id) data['id'] = document.id data['$type'] = TYPE_DOCUMENT return data