def index_resources_by_type(resource_types, clear_index=True, index_name=None, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indexes all resources of a given type(s) Arguments: resource_types -- array of graph ids that represent resource types Keyword Arguments: clear_index -- set to True to remove all the resources of the types passed in from the index before the reindexing operation index_name -- only applies to custom indexes and if given will try and just refresh the data in that index batch_size -- the number of records to index as a group, the larger the number to more memory required """ status = '' se = SearchEngineFactory().create() datatype_factory = DataTypeFactory() node_datatypes = {str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list('nodeid', 'datatype')} status = '' for resource_type in resource_types: start = datetime.now() resources = Resource.objects.filter(graph_id=str(resource_type)) graph_name = models.GraphModel.objects.get(graphid=str(resource_type)).name print("Indexing resource type '{0}'".format(graph_name)) if index_name is None: q = Query(se=se) term = Term(field='graph_id', term=str(resource_type)) q.add_query(term) if clear_index: q.delete(index='resources', refresh=True) with se.BulkIndexer(batch_size=batch_size, refresh=True) as doc_indexer: with se.BulkIndexer(batch_size=batch_size, refresh=True) as term_indexer: for resource in resources: document, terms = resource.get_documents_to_index(fetchTiles=True, datatype_factory=datatype_factory, node_datatypes=node_datatypes) doc_indexer.add(index='resources', id=document['resourceinstanceid'], data=document) for term in terms: term_indexer.add(index='terms', id=term['_id'], data=term['_source']) result_summary = {'database': len(resources), 'indexed': se.count(index='resources', body=q.dsl)} status = 'Passed' if result_summary['database'] == result_summary['indexed'] else 'Failed' print("Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds".format(status, graph_name, result_summary['database'], result_summary['indexed'], (datetime.now()-start).seconds)) for index in settings.ELASTICSEARCH_CUSTOM_INDEXES: es_index = import_class_from_string(index['module'])(index['name']) es_index.bulk_index(resources=resources, resource_type=resource_type, graph_name=graph_name, clear_index=clear_index) else: es_index = get_index(index_name) es_index.bulk_index(resources=resources, resource_type=resource_type, graph_name=graph_name, clear_index=clear_index) return status
def test_bulk_indexer(self): se = SearchEngineFactory().create() se.create_index(index="bulk") with se.BulkIndexer(batch_size=500, refresh=True) as bulk_indexer: for i in range(1001): doc = {"id": i, "type": "prefLabel", "value": "test pref label"} bulk_indexer.add(index="bulk", id=doc["id"], data=doc) count_after = se.count(index="bulk") self.assertEqual(count_after, 1001)
def index_resource_relations(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indexes all resource to resource relation records Keyword Arguments: clear_index -- set to True to remove all the resources from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required """ start = datetime.now() print("Indexing resource to resource relations") cursor = connection.cursor() se = SearchEngineFactory().create() if clear_index: q = Query(se=se) q.delete(index="resource_relations") with se.BulkIndexer(batch_size=batch_size, refresh=True) as resource_relations_indexer: sql = """ SELECT resourcexid, notes, datestarted, dateended, relationshiptype, resourceinstanceidfrom, resourceinstanceidto, modified, created, inverserelationshiptype, tileid, nodeid FROM public.resource_x_resource; """ cursor.execute(sql) for resource_relation in cursor.fetchall(): doc = { "resourcexid": resource_relation[0], "notes": resource_relation[1], "datestarted": resource_relation[2], "dateended": resource_relation[3], "relationshiptype": resource_relation[4], "resourceinstanceidfrom": resource_relation[5], "resourceinstanceidto": resource_relation[6], "modified": resource_relation[7], "created": resource_relation[8], "inverserelationshiptype": resource_relation[9], "tileid": resource_relation[10], "nodeid": resource_relation[11], } resource_relations_indexer.add(index="resource_relations", id=doc["resourcexid"], data=doc) index_count = se.count(index="resource_relations") print("Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds". format("Passed" if cursor.rowcount == index_count else "Failed", cursor.rowcount, index_count, (datetime.now() - start).seconds))
def test_bulk_indexer(self): se = SearchEngineFactory().create() se.create_index(index='bulk') with se.BulkIndexer(batch_size=500, refresh=True) as bulk_indexer: for i in range(1001): doc = { 'id': i, 'type': 'prefLabel', 'value': 'test pref label', } bulk_indexer.add(index='bulk', id=doc['id'], data=doc) count_after = se.count(index='bulk') self.assertEqual(count_after, 1001)
def index_resource_relations(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indexes all resource to resource relation records Keyword Arguments: clear_index -- set to True to remove all the resources from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required """ start = datetime.now() print "Indexing resource to resource relations" se = SearchEngineFactory().create() if clear_index: q = Query(se=se) q.delete(index='resource_relations') with se.BulkIndexer(batch_size=batch_size, refresh=True) as resource_relations_indexer: sql = """ SELECT resourcexid, resourceinstanceidfrom, notes, relationshiptype, resourceinstanceidto FROM public.resource_x_resource; """ cursor = connection.cursor() cursor.execute(sql) for resource_relation in cursor.fetchall(): doc = { 'resourcexid': resource_relation[0], 'resourceinstanceidfrom': resource_relation[1], 'notes': resource_relation[2], 'relationshiptype': resource_relation[3], 'resourceinstanceidto': resource_relation[4] } resource_relations_indexer.add(index='resource_relations', doc_type='all', id=doc['resourcexid'], data=doc) index_count = se.es.count(index='resource_relations')['count'] print "Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds".format( 'Passed' if cursor.rowcount == index_count else 'Failed', cursor.rowcount, index_count, (datetime.now() - start).seconds)
def index_concepts(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indxes all concepts from the database Keyword Arguments: clear_index -- set to True to remove all the concepts from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required """ start = datetime.now() print("Indexing concepts") cursor = connection.cursor() se = SearchEngineFactory().create() if clear_index: q = Query(se=se) q.delete(index="concepts") with se.BulkIndexer(batch_size=batch_size, refresh=True) as concept_indexer: indexed_values = [] for conceptValue in models.Value.objects.filter( Q(concept__nodetype="Collection") | Q(concept__nodetype="ConceptScheme"), valuetype__category="label"): doc = { "category": "label", "conceptid": conceptValue.concept_id, "language": conceptValue.language_id, "value": conceptValue.value, "type": conceptValue.valuetype_id, "id": conceptValue.valueid, "top_concept": conceptValue.concept_id, } concept_indexer.add(index="concepts", id=doc["id"], data=doc) indexed_values.append(doc["id"]) valueTypes = [] for valuetype in models.DValueType.objects.filter( category="label").values_list("valuetype", flat=True): valueTypes.append("'%s'" % valuetype) valueTypes = ",".join(valueTypes) for conceptValue in models.Relation.objects.filter( relationtype="hasTopConcept"): topConcept = conceptValue.conceptto_id sql = """ WITH RECURSIVE children_inclusive AS ( SELECT d.conceptidfrom, d.conceptidto, c.*, 1 AS depth ---|NonRecursive Part FROM relations d JOIN values c ON(c.conceptid = d.conceptidto) JOIN values c2 ON(c2.conceptid = d.conceptidfrom) WHERE d.conceptidto = '{0}' and c2.valuetype = 'prefLabel' and c.valuetype in ({1}) and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept') UNION SELECT d.conceptidfrom, d.conceptidto, v.*, depth+1 ---|RecursivePart FROM relations d JOIN children_inclusive b ON(b.conceptidto = d.conceptidfrom) JOIN values v ON(v.conceptid = d.conceptidto) JOIN values v2 ON(v2.conceptid = d.conceptidfrom) WHERE v2.valuetype = 'prefLabel' and v.valuetype in ({1}) and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept') ) SELECT valueid, value, conceptid, languageid, valuetype FROM children_inclusive ORDER BY depth; """.format(topConcept, valueTypes) cursor.execute(sql) for conceptValue in cursor.fetchall(): doc = { "category": "label", "conceptid": conceptValue[2], "language": conceptValue[3], "value": conceptValue[1], "type": conceptValue[4], "id": conceptValue[0], "top_concept": topConcept, } concept_indexer.add(index="concepts", id=doc["id"], data=doc) indexed_values.append(doc["id"]) # we add this step to catch any concepts/values that are orphaned (have no parent concept) for conceptValue in models.Value.objects.filter( valuetype__category="label").exclude( valueid__in=indexed_values): doc = { "category": "label", "conceptid": conceptValue.concept_id, "language": conceptValue.language_id, "value": conceptValue.value, "type": conceptValue.valuetype_id, "id": conceptValue.valueid, "top_concept": conceptValue.concept_id, } concept_indexer.add(index="concepts", id=doc["id"], data=doc) cursor.execute( "SELECT count(*) from values WHERE valuetype in ({0})".format( valueTypes)) concept_count_in_db = cursor.fetchone()[0] index_count = se.count(index="concepts") print("Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds". format("Passed" if concept_count_in_db == index_count else "Failed", concept_count_in_db, index_count, (datetime.now() - start).seconds))
def index_concepts(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indxes all concepts from the database Keyword Arguments: clear_index -- set to True to remove all the concepts from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required """ start = datetime.now() print "Indexing concepts" cursor = connection.cursor() se = SearchEngineFactory().create() if clear_index: q = Query(se=se) q.delete(index='concepts') with se.BulkIndexer(batch_size=batch_size, refresh=True) as concept_indexer: concept_strings = [] for conceptValue in models.Value.objects.filter(Q(concept__nodetype='Collection') | Q(concept__nodetype='ConceptScheme'), valuetype__category='label'): doc = { 'category': 'label', 'conceptid': conceptValue.concept_id, 'language': conceptValue.language_id, 'value': conceptValue.value, 'type': conceptValue.valuetype_id, 'id': conceptValue.valueid, 'top_concept': conceptValue.concept_id } concept_indexer.add(index='concepts', id=doc['id'], data=doc) valueTypes = [] valueTypes2 = [] for valuetype in models.DValueType.objects.filter(category='label').values_list('valuetype', flat=True): valueTypes2.append("%s" % valuetype) valueTypes.append("'%s'" % valuetype) valueTypes = ",".join(valueTypes) for conceptValue in models.Relation.objects.filter(relationtype='hasTopConcept'): topConcept = conceptValue.conceptto_id sql = """ WITH RECURSIVE children_inclusive AS ( SELECT d.conceptidfrom, d.conceptidto, c.*, 1 AS depth ---|NonRecursive Part FROM relations d JOIN values c ON(c.conceptid = d.conceptidto) JOIN values c2 ON(c2.conceptid = d.conceptidfrom) WHERE d.conceptidto = '{0}' and c2.valuetype = 'prefLabel' and c.valuetype in ({1}) and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept') UNION SELECT d.conceptidfrom, d.conceptidto, v.*, depth+1 ---|RecursivePart FROM relations d JOIN children_inclusive b ON(b.conceptidto = d.conceptidfrom) JOIN values v ON(v.conceptid = d.conceptidto) JOIN values v2 ON(v2.conceptid = d.conceptidfrom) WHERE v2.valuetype = 'prefLabel' and v.valuetype in ({1}) and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept') ) SELECT valueid, value, conceptid, languageid, valuetype FROM children_inclusive ORDER BY depth; """.format(topConcept, valueTypes) cursor.execute(sql) for conceptValue in cursor.fetchall(): doc = { 'category': 'label', 'conceptid': conceptValue[2], 'language': conceptValue[3], 'value': conceptValue[1], 'type': conceptValue[4], 'id': conceptValue[0], 'top_concept': topConcept } concept_indexer.add(index='concepts', id=doc['id'], data=doc) cursor.execute("SELECT count(*) from values WHERE valuetype in ({0})".format(valueTypes)) concept_count_in_db = cursor.fetchone()[0] index_count = se.count(index='concepts') print "Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds".format('Passed' if concept_count_in_db == index_count else 'Failed', concept_count_in_db, index_count, (datetime.now()-start).seconds)
class BaseIndex(object): def __init__(self, index_name=None): if index_name is None or index_name == "": raise SearchIndexError("Index name is not defined") self.se = SearchEngineFactory().create() self.index_metadata = None self.index_name = index_name def prepare_index(self): """ Defines the Elastic Search mapping and settings for an index Arguments: None Keyword Arguments: None Return: None """ if self.index_metadata is not None: self.se.create_index(index=self.index_name, body=self.index_metadata) else: raise SearchIndexError("No index metadata defined.") def get_documents_to_index(self, resourceinstance, tiles): """ Gets a document to index into Elastic Search Arguments: resourceinstance -- resource instance object tiles -- list of tiles that make up the resource instance Keyword Arguments: None Return: tuple of (document, document id) """ raise NotImplementedError def index_document(self, document=None, id=None): """ Indexes a document into Elastic Search Arguments: None Keyword Arguments: document -- the document to index id -- the id of the document Return: None """ if document is not None and id is not None: self.se.index_data(index=self.index_name, body=document, id=id) def index_resources(self, resources=None, batch_size=settings.BULK_IMPORT_BATCH_SIZE, quiet=False): """ Indexes a list of resources in bulk to Elastic Search Keyword Arguments: resources -- the list of resource instances to index batch_size -- the number of records to index as a group, the larger the number to more memory required quiet -- Silences the status bar output during certain operations, use in celery operations for example Return: None """ start = datetime.now() q = Query(se=self.se) self.se.refresh(index=self.index_name) count_before = self.se.count(index=self.index_name, body=q.dsl) result_summary = {"database": len(resources), "indexed": 0} if quiet is False: bar = pyprind.ProgBar(len(resources), bar_char="█") if len(resources) > 1 else None with self.se.BulkIndexer(batch_size=batch_size, refresh=True) as indexer: for resource in resources: if quiet is False and bar is not None: bar.update(item_id=resource) tiles = list(models.TileModel.objects.filter(resourceinstance=resource)) document, doc_id = self.get_documents_to_index(resource, tiles) if document is not None and id is not None: indexer.add(index=self.index_name, id=doc_id, data=document) self.se.refresh(index=self.index_name) result_summary["indexed"] = self.se.count(index=self.index_name, body=q.dsl) - count_before status = "Passed" if result_summary["database"] == result_summary["indexed"] else "Failed" print(f"Custom Index - {settings.ELASTICSEARCH_PREFIX}_{self.index_name}") print( f" Status: {status}, In Database: {result_summary['database']}, Indexed: {result_summary['indexed']}, Took: {(datetime.now() - start).seconds} seconds" ) def delete_resources(self, resources=None): """ Deletes documents from an index based on the passed in list of resources Delete by query, so this is a single operation Keyword Arguments: resources -- a single resource instance or a list of resource instances """ q = Query(se=self.se) if not isinstance(resources, list): resourcelist = [resources] else: resourcelist = resources list_of_ids_to_delete = [] for resource in resourcelist: list_of_ids_to_delete.append(resource.pk) ids_query = Ids(ids=list_of_ids_to_delete) q.add_query(ids_query) q.delete(index=self.index_name) def delete_index(self): """ Deletes this index from Elastic Search Arguments: None Keyword Arguments: None Return: None """ self.se.delete_index(index=self.index_name) def reindex(self, graphids=None, clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE, quiet=False): """ Reindexes the index. By default this does nothing, it needs to be implemented in a subclass. By default you can pass in a list of graph ids to trigger the reindex. This will loop through all resource instances of each graph type. Example subclass command: def reindex(self, clear_index=True): PARCEL_GRAPHID = "e3c35dca-5e72-11ea-a2d3-dca90488358a" super(CustomIndexName, self).reindex(graphids=[PARCEL_GRAPHID], clear_index=clear_index) Keyword Arguments: graphids -- list of graphs ids to trigger the reindex on, will get all resource instances of each graph id supplied clear_index -- True(default) to clear all documents out of the index before reindexing begins batch_size -- the number of records to index as a group, the larger the number to more memory required Return: None """ if graphids is not None: if clear_index: self.delete_index() self.prepare_index() for graphid in graphids: resources = Resource.objects.filter(graph_id=graphid) self.index_resources(resources=resources, batch_size=batch_size, quiet=quiet) else: raise NotImplementedError
def index_resources_by_type(resource_types, clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indexes all resources of a given type(s) Arguments: resource_types -- array of graph ids that represent resource types Keyword Arguments: clear_index -- set to True to remove all the resources of the types passed in from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required """ se = SearchEngineFactory().create() datatype_factory = DataTypeFactory() node_datatypes = { str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list( 'nodeid', 'datatype') } for resource_type in resource_types: start = datetime.now() resources = Resource.objects.filter(graph_id=str(resource_type)) graph_name = models.GraphModel.objects.get( graphid=str(resource_type)).name print "Indexing resource type '{0}'".format(graph_name) result_summary = {'database': len(resources), 'indexed': 0} if clear_index: q = Query(se=se) q.delete(index='resource', doc_type=str(resource_type)) with se.BulkIndexer(batch_size=batch_size, refresh=True) as doc_indexer: with se.BulkIndexer(batch_size=batch_size, refresh=True) as term_indexer: for resource in resources: document, terms = resource.get_documents_to_index( fetchTiles=True, datatype_factory=datatype_factory, node_datatypes=node_datatypes) doc_indexer.add(index='resource', doc_type=document['graph_id'], id=document['resourceinstanceid'], data=document) for term in terms: term_indexer.add(index='strings', doc_type='term', id=term['_id'], data=term['_source']) result_summary['indexed'] = se.es.count( index='resource', doc_type=str(resource_type))['count'] status = 'Passed' if result_summary['database'] == result_summary[ 'indexed'] else 'Failed' print "Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds".format( status, graph_name, result_summary['database'], result_summary['indexed'], (datetime.now() - start).seconds)
class BaseIndex(object): def __init__(self, index_name=None): if index_name is None or index_name is "": raise SearchIndexError("Index name is not defined") self.se = SearchEngineFactory().create() self.index_metadata = None self.index_name = index_name def prepare_index(self): """ Defines the Elastic Search mapping and settings for an index Arguments: None Keyword Arguments: None Return: None """ if self.index_metadata is not None: self.se.create_index(index=self.index_name, body=self.index_metadata) else: raise SearchIndexError("No index metadata defined.") def get_documents_to_index(self, resourceinstance, tiles): """ Gets a document to index into Elastic Search Arguments: resourceinstance -- resource instance object tiles -- list of tiles that make up the resource instance Keyword Arguments: None Return: tuple of (document, document id) """ raise NotImplementedError def index_document(self, document=None, id=None): """ Indexes a document into Elastic Search Arguments: None Keyword Arguments: document -- the document to index id -- the id of the document Return: None """ if document is not None and id is not None: self.se.index_data(index=self.index_name, body=document, id=id) def bulk_index(self, resources=None, resource_type=None, graph_name=None, clear_index=True): """ Indexes a list of documents in bulk to Elastic Search Arguments: None Keyword Arguments: resources -- the list of resource instances to index resource_type -- the type of resources being indexed graph_name -- the name of the graph model that represents the resources being indexed clear_index -- True(default) to remove all index records of type "resource_type" before indexing, assumes that a field called "graph_id" exists on the indexed documents Return: None """ start = datetime.now() q = Query(se=self.se) if clear_index: term = Term(field="graph_id", term=str(resource_type)) q.add_query(term) q.delete(index=self.index_name, refresh=True) q = Query(se=self.se) count_before = self.se.count(index=self.index_name, body=q.dsl) result_summary = {"database": len(resources), "indexed": 0} with self.se.BulkIndexer(batch_size=settings.BULK_IMPORT_BATCH_SIZE, refresh=True) as indexer: for resource in resources: tiles = list( models.TileModel.objects.filter(resourceinstance=resource)) document, doc_id = self.get_documents_to_index(resource, tiles) if document is not None and id is not None: indexer.add(index=self.index_name, id=doc_id, data=document) result_summary["indexed"] = self.se.count(index=self.index_name, body=q.dsl) - count_before status = "Passed" if result_summary["database"] == result_summary[ "indexed"] else "Failed" print("Custom Index - %s:" % self.index_name) print( " Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds" .format(status, graph_name, result_summary["database"], result_summary["indexed"], (datetime.now() - start).seconds)) def delete_index(self): """ Deletes this index from Elastic Search Arguments: None Keyword Arguments: None Return: None """ self.se.delete_index(index=self.index_name)