def delete_concept_values_index(concepts_to_delete): se = SearchEngineFactory().create() for concept in concepts_to_delete.itervalues(): query = Query(se, start=0, limit=10000) term = Term(field='conceptid', term=concept.id) query.add_query(term) query.delete(index='strings', doc_type='concept')
def delete_index(self): se = SearchEngineFactory().create() query = Query(se, start=0, limit=10000) phrase = Match(field='conceptid', query=self.conceptid, type='phrase') query.add_query(phrase) query.delete(index='concept_labels') se.delete_terms(self.id)
def index_resources(clear_index=True, index_name=None, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indexes all resources from the database Keyword Arguments: clear_index -- set to True to remove all the resources from the index before the reindexing operation index_name -- only applies to custom indexes and if given will try and just refresh the data in that index batch_size -- the number of records to index as a group, the larger the number to more memory required """ se = SearchEngineFactory().create() if clear_index and index_name is None: q = Query(se=se) q.delete(index="terms") resource_types = (models.GraphModel.objects.filter( isresource=True).exclude( graphid=settings.SYSTEM_SETTINGS_RESOURCE_MODEL_ID).values_list( "graphid", flat=True)) index_resources_by_type(resource_types, clear_index=clear_index, index_name=index_name, batch_size=batch_size)
def delete_concept_values_index(concepts_to_delete): se = SearchEngineFactory().create() for concept in concepts_to_delete.itervalues(): query = Query(se, start=0, limit=10000) term = Term(field='conceptid', term=concept.id) query.add_query(term) query.delete(index='strings', doc_type='concept')
def index_resources(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE, quiet=False): """ Indexes all resources from the database Keyword Arguments: clear_index -- set to True to remove all the resources from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required quiet -- Silences the status bar output during certain operations, use in celery operations for example """ if clear_index: q = Query(se=se) q.delete(index=TERMS_INDEX) resource_types = (models.GraphModel.objects.filter( isresource=True).exclude( graphid=settings.SYSTEM_SETTINGS_RESOURCE_MODEL_ID).values_list( "graphid", flat=True)) index_resources_by_type(resource_types, clear_index=clear_index, batch_size=batch_size, quiet=quiet)
def delete_index(self): se = SearchEngineFactory().create() query = Query(se, start=0, limit=10000) term = Term(field='id', term=self.id) query.add_query(term) query.delete(index='concept_labels') se.delete_terms(self.id)
def reverse_func(apps, schema_editor): extensions = [os.path.join(settings.ONTOLOGY_PATH, x) for x in settings.ONTOLOGY_EXT] management.call_command('load_ontology', source=os.path.join(settings.ONTOLOGY_PATH, settings.ONTOLOGY_BASE), version=settings.ONTOLOGY_BASE_VERSION, ontology_name=settings.ONTOLOGY_BASE_NAME, id=settings.ONTOLOGY_BASE_ID, extensions=','.join(extensions), verbosity=0) Node = apps.get_model("models", "Node") Edge = apps.get_model("models", "Edge") for node in Node.objects.all(): node.ontologyclass = str(node.ontologyclass).split('/')[-1] node.save() for edge in Edge.objects.all(): edge.ontologyproperty = str(edge.ontologyproperty).split('/')[-1] edge.save() # remove index for base Arches concept se = SearchEngineFactory().create() query = Query(se, start=0, limit=10000) query.add_query(Term(field='conceptid', term='00000000-0000-0000-0000-000000000001')) query.delete(index='strings', doc_type='concept') try: DValueType = apps.get_model("models", "DValueType") DValueType.objects.get(valuetype='identifier').delete() except: pass
def reverse_func(apps, schema_editor): Node = apps.get_model("models", "Node") Edge = apps.get_model("models", "Edge") for node in Node.objects.all(): node.ontologyclass = str(node.ontologyclass).split("/")[-1] node.save() for edge in Edge.objects.all(): edge.ontologyproperty = str(edge.ontologyproperty).split("/")[-1] edge.save() # remove index for base Arches concept se = SearchEngineFactory().create() query = Query(se, start=0, limit=10000) query.add_query( Term(field="conceptid", term="00000000-0000-0000-0000-000000000001")) query.delete(index="concepts") try: DValueType = apps.get_model("models", "DValueType") DValueType.objects.get(valuetype="identifier").delete() except Exception: pass
def delete_index(self): se = SearchEngineFactory().create() query = Query(se, start=0, limit=10000) phrase = Match(field='conceptid', query=self.conceptid, type='phrase') query.add_query(phrase) query.delete(index='concept_labels') se.delete_terms(self.id)
def test_bulk_delete(self): """ Test bulk deleting of documents in Elasticsearch """ se = SearchEngineFactory().create() # se.create_index(index='test') for i in range(10): x = { 'id': i, 'type': 'prefLabel', 'value': 'test pref label', } se.index_data(index='test', doc_type='test', body=x, idfield='id', refresh=True) y = { 'id': i + 100, 'type': 'altLabel', 'value': 'test alt label', } se.index_data(index='test', doc_type='test', body=y, idfield='id', refresh=True) query = Query(se, start=0, limit=100) match = Match(field='type', query='altLabel') query.add_query(match) query.delete(index='test', refresh=True) self.assertEqual(se.es.count(index='test', doc_type='test')['count'], 10)
def reverse_func(apps, schema_editor): extensions = [os.path.join(settings.ONTOLOGY_PATH, x) for x in settings.ONTOLOGY_EXT] management.call_command('load_ontology', source=os.path.join(settings.ONTOLOGY_PATH, settings.ONTOLOGY_BASE), version=settings.ONTOLOGY_BASE_VERSION, ontology_name=settings.ONTOLOGY_BASE_NAME, id=settings.ONTOLOGY_BASE_ID, extensions=','.join(extensions), verbosity=0) Node = apps.get_model("models", "Node") Edge = apps.get_model("models", "Edge") for node in Node.objects.all(): node.ontologyclass = str(node.ontologyclass).split('/')[-1] node.save() for edge in Edge.objects.all(): edge.ontologyproperty = str(edge.ontologyproperty).split('/')[-1] edge.save() # remove index for base Arches concept se = SearchEngineFactory().create() query = Query(se, start=0, limit=10000) query.add_query(Term(field='conceptid', term='00000000-0000-0000-0000-000000000001')) query.delete(index='concepts') try: DValueType = apps.get_model("models", "DValueType") DValueType.objects.get(valuetype='identifier').delete() except: pass
def test_delete_by_query(self): """ Test deleting documents by query in Elasticsearch """ se = SearchEngineFactory().create() for i in range(10): x = { 'id': i, 'type': 'prefLabel', 'value': 'test pref label', } se.index_data(index='test', body=x, idfield='id', refresh=True) y = { 'id': i + 100, 'type': 'altLabel', 'value': 'test alt label', } se.index_data(index='test', body=y, idfield='id', refresh=True) time.sleep(1) query = Query(se, start=0, limit=100) match = Match(field='type', query='altLabel') query.add_query(match) query.delete(index='test', refresh=True) self.assertEqual(se.count(index='test'), 10)
def delete_concept_values_index(concepts_to_delete): se = SearchEngineFactory().create() for concept in concepts_to_delete.itervalues(): query = Query(se, start=0, limit=10000) term = Term(field='conceptid', term=concept.id) query.add_query(term) query.delete(index='concept_labels') for conceptvalue in concept.values: se.delete_terms(conceptvalue.id)
def index_resource_relations(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indexes all resource to resource relation records Keyword Arguments: clear_index -- set to True to remove all the resources from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required """ start = datetime.now() print("Indexing resource to resource relations") cursor = connection.cursor() if clear_index: q = Query(se=se) q.delete(index=RESOURCE_RELATIONS_INDEX) with se.BulkIndexer(batch_size=batch_size, refresh=True) as resource_relations_indexer: sql = """ SELECT resourcexid, notes, datestarted, dateended, relationshiptype, resourceinstanceidfrom, resourceinstancefrom_graphid, resourceinstanceidto, resourceinstanceto_graphid, modified, created, inverserelationshiptype, tileid, nodeid FROM public.resource_x_resource """ cursor.execute(sql) for resource_relation in cursor.fetchall(): doc = { "resourcexid": resource_relation[0], "notes": resource_relation[1], "datestarted": resource_relation[2], "dateended": resource_relation[3], "relationshiptype": resource_relation[4], "resourceinstanceidfrom": resource_relation[5], "resourceinstancefrom_graphid": resource_relation[6], "resourceinstanceidto": resource_relation[7], "resourceinstanceto_graphid": resource_relation[8], "modified": resource_relation[9], "created": resource_relation[10], "inverserelationshiptype": resource_relation[11], "tileid": resource_relation[12], "nodeid": resource_relation[13], } resource_relations_indexer.add(index=RESOURCE_RELATIONS_INDEX, id=doc["resourcexid"], data=doc) index_count = se.count(index=RESOURCE_RELATIONS_INDEX) print("Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds". format("Passed" if cursor.rowcount == index_count else "Failed", cursor.rowcount, index_count, (datetime.now() - start).seconds))
def index_resources_by_type(resource_types, clear_index=True, index_name=None, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indexes all resources of a given type(s) Arguments: resource_types -- array of graph ids that represent resource types Keyword Arguments: clear_index -- set to True to remove all the resources of the types passed in from the index before the reindexing operation index_name -- only applies to custom indexes and if given will try and just refresh the data in that index batch_size -- the number of records to index as a group, the larger the number to more memory required """ status = '' se = SearchEngineFactory().create() datatype_factory = DataTypeFactory() node_datatypes = {str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list('nodeid', 'datatype')} status = '' for resource_type in resource_types: start = datetime.now() resources = Resource.objects.filter(graph_id=str(resource_type)) graph_name = models.GraphModel.objects.get(graphid=str(resource_type)).name print("Indexing resource type '{0}'".format(graph_name)) if index_name is None: q = Query(se=se) term = Term(field='graph_id', term=str(resource_type)) q.add_query(term) if clear_index: q.delete(index='resources', refresh=True) with se.BulkIndexer(batch_size=batch_size, refresh=True) as doc_indexer: with se.BulkIndexer(batch_size=batch_size, refresh=True) as term_indexer: for resource in resources: document, terms = resource.get_documents_to_index(fetchTiles=True, datatype_factory=datatype_factory, node_datatypes=node_datatypes) doc_indexer.add(index='resources', id=document['resourceinstanceid'], data=document) for term in terms: term_indexer.add(index='terms', id=term['_id'], data=term['_source']) result_summary = {'database': len(resources), 'indexed': se.count(index='resources', body=q.dsl)} status = 'Passed' if result_summary['database'] == result_summary['indexed'] else 'Failed' print("Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds".format(status, graph_name, result_summary['database'], result_summary['indexed'], (datetime.now()-start).seconds)) for index in settings.ELASTICSEARCH_CUSTOM_INDEXES: es_index = import_class_from_string(index['module'])(index['name']) es_index.bulk_index(resources=resources, resource_type=resource_type, graph_name=graph_name, clear_index=clear_index) else: es_index = get_index(index_name) es_index.bulk_index(resources=resources, resource_type=resource_type, graph_name=graph_name, clear_index=clear_index) return status
def delete_index(self, resourceinstanceid=None): """ Deletes all references to a resource from all indexes Keyword Arguments: resourceinstanceid -- the resource instance id to delete from related indexes, if supplied will use this over self.resourceinstanceid """ if resourceinstanceid is None: resourceinstanceid = self.resourceinstanceid resourceinstanceid = str(resourceinstanceid) # delete any related terms query = Query(se) bool_query = Bool() bool_query.filter( Terms(field="resourceinstanceid", terms=[resourceinstanceid])) query.add_query(bool_query) query.delete(index=TERMS_INDEX) # delete any related resource index entries query = Query(se) bool_query = Bool() bool_query.should( Terms(field="resourceinstanceidto", terms=[resourceinstanceid])) bool_query.should( Terms(field="resourceinstanceidfrom", terms=[resourceinstanceid])) query.add_query(bool_query) query.delete(index=RESOURCE_RELATIONS_INDEX) # reindex any related resources query = Query(se) bool_query = Bool() bool_query.filter( Nested(path="ids", query=Terms(field="ids.id", terms=[resourceinstanceid]))) query.add_query(bool_query) results = query.search(index=RESOURCES_INDEX)["hits"]["hits"] for result in results: try: res = Resource.objects.get(pk=result["_id"]) res.load_tiles() res.index() except ObjectDoesNotExist: pass # delete resource index se.delete(index=RESOURCES_INDEX, id=resourceinstanceid) # delete resources from custom indexes for index in settings.ELASTICSEARCH_CUSTOM_INDEXES: es_index = import_class_from_string(index["module"])(index["name"]) es_index.delete_resources(resources=self)
def bulk_index(self, resources=None, resource_type=None, graph_name=None, clear_index=True): """ Indexes a list of documents in bulk to Elastic Search Arguments: None Keyword Arguments: resources -- the list of resource instances to index resource_type -- the type of resources being indexed graph_name -- the name of the graph model that represents the resources being indexed clear_index -- True(default) to remove all index records of type "resource_type" before indexing, assumes that a field called "graph_id" exists on the indexed documents Return: None """ start = datetime.now() q = Query(se=self.se) if clear_index: term = Term(field="graph_id", term=str(resource_type)) q.add_query(term) q.delete(index=self.index_name, refresh=True) q = Query(se=self.se) count_before = self.se.count(index=self.index_name, body=q.dsl) result_summary = {"database": len(resources), "indexed": 0} with self.se.BulkIndexer(batch_size=settings.BULK_IMPORT_BATCH_SIZE, refresh=True) as indexer: for resource in resources: tiles = list( models.TileModel.objects.filter(resourceinstance=resource)) document, doc_id = self.get_documents_to_index(resource, tiles) if document is not None and id is not None: indexer.add(index=self.index_name, id=doc_id, data=document) result_summary["indexed"] = self.se.count(index=self.index_name, body=q.dsl) - count_before status = "Passed" if result_summary["database"] == result_summary[ "indexed"] else "Failed" print("Custom Index - %s:" % self.index_name) print( " Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds" .format(status, graph_name, result_summary["database"], result_summary["indexed"], (datetime.now() - start).seconds))
def index_resources(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indexes all resources from the database Keyword Arguments: clear_index -- set to True to remove all the resources from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required """ se = SearchEngineFactory().create() if clear_index: q = Query(se=se) q.delete(index='strings', doc_type='term') resource_types = models.GraphModel.objects.filter(isresource=True).exclude(graphid=settings.SYSTEM_SETTINGS_RESOURCE_MODEL_ID).values_list('graphid', flat=True) index_resources_by_type(resource_types, clear_index=clear_index, batch_size=batch_size)
def index_resources(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indexes all resources from the database Keyword Arguments: clear_index -- set to True to remove all the resources from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required """ se = SearchEngineFactory().create() if clear_index: q = Query(se=se) q.delete(index='terms') resource_types = models.GraphModel.objects.filter(isresource=True).exclude(graphid=settings.SYSTEM_SETTINGS_RESOURCE_MODEL_ID).values_list('graphid', flat=True) index_resources_by_type(resource_types, clear_index=clear_index, batch_size=batch_size)
def index_resource_relations(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indexes all resource to resource relation records Keyword Arguments: clear_index -- set to True to remove all the resources from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required """ start = datetime.now() print("Indexing resource to resource relations") cursor = connection.cursor() se = SearchEngineFactory().create() if clear_index: q = Query(se=se) q.delete(index="resource_relations") with se.BulkIndexer(batch_size=batch_size, refresh=True) as resource_relations_indexer: sql = """ SELECT resourcexid, resourceinstanceidfrom, notes, relationshiptype, resourceinstanceidto FROM public.resource_x_resource; """ cursor.execute(sql) for resource_relation in cursor.fetchall(): doc = { "resourcexid": resource_relation[0], "resourceinstanceidfrom": resource_relation[1], "notes": resource_relation[2], "relationshiptype": resource_relation[3], "resourceinstanceidto": resource_relation[4], } resource_relations_indexer.add(index="resource_relations", id=doc["resourcexid"], data=doc) index_count = se.count(index="resource_relations") print("Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds". format("Passed" if cursor.rowcount == index_count else "Failed", cursor.rowcount, index_count, (datetime.now() - start).seconds))
def delete_resources(self, resources=None): """ Deletes documents from an index based on the passed in list of resources Delete by query, so this is a single operation Keyword Arguments: resources -- a single resource instance or a list of resource instances """ q = Query(se=self.se) if not isinstance(resources, list): resourcelist = [resources] else: resourcelist = resources list_of_ids_to_delete = [] for resource in resourcelist: list_of_ids_to_delete.append(resource.pk) ids_query = Ids(ids=list_of_ids_to_delete) q.add_query(ids_query) q.delete(index=self.index_name)
def clear_resources(): """Removes all resource instances from your db and elasticsearch resource index""" se = SearchEngineFactory().create() match_all_query = Query(se) match_all_query.delete(index="terms") match_all_query.delete(index="resources") match_all_query.delete(index="resource_relations") print( "deleting", Resource.objects.exclude( resourceinstanceid=settings.RESOURCE_INSTANCE_ID).count(), "resources") Resource.objects.exclude( resourceinstanceid=settings.RESOURCE_INSTANCE_ID).delete() print( Resource.objects.exclude( resourceinstanceid=settings.RESOURCE_INSTANCE_ID).count(), "resources remaining") print("deleting", models.ResourceXResource.objects.count(), "resource relationships") cursor = connection.cursor() cursor.execute("TRUNCATE public.resource_x_resource CASCADE;") print(models.ResourceXResource.objects.count(), "resource relationships remaining")
def index_resources_by_type(resource_types, clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indexes all resources of a given type(s) Arguments: resource_types -- array of graph ids that represent resource types Keyword Arguments: clear_index -- set to True to remove all the resources of the types passed in from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required """ se = SearchEngineFactory().create() datatype_factory = DataTypeFactory() node_datatypes = {str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list('nodeid', 'datatype')} for resource_type in resource_types: start = datetime.now() resources = Resource.objects.filter(graph_id=str(resource_type)) graph_name = models.GraphModel.objects.get(graphid=str(resource_type)).name print "Indexing resource type '{0}'".format(graph_name) result_summary = {'database':len(resources), 'indexed':0} if clear_index: q = Query(se=se) q.delete(index='resource', doc_type=str(resource_type)) with se.BulkIndexer(batch_size=batch_size, refresh=True) as doc_indexer: with se.BulkIndexer(batch_size=batch_size, refresh=True) as term_indexer: for resource in resources: document, terms = resource.get_documents_to_index(fetchTiles=True, datatype_factory=datatype_factory, node_datatypes=node_datatypes) doc_indexer.add(index='resource', doc_type=document['graph_id'], id=document['resourceinstanceid'], data=document) for term in terms: term_indexer.add(index='strings', doc_type='term', id=term['_id'], data=term['_source']) result_summary['indexed'] = se.count(index='resource', doc_type=str(resource_type)) status = 'Passed' if result_summary['database'] == result_summary['indexed'] else 'Failed' print "Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds".format(status, graph_name, result_summary['database'], result_summary['indexed'], (datetime.now()-start).seconds)
def delete_index(self, resourceinstanceid=None): """ Deletes all references to a resource from all indexes Keyword Arguments: resourceinstanceid -- the resource instance id to delete from related indexes, if supplied will use this over self.resourceinstanceid """ if resourceinstanceid is None: resourceinstanceid = self.resourceinstanceid resourceinstanceid = str(resourceinstanceid) # delete any related terms query = Query(se) bool_query = Bool() bool_query.filter(Terms(field="resourceinstanceid", terms=[resourceinstanceid])) query.add_query(bool_query) query.delete(index=TERMS_INDEX) # delete any related resource index entries query = Query(se) bool_query = Bool() bool_query.should(Terms(field="resourceinstanceidto", terms=[resourceinstanceid])) bool_query.should(Terms(field="resourceinstanceidfrom", terms=[resourceinstanceid])) query.add_query(bool_query) query.delete(index=RESOURCE_RELATIONS_INDEX) # reindex any related resources query = Query(se) bool_query = Bool() bool_query.filter(Nested(path="ids", query=Terms(field="ids.id", terms=[resourceinstanceid]))) query.add_query(bool_query) results = query.search(index=RESOURCES_INDEX)["hits"]["hits"] for result in results: res = Resource.objects.get(pk=result["_id"]) res.load_tiles() res.index() # delete resource index se.delete(index=RESOURCES_INDEX, id=resourceinstanceid)
def index_resource_relations(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indexes all resource to resource relation records Keyword Arguments: clear_index -- set to True to remove all the resources from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required """ start = datetime.now() print "Indexing resource to resource relations" cursor = connection.cursor() se = SearchEngineFactory().create() if clear_index: q = Query(se=se) q.delete(index='resource_relations') with se.BulkIndexer(batch_size=batch_size, refresh=True) as resource_relations_indexer: sql = """ SELECT resourcexid, resourceinstanceidfrom, notes, relationshiptype, resourceinstanceidto FROM public.resource_x_resource; """ cursor.execute(sql) for resource_relation in cursor.fetchall(): doc = { 'resourcexid': resource_relation[0], 'resourceinstanceidfrom': resource_relation[1], 'notes': resource_relation[2], 'relationshiptype': resource_relation[3], 'resourceinstanceidto': resource_relation[4] } resource_relations_indexer.add(index='resource_relations', doc_type='all', id=doc['resourcexid'], data=doc) index_count = se.count(index='resource_relations') print "Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds".format('Passed' if cursor.rowcount == index_count else 'Failed', cursor.rowcount, index_count, (datetime.now()-start).seconds)
def test_delete_by_query(self): """ Test deleting documents by query in Elasticsearch """ se = SearchEngineFactory().create() for i in range(10): x = {"id": i, "type": "prefLabel", "value": "test pref label"} se.index_data(index="test", body=x, idfield="id", refresh=True) y = {"id": i + 100, "type": "altLabel", "value": "test alt label"} se.index_data(index="test", body=y, idfield="id", refresh=True) time.sleep(1) query = Query(se, start=0, limit=100) match = Match(field="type", query="altLabel") query.add_query(match) query.delete(index="test", refresh=True) self.assertEqual(se.count(index="test"), 10)
def delete(self, *args, **kwargs): se = SearchEngineFactory().create() request = kwargs.pop("request", None) index = kwargs.pop("index", True) transaction_id = kwargs.pop("index", None) provisional_edit_log_details = kwargs.pop( "provisional_edit_log_details", None) for tile in self.tiles: tile.delete(*args, request=request, **kwargs) try: user = request.user user_is_reviewer = user_is_resource_reviewer(user) except AttributeError: # no user user = None user_is_reviewer = True if user_is_reviewer is True or self.user_owns_provisional(user): if index: query = Query(se) bool_query = Bool() bool_query.filter(Terms(field="tileid", terms=[self.tileid])) query.add_query(bool_query) results = query.delete(index=TERMS_INDEX) self.__preDelete(request) self.save_edit( user=user, edit_type="tile delete", old_value=self.data, provisional_edit_log_details=provisional_edit_log_details, transaction_id=transaction_id, ) try: super(Tile, self).delete(*args, **kwargs) for nodeid in self.data.keys(): node = models.Node.objects.get(nodeid=nodeid) datatype = self.datatype_factory.get_instance( node.datatype) datatype.post_tile_delete(self, nodeid, index=index) if index: self.index() except IntegrityError as e: logger.error(e) else: self.apply_provisional_edit(user, data={}, action="delete") super(Tile, self).save(*args, **kwargs)
def clear_resources(): """Removes all resource instances from your db and elasticsearch resource index""" se = SearchEngineFactory().create() match_all_query = Query(se) match_all_query.delete(index='strings', doc_type='term') match_all_query.delete(index='resource') match_all_query.delete(index='resource_relations') print 'deleting', Resource.objects.count(), 'resources' cursor = connection.cursor() cursor.execute("TRUNCATE public.resource_instances CASCADE;" ) print Resource.objects.count(), 'resources remaining' print 'deleting', models.ResourceXResource.objects.count(), 'resource relationships' cursor.execute("TRUNCATE public.resource_x_resource CASCADE;" ) print models.ResourceXResource.objects.count(), 'resource relationships remaining'
def clear_resources(): """Removes all resource instances from your db and elasticsearch resource index""" se = SearchEngineFactory().create() match_all_query = Query(se) match_all_query.delete(index='strings', doc_type='term') match_all_query.delete(index='resource') match_all_query.delete(index='resource_relations') print 'deleting', Resource.objects.count(), 'resources' cursor = connection.cursor() cursor.execute("TRUNCATE public.resource_instances CASCADE;") print Resource.objects.count(), 'resources remaining' print 'deleting', models.ResourceXResource.objects.count( ), 'resource relationships' cursor.execute("TRUNCATE public.resource_x_resource CASCADE;") print models.ResourceXResource.objects.count( ), 'resource relationships remaining'
def index_concepts(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indxes all concepts from the database Keyword Arguments: clear_index -- set to True to remove all the concepts from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required """ start = datetime.now() print("Indexing concepts") cursor = connection.cursor() se = SearchEngineFactory().create() if clear_index: q = Query(se=se) q.delete(index="concepts") with se.BulkIndexer(batch_size=batch_size, refresh=True) as concept_indexer: indexed_values = [] for conceptValue in models.Value.objects.filter( Q(concept__nodetype="Collection") | Q(concept__nodetype="ConceptScheme"), valuetype__category="label"): doc = { "category": "label", "conceptid": conceptValue.concept_id, "language": conceptValue.language_id, "value": conceptValue.value, "type": conceptValue.valuetype_id, "id": conceptValue.valueid, "top_concept": conceptValue.concept_id, } concept_indexer.add(index="concepts", id=doc["id"], data=doc) indexed_values.append(doc["id"]) valueTypes = [] for valuetype in models.DValueType.objects.filter( category="label").values_list("valuetype", flat=True): valueTypes.append("'%s'" % valuetype) valueTypes = ",".join(valueTypes) for conceptValue in models.Relation.objects.filter( relationtype="hasTopConcept"): topConcept = conceptValue.conceptto_id sql = """ WITH RECURSIVE children_inclusive AS ( SELECT d.conceptidfrom, d.conceptidto, c.*, 1 AS depth ---|NonRecursive Part FROM relations d JOIN values c ON(c.conceptid = d.conceptidto) JOIN values c2 ON(c2.conceptid = d.conceptidfrom) WHERE d.conceptidto = '{0}' and c2.valuetype = 'prefLabel' and c.valuetype in ({1}) and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept') UNION SELECT d.conceptidfrom, d.conceptidto, v.*, depth+1 ---|RecursivePart FROM relations d JOIN children_inclusive b ON(b.conceptidto = d.conceptidfrom) JOIN values v ON(v.conceptid = d.conceptidto) JOIN values v2 ON(v2.conceptid = d.conceptidfrom) WHERE v2.valuetype = 'prefLabel' and v.valuetype in ({1}) and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept') ) SELECT valueid, value, conceptid, languageid, valuetype FROM children_inclusive ORDER BY depth; """.format(topConcept, valueTypes) cursor.execute(sql) for conceptValue in cursor.fetchall(): doc = { "category": "label", "conceptid": conceptValue[2], "language": conceptValue[3], "value": conceptValue[1], "type": conceptValue[4], "id": conceptValue[0], "top_concept": topConcept, } concept_indexer.add(index="concepts", id=doc["id"], data=doc) indexed_values.append(doc["id"]) # we add this step to catch any concepts/values that are orphaned (have no parent concept) for conceptValue in models.Value.objects.filter( valuetype__category="label").exclude( valueid__in=indexed_values): doc = { "category": "label", "conceptid": conceptValue.concept_id, "language": conceptValue.language_id, "value": conceptValue.value, "type": conceptValue.valuetype_id, "id": conceptValue.valueid, "top_concept": conceptValue.concept_id, } concept_indexer.add(index="concepts", id=doc["id"], data=doc) cursor.execute( "SELECT count(*) from values WHERE valuetype in ({0})".format( valueTypes)) concept_count_in_db = cursor.fetchone()[0] index_count = se.count(index="concepts") print("Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds". format("Passed" if concept_count_in_db == index_count else "Failed", concept_count_in_db, index_count, (datetime.now() - start).seconds))
def delete_index(self): se = SearchEngineFactory().create() query = Query(se, start=0, limit=10000) term = Term(field='id', term=self.id) query.add_query(term) query.delete(index='strings', doc_type='concept')
def index_resources_by_type(resource_types, clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indexes all resources of a given type(s) Arguments: resource_types -- array of graph ids that represent resource types Keyword Arguments: clear_index -- set to True to remove all the resources of the types passed in from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required """ se = SearchEngineFactory().create() datatype_factory = DataTypeFactory() node_datatypes = { str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list( 'nodeid', 'datatype') } for resource_type in resource_types: start = datetime.now() resources = Resource.objects.filter(graph_id=str(resource_type)) graph_name = models.GraphModel.objects.get( graphid=str(resource_type)).name print "Indexing resource type '{0}'".format(graph_name) result_summary = {'database': len(resources), 'indexed': 0} if clear_index: q = Query(se=se) q.delete(index='resource', doc_type=str(resource_type)) with se.BulkIndexer(batch_size=batch_size, refresh=True) as doc_indexer: with se.BulkIndexer(batch_size=batch_size, refresh=True) as term_indexer: for resource in resources: document, terms = resource.get_documents_to_index( fetchTiles=True, datatype_factory=datatype_factory, node_datatypes=node_datatypes) doc_indexer.add(index='resource', doc_type=document['graph_id'], id=document['resourceinstanceid'], data=document) for term in terms: term_indexer.add(index='strings', doc_type='term', id=term['_id'], data=term['_source']) result_summary['indexed'] = se.es.count( index='resource', doc_type=str(resource_type))['count'] status = 'Passed' if result_summary['database'] == result_summary[ 'indexed'] else 'Failed' print "Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds".format( status, graph_name, result_summary['database'], result_summary['indexed'], (datetime.now() - start).seconds)
def index_concepts(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indxes all concepts from the database Keyword Arguments: clear_index -- set to True to remove all the concepts from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required """ start = datetime.now() print "Indexing concepts" cursor = connection.cursor() se = SearchEngineFactory().create() if clear_index: q = Query(se=se) q.delete(index='concepts') with se.BulkIndexer(batch_size=batch_size, refresh=True) as concept_indexer: concept_strings = [] for conceptValue in models.Value.objects.filter(Q(concept__nodetype='Collection') | Q(concept__nodetype='ConceptScheme'), valuetype__category='label'): doc = { 'category': 'label', 'conceptid': conceptValue.concept_id, 'language': conceptValue.language_id, 'value': conceptValue.value, 'type': conceptValue.valuetype_id, 'id': conceptValue.valueid, 'top_concept': conceptValue.concept_id } concept_indexer.add(index='concepts', id=doc['id'], data=doc) valueTypes = [] valueTypes2 = [] for valuetype in models.DValueType.objects.filter(category='label').values_list('valuetype', flat=True): valueTypes2.append("%s" % valuetype) valueTypes.append("'%s'" % valuetype) valueTypes = ",".join(valueTypes) for conceptValue in models.Relation.objects.filter(relationtype='hasTopConcept'): topConcept = conceptValue.conceptto_id sql = """ WITH RECURSIVE children_inclusive AS ( SELECT d.conceptidfrom, d.conceptidto, c.*, 1 AS depth ---|NonRecursive Part FROM relations d JOIN values c ON(c.conceptid = d.conceptidto) JOIN values c2 ON(c2.conceptid = d.conceptidfrom) WHERE d.conceptidto = '{0}' and c2.valuetype = 'prefLabel' and c.valuetype in ({1}) and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept') UNION SELECT d.conceptidfrom, d.conceptidto, v.*, depth+1 ---|RecursivePart FROM relations d JOIN children_inclusive b ON(b.conceptidto = d.conceptidfrom) JOIN values v ON(v.conceptid = d.conceptidto) JOIN values v2 ON(v2.conceptid = d.conceptidfrom) WHERE v2.valuetype = 'prefLabel' and v.valuetype in ({1}) and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept') ) SELECT valueid, value, conceptid, languageid, valuetype FROM children_inclusive ORDER BY depth; """.format(topConcept, valueTypes) cursor.execute(sql) for conceptValue in cursor.fetchall(): doc = { 'category': 'label', 'conceptid': conceptValue[2], 'language': conceptValue[3], 'value': conceptValue[1], 'type': conceptValue[4], 'id': conceptValue[0], 'top_concept': topConcept } concept_indexer.add(index='concepts', id=doc['id'], data=doc) cursor.execute("SELECT count(*) from values WHERE valuetype in ({0})".format(valueTypes)) concept_count_in_db = cursor.fetchone()[0] index_count = se.count(index='concepts') print "Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds".format('Passed' if concept_count_in_db == index_count else 'Failed', concept_count_in_db, index_count, (datetime.now()-start).seconds)
def index_resources_by_type(resource_types, clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE, quiet=False): """ Indexes all resources of a given type(s) Arguments: resource_types -- array of graph ids that represent resource types Keyword Arguments: clear_index -- set to True to remove all the resources of the types passed in from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required quiet -- Silences the status bar output during certain operations, use in celery operations for example """ status = "" datatype_factory = DataTypeFactory() node_datatypes = { str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list( "nodeid", "datatype") } if isinstance(resource_types, str): resource_types = [resource_types] for resource_type in resource_types: start = datetime.now() resources = Resource.objects.filter(graph_id=str(resource_type)) graph_name = models.GraphModel.objects.get( graphid=str(resource_type)).name print("Indexing resource type '{0}'".format(graph_name)) q = Query(se=se) term = Term(field="graph_id", term=str(resource_type)) q.add_query(term) if clear_index: q.delete(index=RESOURCES_INDEX, refresh=True) with se.BulkIndexer(batch_size=batch_size, refresh=True) as doc_indexer: with se.BulkIndexer(batch_size=batch_size, refresh=True) as term_indexer: if quiet is False: bar = pyprind.ProgBar( len(resources), bar_char="█") if len(resources) > 1 else None for resource in resources: if quiet is False and bar is not None: bar.update(item_id=resource) document, terms = resource.get_documents_to_index( fetchTiles=True, datatype_factory=datatype_factory, node_datatypes=node_datatypes) doc_indexer.add(index=RESOURCES_INDEX, id=document["resourceinstanceid"], data=document) for term in terms: term_indexer.add(index=TERMS_INDEX, id=term["_id"], data=term["_source"]) result_summary = { "database": len(resources), "indexed": se.count(index=RESOURCES_INDEX, body=q.dsl) } status = "Passed" if result_summary["database"] == result_summary[ "indexed"] else "Failed" print( "Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds" .format(status, graph_name, result_summary["database"], result_summary["indexed"], (datetime.now() - start).seconds)) return status
def setUpClass(cls): se = SearchEngineFactory().create() q = Query(se=se) for indexname in [ TERMS_INDEX, CONCEPTS_INDEX, RESOURCE_RELATIONS_INDEX, RESOURCES_INDEX ]: q.delete(index=indexname, refresh=True) cls.client = Client() cls.client.login(username="******", password="******") models.ResourceInstance.objects.all().delete() with open( os.path.join( "tests/fixtures/resource_graphs/Search Test Model.json"), "rU") as f: archesfile = JSONDeserializer().deserialize(f) ResourceGraphImporter(archesfile["graph"]) cls.search_model_graphid = "d291a445-fa5f-11e6-afa8-14109fd34195" cls.search_model_cultural_period_nodeid = "7a182580-fa60-11e6-96d1-14109fd34195" cls.search_model_creation_date_nodeid = "1c1d05f5-fa60-11e6-887f-14109fd34195" cls.search_model_destruction_date_nodeid = "e771b8a1-65fe-11e7-9163-14109fd34195" cls.search_model_name_nodeid = "2fe14de3-fa61-11e6-897b-14109fd34195" cls.search_model_sensitive_info_nodeid = "57446fae-65ff-11e7-b63a-14109fd34195" cls.search_model_geom_nodeid = "3ebc6785-fa61-11e6-8c85-14109fd34195" cls.user = User.objects.create_user( "unpriviliged_user", "*****@*****.**", "test") cls.user.groups.add(Group.objects.get(name="Guest")) nodegroup = models.NodeGroup.objects.get( pk=cls.search_model_destruction_date_nodeid) assign_perm("no_access_to_nodegroup", cls.user, nodegroup) # Add a concept that defines a min and max date concept = { "id": "00000000-0000-0000-0000-000000000001", "legacyoid": "ARCHES", "nodetype": "ConceptScheme", "values": [], "subconcepts": [{ "values": [ { "value": "Mock concept", "language": "en-US", "category": "label", "type": "prefLabel", "id": "", "conceptid": "" }, { "value": "1950", "language": "en-US", "category": "note", "type": "min_year", "id": "", "conceptid": "" }, { "value": "1980", "language": "en-US", "category": "note", "type": "max_year", "id": "", "conceptid": "" }, ], "relationshiptype": "hasTopConcept", "nodetype": "Concept", "id": "", "legacyoid": "", "subconcepts": [], "parentconcepts": [], "relatedconcepts": [], }], } post_data = JSONSerializer().serialize(concept) content_type = "application/x-www-form-urlencoded" response = cls.client.post( reverse( "concept", kwargs={"conceptid": "00000000-0000-0000-0000-000000000001"}), post_data, content_type) response_json = json.loads(response.content) valueid = response_json["subconcepts"][0]["values"][0]["id"] cls.conceptid = response_json["subconcepts"][0]["id"] # add resource instance with only a cultural period defined cls.cultural_period_resource = Resource( graph_id=cls.search_model_graphid) tile = Tile(data={cls.search_model_cultural_period_nodeid: [valueid]}, nodegroup_id=cls.search_model_cultural_period_nodeid) cls.cultural_period_resource.tiles.append(tile) cls.cultural_period_resource.save() # add resource instance with a creation and destruction date defined cls.date_resource = Resource(graph_id=cls.search_model_graphid) tile = Tile(data={cls.search_model_creation_date_nodeid: "1941-01-01"}, nodegroup_id=cls.search_model_creation_date_nodeid) cls.date_resource.tiles.append(tile) tile = Tile( data={cls.search_model_destruction_date_nodeid: "1948-01-01"}, nodegroup_id=cls.search_model_destruction_date_nodeid) cls.date_resource.tiles.append(tile) tile = Tile(data={cls.search_model_name_nodeid: "testing 123"}, nodegroup_id=cls.search_model_name_nodeid) cls.date_resource.tiles.append(tile) cls.date_resource.save() # add resource instance with a creation date and a cultural period defined cls.date_and_cultural_period_resource = Resource( graph_id=cls.search_model_graphid) tile = Tile(data={cls.search_model_creation_date_nodeid: "1942-01-01"}, nodegroup_id=cls.search_model_creation_date_nodeid) cls.date_and_cultural_period_resource.tiles.append(tile) tile = Tile(data={cls.search_model_cultural_period_nodeid: [valueid]}, nodegroup_id=cls.search_model_cultural_period_nodeid) cls.date_and_cultural_period_resource.tiles.append(tile) cls.date_and_cultural_period_resource.save() # add resource instance with with no dates or periods defined cls.name_resource = Resource(graph_id=cls.search_model_graphid) tile = Tile(data={cls.search_model_name_nodeid: "some test name"}, nodegroup_id=cls.search_model_name_nodeid) cls.name_resource.tiles.append(tile) geom = { "type": "FeatureCollection", "features": [{ "geometry": { "type": "Point", "coordinates": [0, 0] }, "type": "Feature", "properties": {} }], } tile = Tile(data={cls.search_model_geom_nodeid: geom}, nodegroup_id=cls.search_model_geom_nodeid) cls.name_resource.tiles.append(tile) cls.name_resource.save() # add delay to allow for indexes to be updated time.sleep(1)
def delete_index(self): se = SearchEngineFactory().create() query = Query(se, start=0, limit=10000) term = Term(field='id', term=self.id) query.add_query(term) query.delete(index='strings', doc_type='concept')
def index_concepts(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indxes all concepts from the database Keyword Arguments: clear_index -- set to True to remove all the concepts from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required """ start = datetime.now() print "Indexing concepts" cursor = connection.cursor() se = SearchEngineFactory().create() if clear_index: q = Query(se=se) q.delete(index='strings', doc_type='concept') with se.BulkIndexer(batch_size=batch_size, refresh=True) as concept_indexer: concept_strings = [] for conceptValue in models.Value.objects.filter(Q(concept__nodetype='Collection') | Q(concept__nodetype='ConceptScheme'), valuetype__category ='label'): doc = { 'category': 'label', 'conceptid': conceptValue.concept_id, 'language': conceptValue.language_id, 'value': conceptValue.value, 'type': conceptValue.valuetype_id, 'id': conceptValue.valueid, 'top_concept': conceptValue.concept_id } concept_indexer.add(index='strings', doc_type='concept', id=doc['id'], data=doc) valueTypes = [] valueTypes2=[] for valuetype in models.DValueType.objects.filter(category='label').values_list('valuetype', flat=True): valueTypes2.append("%s" % valuetype) valueTypes.append("'%s'" % valuetype) valueTypes = ",".join(valueTypes) for conceptValue in models.Relation.objects.filter(relationtype='hasTopConcept'): topConcept = conceptValue.conceptto_id sql = """ WITH RECURSIVE children_inclusive AS ( SELECT d.conceptidfrom, d.conceptidto, c.*, 1 AS depth ---|NonRecursive Part FROM relations d JOIN values c ON(c.conceptid = d.conceptidto) JOIN values c2 ON(c2.conceptid = d.conceptidfrom) WHERE d.conceptidto = '{0}' and c2.valuetype = 'prefLabel' and c.valuetype in ({1}) and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept') UNION SELECT d.conceptidfrom, d.conceptidto, v.*, depth+1 ---|RecursivePart FROM relations d JOIN children_inclusive b ON(b.conceptidto = d.conceptidfrom) JOIN values v ON(v.conceptid = d.conceptidto) JOIN values v2 ON(v2.conceptid = d.conceptidfrom) WHERE v2.valuetype = 'prefLabel' and v.valuetype in ({1}) and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept') ) SELECT valueid, value, conceptid, languageid, valuetype FROM children_inclusive ORDER BY depth; """.format(topConcept, valueTypes) cursor.execute(sql) for conceptValue in cursor.fetchall(): doc = { 'category': 'label', 'conceptid': conceptValue[2], 'language': conceptValue[3], 'value': conceptValue[1], 'type': conceptValue[4], 'id': conceptValue[0], 'top_concept': topConcept } concept_indexer.add(index='strings', doc_type='concept', id=doc['id'], data=doc) cursor.execute("SELECT count(*) from values WHERE valuetype in ({0})".format(valueTypes)) concept_count_in_db = cursor.fetchone()[0] index_count = se.count(index='strings', doc_type='concept') print "Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds".format('Passed' if concept_count_in_db == index_count else 'Failed', concept_count_in_db, index_count, (datetime.now()-start).seconds)