def index(self): """ Indexes all the nessesary items values of a resource to support search """ if str(self.graph_id) != str( settings.SYSTEM_SETTINGS_RESOURCE_MODEL_ID): datatype_factory = DataTypeFactory() node_datatypes = { str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list( "nodeid", "datatype") } document, terms = self.get_documents_to_index( datatype_factory=datatype_factory, node_datatypes=node_datatypes) document["root_ontology_class"] = self.get_root_ontology() doc = JSONSerializer().serializeToPython(document) se.index_data(index=RESOURCES_INDEX, body=doc, id=self.pk) for term in terms: se.index_data("terms", body=term["_source"], id=term["_id"]) for index in settings.ELASTICSEARCH_CUSTOM_INDEXES: es_index = import_class_from_string(index["module"])( index["name"]) doc, doc_id = es_index.get_documents_to_index( self, document["tiles"]) es_index.index_document(document=doc, id=doc_id)
def save(self): from arches.app.search.search_engine_factory import SearchEngineInstance as se from arches.app.search.mappings import RESOURCE_RELATIONS_INDEX if not self.created: self.created = datetime.datetime.now() self.modified = datetime.datetime.now() document = model_to_dict(self) se.index_data(index=RESOURCE_RELATIONS_INDEX, body=document, idfield="resourcexid") super(ResourceXResource, self).save()
def index_resource_relations(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indexes all resource to resource relation records Keyword Arguments: clear_index -- set to True to remove all the resources from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required """ start = datetime.now() print("Indexing resource to resource relations") cursor = connection.cursor() if clear_index: q = Query(se=se) q.delete(index=RESOURCE_RELATIONS_INDEX) with se.BulkIndexer(batch_size=batch_size, refresh=True) as resource_relations_indexer: sql = """ SELECT resourcexid, notes, datestarted, dateended, relationshiptype, resourceinstanceidfrom, resourceinstancefrom_graphid, resourceinstanceidto, resourceinstanceto_graphid, modified, created, inverserelationshiptype, tileid, nodeid FROM public.resource_x_resource """ cursor.execute(sql) for resource_relation in cursor.fetchall(): doc = { "resourcexid": resource_relation[0], "notes": resource_relation[1], "datestarted": resource_relation[2], "dateended": resource_relation[3], "relationshiptype": resource_relation[4], "resourceinstanceidfrom": resource_relation[5], "resourceinstancefrom_graphid": resource_relation[6], "resourceinstanceidto": resource_relation[7], "resourceinstanceto_graphid": resource_relation[8], "modified": resource_relation[9], "created": resource_relation[10], "inverserelationshiptype": resource_relation[11], "tileid": resource_relation[12], "nodeid": resource_relation[13], } resource_relations_indexer.add(index=RESOURCE_RELATIONS_INDEX, id=doc["resourcexid"], data=doc) index_count = se.count(index=RESOURCE_RELATIONS_INDEX) print("Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds". format("Passed" if cursor.rowcount == index_count else "Failed", cursor.rowcount, index_count, (datetime.now() - start).seconds))
def delete_index(self, resourceinstanceid=None): """ Deletes all references to a resource from all indexes Keyword Arguments: resourceinstanceid -- the resource instance id to delete from related indexes, if supplied will use this over self.resourceinstanceid """ if resourceinstanceid is None: resourceinstanceid = self.resourceinstanceid resourceinstanceid = str(resourceinstanceid) # delete any related terms query = Query(se) bool_query = Bool() bool_query.filter( Terms(field="resourceinstanceid", terms=[resourceinstanceid])) query.add_query(bool_query) query.delete(index=TERMS_INDEX) # delete any related resource index entries query = Query(se) bool_query = Bool() bool_query.should( Terms(field="resourceinstanceidto", terms=[resourceinstanceid])) bool_query.should( Terms(field="resourceinstanceidfrom", terms=[resourceinstanceid])) query.add_query(bool_query) query.delete(index=RESOURCE_RELATIONS_INDEX) # reindex any related resources query = Query(se) bool_query = Bool() bool_query.filter( Nested(path="ids", query=Terms(field="ids.id", terms=[resourceinstanceid]))) query.add_query(bool_query) results = query.search(index=RESOURCES_INDEX)["hits"]["hits"] for result in results: try: res = Resource.objects.get(pk=result["_id"]) res.load_tiles() res.index() except ObjectDoesNotExist: pass # delete resource index se.delete(index=RESOURCES_INDEX, id=resourceinstanceid) # delete resources from custom indexes for index in settings.ELASTICSEARCH_CUSTOM_INDEXES: es_index = import_class_from_string(index["module"])(index["name"]) es_index.delete_resources(resources=self)
def iterate_all_documents(self, se, index, pagesize=250, scroll_timeout="1m"): """ Helper to iterate ALL values from a single index. Yields all the documents. https://techoverflow.net/2019/05/07/elasticsearch-how-to-iterate-scroll-through-all-documents-in-index/ """ is_first = True while True: # Scroll next if is_first: # Initialize scroll result = se.search(index=index, scroll="1m", body={ "size": pagesize }) is_first = False else: ## note: need to access the ElasticSearch() instance directly ## here, (.es), because the Arches se object doesn't inherit .scroll() result = se.es.scroll(body={ "scroll_id": scroll_id, "scroll": scroll_timeout }) scroll_id = result["_scroll_id"] hits = result["hits"]["hits"] # Stop after no more docs if not hits: break # Yield each entry yield from ((hit['_source']['resourceinstanceid'], hit['_source']['graph_id']) for hit in hits)
def delete(self, *args, **kwargs): from arches.app.search.search_engine_factory import SearchEngineInstance as se from arches.app.search.mappings import RESOURCE_RELATIONS_INDEX se.delete(index=RESOURCE_RELATIONS_INDEX, id=self.resourcexid) # update the resource-instance tile by removing any references to a deleted resource deletedResourceId = kwargs.pop("deletedResourceId", None) if deletedResourceId and self.tileid and self.nodeid: newTileData = [] data = self.tileid.data[str(self.nodeid_id)] if type(data) != list: data = [data] for relatedresourceItem in data: if relatedresourceItem["resourceId"] != str(deletedResourceId): newTileData.append(relatedresourceItem) self.tileid.data[str(self.nodeid_id)] = newTileData self.tileid.save() super(ResourceXResource, self).delete()
def delete_index(self, resourceinstanceid=None): """ Deletes all references to a resource from all indexes Keyword Arguments: resourceinstanceid -- the resource instance id to delete from related indexes, if supplied will use this over self.resourceinstanceid """ if resourceinstanceid is None: resourceinstanceid = self.resourceinstanceid resourceinstanceid = str(resourceinstanceid) # delete any related terms query = Query(se) bool_query = Bool() bool_query.filter(Terms(field="resourceinstanceid", terms=[resourceinstanceid])) query.add_query(bool_query) query.delete(index=TERMS_INDEX) # delete any related resource index entries query = Query(se) bool_query = Bool() bool_query.should(Terms(field="resourceinstanceidto", terms=[resourceinstanceid])) bool_query.should(Terms(field="resourceinstanceidfrom", terms=[resourceinstanceid])) query.add_query(bool_query) query.delete(index=RESOURCE_RELATIONS_INDEX) # reindex any related resources query = Query(se) bool_query = Bool() bool_query.filter(Nested(path="ids", query=Terms(field="ids.id", terms=[resourceinstanceid]))) query.add_query(bool_query) results = query.search(index=RESOURCES_INDEX)["hits"]["hits"] for result in results: res = Resource.objects.get(pk=result["_id"]) res.load_tiles() res.index() # delete resource index se.delete(index=RESOURCES_INDEX, id=resourceinstanceid)
def delete(self, user={}, note=""): """ Deletes a single resource and any related indexed data """ permit_deletion = False graph = models.GraphModel.objects.get(graphid=self.graph_id) if graph.isactive is False: message = _("This model is not yet active; unable to delete.") raise ModelInactiveError(message) if user != {}: user_is_reviewer = user_is_resource_reviewer(user) if user_is_reviewer is False: tiles = list(models.TileModel.objects.filter(resourceinstance=self)) resource_is_provisional = True if sum([len(t.data) for t in tiles]) == 0 else False if resource_is_provisional is True: permit_deletion = True else: permit_deletion = True else: permit_deletion = True if permit_deletion is True: related_resources = self.get_related_resources(lang="en-US", start=0, limit=1000, page=0) for rr in related_resources["resource_relationships"]: # delete any related resource entries, also reindex the resource that references this resource that's being deleted try: resourceXresource = models.ResourceXResource.objects.get(pk=rr["resourcexid"]) resource_to_reindex = ( resourceXresource.resourceinstanceidfrom_id if resourceXresource.resourceinstanceidto_id == self.resourceinstanceid else resourceXresource.resourceinstanceidto_id ) resourceXresource.delete(deletedResourceId=self.resourceinstanceid) res = Resource.objects.get(pk=resource_to_reindex) res.load_tiles() res.index() except ObjectDoesNotExist: se.delete(index=RESOURCE_RELATIONS_INDEX, id=rr["resourcexid"]) query = Query(se) bool_query = Bool() bool_query.filter(Terms(field="resourceinstanceid", terms=[self.resourceinstanceid])) query.add_query(bool_query) results = query.search(index=TERMS_INDEX)["hits"]["hits"] for result in results: se.delete(index=TERMS_INDEX, id=result["_id"]) se.delete(index=RESOURCES_INDEX, id=self.resourceinstanceid) try: self.save_edit(edit_type="delete", user=user, note=self.displayname) except: pass super(Resource, self).delete() return permit_deletion
def bulk_save(resources): """ Saves and indexes a list of resources Arguments: resources -- a list of resource models """ datatype_factory = DataTypeFactory() node_datatypes = {str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list("nodeid", "datatype")} tiles = [] documents = [] term_list = [] for resource in resources: resource.tiles = resource.get_flattened_tiles() tiles.extend(resource.tiles) # need to save the models first before getting the documents for index start = time() Resource.objects.bulk_create(resources) TileModel.objects.bulk_create(tiles) print(f"Time to bulk create tiles and resources: {datetime.timedelta(seconds=time() - start)}") start = time() for resource in resources: resource.save_edit(edit_type="create") resources[0].tiles[0].save_edit(note=f"Bulk created: {len(tiles)} for {len(resources)} resources.", edit_type="bulk_create") print("Time to save resource edits: %s" % datetime.timedelta(seconds=time() - start)) for resource in resources: start = time() document, terms = resource.get_documents_to_index( fetchTiles=False, datatype_factory=datatype_factory, node_datatypes=node_datatypes ) documents.append(se.create_bulk_item(index=RESOURCES_INDEX, id=document["resourceinstanceid"], data=document)) for term in terms: term_list.append(se.create_bulk_item(index=TERMS_INDEX, id=term["_id"], data=term["_source"])) se.bulk_index(documents) se.bulk_index(term_list)
def get_related_resources( self, lang="en-US", limit=settings.RELATED_RESOURCES_EXPORT_LIMIT, start=0, page=0, user=None, resourceinstance_graphid=None, ): """ Returns an object that lists the related resources, the relationship types, and a reference to the current resource """ graphs = (models.GraphModel.objects.all().exclude( pk=settings.SYSTEM_SETTINGS_RESOURCE_MODEL_ID).exclude( isresource=False).exclude(isactive=False)) graph_lookup = { str(graph.graphid): { "name": graph.name, "iconclass": graph.iconclass, "fillColor": graph.color } for graph in graphs } ret = { "resource_instance": self, "resource_relationships": [], "related_resources": [], "node_config_lookup": graph_lookup } if page > 0: limit = settings.RELATED_RESOURCES_PER_PAGE start = limit * int(page - 1) def get_relations(resourceinstanceid, start, limit, resourceinstance_graphid=None): query = Query(se, start=start, limit=limit) bool_filter = Bool() bool_filter.should( Terms(field="resourceinstanceidfrom", terms=resourceinstanceid)) bool_filter.should( Terms(field="resourceinstanceidto", terms=resourceinstanceid)) if resourceinstance_graphid: graph_filter = Bool() to_graph_id_filter = Bool() to_graph_id_filter.filter( Terms(field="resourceinstancefrom_graphid", terms=str(self.graph_id))) to_graph_id_filter.filter( Terms(field="resourceinstanceto_graphid", terms=resourceinstance_graphid)) graph_filter.should(to_graph_id_filter) from_graph_id_filter = Bool() from_graph_id_filter.filter( Terms(field="resourceinstancefrom_graphid", terms=resourceinstance_graphid)) from_graph_id_filter.filter( Terms(field="resourceinstanceto_graphid", terms=str(self.graph_id))) graph_filter.should(from_graph_id_filter) bool_filter.must(graph_filter) query.add_query(bool_filter) return query.search(index=RESOURCE_RELATIONS_INDEX) resource_relations = get_relations( resourceinstanceid=self.resourceinstanceid, start=start, limit=limit, resourceinstance_graphid=resourceinstance_graphid, ) ret["total"] = resource_relations["hits"]["total"] instanceids = set() restricted_instances = get_restricted_instances( user, se) if user is not None else [] for relation in resource_relations["hits"]["hits"]: try: preflabel = get_preflabel_from_valueid( relation["_source"]["relationshiptype"], lang) relation["_source"][ "relationshiptype_label"] = preflabel["value"] or "" except: relation["_source"]["relationshiptype_label"] = relation[ "_source"]["relationshiptype"] or "" resourceid_to = relation["_source"]["resourceinstanceidto"] resourceid_from = relation["_source"]["resourceinstanceidfrom"] if resourceid_to not in restricted_instances and resourceid_from not in restricted_instances: ret["resource_relationships"].append(relation["_source"]) instanceids.add(resourceid_to) instanceids.add(resourceid_from) else: ret["total"]["value"] -= 1 if str(self.resourceinstanceid) in instanceids: instanceids.remove(str(self.resourceinstanceid)) if len(instanceids) > 0: related_resources = se.search(index=RESOURCES_INDEX, id=list(instanceids)) if related_resources: for resource in related_resources["docs"]: relations = get_relations( resourceinstanceid=resource["_id"], start=0, limit=0, ) if resource["found"]: resource["_source"]["total_relations"] = relations[ "hits"]["total"] ret["related_resources"].append(resource["_source"]) return ret
def index_resources_by_type(resource_types, clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE, quiet=False): """ Indexes all resources of a given type(s) Arguments: resource_types -- array of graph ids that represent resource types Keyword Arguments: clear_index -- set to True to remove all the resources of the types passed in from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required quiet -- Silences the status bar output during certain operations, use in celery operations for example """ status = "" datatype_factory = DataTypeFactory() node_datatypes = { str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list( "nodeid", "datatype") } if isinstance(resource_types, str): resource_types = [resource_types] for resource_type in resource_types: start = datetime.now() resources = Resource.objects.filter(graph_id=str(resource_type)) graph_name = models.GraphModel.objects.get( graphid=str(resource_type)).name print("Indexing resource type '{0}'".format(graph_name)) q = Query(se=se) term = Term(field="graph_id", term=str(resource_type)) q.add_query(term) if clear_index: q.delete(index=RESOURCES_INDEX, refresh=True) with se.BulkIndexer(batch_size=batch_size, refresh=True) as doc_indexer: with se.BulkIndexer(batch_size=batch_size, refresh=True) as term_indexer: if quiet is False: bar = pyprind.ProgBar( len(resources), bar_char="█") if len(resources) > 1 else None for resource in resources: if quiet is False and bar is not None: bar.update(item_id=resource) document, terms = resource.get_documents_to_index( fetchTiles=True, datatype_factory=datatype_factory, node_datatypes=node_datatypes) doc_indexer.add(index=RESOURCES_INDEX, id=document["resourceinstanceid"], data=document) for term in terms: term_indexer.add(index=TERMS_INDEX, id=term["_id"], data=term["_source"]) result_summary = { "database": len(resources), "indexed": se.count(index=RESOURCES_INDEX, body=q.dsl) } status = "Passed" if result_summary["database"] == result_summary[ "indexed"] else "Failed" print( "Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds" .format(status, graph_name, result_summary["database"], result_summary["indexed"], (datetime.now() - start).seconds)) return status
def index_concepts(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indxes all concepts from the database Keyword Arguments: clear_index -- set to True to remove all the concepts from the index before the reindexing operation batch_size -- the number of records to index as a group, the larger the number to more memory required """ start = datetime.now() print("Indexing concepts") cursor = connection.cursor() if clear_index: q = Query(se=se) q.delete(index=CONCEPTS_INDEX) with se.BulkIndexer(batch_size=batch_size, refresh=True) as concept_indexer: indexed_values = [] for conceptValue in models.Value.objects.filter( Q(concept__nodetype="Collection") | Q(concept__nodetype="ConceptScheme"), valuetype__category="label"): doc = { "category": "label", "conceptid": conceptValue.concept_id, "language": conceptValue.language_id, "value": conceptValue.value, "type": conceptValue.valuetype_id, "id": conceptValue.valueid, "top_concept": conceptValue.concept_id, } concept_indexer.add(index=CONCEPTS_INDEX, id=doc["id"], data=doc) indexed_values.append(doc["id"]) valueTypes = [] for valuetype in models.DValueType.objects.filter( category="label").values_list("valuetype", flat=True): valueTypes.append("'%s'" % valuetype) valueTypes = ",".join(valueTypes) for conceptValue in models.Relation.objects.filter( relationtype="hasTopConcept"): topConcept = conceptValue.conceptto_id sql = """ WITH RECURSIVE children_inclusive AS ( SELECT d.conceptidfrom, d.conceptidto, c.*, 1 AS depth ---|NonRecursive Part FROM relations d JOIN values c ON(c.conceptid = d.conceptidto) JOIN values c2 ON(c2.conceptid = d.conceptidfrom) WHERE d.conceptidto = '{0}' and c2.valuetype = 'prefLabel' and c.valuetype in ({1}) and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept') UNION SELECT d.conceptidfrom, d.conceptidto, v.*, depth+1 ---|RecursivePart FROM relations d JOIN children_inclusive b ON(b.conceptidto = d.conceptidfrom) JOIN values v ON(v.conceptid = d.conceptidto) JOIN values v2 ON(v2.conceptid = d.conceptidfrom) WHERE v2.valuetype = 'prefLabel' and v.valuetype in ({1}) and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept') ) SELECT valueid, value, conceptid, languageid, valuetype FROM children_inclusive ORDER BY depth; """.format(topConcept, valueTypes) cursor.execute(sql) for conceptValue in cursor.fetchall(): doc = { "category": "label", "conceptid": conceptValue[2], "language": conceptValue[3], "value": conceptValue[1], "type": conceptValue[4], "id": conceptValue[0], "top_concept": topConcept, } concept_indexer.add(index=CONCEPTS_INDEX, id=doc["id"], data=doc) indexed_values.append(doc["id"]) # we add this step to catch any concepts/values that are orphaned (have no parent concept) for conceptValue in models.Value.objects.filter( valuetype__category="label").exclude( valueid__in=indexed_values): doc = { "category": "label", "conceptid": conceptValue.concept_id, "language": conceptValue.language_id, "value": conceptValue.value, "type": conceptValue.valuetype_id, "id": conceptValue.valueid, "top_concept": conceptValue.concept_id, } concept_indexer.add(index=CONCEPTS_INDEX, id=doc["id"], data=doc) cursor.execute( "SELECT count(*) from values WHERE valuetype in ({0})".format( valueTypes)) concept_count_in_db = cursor.fetchone()[0] index_count = se.count(index=CONCEPTS_INDEX) print("Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds". format("Passed" if concept_count_in_db == index_count else "Failed", concept_count_in_db, index_count, (datetime.now() - start).seconds))
def get_concept_label_from_valueid(valueid): concept_label = se.search(index=CONCEPTS_INDEX, id=valueid) if concept_label["found"]: return concept_label["_source"]
def index_resources_by_type(resource_types, clear_index=True, index_name=None, batch_size=settings.BULK_IMPORT_BATCH_SIZE): """ Indexes all resources of a given type(s) Arguments: resource_types -- array of graph ids that represent resource types Keyword Arguments: clear_index -- set to True to remove all the resources of the types passed in from the index before the reindexing operation index_name -- only applies to custom indexes and if given will try and just refresh the data in that index batch_size -- the number of records to index as a group, the larger the number to more memory required """ status = "" datatype_factory = DataTypeFactory() node_datatypes = { str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list( "nodeid", "datatype") } status = "" for resource_type in resource_types: start = datetime.now() resources = Resource.objects.filter(graph_id=str(resource_type)) graph_name = models.GraphModel.objects.get( graphid=str(resource_type)).name print("Indexing resource type '{0}'".format(graph_name)) if index_name is None: q = Query(se=se) term = Term(field="graph_id", term=str(resource_type)) q.add_query(term) if clear_index: q.delete(index=RESOURCES_INDEX, refresh=True) with se.BulkIndexer(batch_size=batch_size, refresh=True) as doc_indexer: with se.BulkIndexer(batch_size=batch_size, refresh=True) as term_indexer: for resource in resources: document, terms = resource.get_documents_to_index( fetchTiles=True, datatype_factory=datatype_factory, node_datatypes=node_datatypes) doc_indexer.add(index=RESOURCES_INDEX, id=document["resourceinstanceid"], data=document) for term in terms: term_indexer.add(index=TERMS_INDEX, id=term["_id"], data=term["_source"]) result_summary = { "database": len(resources), "indexed": se.count(index=RESOURCES_INDEX, body=q.dsl) } status = "Passed" if result_summary["database"] == result_summary[ "indexed"] else "Failed" print( "Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds" .format(status, graph_name, result_summary["database"], result_summary["indexed"], (datetime.now() - start).seconds)) for index in settings.ELASTICSEARCH_CUSTOM_INDEXES: es_index = import_class_from_string(index["module"])( index["name"]) es_index.bulk_index(resources=resources, resource_type=resource_type, graph_name=graph_name, clear_index=clear_index) else: es_index = get_index(index_name) es_index.bulk_index(resources=resources, resource_type=resource_type, graph_name=graph_name, clear_index=clear_index) return status