Ejemplos de SearchEngineInstance en Python, ejemplos de arches.app.search.search_engine_factory.SearchEngineInstance en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: resource.py Proyecto: legiongis/arches

    def index(self):
        """
        Indexes all the nessesary items values of a resource to support search

        """

        if str(self.graph_id) != str(
                settings.SYSTEM_SETTINGS_RESOURCE_MODEL_ID):
            datatype_factory = DataTypeFactory()
            node_datatypes = {
                str(nodeid): datatype
                for nodeid, datatype in models.Node.objects.values_list(
                    "nodeid", "datatype")
            }
            document, terms = self.get_documents_to_index(
                datatype_factory=datatype_factory,
                node_datatypes=node_datatypes)
            document["root_ontology_class"] = self.get_root_ontology()
            doc = JSONSerializer().serializeToPython(document)
            se.index_data(index=RESOURCES_INDEX, body=doc, id=self.pk)
            for term in terms:
                se.index_data("terms", body=term["_source"], id=term["_id"])

            for index in settings.ELASTICSEARCH_CUSTOM_INDEXES:
                es_index = import_class_from_string(index["module"])(
                    index["name"])
                doc, doc_id = es_index.get_documents_to_index(
                    self, document["tiles"])
                es_index.index_document(document=doc, id=doc_id)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: models.py Proyecto: manjulive89/arches

    def save(self):
        from arches.app.search.search_engine_factory import SearchEngineInstance as se
        from arches.app.search.mappings import RESOURCE_RELATIONS_INDEX

        if not self.created:
            self.created = datetime.datetime.now()
        self.modified = datetime.datetime.now()
        document = model_to_dict(self)
        se.index_data(index=RESOURCE_RELATIONS_INDEX, body=document, idfield="resourcexid")
        super(ResourceXResource, self).save()

Ejemplo n.º 3

0

Mostrar archivo

def index_resource_relations(clear_index=True,
                             batch_size=settings.BULK_IMPORT_BATCH_SIZE):
    """
    Indexes all resource to resource relation records

    Keyword Arguments:
    clear_index -- set to True to remove all the resources from the index before the reindexing operation
    batch_size -- the number of records to index as a group, the larger the number to more memory required

    """

    start = datetime.now()
    print("Indexing resource to resource relations")

    cursor = connection.cursor()
    if clear_index:
        q = Query(se=se)
        q.delete(index=RESOURCE_RELATIONS_INDEX)

    with se.BulkIndexer(batch_size=batch_size,
                        refresh=True) as resource_relations_indexer:
        sql = """
            SELECT resourcexid, notes, datestarted, dateended, relationshiptype, resourceinstanceidfrom, resourceinstancefrom_graphid,
            resourceinstanceidto, resourceinstanceto_graphid, modified, created, inverserelationshiptype, tileid, nodeid
            FROM public.resource_x_resource
        """

        cursor.execute(sql)
        for resource_relation in cursor.fetchall():
            doc = {
                "resourcexid": resource_relation[0],
                "notes": resource_relation[1],
                "datestarted": resource_relation[2],
                "dateended": resource_relation[3],
                "relationshiptype": resource_relation[4],
                "resourceinstanceidfrom": resource_relation[5],
                "resourceinstancefrom_graphid": resource_relation[6],
                "resourceinstanceidto": resource_relation[7],
                "resourceinstanceto_graphid": resource_relation[8],
                "modified": resource_relation[9],
                "created": resource_relation[10],
                "inverserelationshiptype": resource_relation[11],
                "tileid": resource_relation[12],
                "nodeid": resource_relation[13],
            }
            resource_relations_indexer.add(index=RESOURCE_RELATIONS_INDEX,
                                           id=doc["resourcexid"],
                                           data=doc)

    index_count = se.count(index=RESOURCE_RELATIONS_INDEX)
    print("Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds".
          format("Passed" if cursor.rowcount == index_count else "Failed",
                 cursor.rowcount, index_count,
                 (datetime.now() - start).seconds))

Ejemplo n.º 4

0

Mostrar archivo

Archivo: resource.py Proyecto: legiongis/arches

    def delete_index(self, resourceinstanceid=None):
        """
        Deletes all references to a resource from all indexes

        Keyword Arguments:
        resourceinstanceid -- the resource instance id to delete from related indexes, if supplied will use this over self.resourceinstanceid
        """

        if resourceinstanceid is None:
            resourceinstanceid = self.resourceinstanceid
        resourceinstanceid = str(resourceinstanceid)

        # delete any related terms
        query = Query(se)
        bool_query = Bool()
        bool_query.filter(
            Terms(field="resourceinstanceid", terms=[resourceinstanceid]))
        query.add_query(bool_query)
        query.delete(index=TERMS_INDEX)

        # delete any related resource index entries
        query = Query(se)
        bool_query = Bool()
        bool_query.should(
            Terms(field="resourceinstanceidto", terms=[resourceinstanceid]))
        bool_query.should(
            Terms(field="resourceinstanceidfrom", terms=[resourceinstanceid]))
        query.add_query(bool_query)
        query.delete(index=RESOURCE_RELATIONS_INDEX)

        # reindex any related resources
        query = Query(se)
        bool_query = Bool()
        bool_query.filter(
            Nested(path="ids",
                   query=Terms(field="ids.id", terms=[resourceinstanceid])))
        query.add_query(bool_query)
        results = query.search(index=RESOURCES_INDEX)["hits"]["hits"]
        for result in results:
            try:
                res = Resource.objects.get(pk=result["_id"])
                res.load_tiles()
                res.index()
            except ObjectDoesNotExist:
                pass

        # delete resource index
        se.delete(index=RESOURCES_INDEX, id=resourceinstanceid)

        # delete resources from custom indexes
        for index in settings.ELASTICSEARCH_CUSTOM_INDEXES:
            es_index = import_class_from_string(index["module"])(index["name"])
            es_index.delete_resources(resources=self)

Ejemplo n.º 5

0

Mostrar archivo

Archivo: check_indexes.py Proyecto: legiongis/fpan

 def iterate_all_documents(self, se, index, pagesize=250, scroll_timeout="1m"):
     """
     Helper to iterate ALL values from a single index. Yields all the documents.
     https://techoverflow.net/2019/05/07/elasticsearch-how-to-iterate-scroll-through-all-documents-in-index/
     """
     is_first = True
     while True:
         # Scroll next
         if is_first: # Initialize scroll
             result = se.search(index=index, scroll="1m", body={
                 "size": pagesize
             })
             is_first = False
         else:
             ## note: need to access the ElasticSearch() instance directly
             ## here, (.es), because the Arches se object doesn't inherit .scroll()
             result = se.es.scroll(body={
                 "scroll_id": scroll_id,
                 "scroll": scroll_timeout
             })
         scroll_id = result["_scroll_id"]
         hits = result["hits"]["hits"]
         # Stop after no more docs
         if not hits:
             break
         # Yield each entry
         yield from ((hit['_source']['resourceinstanceid'], hit['_source']['graph_id']) for hit in hits)

Ejemplo n.º 6

0

Mostrar archivo

Archivo: models.py Proyecto: manjulive89/arches

    def delete(self, *args, **kwargs):
        from arches.app.search.search_engine_factory import SearchEngineInstance as se
        from arches.app.search.mappings import RESOURCE_RELATIONS_INDEX

        se.delete(index=RESOURCE_RELATIONS_INDEX, id=self.resourcexid)

        # update the resource-instance tile by removing any references to a deleted resource
        deletedResourceId = kwargs.pop("deletedResourceId", None)
        if deletedResourceId and self.tileid and self.nodeid:
            newTileData = []
            data = self.tileid.data[str(self.nodeid_id)]
            if type(data) != list:
                data = [data]
            for relatedresourceItem in data:
                if relatedresourceItem["resourceId"] != str(deletedResourceId):
                    newTileData.append(relatedresourceItem)
            self.tileid.data[str(self.nodeid_id)] = newTileData
            self.tileid.save()

        super(ResourceXResource, self).delete()

Ejemplo n.º 7

0

Mostrar archivo

Archivo: resource.py Proyecto: gongyp/arches

    def delete_index(self, resourceinstanceid=None):
        """
        Deletes all references to a resource from all indexes

        Keyword Arguments:
        resourceinstanceid -- the resource instance id to delete from related indexes, if supplied will use this over self.resourceinstanceid
        """

        if resourceinstanceid is None:
            resourceinstanceid = self.resourceinstanceid
        resourceinstanceid = str(resourceinstanceid)

        # delete any related terms
        query = Query(se)
        bool_query = Bool()
        bool_query.filter(Terms(field="resourceinstanceid", terms=[resourceinstanceid]))
        query.add_query(bool_query)
        query.delete(index=TERMS_INDEX)

        # delete any related resource index entries
        query = Query(se)
        bool_query = Bool()
        bool_query.should(Terms(field="resourceinstanceidto", terms=[resourceinstanceid]))
        bool_query.should(Terms(field="resourceinstanceidfrom", terms=[resourceinstanceid]))
        query.add_query(bool_query)
        query.delete(index=RESOURCE_RELATIONS_INDEX)

        # reindex any related resources
        query = Query(se)
        bool_query = Bool()
        bool_query.filter(Nested(path="ids", query=Terms(field="ids.id", terms=[resourceinstanceid])))
        query.add_query(bool_query)
        results = query.search(index=RESOURCES_INDEX)["hits"]["hits"]
        for result in results:
            res = Resource.objects.get(pk=result["_id"])
            res.load_tiles()
            res.index()

        # delete resource index
        se.delete(index=RESOURCES_INDEX, id=resourceinstanceid)

Ejemplo n.º 8

0

Mostrar archivo

    def delete(self, user={}, note=""):
        """
        Deletes a single resource and any related indexed data

        """

        permit_deletion = False
        graph = models.GraphModel.objects.get(graphid=self.graph_id)
        if graph.isactive is False:
            message = _("This model is not yet active; unable to delete.")
            raise ModelInactiveError(message)
        if user != {}:
            user_is_reviewer = user_is_resource_reviewer(user)
            if user_is_reviewer is False:
                tiles = list(models.TileModel.objects.filter(resourceinstance=self))
                resource_is_provisional = True if sum([len(t.data) for t in tiles]) == 0 else False
                if resource_is_provisional is True:
                    permit_deletion = True
            else:
                permit_deletion = True
        else:
            permit_deletion = True

        if permit_deletion is True:
            related_resources = self.get_related_resources(lang="en-US", start=0, limit=1000, page=0)
            for rr in related_resources["resource_relationships"]:
                # delete any related resource entries, also reindex the resource that references this resource that's being deleted
                try:
                    resourceXresource = models.ResourceXResource.objects.get(pk=rr["resourcexid"])
                    resource_to_reindex = (
                        resourceXresource.resourceinstanceidfrom_id
                        if resourceXresource.resourceinstanceidto_id == self.resourceinstanceid
                        else resourceXresource.resourceinstanceidto_id
                    )
                    resourceXresource.delete(deletedResourceId=self.resourceinstanceid)
                    res = Resource.objects.get(pk=resource_to_reindex)
                    res.load_tiles()
                    res.index()
                except ObjectDoesNotExist:
                    se.delete(index=RESOURCE_RELATIONS_INDEX, id=rr["resourcexid"])

            query = Query(se)
            bool_query = Bool()
            bool_query.filter(Terms(field="resourceinstanceid", terms=[self.resourceinstanceid]))
            query.add_query(bool_query)
            results = query.search(index=TERMS_INDEX)["hits"]["hits"]
            for result in results:
                se.delete(index=TERMS_INDEX, id=result["_id"])
            se.delete(index=RESOURCES_INDEX, id=self.resourceinstanceid)

            try:
                self.save_edit(edit_type="delete", user=user, note=self.displayname)
            except:
                pass
            super(Resource, self).delete()

        return permit_deletion

Ejemplo n.º 9

0

Mostrar archivo

Archivo: resource.py Proyecto: gongyp/arches

    def bulk_save(resources):
        """
        Saves and indexes a list of resources

        Arguments:
        resources -- a list of resource models

        """

        datatype_factory = DataTypeFactory()
        node_datatypes = {str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list("nodeid", "datatype")}
        tiles = []
        documents = []
        term_list = []

        for resource in resources:
            resource.tiles = resource.get_flattened_tiles()
            tiles.extend(resource.tiles)

        # need to save the models first before getting the documents for index
        start = time()
        Resource.objects.bulk_create(resources)
        TileModel.objects.bulk_create(tiles)

        print(f"Time to bulk create tiles and resources: {datetime.timedelta(seconds=time() - start)}")

        start = time()
        for resource in resources:
            resource.save_edit(edit_type="create")

        resources[0].tiles[0].save_edit(note=f"Bulk created: {len(tiles)} for {len(resources)} resources.", edit_type="bulk_create")

        print("Time to save resource edits: %s" % datetime.timedelta(seconds=time() - start))

        for resource in resources:
            start = time()
            document, terms = resource.get_documents_to_index(
                fetchTiles=False, datatype_factory=datatype_factory, node_datatypes=node_datatypes
            )

            documents.append(se.create_bulk_item(index=RESOURCES_INDEX, id=document["resourceinstanceid"], data=document))

            for term in terms:
                term_list.append(se.create_bulk_item(index=TERMS_INDEX, id=term["_id"], data=term["_source"]))

        se.bulk_index(documents)
        se.bulk_index(term_list)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: resource.py Proyecto: legiongis/arches

    def get_related_resources(
        self,
        lang="en-US",
        limit=settings.RELATED_RESOURCES_EXPORT_LIMIT,
        start=0,
        page=0,
        user=None,
        resourceinstance_graphid=None,
    ):
        """
        Returns an object that lists the related resources, the relationship types, and a reference to the current resource

        """
        graphs = (models.GraphModel.objects.all().exclude(
            pk=settings.SYSTEM_SETTINGS_RESOURCE_MODEL_ID).exclude(
                isresource=False).exclude(isactive=False))

        graph_lookup = {
            str(graph.graphid): {
                "name": graph.name,
                "iconclass": graph.iconclass,
                "fillColor": graph.color
            }
            for graph in graphs
        }

        ret = {
            "resource_instance": self,
            "resource_relationships": [],
            "related_resources": [],
            "node_config_lookup": graph_lookup
        }

        if page > 0:
            limit = settings.RELATED_RESOURCES_PER_PAGE
            start = limit * int(page - 1)

        def get_relations(resourceinstanceid,
                          start,
                          limit,
                          resourceinstance_graphid=None):
            query = Query(se, start=start, limit=limit)
            bool_filter = Bool()
            bool_filter.should(
                Terms(field="resourceinstanceidfrom",
                      terms=resourceinstanceid))
            bool_filter.should(
                Terms(field="resourceinstanceidto", terms=resourceinstanceid))

            if resourceinstance_graphid:
                graph_filter = Bool()
                to_graph_id_filter = Bool()
                to_graph_id_filter.filter(
                    Terms(field="resourceinstancefrom_graphid",
                          terms=str(self.graph_id)))
                to_graph_id_filter.filter(
                    Terms(field="resourceinstanceto_graphid",
                          terms=resourceinstance_graphid))
                graph_filter.should(to_graph_id_filter)

                from_graph_id_filter = Bool()
                from_graph_id_filter.filter(
                    Terms(field="resourceinstancefrom_graphid",
                          terms=resourceinstance_graphid))
                from_graph_id_filter.filter(
                    Terms(field="resourceinstanceto_graphid",
                          terms=str(self.graph_id)))
                graph_filter.should(from_graph_id_filter)
                bool_filter.must(graph_filter)

            query.add_query(bool_filter)

            return query.search(index=RESOURCE_RELATIONS_INDEX)

        resource_relations = get_relations(
            resourceinstanceid=self.resourceinstanceid,
            start=start,
            limit=limit,
            resourceinstance_graphid=resourceinstance_graphid,
        )

        ret["total"] = resource_relations["hits"]["total"]
        instanceids = set()

        restricted_instances = get_restricted_instances(
            user, se) if user is not None else []
        for relation in resource_relations["hits"]["hits"]:
            try:
                preflabel = get_preflabel_from_valueid(
                    relation["_source"]["relationshiptype"], lang)
                relation["_source"][
                    "relationshiptype_label"] = preflabel["value"] or ""
            except:
                relation["_source"]["relationshiptype_label"] = relation[
                    "_source"]["relationshiptype"] or ""

            resourceid_to = relation["_source"]["resourceinstanceidto"]
            resourceid_from = relation["_source"]["resourceinstanceidfrom"]
            if resourceid_to not in restricted_instances and resourceid_from not in restricted_instances:
                ret["resource_relationships"].append(relation["_source"])
                instanceids.add(resourceid_to)
                instanceids.add(resourceid_from)
            else:
                ret["total"]["value"] -= 1

        if str(self.resourceinstanceid) in instanceids:
            instanceids.remove(str(self.resourceinstanceid))

        if len(instanceids) > 0:
            related_resources = se.search(index=RESOURCES_INDEX,
                                          id=list(instanceids))
            if related_resources:
                for resource in related_resources["docs"]:
                    relations = get_relations(
                        resourceinstanceid=resource["_id"],
                        start=0,
                        limit=0,
                    )
                    if resource["found"]:
                        resource["_source"]["total_relations"] = relations[
                            "hits"]["total"]
                        ret["related_resources"].append(resource["_source"])

        return ret

Ejemplo n.º 11

0

Mostrar archivo

def index_resources_by_type(resource_types,
                            clear_index=True,
                            batch_size=settings.BULK_IMPORT_BATCH_SIZE,
                            quiet=False):
    """
    Indexes all resources of a given type(s)

    Arguments:
    resource_types -- array of graph ids that represent resource types

    Keyword Arguments:
    clear_index -- set to True to remove all the resources of the types passed in from the index before the reindexing operation
    batch_size -- the number of records to index as a group, the larger the number to more memory required
    quiet -- Silences the status bar output during certain operations, use in celery operations for example

    """

    status = ""
    datatype_factory = DataTypeFactory()
    node_datatypes = {
        str(nodeid): datatype
        for nodeid, datatype in models.Node.objects.values_list(
            "nodeid", "datatype")
    }
    if isinstance(resource_types, str):
        resource_types = [resource_types]

    for resource_type in resource_types:
        start = datetime.now()
        resources = Resource.objects.filter(graph_id=str(resource_type))
        graph_name = models.GraphModel.objects.get(
            graphid=str(resource_type)).name
        print("Indexing resource type '{0}'".format(graph_name))

        q = Query(se=se)
        term = Term(field="graph_id", term=str(resource_type))
        q.add_query(term)
        if clear_index:
            q.delete(index=RESOURCES_INDEX, refresh=True)

        with se.BulkIndexer(batch_size=batch_size,
                            refresh=True) as doc_indexer:
            with se.BulkIndexer(batch_size=batch_size,
                                refresh=True) as term_indexer:
                if quiet is False:
                    bar = pyprind.ProgBar(
                        len(resources),
                        bar_char="█") if len(resources) > 1 else None
                for resource in resources:
                    if quiet is False and bar is not None:
                        bar.update(item_id=resource)
                    document, terms = resource.get_documents_to_index(
                        fetchTiles=True,
                        datatype_factory=datatype_factory,
                        node_datatypes=node_datatypes)
                    doc_indexer.add(index=RESOURCES_INDEX,
                                    id=document["resourceinstanceid"],
                                    data=document)
                    for term in terms:
                        term_indexer.add(index=TERMS_INDEX,
                                         id=term["_id"],
                                         data=term["_source"])

        result_summary = {
            "database": len(resources),
            "indexed": se.count(index=RESOURCES_INDEX, body=q.dsl)
        }
        status = "Passed" if result_summary["database"] == result_summary[
            "indexed"] else "Failed"
        print(
            "Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds"
            .format(status, graph_name, result_summary["database"],
                    result_summary["indexed"],
                    (datetime.now() - start).seconds))
    return status

Ejemplo n.º 12

0

Mostrar archivo

def index_concepts(clear_index=True,
                   batch_size=settings.BULK_IMPORT_BATCH_SIZE):
    """
    Indxes all concepts from the database

    Keyword Arguments:
    clear_index -- set to True to remove all the concepts from the index before the reindexing operation
    batch_size -- the number of records to index as a group, the larger the number to more memory required

    """

    start = datetime.now()
    print("Indexing concepts")
    cursor = connection.cursor()
    if clear_index:
        q = Query(se=se)
        q.delete(index=CONCEPTS_INDEX)

    with se.BulkIndexer(batch_size=batch_size,
                        refresh=True) as concept_indexer:
        indexed_values = []
        for conceptValue in models.Value.objects.filter(
                Q(concept__nodetype="Collection")
                | Q(concept__nodetype="ConceptScheme"),
                valuetype__category="label"):
            doc = {
                "category": "label",
                "conceptid": conceptValue.concept_id,
                "language": conceptValue.language_id,
                "value": conceptValue.value,
                "type": conceptValue.valuetype_id,
                "id": conceptValue.valueid,
                "top_concept": conceptValue.concept_id,
            }
            concept_indexer.add(index=CONCEPTS_INDEX, id=doc["id"], data=doc)
            indexed_values.append(doc["id"])

        valueTypes = []
        for valuetype in models.DValueType.objects.filter(
                category="label").values_list("valuetype", flat=True):
            valueTypes.append("'%s'" % valuetype)
        valueTypes = ",".join(valueTypes)

        for conceptValue in models.Relation.objects.filter(
                relationtype="hasTopConcept"):
            topConcept = conceptValue.conceptto_id
            sql = """
                WITH RECURSIVE children_inclusive AS (
                    SELECT d.conceptidfrom, d.conceptidto, c.*, 1 AS depth          ---|NonRecursive Part
                        FROM relations d
                        JOIN values c ON(c.conceptid = d.conceptidto)
                        JOIN values c2 ON(c2.conceptid = d.conceptidfrom)
                        WHERE d.conceptidto = '{0}'
                        and c2.valuetype = 'prefLabel'
                        and c.valuetype in ({1})
                        and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept')
                        UNION
                    SELECT d.conceptidfrom, d.conceptidto, v.*, depth+1             ---|RecursivePart
                        FROM relations  d
                        JOIN children_inclusive b ON(b.conceptidto = d.conceptidfrom)
                        JOIN values v ON(v.conceptid = d.conceptidto)
                        JOIN values v2 ON(v2.conceptid = d.conceptidfrom)
                        WHERE  v2.valuetype = 'prefLabel'
                        and v.valuetype in ({1})
                        and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept')
                ) SELECT valueid, value, conceptid, languageid, valuetype FROM children_inclusive ORDER BY depth;
            """.format(topConcept, valueTypes)

            cursor.execute(sql)
            for conceptValue in cursor.fetchall():
                doc = {
                    "category": "label",
                    "conceptid": conceptValue[2],
                    "language": conceptValue[3],
                    "value": conceptValue[1],
                    "type": conceptValue[4],
                    "id": conceptValue[0],
                    "top_concept": topConcept,
                }
                concept_indexer.add(index=CONCEPTS_INDEX,
                                    id=doc["id"],
                                    data=doc)
                indexed_values.append(doc["id"])

        # we add this step to catch any concepts/values that are orphaned (have no parent concept)
        for conceptValue in models.Value.objects.filter(
                valuetype__category="label").exclude(
                    valueid__in=indexed_values):
            doc = {
                "category": "label",
                "conceptid": conceptValue.concept_id,
                "language": conceptValue.language_id,
                "value": conceptValue.value,
                "type": conceptValue.valuetype_id,
                "id": conceptValue.valueid,
                "top_concept": conceptValue.concept_id,
            }
            concept_indexer.add(index=CONCEPTS_INDEX, id=doc["id"], data=doc)

    cursor.execute(
        "SELECT count(*) from values WHERE valuetype in ({0})".format(
            valueTypes))
    concept_count_in_db = cursor.fetchone()[0]
    index_count = se.count(index=CONCEPTS_INDEX)

    print("Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds".
          format("Passed" if concept_count_in_db == index_count else "Failed",
                 concept_count_in_db, index_count,
                 (datetime.now() - start).seconds))

Ejemplo n.º 13

0

Mostrar archivo

def get_concept_label_from_valueid(valueid):
    concept_label = se.search(index=CONCEPTS_INDEX, id=valueid)
    if concept_label["found"]:
        return concept_label["_source"]

Ejemplo n.º 14

0

Mostrar archivo

def index_resources_by_type(resource_types,
                            clear_index=True,
                            index_name=None,
                            batch_size=settings.BULK_IMPORT_BATCH_SIZE):
    """
    Indexes all resources of a given type(s)

    Arguments:
    resource_types -- array of graph ids that represent resource types

    Keyword Arguments:
    clear_index -- set to True to remove all the resources of the types passed in from the index before the reindexing operation
    index_name -- only applies to custom indexes and if given will try and just refresh the data in that index
    batch_size -- the number of records to index as a group, the larger the number to more memory required

    """

    status = ""
    datatype_factory = DataTypeFactory()
    node_datatypes = {
        str(nodeid): datatype
        for nodeid, datatype in models.Node.objects.values_list(
            "nodeid", "datatype")
    }

    status = ""
    for resource_type in resource_types:
        start = datetime.now()
        resources = Resource.objects.filter(graph_id=str(resource_type))
        graph_name = models.GraphModel.objects.get(
            graphid=str(resource_type)).name
        print("Indexing resource type '{0}'".format(graph_name))

        if index_name is None:
            q = Query(se=se)
            term = Term(field="graph_id", term=str(resource_type))
            q.add_query(term)
            if clear_index:
                q.delete(index=RESOURCES_INDEX, refresh=True)

            with se.BulkIndexer(batch_size=batch_size,
                                refresh=True) as doc_indexer:
                with se.BulkIndexer(batch_size=batch_size,
                                    refresh=True) as term_indexer:
                    for resource in resources:
                        document, terms = resource.get_documents_to_index(
                            fetchTiles=True,
                            datatype_factory=datatype_factory,
                            node_datatypes=node_datatypes)
                        doc_indexer.add(index=RESOURCES_INDEX,
                                        id=document["resourceinstanceid"],
                                        data=document)
                        for term in terms:
                            term_indexer.add(index=TERMS_INDEX,
                                             id=term["_id"],
                                             data=term["_source"])

            result_summary = {
                "database": len(resources),
                "indexed": se.count(index=RESOURCES_INDEX, body=q.dsl)
            }
            status = "Passed" if result_summary["database"] == result_summary[
                "indexed"] else "Failed"
            print(
                "Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds"
                .format(status, graph_name, result_summary["database"],
                        result_summary["indexed"],
                        (datetime.now() - start).seconds))

            for index in settings.ELASTICSEARCH_CUSTOM_INDEXES:
                es_index = import_class_from_string(index["module"])(
                    index["name"])
                es_index.bulk_index(resources=resources,
                                    resource_type=resource_type,
                                    graph_name=graph_name,
                                    clear_index=clear_index)

        else:
            es_index = get_index(index_name)
            es_index.bulk_index(resources=resources,
                                resource_type=resource_type,
                                graph_name=graph_name,
                                clear_index=clear_index)

    return status