Beispiel #1
0
def index_resource_relations(clear_index=True,
                             batch_size=settings.BULK_IMPORT_BATCH_SIZE):
    """
    Indexes all resource to resource relation records

    Keyword Arguments:
    clear_index -- set to True to remove all the resources from the index before the reindexing operation
    batch_size -- the number of records to index as a group, the larger the number to more memory required

    """

    start = datetime.now()
    print("Indexing resource to resource relations")

    cursor = connection.cursor()
    if clear_index:
        q = Query(se=se)
        q.delete(index=RESOURCE_RELATIONS_INDEX)

    with se.BulkIndexer(batch_size=batch_size,
                        refresh=True) as resource_relations_indexer:
        sql = """
            SELECT resourcexid, notes, datestarted, dateended, relationshiptype, resourceinstanceidfrom, resourceinstancefrom_graphid,
            resourceinstanceidto, resourceinstanceto_graphid, modified, created, inverserelationshiptype, tileid, nodeid
            FROM public.resource_x_resource
        """

        cursor.execute(sql)
        for resource_relation in cursor.fetchall():
            doc = {
                "resourcexid": resource_relation[0],
                "notes": resource_relation[1],
                "datestarted": resource_relation[2],
                "dateended": resource_relation[3],
                "relationshiptype": resource_relation[4],
                "resourceinstanceidfrom": resource_relation[5],
                "resourceinstancefrom_graphid": resource_relation[6],
                "resourceinstanceidto": resource_relation[7],
                "resourceinstanceto_graphid": resource_relation[8],
                "modified": resource_relation[9],
                "created": resource_relation[10],
                "inverserelationshiptype": resource_relation[11],
                "tileid": resource_relation[12],
                "nodeid": resource_relation[13],
            }
            resource_relations_indexer.add(index=RESOURCE_RELATIONS_INDEX,
                                           id=doc["resourcexid"],
                                           data=doc)

    index_count = se.count(index=RESOURCE_RELATIONS_INDEX)
    print("Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds".
          format("Passed" if cursor.rowcount == index_count else "Failed",
                 cursor.rowcount, index_count,
                 (datetime.now() - start).seconds))
Beispiel #2
0
def index_resources_by_type(resource_types,
                            clear_index=True,
                            batch_size=settings.BULK_IMPORT_BATCH_SIZE,
                            quiet=False):
    """
    Indexes all resources of a given type(s)

    Arguments:
    resource_types -- array of graph ids that represent resource types

    Keyword Arguments:
    clear_index -- set to True to remove all the resources of the types passed in from the index before the reindexing operation
    batch_size -- the number of records to index as a group, the larger the number to more memory required
    quiet -- Silences the status bar output during certain operations, use in celery operations for example

    """

    status = ""
    datatype_factory = DataTypeFactory()
    node_datatypes = {
        str(nodeid): datatype
        for nodeid, datatype in models.Node.objects.values_list(
            "nodeid", "datatype")
    }
    if isinstance(resource_types, str):
        resource_types = [resource_types]

    for resource_type in resource_types:
        start = datetime.now()
        resources = Resource.objects.filter(graph_id=str(resource_type))
        graph_name = models.GraphModel.objects.get(
            graphid=str(resource_type)).name
        print("Indexing resource type '{0}'".format(graph_name))

        q = Query(se=se)
        term = Term(field="graph_id", term=str(resource_type))
        q.add_query(term)
        if clear_index:
            q.delete(index=RESOURCES_INDEX, refresh=True)

        with se.BulkIndexer(batch_size=batch_size,
                            refresh=True) as doc_indexer:
            with se.BulkIndexer(batch_size=batch_size,
                                refresh=True) as term_indexer:
                if quiet is False:
                    bar = pyprind.ProgBar(
                        len(resources),
                        bar_char="█") if len(resources) > 1 else None
                for resource in resources:
                    if quiet is False and bar is not None:
                        bar.update(item_id=resource)
                    document, terms = resource.get_documents_to_index(
                        fetchTiles=True,
                        datatype_factory=datatype_factory,
                        node_datatypes=node_datatypes)
                    doc_indexer.add(index=RESOURCES_INDEX,
                                    id=document["resourceinstanceid"],
                                    data=document)
                    for term in terms:
                        term_indexer.add(index=TERMS_INDEX,
                                         id=term["_id"],
                                         data=term["_source"])

        result_summary = {
            "database": len(resources),
            "indexed": se.count(index=RESOURCES_INDEX, body=q.dsl)
        }
        status = "Passed" if result_summary["database"] == result_summary[
            "indexed"] else "Failed"
        print(
            "Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds"
            .format(status, graph_name, result_summary["database"],
                    result_summary["indexed"],
                    (datetime.now() - start).seconds))
    return status
Beispiel #3
0
def index_concepts(clear_index=True,
                   batch_size=settings.BULK_IMPORT_BATCH_SIZE):
    """
    Indxes all concepts from the database

    Keyword Arguments:
    clear_index -- set to True to remove all the concepts from the index before the reindexing operation
    batch_size -- the number of records to index as a group, the larger the number to more memory required

    """

    start = datetime.now()
    print("Indexing concepts")
    cursor = connection.cursor()
    if clear_index:
        q = Query(se=se)
        q.delete(index=CONCEPTS_INDEX)

    with se.BulkIndexer(batch_size=batch_size,
                        refresh=True) as concept_indexer:
        indexed_values = []
        for conceptValue in models.Value.objects.filter(
                Q(concept__nodetype="Collection")
                | Q(concept__nodetype="ConceptScheme"),
                valuetype__category="label"):
            doc = {
                "category": "label",
                "conceptid": conceptValue.concept_id,
                "language": conceptValue.language_id,
                "value": conceptValue.value,
                "type": conceptValue.valuetype_id,
                "id": conceptValue.valueid,
                "top_concept": conceptValue.concept_id,
            }
            concept_indexer.add(index=CONCEPTS_INDEX, id=doc["id"], data=doc)
            indexed_values.append(doc["id"])

        valueTypes = []
        for valuetype in models.DValueType.objects.filter(
                category="label").values_list("valuetype", flat=True):
            valueTypes.append("'%s'" % valuetype)
        valueTypes = ",".join(valueTypes)

        for conceptValue in models.Relation.objects.filter(
                relationtype="hasTopConcept"):
            topConcept = conceptValue.conceptto_id
            sql = """
                WITH RECURSIVE children_inclusive AS (
                    SELECT d.conceptidfrom, d.conceptidto, c.*, 1 AS depth          ---|NonRecursive Part
                        FROM relations d
                        JOIN values c ON(c.conceptid = d.conceptidto)
                        JOIN values c2 ON(c2.conceptid = d.conceptidfrom)
                        WHERE d.conceptidto = '{0}'
                        and c2.valuetype = 'prefLabel'
                        and c.valuetype in ({1})
                        and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept')
                        UNION
                    SELECT d.conceptidfrom, d.conceptidto, v.*, depth+1             ---|RecursivePart
                        FROM relations  d
                        JOIN children_inclusive b ON(b.conceptidto = d.conceptidfrom)
                        JOIN values v ON(v.conceptid = d.conceptidto)
                        JOIN values v2 ON(v2.conceptid = d.conceptidfrom)
                        WHERE  v2.valuetype = 'prefLabel'
                        and v.valuetype in ({1})
                        and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept')
                ) SELECT valueid, value, conceptid, languageid, valuetype FROM children_inclusive ORDER BY depth;
            """.format(topConcept, valueTypes)

            cursor.execute(sql)
            for conceptValue in cursor.fetchall():
                doc = {
                    "category": "label",
                    "conceptid": conceptValue[2],
                    "language": conceptValue[3],
                    "value": conceptValue[1],
                    "type": conceptValue[4],
                    "id": conceptValue[0],
                    "top_concept": topConcept,
                }
                concept_indexer.add(index=CONCEPTS_INDEX,
                                    id=doc["id"],
                                    data=doc)
                indexed_values.append(doc["id"])

        # we add this step to catch any concepts/values that are orphaned (have no parent concept)
        for conceptValue in models.Value.objects.filter(
                valuetype__category="label").exclude(
                    valueid__in=indexed_values):
            doc = {
                "category": "label",
                "conceptid": conceptValue.concept_id,
                "language": conceptValue.language_id,
                "value": conceptValue.value,
                "type": conceptValue.valuetype_id,
                "id": conceptValue.valueid,
                "top_concept": conceptValue.concept_id,
            }
            concept_indexer.add(index=CONCEPTS_INDEX, id=doc["id"], data=doc)

    cursor.execute(
        "SELECT count(*) from values WHERE valuetype in ({0})".format(
            valueTypes))
    concept_count_in_db = cursor.fetchone()[0]
    index_count = se.count(index=CONCEPTS_INDEX)

    print("Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds".
          format("Passed" if concept_count_in_db == index_count else "Failed",
                 concept_count_in_db, index_count,
                 (datetime.now() - start).seconds))
Beispiel #4
0
def index_resources_by_type(resource_types,
                            clear_index=True,
                            index_name=None,
                            batch_size=settings.BULK_IMPORT_BATCH_SIZE):
    """
    Indexes all resources of a given type(s)

    Arguments:
    resource_types -- array of graph ids that represent resource types

    Keyword Arguments:
    clear_index -- set to True to remove all the resources of the types passed in from the index before the reindexing operation
    index_name -- only applies to custom indexes and if given will try and just refresh the data in that index
    batch_size -- the number of records to index as a group, the larger the number to more memory required

    """

    status = ""
    datatype_factory = DataTypeFactory()
    node_datatypes = {
        str(nodeid): datatype
        for nodeid, datatype in models.Node.objects.values_list(
            "nodeid", "datatype")
    }

    status = ""
    for resource_type in resource_types:
        start = datetime.now()
        resources = Resource.objects.filter(graph_id=str(resource_type))
        graph_name = models.GraphModel.objects.get(
            graphid=str(resource_type)).name
        print("Indexing resource type '{0}'".format(graph_name))

        if index_name is None:
            q = Query(se=se)
            term = Term(field="graph_id", term=str(resource_type))
            q.add_query(term)
            if clear_index:
                q.delete(index=RESOURCES_INDEX, refresh=True)

            with se.BulkIndexer(batch_size=batch_size,
                                refresh=True) as doc_indexer:
                with se.BulkIndexer(batch_size=batch_size,
                                    refresh=True) as term_indexer:
                    for resource in resources:
                        document, terms = resource.get_documents_to_index(
                            fetchTiles=True,
                            datatype_factory=datatype_factory,
                            node_datatypes=node_datatypes)
                        doc_indexer.add(index=RESOURCES_INDEX,
                                        id=document["resourceinstanceid"],
                                        data=document)
                        for term in terms:
                            term_indexer.add(index=TERMS_INDEX,
                                             id=term["_id"],
                                             data=term["_source"])

            result_summary = {
                "database": len(resources),
                "indexed": se.count(index=RESOURCES_INDEX, body=q.dsl)
            }
            status = "Passed" if result_summary["database"] == result_summary[
                "indexed"] else "Failed"
            print(
                "Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds"
                .format(status, graph_name, result_summary["database"],
                        result_summary["indexed"],
                        (datetime.now() - start).seconds))

            for index in settings.ELASTICSEARCH_CUSTOM_INDEXES:
                es_index = import_class_from_string(index["module"])(
                    index["name"])
                es_index.bulk_index(resources=resources,
                                    resource_type=resource_type,
                                    graph_name=graph_name,
                                    clear_index=clear_index)

        else:
            es_index = get_index(index_name)
            es_index.bulk_index(resources=resources,
                                resource_type=resource_type,
                                graph_name=graph_name,
                                clear_index=clear_index)

    return status