Exemple #1
0
 def delete_concept_values_index(concepts_to_delete):
     se = SearchEngineFactory().create()
     for concept in concepts_to_delete.itervalues():
         query = Query(se, start=0, limit=10000)
         term = Term(field='conceptid', term=concept.id)
         query.add_query(term)
         query.delete(index='strings', doc_type='concept')
Exemple #2
0
 def delete_index(self):   
     se = SearchEngineFactory().create() 
     query = Query(se, start=0, limit=10000)
     phrase = Match(field='conceptid', query=self.conceptid, type='phrase')
     query.add_query(phrase)
     query.delete(index='concept_labels')  
     se.delete_terms(self.id)
Exemple #3
0
def index_resources(clear_index=True,
                    index_name=None,
                    batch_size=settings.BULK_IMPORT_BATCH_SIZE):
    """
    Indexes all resources from the database

    Keyword Arguments:
    clear_index -- set to True to remove all the resources from the index before the reindexing operation
    index_name -- only applies to custom indexes and if given will try and just refresh the data in that index
    batch_size -- the number of records to index as a group, the larger the number to more memory required

    """

    se = SearchEngineFactory().create()
    if clear_index and index_name is None:
        q = Query(se=se)
        q.delete(index="terms")

    resource_types = (models.GraphModel.objects.filter(
        isresource=True).exclude(
            graphid=settings.SYSTEM_SETTINGS_RESOURCE_MODEL_ID).values_list(
                "graphid", flat=True))
    index_resources_by_type(resource_types,
                            clear_index=clear_index,
                            index_name=index_name,
                            batch_size=batch_size)
Exemple #4
0
 def delete_concept_values_index(concepts_to_delete):
     se = SearchEngineFactory().create()
     for concept in concepts_to_delete.itervalues():
         query = Query(se, start=0, limit=10000)
         term = Term(field='conceptid', term=concept.id)
         query.add_query(term)
         query.delete(index='strings', doc_type='concept')
Exemple #5
0
def index_resources(clear_index=True,
                    batch_size=settings.BULK_IMPORT_BATCH_SIZE,
                    quiet=False):
    """
    Indexes all resources from the database

    Keyword Arguments:
    clear_index -- set to True to remove all the resources from the index before the reindexing operation
    batch_size -- the number of records to index as a group, the larger the number to more memory required
    quiet -- Silences the status bar output during certain operations, use in celery operations for example

    """

    if clear_index:
        q = Query(se=se)
        q.delete(index=TERMS_INDEX)

    resource_types = (models.GraphModel.objects.filter(
        isresource=True).exclude(
            graphid=settings.SYSTEM_SETTINGS_RESOURCE_MODEL_ID).values_list(
                "graphid", flat=True))
    index_resources_by_type(resource_types,
                            clear_index=clear_index,
                            batch_size=batch_size,
                            quiet=quiet)
Exemple #6
0
 def delete_index(self):
     se = SearchEngineFactory().create()
     query = Query(se, start=0, limit=10000)
     term = Term(field='id', term=self.id)
     query.add_query(term)
     query.delete(index='concept_labels')
     se.delete_terms(self.id)
Exemple #7
0
def reverse_func(apps, schema_editor):
    extensions = [os.path.join(settings.ONTOLOGY_PATH, x) for x in settings.ONTOLOGY_EXT]
    management.call_command('load_ontology', source=os.path.join(settings.ONTOLOGY_PATH, settings.ONTOLOGY_BASE),
        version=settings.ONTOLOGY_BASE_VERSION, ontology_name=settings.ONTOLOGY_BASE_NAME, id=settings.ONTOLOGY_BASE_ID, extensions=','.join(extensions), verbosity=0)

    Node = apps.get_model("models", "Node")
    Edge = apps.get_model("models", "Edge")

    for node in Node.objects.all():
        node.ontologyclass = str(node.ontologyclass).split('/')[-1]
        node.save()

    for edge in Edge.objects.all():
        edge.ontologyproperty = str(edge.ontologyproperty).split('/')[-1]
        edge.save()

    # remove index for base Arches concept
    se = SearchEngineFactory().create()
    query = Query(se, start=0, limit=10000)
    query.add_query(Term(field='conceptid', term='00000000-0000-0000-0000-000000000001'))
    query.delete(index='strings', doc_type='concept')

    try:
        DValueType = apps.get_model("models", "DValueType")
        DValueType.objects.get(valuetype='identifier').delete()
    except:
        pass
Exemple #8
0
def reverse_func(apps, schema_editor):

    Node = apps.get_model("models", "Node")
    Edge = apps.get_model("models", "Edge")

    for node in Node.objects.all():
        node.ontologyclass = str(node.ontologyclass).split("/")[-1]
        node.save()

    for edge in Edge.objects.all():
        edge.ontologyproperty = str(edge.ontologyproperty).split("/")[-1]
        edge.save()

    # remove index for base Arches concept
    se = SearchEngineFactory().create()
    query = Query(se, start=0, limit=10000)
    query.add_query(
        Term(field="conceptid", term="00000000-0000-0000-0000-000000000001"))
    query.delete(index="concepts")

    try:
        DValueType = apps.get_model("models", "DValueType")
        DValueType.objects.get(valuetype="identifier").delete()
    except Exception:
        pass
Exemple #9
0
 def delete_index(self):
     se = SearchEngineFactory().create()
     query = Query(se, start=0, limit=10000)
     phrase = Match(field='conceptid', query=self.conceptid, type='phrase')
     query.add_query(phrase)
     query.delete(index='concept_labels')
     se.delete_terms(self.id)
Exemple #10
0
    def test_bulk_delete(self):
        """
        Test bulk deleting of documents in Elasticsearch

        """

        se = SearchEngineFactory().create()
        # se.create_index(index='test')

        for i in range(10):
            x = {
                'id': i,
                'type': 'prefLabel',
                'value': 'test pref label',
            }
            se.index_data(index='test', doc_type='test', body=x, idfield='id', refresh=True)
            y = {
                'id': i + 100,
                'type': 'altLabel',
                'value': 'test alt label',
            }
            se.index_data(index='test', doc_type='test', body=y, idfield='id', refresh=True)


        query = Query(se, start=0, limit=100)
        match = Match(field='type', query='altLabel')
        query.add_query(match)

        query.delete(index='test', refresh=True)

        self.assertEqual(se.es.count(index='test', doc_type='test')['count'], 10)
Exemple #11
0
def reverse_func(apps, schema_editor):
    extensions = [os.path.join(settings.ONTOLOGY_PATH, x) for x in settings.ONTOLOGY_EXT]
    management.call_command('load_ontology', source=os.path.join(settings.ONTOLOGY_PATH, settings.ONTOLOGY_BASE),
        version=settings.ONTOLOGY_BASE_VERSION, ontology_name=settings.ONTOLOGY_BASE_NAME, id=settings.ONTOLOGY_BASE_ID, extensions=','.join(extensions), verbosity=0)

    Node = apps.get_model("models", "Node")
    Edge = apps.get_model("models", "Edge")

    for node in Node.objects.all():
        node.ontologyclass = str(node.ontologyclass).split('/')[-1]
        node.save()

    for edge in Edge.objects.all():
        edge.ontologyproperty = str(edge.ontologyproperty).split('/')[-1]
        edge.save()

    # remove index for base Arches concept
    se = SearchEngineFactory().create()
    query = Query(se, start=0, limit=10000)
    query.add_query(Term(field='conceptid', term='00000000-0000-0000-0000-000000000001'))
    query.delete(index='concepts')

    try:
        DValueType = apps.get_model("models", "DValueType")
        DValueType.objects.get(valuetype='identifier').delete()
    except:
        pass
    def test_delete_by_query(self):
        """
        Test deleting documents by query in Elasticsearch

        """

        se = SearchEngineFactory().create()

        for i in range(10):
            x = {
                'id': i,
                'type': 'prefLabel',
                'value': 'test pref label',
            }
            se.index_data(index='test', body=x, idfield='id', refresh=True)
            y = {
                'id': i + 100,
                'type': 'altLabel',
                'value': 'test alt label',
            }
            se.index_data(index='test', body=y, idfield='id', refresh=True)

        time.sleep(1)

        query = Query(se, start=0, limit=100)
        match = Match(field='type', query='altLabel')
        query.add_query(match)

        query.delete(index='test', refresh=True)

        self.assertEqual(se.count(index='test'), 10)
Exemple #13
0
 def delete_concept_values_index(concepts_to_delete):
     se = SearchEngineFactory().create()
     for concept in concepts_to_delete.itervalues():
         query = Query(se, start=0, limit=10000)
         term = Term(field='conceptid', term=concept.id)
         query.add_query(term)
         query.delete(index='concept_labels')
         for conceptvalue in concept.values:
             se.delete_terms(conceptvalue.id)
Exemple #14
0
def index_resource_relations(clear_index=True,
                             batch_size=settings.BULK_IMPORT_BATCH_SIZE):
    """
    Indexes all resource to resource relation records

    Keyword Arguments:
    clear_index -- set to True to remove all the resources from the index before the reindexing operation
    batch_size -- the number of records to index as a group, the larger the number to more memory required

    """

    start = datetime.now()
    print("Indexing resource to resource relations")

    cursor = connection.cursor()
    if clear_index:
        q = Query(se=se)
        q.delete(index=RESOURCE_RELATIONS_INDEX)

    with se.BulkIndexer(batch_size=batch_size,
                        refresh=True) as resource_relations_indexer:
        sql = """
            SELECT resourcexid, notes, datestarted, dateended, relationshiptype, resourceinstanceidfrom, resourceinstancefrom_graphid,
            resourceinstanceidto, resourceinstanceto_graphid, modified, created, inverserelationshiptype, tileid, nodeid
            FROM public.resource_x_resource
        """

        cursor.execute(sql)
        for resource_relation in cursor.fetchall():
            doc = {
                "resourcexid": resource_relation[0],
                "notes": resource_relation[1],
                "datestarted": resource_relation[2],
                "dateended": resource_relation[3],
                "relationshiptype": resource_relation[4],
                "resourceinstanceidfrom": resource_relation[5],
                "resourceinstancefrom_graphid": resource_relation[6],
                "resourceinstanceidto": resource_relation[7],
                "resourceinstanceto_graphid": resource_relation[8],
                "modified": resource_relation[9],
                "created": resource_relation[10],
                "inverserelationshiptype": resource_relation[11],
                "tileid": resource_relation[12],
                "nodeid": resource_relation[13],
            }
            resource_relations_indexer.add(index=RESOURCE_RELATIONS_INDEX,
                                           id=doc["resourcexid"],
                                           data=doc)

    index_count = se.count(index=RESOURCE_RELATIONS_INDEX)
    print("Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds".
          format("Passed" if cursor.rowcount == index_count else "Failed",
                 cursor.rowcount, index_count,
                 (datetime.now() - start).seconds))
Exemple #15
0
def index_resources_by_type(resource_types, clear_index=True, index_name=None, batch_size=settings.BULK_IMPORT_BATCH_SIZE):
    """
    Indexes all resources of a given type(s)

    Arguments:
    resource_types -- array of graph ids that represent resource types

    Keyword Arguments:
    clear_index -- set to True to remove all the resources of the types passed in from the index before the reindexing operation
    index_name -- only applies to custom indexes and if given will try and just refresh the data in that index
    batch_size -- the number of records to index as a group, the larger the number to more memory required

    """
    
    status = ''
    se = SearchEngineFactory().create()
    datatype_factory = DataTypeFactory()
    node_datatypes = {str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list('nodeid', 'datatype')}

    status = ''
    for resource_type in resource_types:
        start = datetime.now()
        resources = Resource.objects.filter(graph_id=str(resource_type))
        graph_name = models.GraphModel.objects.get(graphid=str(resource_type)).name
        print("Indexing resource type '{0}'".format(graph_name))

        if index_name is None:
            q = Query(se=se)
            term = Term(field='graph_id', term=str(resource_type))
            q.add_query(term)
            if clear_index:
                q.delete(index='resources', refresh=True)

            with se.BulkIndexer(batch_size=batch_size, refresh=True) as doc_indexer:
                with se.BulkIndexer(batch_size=batch_size, refresh=True) as term_indexer:
                    for resource in resources:
                        document, terms = resource.get_documents_to_index(fetchTiles=True, datatype_factory=datatype_factory, node_datatypes=node_datatypes)
                        doc_indexer.add(index='resources', id=document['resourceinstanceid'], data=document)
                        for term in terms:
                            term_indexer.add(index='terms', id=term['_id'], data=term['_source'])

            result_summary = {'database': len(resources), 'indexed': se.count(index='resources', body=q.dsl)}
            status = 'Passed' if result_summary['database'] == result_summary['indexed'] else 'Failed'
            print("Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds".format(status, graph_name, result_summary['database'], result_summary['indexed'], (datetime.now()-start).seconds))

            for index in settings.ELASTICSEARCH_CUSTOM_INDEXES:
                es_index = import_class_from_string(index['module'])(index['name'])
                es_index.bulk_index(resources=resources, resource_type=resource_type, graph_name=graph_name, clear_index=clear_index)

        else:
            es_index = get_index(index_name)
            es_index.bulk_index(resources=resources, resource_type=resource_type, graph_name=graph_name, clear_index=clear_index)

    return status
Exemple #16
0
    def delete_index(self, resourceinstanceid=None):
        """
        Deletes all references to a resource from all indexes

        Keyword Arguments:
        resourceinstanceid -- the resource instance id to delete from related indexes, if supplied will use this over self.resourceinstanceid
        """

        if resourceinstanceid is None:
            resourceinstanceid = self.resourceinstanceid
        resourceinstanceid = str(resourceinstanceid)

        # delete any related terms
        query = Query(se)
        bool_query = Bool()
        bool_query.filter(
            Terms(field="resourceinstanceid", terms=[resourceinstanceid]))
        query.add_query(bool_query)
        query.delete(index=TERMS_INDEX)

        # delete any related resource index entries
        query = Query(se)
        bool_query = Bool()
        bool_query.should(
            Terms(field="resourceinstanceidto", terms=[resourceinstanceid]))
        bool_query.should(
            Terms(field="resourceinstanceidfrom", terms=[resourceinstanceid]))
        query.add_query(bool_query)
        query.delete(index=RESOURCE_RELATIONS_INDEX)

        # reindex any related resources
        query = Query(se)
        bool_query = Bool()
        bool_query.filter(
            Nested(path="ids",
                   query=Terms(field="ids.id", terms=[resourceinstanceid])))
        query.add_query(bool_query)
        results = query.search(index=RESOURCES_INDEX)["hits"]["hits"]
        for result in results:
            try:
                res = Resource.objects.get(pk=result["_id"])
                res.load_tiles()
                res.index()
            except ObjectDoesNotExist:
                pass

        # delete resource index
        se.delete(index=RESOURCES_INDEX, id=resourceinstanceid)

        # delete resources from custom indexes
        for index in settings.ELASTICSEARCH_CUSTOM_INDEXES:
            es_index = import_class_from_string(index["module"])(index["name"])
            es_index.delete_resources(resources=self)
Exemple #17
0
    def bulk_index(self,
                   resources=None,
                   resource_type=None,
                   graph_name=None,
                   clear_index=True):
        """
        Indexes a list of documents in bulk to Elastic Search

        Arguments:
        None

        Keyword Arguments:
        resources -- the list of resource instances to index
        resource_type -- the type of resources being indexed
        graph_name -- the name of the graph model that represents the resources being indexed
        clear_index -- True(default) to remove all index records of type "resource_type" before indexing, 
            assumes that a field called "graph_id" exists on the indexed documents

        Return: None
        """

        start = datetime.now()
        q = Query(se=self.se)
        if clear_index:
            term = Term(field="graph_id", term=str(resource_type))
            q.add_query(term)
            q.delete(index=self.index_name, refresh=True)

        q = Query(se=self.se)
        count_before = self.se.count(index=self.index_name, body=q.dsl)

        result_summary = {"database": len(resources), "indexed": 0}
        with self.se.BulkIndexer(batch_size=settings.BULK_IMPORT_BATCH_SIZE,
                                 refresh=True) as indexer:
            for resource in resources:
                tiles = list(
                    models.TileModel.objects.filter(resourceinstance=resource))
                document, doc_id = self.get_documents_to_index(resource, tiles)
                if document is not None and id is not None:
                    indexer.add(index=self.index_name,
                                id=doc_id,
                                data=document)

        result_summary["indexed"] = self.se.count(index=self.index_name,
                                                  body=q.dsl) - count_before
        status = "Passed" if result_summary["database"] == result_summary[
            "indexed"] else "Failed"
        print("Custom Index - %s:" % self.index_name)
        print(
            "    Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds"
            .format(status, graph_name, result_summary["database"],
                    result_summary["indexed"],
                    (datetime.now() - start).seconds))
Exemple #18
0
def index_resources(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE):
    """
    Indexes all resources from the database

    Keyword Arguments:
    clear_index -- set to True to remove all the resources from the index before the reindexing operation
    batch_size -- the number of records to index as a group, the larger the number to more memory required

    """

    se = SearchEngineFactory().create()
    if clear_index:
        q = Query(se=se)
        q.delete(index='strings', doc_type='term')

    resource_types = models.GraphModel.objects.filter(isresource=True).exclude(graphid=settings.SYSTEM_SETTINGS_RESOURCE_MODEL_ID).values_list('graphid', flat=True)
    index_resources_by_type(resource_types, clear_index=clear_index, batch_size=batch_size)
def index_resources(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE):
    """
    Indexes all resources from the database

    Keyword Arguments:
    clear_index -- set to True to remove all the resources from the index before the reindexing operation
    batch_size -- the number of records to index as a group, the larger the number to more memory required

    """

    se = SearchEngineFactory().create()
    if clear_index:
        q = Query(se=se)
        q.delete(index='terms')

    resource_types = models.GraphModel.objects.filter(isresource=True).exclude(graphid=settings.SYSTEM_SETTINGS_RESOURCE_MODEL_ID).values_list('graphid', flat=True)
    index_resources_by_type(resource_types, clear_index=clear_index, batch_size=batch_size)
Exemple #20
0
def index_resource_relations(clear_index=True,
                             batch_size=settings.BULK_IMPORT_BATCH_SIZE):
    """
    Indexes all resource to resource relation records

    Keyword Arguments:
    clear_index -- set to True to remove all the resources from the index before the reindexing operation
    batch_size -- the number of records to index as a group, the larger the number to more memory required

    """

    start = datetime.now()
    print("Indexing resource to resource relations")

    cursor = connection.cursor()
    se = SearchEngineFactory().create()
    if clear_index:
        q = Query(se=se)
        q.delete(index="resource_relations")

    with se.BulkIndexer(batch_size=batch_size,
                        refresh=True) as resource_relations_indexer:
        sql = """
            SELECT resourcexid, resourceinstanceidfrom, notes, relationshiptype, resourceinstanceidto
            FROM public.resource_x_resource;
        """

        cursor.execute(sql)
        for resource_relation in cursor.fetchall():
            doc = {
                "resourcexid": resource_relation[0],
                "resourceinstanceidfrom": resource_relation[1],
                "notes": resource_relation[2],
                "relationshiptype": resource_relation[3],
                "resourceinstanceidto": resource_relation[4],
            }
            resource_relations_indexer.add(index="resource_relations",
                                           id=doc["resourcexid"],
                                           data=doc)

    index_count = se.count(index="resource_relations")
    print("Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds".
          format("Passed" if cursor.rowcount == index_count else "Failed",
                 cursor.rowcount, index_count,
                 (datetime.now() - start).seconds))
Exemple #21
0
    def delete_resources(self, resources=None):
        """
        Deletes documents from an index based on the passed in list of resources
        Delete by query, so this is a single operation

        Keyword Arguments:
        resources -- a single resource instance or a list of resource instances
        """

        q = Query(se=self.se)
        if not isinstance(resources, list):
            resourcelist = [resources]
        else:
            resourcelist = resources
        list_of_ids_to_delete = []
        for resource in resourcelist:
            list_of_ids_to_delete.append(resource.pk)
        ids_query = Ids(ids=list_of_ids_to_delete)
        q.add_query(ids_query)
        q.delete(index=self.index_name)
Exemple #22
0
def clear_resources():
    """Removes all resource instances from your db and elasticsearch resource index"""
    se = SearchEngineFactory().create()
    match_all_query = Query(se)
    match_all_query.delete(index="terms")
    match_all_query.delete(index="resources")
    match_all_query.delete(index="resource_relations")

    print(
        "deleting",
        Resource.objects.exclude(
            resourceinstanceid=settings.RESOURCE_INSTANCE_ID).count(),
        "resources")
    Resource.objects.exclude(
        resourceinstanceid=settings.RESOURCE_INSTANCE_ID).delete()
    print(
        Resource.objects.exclude(
            resourceinstanceid=settings.RESOURCE_INSTANCE_ID).count(),
        "resources remaining")

    print("deleting", models.ResourceXResource.objects.count(),
          "resource relationships")
    cursor = connection.cursor()
    cursor.execute("TRUNCATE public.resource_x_resource CASCADE;")
    print(models.ResourceXResource.objects.count(),
          "resource relationships remaining")
Exemple #23
0
def index_resources_by_type(resource_types, clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE):
    """
    Indexes all resources of a given type(s)

    Arguments:
    resource_types -- array of graph ids that represent resource types

    Keyword Arguments:
    clear_index -- set to True to remove all the resources of the types passed in from the index before the reindexing operation
    batch_size -- the number of records to index as a group, the larger the number to more memory required

    """

    se = SearchEngineFactory().create()
    datatype_factory = DataTypeFactory()
    node_datatypes = {str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list('nodeid', 'datatype')}

    for resource_type in resource_types:
        start = datetime.now()
        resources = Resource.objects.filter(graph_id=str(resource_type))
        graph_name = models.GraphModel.objects.get(graphid=str(resource_type)).name
        print "Indexing resource type '{0}'".format(graph_name)
        result_summary = {'database':len(resources), 'indexed':0}

        if clear_index:
            q = Query(se=se)
            q.delete(index='resource', doc_type=str(resource_type))

        with se.BulkIndexer(batch_size=batch_size, refresh=True) as doc_indexer:
            with se.BulkIndexer(batch_size=batch_size, refresh=True) as term_indexer:
                for resource in resources:
                    document, terms = resource.get_documents_to_index(fetchTiles=True, datatype_factory=datatype_factory, node_datatypes=node_datatypes)
                    doc_indexer.add(index='resource', doc_type=document['graph_id'], id=document['resourceinstanceid'], data=document)
                    for term in terms:
                        term_indexer.add(index='strings', doc_type='term', id=term['_id'], data=term['_source'])

        result_summary['indexed'] = se.count(index='resource', doc_type=str(resource_type))

        status = 'Passed' if result_summary['database'] == result_summary['indexed'] else 'Failed'
        print "Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds".format(status, graph_name, result_summary['database'], result_summary['indexed'], (datetime.now()-start).seconds)
Exemple #24
0
    def delete_index(self, resourceinstanceid=None):
        """
        Deletes all references to a resource from all indexes

        Keyword Arguments:
        resourceinstanceid -- the resource instance id to delete from related indexes, if supplied will use this over self.resourceinstanceid
        """

        if resourceinstanceid is None:
            resourceinstanceid = self.resourceinstanceid
        resourceinstanceid = str(resourceinstanceid)

        # delete any related terms
        query = Query(se)
        bool_query = Bool()
        bool_query.filter(Terms(field="resourceinstanceid", terms=[resourceinstanceid]))
        query.add_query(bool_query)
        query.delete(index=TERMS_INDEX)

        # delete any related resource index entries
        query = Query(se)
        bool_query = Bool()
        bool_query.should(Terms(field="resourceinstanceidto", terms=[resourceinstanceid]))
        bool_query.should(Terms(field="resourceinstanceidfrom", terms=[resourceinstanceid]))
        query.add_query(bool_query)
        query.delete(index=RESOURCE_RELATIONS_INDEX)

        # reindex any related resources
        query = Query(se)
        bool_query = Bool()
        bool_query.filter(Nested(path="ids", query=Terms(field="ids.id", terms=[resourceinstanceid])))
        query.add_query(bool_query)
        results = query.search(index=RESOURCES_INDEX)["hits"]["hits"]
        for result in results:
            res = Resource.objects.get(pk=result["_id"])
            res.load_tiles()
            res.index()

        # delete resource index
        se.delete(index=RESOURCES_INDEX, id=resourceinstanceid)
Exemple #25
0
def index_resource_relations(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE):
    """
    Indexes all resource to resource relation records

    Keyword Arguments:
    clear_index -- set to True to remove all the resources from the index before the reindexing operation
    batch_size -- the number of records to index as a group, the larger the number to more memory required

    """

    start = datetime.now()
    print "Indexing resource to resource relations"

    cursor = connection.cursor()
    se = SearchEngineFactory().create()
    if clear_index:
        q = Query(se=se)
        q.delete(index='resource_relations')

    with se.BulkIndexer(batch_size=batch_size, refresh=True) as resource_relations_indexer:
        sql = """
            SELECT resourcexid, resourceinstanceidfrom, notes, relationshiptype, resourceinstanceidto
            FROM public.resource_x_resource;
        """

        cursor.execute(sql)
        for resource_relation in cursor.fetchall():
            doc  = {
                'resourcexid': resource_relation[0],
                'resourceinstanceidfrom': resource_relation[1],
                'notes': resource_relation[2],
                'relationshiptype': resource_relation[3],
                'resourceinstanceidto': resource_relation[4]
            }
            resource_relations_indexer.add(index='resource_relations', doc_type='all', id=doc['resourcexid'], data=doc)

    index_count = se.count(index='resource_relations')
    print "Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds".format('Passed' if cursor.rowcount == index_count else 'Failed', cursor.rowcount, index_count, (datetime.now()-start).seconds)
Exemple #26
0
    def test_delete_by_query(self):
        """
        Test deleting documents by query in Elasticsearch

        """

        se = SearchEngineFactory().create()

        for i in range(10):
            x = {"id": i, "type": "prefLabel", "value": "test pref label"}
            se.index_data(index="test", body=x, idfield="id", refresh=True)
            y = {"id": i + 100, "type": "altLabel", "value": "test alt label"}
            se.index_data(index="test", body=y, idfield="id", refresh=True)

        time.sleep(1)

        query = Query(se, start=0, limit=100)
        match = Match(field="type", query="altLabel")
        query.add_query(match)

        query.delete(index="test", refresh=True)

        self.assertEqual(se.count(index="test"), 10)
Exemple #27
0
    def delete(self, *args, **kwargs):
        se = SearchEngineFactory().create()
        request = kwargs.pop("request", None)
        index = kwargs.pop("index", True)
        transaction_id = kwargs.pop("index", None)
        provisional_edit_log_details = kwargs.pop(
            "provisional_edit_log_details", None)
        for tile in self.tiles:
            tile.delete(*args, request=request, **kwargs)
        try:
            user = request.user
            user_is_reviewer = user_is_resource_reviewer(user)
        except AttributeError:  # no user
            user = None
            user_is_reviewer = True

        if user_is_reviewer is True or self.user_owns_provisional(user):
            if index:
                query = Query(se)
                bool_query = Bool()
                bool_query.filter(Terms(field="tileid", terms=[self.tileid]))
                query.add_query(bool_query)
                results = query.delete(index=TERMS_INDEX)

            self.__preDelete(request)
            self.save_edit(
                user=user,
                edit_type="tile delete",
                old_value=self.data,
                provisional_edit_log_details=provisional_edit_log_details,
                transaction_id=transaction_id,
            )
            try:
                super(Tile, self).delete(*args, **kwargs)
                for nodeid in self.data.keys():
                    node = models.Node.objects.get(nodeid=nodeid)
                    datatype = self.datatype_factory.get_instance(
                        node.datatype)
                    datatype.post_tile_delete(self, nodeid, index=index)
                if index:
                    self.index()
            except IntegrityError as e:
                logger.error(e)

        else:
            self.apply_provisional_edit(user, data={}, action="delete")
            super(Tile, self).save(*args, **kwargs)
Exemple #28
0
def clear_resources():
    """Removes all resource instances from your db and elasticsearch resource index"""
    se = SearchEngineFactory().create()
    match_all_query = Query(se)
    match_all_query.delete(index='strings', doc_type='term')
    match_all_query.delete(index='resource')
    match_all_query.delete(index='resource_relations')

    print 'deleting', Resource.objects.count(), 'resources'
    cursor = connection.cursor()
    cursor.execute("TRUNCATE public.resource_instances CASCADE;" )
    print Resource.objects.count(), 'resources remaining'

    print 'deleting', models.ResourceXResource.objects.count(), 'resource relationships'
    cursor.execute("TRUNCATE public.resource_x_resource CASCADE;" )
    print models.ResourceXResource.objects.count(), 'resource relationships remaining'
Exemple #29
0
def clear_resources():
    """Removes all resource instances from your db and elasticsearch resource index"""
    se = SearchEngineFactory().create()
    match_all_query = Query(se)
    match_all_query.delete(index='strings', doc_type='term')
    match_all_query.delete(index='resource')
    match_all_query.delete(index='resource_relations')

    print 'deleting', Resource.objects.count(), 'resources'
    cursor = connection.cursor()
    cursor.execute("TRUNCATE public.resource_instances CASCADE;")
    print Resource.objects.count(), 'resources remaining'

    print 'deleting', models.ResourceXResource.objects.count(
    ), 'resource relationships'
    cursor.execute("TRUNCATE public.resource_x_resource CASCADE;")
    print models.ResourceXResource.objects.count(
    ), 'resource relationships remaining'
Exemple #30
0
def index_concepts(clear_index=True,
                   batch_size=settings.BULK_IMPORT_BATCH_SIZE):
    """
    Indxes all concepts from the database

    Keyword Arguments:
    clear_index -- set to True to remove all the concepts from the index before the reindexing operation
    batch_size -- the number of records to index as a group, the larger the number to more memory required

    """

    start = datetime.now()
    print("Indexing concepts")
    cursor = connection.cursor()
    se = SearchEngineFactory().create()
    if clear_index:
        q = Query(se=se)
        q.delete(index="concepts")

    with se.BulkIndexer(batch_size=batch_size,
                        refresh=True) as concept_indexer:
        indexed_values = []
        for conceptValue in models.Value.objects.filter(
                Q(concept__nodetype="Collection")
                | Q(concept__nodetype="ConceptScheme"),
                valuetype__category="label"):
            doc = {
                "category": "label",
                "conceptid": conceptValue.concept_id,
                "language": conceptValue.language_id,
                "value": conceptValue.value,
                "type": conceptValue.valuetype_id,
                "id": conceptValue.valueid,
                "top_concept": conceptValue.concept_id,
            }
            concept_indexer.add(index="concepts", id=doc["id"], data=doc)
            indexed_values.append(doc["id"])

        valueTypes = []
        for valuetype in models.DValueType.objects.filter(
                category="label").values_list("valuetype", flat=True):
            valueTypes.append("'%s'" % valuetype)
        valueTypes = ",".join(valueTypes)

        for conceptValue in models.Relation.objects.filter(
                relationtype="hasTopConcept"):
            topConcept = conceptValue.conceptto_id
            sql = """
                WITH RECURSIVE children_inclusive AS (
                    SELECT d.conceptidfrom, d.conceptidto, c.*, 1 AS depth          ---|NonRecursive Part
                        FROM relations d
                        JOIN values c ON(c.conceptid = d.conceptidto)
                        JOIN values c2 ON(c2.conceptid = d.conceptidfrom)
                        WHERE d.conceptidto = '{0}'
                        and c2.valuetype = 'prefLabel'
                        and c.valuetype in ({1})
                        and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept')
                        UNION
                    SELECT d.conceptidfrom, d.conceptidto, v.*, depth+1             ---|RecursivePart
                        FROM relations  d
                        JOIN children_inclusive b ON(b.conceptidto = d.conceptidfrom)
                        JOIN values v ON(v.conceptid = d.conceptidto)
                        JOIN values v2 ON(v2.conceptid = d.conceptidfrom)
                        WHERE  v2.valuetype = 'prefLabel'
                        and v.valuetype in ({1})
                        and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept')
                ) SELECT valueid, value, conceptid, languageid, valuetype FROM children_inclusive ORDER BY depth;
            """.format(topConcept, valueTypes)

            cursor.execute(sql)
            for conceptValue in cursor.fetchall():
                doc = {
                    "category": "label",
                    "conceptid": conceptValue[2],
                    "language": conceptValue[3],
                    "value": conceptValue[1],
                    "type": conceptValue[4],
                    "id": conceptValue[0],
                    "top_concept": topConcept,
                }
                concept_indexer.add(index="concepts", id=doc["id"], data=doc)
                indexed_values.append(doc["id"])

        # we add this step to catch any concepts/values that are orphaned (have no parent concept)
        for conceptValue in models.Value.objects.filter(
                valuetype__category="label").exclude(
                    valueid__in=indexed_values):
            doc = {
                "category": "label",
                "conceptid": conceptValue.concept_id,
                "language": conceptValue.language_id,
                "value": conceptValue.value,
                "type": conceptValue.valuetype_id,
                "id": conceptValue.valueid,
                "top_concept": conceptValue.concept_id,
            }
            concept_indexer.add(index="concepts", id=doc["id"], data=doc)

    cursor.execute(
        "SELECT count(*) from values WHERE valuetype in ({0})".format(
            valueTypes))
    concept_count_in_db = cursor.fetchone()[0]
    index_count = se.count(index="concepts")

    print("Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds".
          format("Passed" if concept_count_in_db == index_count else "Failed",
                 concept_count_in_db, index_count,
                 (datetime.now() - start).seconds))
Exemple #31
0
 def delete_index(self):
     se = SearchEngineFactory().create()
     query = Query(se, start=0, limit=10000)
     term = Term(field='id', term=self.id)
     query.add_query(term)
     query.delete(index='strings', doc_type='concept')
Exemple #32
0
def index_resources_by_type(resource_types,
                            clear_index=True,
                            batch_size=settings.BULK_IMPORT_BATCH_SIZE):
    """
    Indexes all resources of a given type(s)

    Arguments:
    resource_types -- array of graph ids that represent resource types

    Keyword Arguments:
    clear_index -- set to True to remove all the resources of the types passed in from the index before the reindexing operation
    batch_size -- the number of records to index as a group, the larger the number to more memory required

    """

    se = SearchEngineFactory().create()
    datatype_factory = DataTypeFactory()
    node_datatypes = {
        str(nodeid): datatype
        for nodeid, datatype in models.Node.objects.values_list(
            'nodeid', 'datatype')
    }

    for resource_type in resource_types:
        start = datetime.now()
        resources = Resource.objects.filter(graph_id=str(resource_type))
        graph_name = models.GraphModel.objects.get(
            graphid=str(resource_type)).name
        print "Indexing resource type '{0}'".format(graph_name)
        result_summary = {'database': len(resources), 'indexed': 0}

        if clear_index:
            q = Query(se=se)
            q.delete(index='resource', doc_type=str(resource_type))

        with se.BulkIndexer(batch_size=batch_size,
                            refresh=True) as doc_indexer:
            with se.BulkIndexer(batch_size=batch_size,
                                refresh=True) as term_indexer:
                for resource in resources:
                    document, terms = resource.get_documents_to_index(
                        fetchTiles=True,
                        datatype_factory=datatype_factory,
                        node_datatypes=node_datatypes)
                    doc_indexer.add(index='resource',
                                    doc_type=document['graph_id'],
                                    id=document['resourceinstanceid'],
                                    data=document)
                    for term in terms:
                        term_indexer.add(index='strings',
                                         doc_type='term',
                                         id=term['_id'],
                                         data=term['_source'])

        result_summary['indexed'] = se.es.count(
            index='resource', doc_type=str(resource_type))['count']

        status = 'Passed' if result_summary['database'] == result_summary[
            'indexed'] else 'Failed'
        print "Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds".format(
            status, graph_name, result_summary['database'],
            result_summary['indexed'], (datetime.now() - start).seconds)
def index_concepts(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE):
    """
    Indxes all concepts from the database

    Keyword Arguments:
    clear_index -- set to True to remove all the concepts from the index before the reindexing operation
    batch_size -- the number of records to index as a group, the larger the number to more memory required

    """

    start = datetime.now()
    print "Indexing concepts"
    cursor = connection.cursor()
    se = SearchEngineFactory().create()
    if clear_index:
        q = Query(se=se)
        q.delete(index='concepts')

    with se.BulkIndexer(batch_size=batch_size, refresh=True) as concept_indexer:
        concept_strings = []
        for conceptValue in models.Value.objects.filter(Q(concept__nodetype='Collection') | Q(concept__nodetype='ConceptScheme'), valuetype__category='label'):
            doc = {
                'category': 'label',
                'conceptid': conceptValue.concept_id,
                'language': conceptValue.language_id,
                'value': conceptValue.value,
                'type': conceptValue.valuetype_id,
                'id': conceptValue.valueid,
                'top_concept': conceptValue.concept_id
            }
            concept_indexer.add(index='concepts', id=doc['id'], data=doc)

        valueTypes = []
        valueTypes2 = []
        for valuetype in models.DValueType.objects.filter(category='label').values_list('valuetype', flat=True):
            valueTypes2.append("%s" % valuetype)
            valueTypes.append("'%s'" % valuetype)
        valueTypes = ",".join(valueTypes)

        for conceptValue in models.Relation.objects.filter(relationtype='hasTopConcept'):
            topConcept = conceptValue.conceptto_id
            sql = """
                WITH RECURSIVE children_inclusive AS (
                    SELECT d.conceptidfrom, d.conceptidto, c.*, 1 AS depth          ---|NonRecursive Part
                        FROM relations d
                        JOIN values c ON(c.conceptid = d.conceptidto)
                        JOIN values c2 ON(c2.conceptid = d.conceptidfrom)
                        WHERE d.conceptidto = '{0}'
                        and c2.valuetype = 'prefLabel'
                        and c.valuetype in ({1})
                        and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept')
                        UNION
                    SELECT d.conceptidfrom, d.conceptidto, v.*, depth+1             ---|RecursivePart
                        FROM relations  d
                        JOIN children_inclusive b ON(b.conceptidto = d.conceptidfrom)
                        JOIN values v ON(v.conceptid = d.conceptidto)
                        JOIN values v2 ON(v2.conceptid = d.conceptidfrom)
                        WHERE  v2.valuetype = 'prefLabel'
                        and v.valuetype in ({1})
                        and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept')
                ) SELECT valueid, value, conceptid, languageid, valuetype FROM children_inclusive ORDER BY depth;
            """.format(topConcept, valueTypes)

            cursor.execute(sql)
            for conceptValue in cursor.fetchall():
                doc = {
                    'category': 'label',
                    'conceptid': conceptValue[2],
                    'language': conceptValue[3],
                    'value': conceptValue[1],
                    'type': conceptValue[4],
                    'id': conceptValue[0],
                    'top_concept': topConcept
                }
                concept_indexer.add(index='concepts', id=doc['id'], data=doc)

    cursor.execute("SELECT count(*) from values WHERE valuetype in ({0})".format(valueTypes))
    concept_count_in_db = cursor.fetchone()[0]
    index_count = se.count(index='concepts')
    print "Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds".format('Passed' if concept_count_in_db == index_count else 'Failed', concept_count_in_db, index_count, (datetime.now()-start).seconds)
Exemple #34
0
def index_resources_by_type(resource_types,
                            clear_index=True,
                            batch_size=settings.BULK_IMPORT_BATCH_SIZE,
                            quiet=False):
    """
    Indexes all resources of a given type(s)

    Arguments:
    resource_types -- array of graph ids that represent resource types

    Keyword Arguments:
    clear_index -- set to True to remove all the resources of the types passed in from the index before the reindexing operation
    batch_size -- the number of records to index as a group, the larger the number to more memory required
    quiet -- Silences the status bar output during certain operations, use in celery operations for example

    """

    status = ""
    datatype_factory = DataTypeFactory()
    node_datatypes = {
        str(nodeid): datatype
        for nodeid, datatype in models.Node.objects.values_list(
            "nodeid", "datatype")
    }
    if isinstance(resource_types, str):
        resource_types = [resource_types]

    for resource_type in resource_types:
        start = datetime.now()
        resources = Resource.objects.filter(graph_id=str(resource_type))
        graph_name = models.GraphModel.objects.get(
            graphid=str(resource_type)).name
        print("Indexing resource type '{0}'".format(graph_name))

        q = Query(se=se)
        term = Term(field="graph_id", term=str(resource_type))
        q.add_query(term)
        if clear_index:
            q.delete(index=RESOURCES_INDEX, refresh=True)

        with se.BulkIndexer(batch_size=batch_size,
                            refresh=True) as doc_indexer:
            with se.BulkIndexer(batch_size=batch_size,
                                refresh=True) as term_indexer:
                if quiet is False:
                    bar = pyprind.ProgBar(
                        len(resources),
                        bar_char="█") if len(resources) > 1 else None
                for resource in resources:
                    if quiet is False and bar is not None:
                        bar.update(item_id=resource)
                    document, terms = resource.get_documents_to_index(
                        fetchTiles=True,
                        datatype_factory=datatype_factory,
                        node_datatypes=node_datatypes)
                    doc_indexer.add(index=RESOURCES_INDEX,
                                    id=document["resourceinstanceid"],
                                    data=document)
                    for term in terms:
                        term_indexer.add(index=TERMS_INDEX,
                                         id=term["_id"],
                                         data=term["_source"])

        result_summary = {
            "database": len(resources),
            "indexed": se.count(index=RESOURCES_INDEX, body=q.dsl)
        }
        status = "Passed" if result_summary["database"] == result_summary[
            "indexed"] else "Failed"
        print(
            "Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds"
            .format(status, graph_name, result_summary["database"],
                    result_summary["indexed"],
                    (datetime.now() - start).seconds))
    return status
Exemple #35
0
    def setUpClass(cls):
        se = SearchEngineFactory().create()
        q = Query(se=se)
        for indexname in [
                TERMS_INDEX, CONCEPTS_INDEX, RESOURCE_RELATIONS_INDEX,
                RESOURCES_INDEX
        ]:
            q.delete(index=indexname, refresh=True)

        cls.client = Client()
        cls.client.login(username="******", password="******")

        models.ResourceInstance.objects.all().delete()
        with open(
                os.path.join(
                    "tests/fixtures/resource_graphs/Search Test Model.json"),
                "rU") as f:
            archesfile = JSONDeserializer().deserialize(f)
        ResourceGraphImporter(archesfile["graph"])

        cls.search_model_graphid = "d291a445-fa5f-11e6-afa8-14109fd34195"
        cls.search_model_cultural_period_nodeid = "7a182580-fa60-11e6-96d1-14109fd34195"
        cls.search_model_creation_date_nodeid = "1c1d05f5-fa60-11e6-887f-14109fd34195"
        cls.search_model_destruction_date_nodeid = "e771b8a1-65fe-11e7-9163-14109fd34195"
        cls.search_model_name_nodeid = "2fe14de3-fa61-11e6-897b-14109fd34195"
        cls.search_model_sensitive_info_nodeid = "57446fae-65ff-11e7-b63a-14109fd34195"
        cls.search_model_geom_nodeid = "3ebc6785-fa61-11e6-8c85-14109fd34195"

        cls.user = User.objects.create_user(
            "unpriviliged_user", "*****@*****.**", "test")
        cls.user.groups.add(Group.objects.get(name="Guest"))

        nodegroup = models.NodeGroup.objects.get(
            pk=cls.search_model_destruction_date_nodeid)
        assign_perm("no_access_to_nodegroup", cls.user, nodegroup)

        # Add a concept that defines a min and max date
        concept = {
            "id":
            "00000000-0000-0000-0000-000000000001",
            "legacyoid":
            "ARCHES",
            "nodetype":
            "ConceptScheme",
            "values": [],
            "subconcepts": [{
                "values": [
                    {
                        "value": "Mock concept",
                        "language": "en-US",
                        "category": "label",
                        "type": "prefLabel",
                        "id": "",
                        "conceptid": ""
                    },
                    {
                        "value": "1950",
                        "language": "en-US",
                        "category": "note",
                        "type": "min_year",
                        "id": "",
                        "conceptid": ""
                    },
                    {
                        "value": "1980",
                        "language": "en-US",
                        "category": "note",
                        "type": "max_year",
                        "id": "",
                        "conceptid": ""
                    },
                ],
                "relationshiptype":
                "hasTopConcept",
                "nodetype":
                "Concept",
                "id":
                "",
                "legacyoid":
                "",
                "subconcepts": [],
                "parentconcepts": [],
                "relatedconcepts": [],
            }],
        }

        post_data = JSONSerializer().serialize(concept)
        content_type = "application/x-www-form-urlencoded"
        response = cls.client.post(
            reverse(
                "concept",
                kwargs={"conceptid": "00000000-0000-0000-0000-000000000001"}),
            post_data, content_type)
        response_json = json.loads(response.content)
        valueid = response_json["subconcepts"][0]["values"][0]["id"]
        cls.conceptid = response_json["subconcepts"][0]["id"]

        # add resource instance with only a cultural period defined
        cls.cultural_period_resource = Resource(
            graph_id=cls.search_model_graphid)
        tile = Tile(data={cls.search_model_cultural_period_nodeid: [valueid]},
                    nodegroup_id=cls.search_model_cultural_period_nodeid)
        cls.cultural_period_resource.tiles.append(tile)
        cls.cultural_period_resource.save()

        # add resource instance with a creation and destruction date defined
        cls.date_resource = Resource(graph_id=cls.search_model_graphid)
        tile = Tile(data={cls.search_model_creation_date_nodeid: "1941-01-01"},
                    nodegroup_id=cls.search_model_creation_date_nodeid)
        cls.date_resource.tiles.append(tile)
        tile = Tile(
            data={cls.search_model_destruction_date_nodeid: "1948-01-01"},
            nodegroup_id=cls.search_model_destruction_date_nodeid)
        cls.date_resource.tiles.append(tile)
        tile = Tile(data={cls.search_model_name_nodeid: "testing 123"},
                    nodegroup_id=cls.search_model_name_nodeid)
        cls.date_resource.tiles.append(tile)
        cls.date_resource.save()

        # add resource instance with a creation date and a cultural period defined
        cls.date_and_cultural_period_resource = Resource(
            graph_id=cls.search_model_graphid)
        tile = Tile(data={cls.search_model_creation_date_nodeid: "1942-01-01"},
                    nodegroup_id=cls.search_model_creation_date_nodeid)
        cls.date_and_cultural_period_resource.tiles.append(tile)
        tile = Tile(data={cls.search_model_cultural_period_nodeid: [valueid]},
                    nodegroup_id=cls.search_model_cultural_period_nodeid)
        cls.date_and_cultural_period_resource.tiles.append(tile)
        cls.date_and_cultural_period_resource.save()

        # add resource instance with with no dates or periods defined
        cls.name_resource = Resource(graph_id=cls.search_model_graphid)
        tile = Tile(data={cls.search_model_name_nodeid: "some test name"},
                    nodegroup_id=cls.search_model_name_nodeid)
        cls.name_resource.tiles.append(tile)
        geom = {
            "type":
            "FeatureCollection",
            "features": [{
                "geometry": {
                    "type": "Point",
                    "coordinates": [0, 0]
                },
                "type": "Feature",
                "properties": {}
            }],
        }
        tile = Tile(data={cls.search_model_geom_nodeid: geom},
                    nodegroup_id=cls.search_model_geom_nodeid)
        cls.name_resource.tiles.append(tile)
        cls.name_resource.save()

        # add delay to allow for indexes to be updated
        time.sleep(1)
Exemple #36
0
 def delete_index(self):
     se = SearchEngineFactory().create()
     query = Query(se, start=0, limit=10000)
     term = Term(field='id', term=self.id)
     query.add_query(term)
     query.delete(index='strings', doc_type='concept')
Exemple #37
0
def index_concepts(clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE):
    """
    Indxes all concepts from the database

    Keyword Arguments:
    clear_index -- set to True to remove all the concepts from the index before the reindexing operation
    batch_size -- the number of records to index as a group, the larger the number to more memory required

    """

    start = datetime.now()
    print "Indexing concepts"
    cursor = connection.cursor()
    se = SearchEngineFactory().create()
    if clear_index:
        q = Query(se=se)
        q.delete(index='strings', doc_type='concept')

    with se.BulkIndexer(batch_size=batch_size, refresh=True) as concept_indexer:
        concept_strings = []
        for conceptValue in models.Value.objects.filter(Q(concept__nodetype='Collection') | Q(concept__nodetype='ConceptScheme'), valuetype__category ='label'):
            doc  = {
                'category': 'label',
                'conceptid': conceptValue.concept_id,
                'language': conceptValue.language_id,
                'value': conceptValue.value,
                'type': conceptValue.valuetype_id,
                'id': conceptValue.valueid,
                'top_concept': conceptValue.concept_id
            }
            concept_indexer.add(index='strings', doc_type='concept', id=doc['id'], data=doc)

        valueTypes = []
        valueTypes2=[]
        for valuetype in models.DValueType.objects.filter(category='label').values_list('valuetype', flat=True):
            valueTypes2.append("%s" % valuetype)
            valueTypes.append("'%s'" % valuetype)
        valueTypes = ",".join(valueTypes)

        for conceptValue in models.Relation.objects.filter(relationtype='hasTopConcept'):
            topConcept = conceptValue.conceptto_id
            sql = """
                WITH RECURSIVE children_inclusive AS (
                    SELECT d.conceptidfrom, d.conceptidto, c.*, 1 AS depth          ---|NonRecursive Part
                        FROM relations d
                        JOIN values c ON(c.conceptid = d.conceptidto)
                        JOIN values c2 ON(c2.conceptid = d.conceptidfrom)
                        WHERE d.conceptidto = '{0}'
                        and c2.valuetype = 'prefLabel'
                        and c.valuetype in ({1})
                        and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept')
                        UNION
                    SELECT d.conceptidfrom, d.conceptidto, v.*, depth+1             ---|RecursivePart
                        FROM relations  d
                        JOIN children_inclusive b ON(b.conceptidto = d.conceptidfrom)
                        JOIN values v ON(v.conceptid = d.conceptidto)
                        JOIN values v2 ON(v2.conceptid = d.conceptidfrom)
                        WHERE  v2.valuetype = 'prefLabel'
                        and v.valuetype in ({1})
                        and (d.relationtype = 'narrower' or d.relationtype = 'hasTopConcept')
                ) SELECT valueid, value, conceptid, languageid, valuetype FROM children_inclusive ORDER BY depth;
            """.format(topConcept, valueTypes)

            cursor.execute(sql)
            for conceptValue in cursor.fetchall():
                doc  = {
                    'category': 'label',
                    'conceptid': conceptValue[2],
                    'language': conceptValue[3],
                    'value': conceptValue[1],
                    'type': conceptValue[4],
                    'id': conceptValue[0],
                    'top_concept': topConcept
                }
                concept_indexer.add(index='strings', doc_type='concept', id=doc['id'], data=doc)

    cursor.execute("SELECT count(*) from values WHERE valuetype in ({0})".format(valueTypes))
    concept_count_in_db = cursor.fetchone()[0]
    index_count = se.count(index='strings', doc_type='concept')
    print "Status: {0}, In Database: {1}, Indexed: {2}, Took: {3} seconds".format('Passed' if concept_count_in_db == index_count else 'Failed', concept_count_in_db, index_count, (datetime.now()-start).seconds)