Exemple #1
0
    def test_bulk_delete(self):
        """
        Test bulk deleting of documents in Elasticsearch

        """

        se = SearchEngineFactory().create()
        # se.create_index(index='test')

        for i in range(10):
            x = {
                'id': i,
                'type': 'prefLabel',
                'value': 'test pref label',
            }
            se.index_data(index='test', doc_type='test', body=x, idfield='id', refresh=True)
            y = {
                'id': i + 100,
                'type': 'altLabel',
                'value': 'test alt label',
            }
            se.index_data(index='test', doc_type='test', body=y, idfield='id', refresh=True)


        query = Query(se, start=0, limit=100)
        match = Match(field='type', query='altLabel')
        query.add_query(match)

        query.delete(index='test', refresh=True)

        self.assertEqual(se.es.count(index='test', doc_type='test')['count'], 10)
Exemple #2
0
    def index(self):
        """
        Indexes all the nessesary items values of a resource to support search

        """
        if str(self.graph_id) != str(
                settings.SYSTEM_SETTINGS_RESOURCE_MODEL_ID):
            se = SearchEngineFactory().create()
            datatype_factory = DataTypeFactory()
            node_datatypes = {
                str(nodeid): datatype
                for nodeid, datatype in models.Node.objects.values_list(
                    "nodeid", "datatype")
            }
            document, terms = self.get_documents_to_index(
                datatype_factory=datatype_factory,
                node_datatypes=node_datatypes)
            document["root_ontology_class"] = self.get_root_ontology()
            doc = JSONSerializer().serializeToPython(document)
            se.index_data(index="resources", body=doc, id=self.pk)
            for term in terms:
                se.index_data("terms", body=term["_source"], id=term["_id"])

            for index in settings.ELASTICSEARCH_CUSTOM_INDEXES:
                es_index = import_class_from_string(index["module"])(
                    index["name"])
                document, doc_id = es_index.get_documents_to_index(
                    self, document["tiles"])
                es_index.index_document(document=document, id=doc_id)
    def test_delete_by_query(self):
        """
        Test deleting documents by query in Elasticsearch

        """

        se = SearchEngineFactory().create()

        for i in range(10):
            x = {
                'id': i,
                'type': 'prefLabel',
                'value': 'test pref label',
            }
            se.index_data(index='test', body=x, idfield='id', refresh=True)
            y = {
                'id': i + 100,
                'type': 'altLabel',
                'value': 'test alt label',
            }
            se.index_data(index='test', body=y, idfield='id', refresh=True)

        time.sleep(1)

        query = Query(se, start=0, limit=100)
        match = Match(field='type', query='altLabel')
        query.add_query(match)

        query.delete(index='test', refresh=True)

        self.assertEqual(se.count(index='test'), 10)
def index_resources_by_type(resource_types, result_summary):
    """
    Collects and indexes all resources

    """

    for resource_type in resource_types:
        resources = archesmodels.Entities.objects.filter(entitytypeid = resource_type)
        print "Indexing {0} {1} resources".format(len(resources), resource_type[0])
        result_summary[resource_type[0]] = {'database':len(resources), 'indexed':0}
        errors = []
        for resource in resources:
            try:
                resource = Resource().get(resource.entityid)
                resource.index()
            except Exception as e:
                if e not in errors:
                    errors.append(e)
        if len(errors) > 0:
            print errors[0], ':', len(errors)

        se = SearchEngineFactory().create()
        related_resource_records = archesmodels.RelatedResource.objects.all()
        for related_resource_record in related_resource_records:
            se.index_data(index='resource_relations', doc_type='all', body=model_to_dict(related_resource_record), idfield='resourcexid')

    return result_summary
Exemple #5
0
def index_resources_by_type(resource_types, result_summary):
    """
    Collects and indexes all resources

    """

    for resource_type in resource_types:
        resources = archesmodels.Entities.objects.filter(entitytypeid = resource_type)
        print "Indexing {0} {1} resources".format(len(resources), resource_type[0])
        result_summary[resource_type[0]] = {'database':len(resources), 'indexed':0}
        errors = []
        for resource in resources:
            try:
                resource = Resource().get(resource.entityid)
                resource.index()
            except Exception as e:
                if e not in errors:
                    errors.append(e)
        if len(errors) > 0:
            print errors[0], ':', len(errors)

        se = SearchEngineFactory().create()
        related_resource_records = archesmodels.RelatedResource.objects.all()
        for related_resource_record in related_resource_records:
            se.index_data(index='resource_relations', doc_type='all', body=model_to_dict(related_resource_record), idfield='resourcexid')

    return result_summary
Exemple #6
0
    def index(self):
        """
        Indexes all the nessesary items values of a resource to support search

        """
        if unicode(self.graph_id) != unicode(
                settings.SYSTEM_SETTINGS_RESOURCE_MODEL_ID):
            se = SearchEngineFactory().create()
            datatype_factory = DataTypeFactory()
            node_datatypes = {
                str(nodeid): datatype
                for nodeid, datatype in models.Node.objects.values_list(
                    'nodeid', 'datatype')
            }
            document, terms = self.get_documents_to_index(
                datatype_factory=datatype_factory,
                node_datatypes=node_datatypes)
            document['root_ontology_class'] = self.get_root_ontology()
            se.index_data('resource',
                          self.graph_id,
                          JSONSerializer().serializeToPython(document),
                          id=self.pk)
            for term in terms:
                se.index_data('strings',
                              'term',
                              term['_source'],
                              id=term['_id'])
Exemple #7
0
 def save(self):
     se = SearchEngineFactory().create()
     document = model_to_dict(self)
     se.index_data(index='resource_relations',
                   doc_type='all',
                   body=document,
                   idfield='resourcexid')
     super(ResourceXResource, self).save()
Exemple #8
0
 def save(self):
     from arches.app.search.search_engine_factory import SearchEngineFactory
     se = SearchEngineFactory().create()
     if not self.created:
         self.created = datetime.datetime.now()
     self.modified = datetime.datetime.now()
     document = model_to_dict(self)
     se.index_data(index='resource_relations', doc_type='all', body=document, idfield='resourcexid')
     super(ResourceXResource, self).save()
Exemple #9
0
    def index(self, scheme=None):
        if self.category == 'label':
            se = SearchEngineFactory().create()
            data = JSONSerializer().serializeToPython(self)
            if scheme == None:
                scheme = self.get_scheme_id()
            if scheme == None:
                raise Exception('Index of label failed.  Index type (scheme id) could not be derived from the label.')

            data['top_concept'] = scheme.id
            se.index_data('strings', 'concept', data, 'id')
Exemple #10
0
    def index(self, scheme=None):
        if self.category == 'label':
            se = SearchEngineFactory().create()
            data = JSONSerializer().serializeToPython(self)
            if scheme == None:
                scheme = self.get_scheme_id()
            if scheme == None:
                raise Exception(
                    'Index of label failed.  Index type (scheme id) could not be derived from the label.'
                )

            data['top_concept'] = scheme.id
            se.index_data('strings', 'concept', data, 'id')
Exemple #11
0
    def index(self, scheme=None):
        if self.category == 'label':
            se = SearchEngineFactory().create()
            data = JSONSerializer().serializeToPython(self)            
            if scheme == None:
                scheme = self.get_scheme_id()
            if scheme == None:
                raise Exception('Index of label failed.  Index type (scheme id) could not be derived from the label.')

            se.create_mapping('concept_labels', scheme.id, fieldname='conceptid', fieldtype='string', fieldindex='not_analyzed')
            se.index_data('concept_labels', scheme.id, data, 'id')
            # don't create terms for entity type concepts
            if not(scheme.id == '00000000-0000-0000-0000-000000000003' or scheme.id == '00000000-0000-0000-0000-000000000004'):
                se.index_term(self.value, self.id, scheme.id, {'conceptid': self.conceptid})
Exemple #12
0
    def index(self, scheme=None):
        if self.category == 'label':
            se = SearchEngineFactory().create()
            data = JSONSerializer().serializeToPython(self)            
            if scheme == None:
                scheme = self.get_scheme_id()
            if scheme == None:
                raise Exception('Index of label failed.  Index type (scheme id) could not be derived from the label.')

            se.create_mapping('concept_labels', scheme.id, fieldname='conceptid', fieldtype='string', fieldindex='not_analyzed')
            se.index_data('concept_labels', scheme.id, data, 'id')
            # don't create terms for entity type concepts
            if not(scheme.id == '00000000-0000-0000-0000-000000000003' or scheme.id == '00000000-0000-0000-0000-000000000004'):
                se.index_term(self.value, self.id, scheme.id, {'conceptid': self.conceptid})
Exemple #13
0
    def index(self):
        """
        Indexes all the nessesary items values of a resource to support search

        """
        if unicode(self.graph_id) != unicode(settings.SYSTEM_SETTINGS_RESOURCE_MODEL_ID):
            se = SearchEngineFactory().create()
            datatype_factory = DataTypeFactory()
            node_datatypes = {str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list('nodeid', 'datatype')}
            document, terms = self.get_documents_to_index(datatype_factory=datatype_factory, node_datatypes=node_datatypes)
            document['root_ontology_class'] = self.get_root_ontology()
            se.index_data('resource', self.graph_id, JSONSerializer().serializeToPython(document), id=self.pk)

            for term in terms:
                se.index_data('strings', 'term', term['_source'], id=term['_id'])
Exemple #14
0
    def index(self):
        """
        Indexes all the nessesary items values of a resource to support search

        """

        se = SearchEngineFactory().create()
        datatype_factory = DataTypeFactory()
        node_datatypes = {str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list('nodeid', 'datatype')}

        document, terms = self.get_documents_to_index(datatype_factory=datatype_factory, node_datatypes=node_datatypes)
        se.index_data('resource', self.graph_id, JSONSerializer().serializeToPython(document), id=self.pk)

        for term in terms:
            se.index_data('strings', 'term', term['_source'], id=term['_id'])
    def index(self, scheme=None):
        if self.category == 'label':
            se = SearchEngineFactory().create()
            data = JSONSerializer().serializeToPython(self)            
            if scheme == None:
                scheme = self.get_scheme_id()
            if scheme == None:
                raise Exception('Index of label failed.  Index type (scheme id) could not be derived from the label.')

            se.create_mapping('concept_labels', scheme.id, fieldname='conceptid', fieldtype='string', fieldindex='not_analyzed')
            se.index_data('concept_labels', scheme.id, data, 'id')
            #Looks up whether the label is actually a dropdown label or an entity label and, if so, excludes them from the term search index.
            entity_or_dropdown= archesmodels.ConceptRelations.objects.filter(Q(relationtype ='hasCollection') | Q(relationtype ='hasEntity'),conceptidto = scheme.id)
            is_entity_or_dropdown = False if entity_or_dropdown.count() == 0 else True
            # don't create terms for entity type concepts
            if not(scheme.id == '00000000-0000-0000-0000-000000000003' or scheme.id == '00000000-0000-0000-0000-000000000004') and is_entity_or_dropdown ==False:
                se.index_term(self.value, self.id, scheme.id, {'conceptid': self.conceptid})
Exemple #16
0
    def index(self, scheme=None):
        if self.category == 'label':
            se = SearchEngineFactory().create()
            data = JSONSerializer().serializeToPython(self)            
            if scheme == None:
                scheme = self.get_scheme_id()
            if scheme == None:
                raise Exception('Index of label failed.  Index type (scheme id) could not be derived from the label.')

            se.create_mapping('concept_labels', scheme.id, fieldname='conceptid', fieldtype='string', fieldindex='not_analyzed')
            se.index_data('concept_labels', scheme.id, data, 'id')
            #Looks up whether the label is actually a dropdown label or an entity label and, if so, excludes them from the term search index.
            entity_or_dropdown= archesmodels.ConceptRelations.objects.filter(Q(relationtype ='hasCollection') | Q(relationtype ='hasEntity'),conceptidto = scheme.id)
            is_entity_or_dropdown = False if entity_or_dropdown.count() == 0 else True
            # don't create terms for entity type concepts
            if not(scheme.id == '00000000-0000-0000-0000-000000000003' or scheme.id == '00000000-0000-0000-0000-000000000004') and is_entity_or_dropdown ==False:
                se.index_term(self.value, self.id, scheme.id, {'conceptid': self.conceptid})
Exemple #17
0
    def index(self,
              documents,
              index,
              type,
              idfield,
              processdoc=None,
              getid=None,
              bulk=False):
        detail = ''
        bulkitems = []
        errorlist = []
        se = SearchEngineFactory().create()
        if not isinstance(documents, list):
            documents = [documents]
        for document in documents:
            #print "inserting document: %s" % (document)
            sys.stdout.write('.')
            if processdoc == None:
                data = document
            else:
                data = processdoc(document)
            id = None
            if getid != None:
                id = getid(document, data)
            try:
                if bulk:
                    bulkitem = se.create_bulk_item(index, type, id, data)
                    bulkitems.append(bulkitem[0])
                    bulkitems.append(bulkitem[1])
                else:
                    se.index_data(index, type, data, idfield=idfield, id=id)
            except Exception as detail:
                errorlist.append(id)
        if bulk:
            try:
                se.bulk_index(index, type, bulkitems)
            except Exception as detail:
                errorlist = bulkitems
                print 'bulk inset failed'

        if detail != '':
            print "\n\nException detail: %s " % (detail)
            print "There was a problem indexing the following items:"
            print errorlist
Exemple #18
0
def index_resources_by_type(resource_types, result_summary):
    """
    Collects and indexes all resources

    """

    errors = []
    for resource_type in resource_types:
        resources = archesmodels.Entities.objects.filter(
            entitytypeid=resource_type)
        print "Indexing {0} {1} resources".format(len(resources),
                                                  resource_type[0])
        result_summary[resource_type[0]] = {
            'database': len(resources),
            'indexed': 0
        }

        for resource in resources:
            try:
                resource = Resource().get(resource.entityid)
                resource.index()
            except Exception as e:
                msg = 'Could not index resource {}.\nERROR: {}'.format(
                    resource.entityid, e)
                print msg
                errors.append(e)

        se = SearchEngineFactory().create()
        related_resource_records = archesmodels.RelatedResource.objects.all()
        for related_resource_record in related_resource_records:
            se.index_data(index='resource_relations',
                          doc_type='all',
                          body=model_to_dict(related_resource_record),
                          idfield='resourcexid')

    if len(errors) > 0:
        print "Number of errors:", len(errors)
        log_file = os.path.join(settings.PACKAGE_ROOT, 'logs',
                                'indexing_errors.txt')
        utils.write_to_file(log_file, '\n'.join(errors), mode="wb")
        print "  -- errors written to:", log_file

    return result_summary
Exemple #19
0
    def post(self, request, resourceid=None):
        es = Elasticsearch()
        se = SearchEngineFactory().create()
        res = dict(request.POST)
        relationship_type = res['relationship_properties[relationship_type]'][0]
        datefrom = res['relationship_properties[datefrom]'][0]
        dateto = res['relationship_properties[dateto]'][0]
        dateto = None if dateto == '' else dateto
        datefrom = None if datefrom == '' else datefrom
        notes = res['relationship_properties[notes]'][0]
        root_resourceinstanceid = res['root_resourceinstanceid']
        instances_to_relate = []
        relationships_to_update = []
        if 'instances_to_relate[]' in res:
            instances_to_relate = res['instances_to_relate[]']
        if 'relationship_ids[]' in res:
            relationships_to_update = res['relationship_ids[]']

        for instanceid in instances_to_relate:
            rr = models.ResourceXResource.objects.create(
                resourceinstanceidfrom = Resource(root_resourceinstanceid[0]),
                resourceinstanceidto = Resource(instanceid),
                notes = notes,
                relationshiptype = models.Value(relationship_type),
                datestarted = datefrom,
                dateended = dateto
            )
            document = model_to_dict(rr)
            se.index_data(index='resource_relations', doc_type='all', body=document, idfield='resourcexid')

        for relationshipid in relationships_to_update:
            rr = models.ResourceXResource.objects.get(pk=relationshipid)
            rr.notes = notes
            rr.relationshiptype = models.Value(relationship_type)
            rr.datestarted = datefrom
            rr.dateended = dateto
            rr.save()
            document = model_to_dict(rr)
            se.index_data(index='resource_relations', doc_type='all', body=document, idfield='resourcexid')
        start = request.GET.get('start', 0)
        es.indices.refresh(index="resource_relations")
        return JSONResponse(self.get_related_resources(root_resourceinstanceid[0], lang="en-us", start=start, limit=15), indent=4)
def add_resource_relation(entityid1, entityid2, relationship_type_string):
    # find the relationship type
    se = SearchEngineFactory().create()
    try:

        logging.warning("finding relationship: %s", relationship_type_string)
        value = models.Values.objects.get(
            value__icontains=relationship_type_string)
        relationship = models.RelatedResource(entityid1=entityid1,
                                              entityid2=entityid2,
                                              relationshiptype=value.pk)
        relationship.save()
        se.index_data(index='resource_relations',
                      doc_type='all',
                      body=model_to_dict(relationship),
                      idfield='resourcexid')
        logging.warning("Added relationship")
    except Exception as e:
        logging.warning("Unable to create relation %s to %s. %s", entityid1,
                        entityid2, e)
Exemple #21
0
    def index(self):
        """
        Indexes all the nessesary documents related to resources to support the map, search, and reports

        """

        se = SearchEngineFactory().create()

        search_documents = self.prepare_documents_for_search_index()
        for document in search_documents:
            se.index_data('resource',
                          self.resourceinstance.graph_id,
                          document,
                          id=self.resourceinstance_id)

        for term in self.prepare_terms_for_search_index():
            term_id = '%s_%s' % (str(self.tileid), str(term['nodeid']))
            se.delete_terms(term_id)
            se.index_term(term['term'], term_id, term['context'],
                          term['options'])
Exemple #22
0
    def index(self):
        """
        Indexes all the nessesary documents related to resources to support the map, search, and reports

        """

        se = SearchEngineFactory().create()

        search_documents = self.prepare_documents_for_search_index()
        for document in search_documents:
            se.index_data('entity',
                          self.entitytypeid,
                          document,
                          id=self.entityid)

            report_documents = self.prepare_documents_for_report_index(
                geom_entities=document['geometries'])
            for report_document in report_documents:
                se.index_data('resource',
                              self.entitytypeid,
                              report_document,
                              id=self.entityid)

            geojson_documents = self.prepare_documents_for_map_index(
                geom_entities=document['geometries'])
            for geojson in geojson_documents:
                se.index_data('maplayers',
                              self.entitytypeid,
                              geojson,
                              idfield='id')

        for term in self.prepare_terms_for_search_index():
            se.index_term(term['term'], term['entityid'], term['context'],
                          term['ewstatus'], term['options'])
Exemple #23
0
    def update(self, data, files):
        se = SearchEngineFactory().create()
        related_resources_data = data.get('related-resources', [])
        original_relations = self.resource.get_related_resources()
        if self.resource.entityid == '':
            self.resource.save()
        relationship_ids = []

        for related_resource in related_resources_data:
            relationship_id = related_resource['relationship']['resourcexid']
            relationship_ids.append(relationship_id)
            resource_id = related_resource['relatedresourceid']
            relationship_type_id = related_resource['relationship']['relationshiptype']
            if isinstance(relationship_type_id, dict):
                relationship_type_id = relationship_type_id['value']
            notes = related_resource['relationship']['notes']
            date_started = related_resource['relationship']['datestarted']
            date_ended = related_resource['relationship']['dateended']
            if not relationship_id:
                relationship = self.resource.create_resource_relationship(resource_id,
                    relationship_type_id=relationship_type_id,
                    notes=notes,
                    date_started=date_started,
                    date_ended=date_ended
                    )
                    
            else:
                relationship = RelatedResource.objects.get(pk=relationship_id)
                relationship.relationshiptype = relationship_type_id
                relationship.notes = notes
                relationship.datestarted = date_started
                relationship.dateended = date_ended
                relationship.save()
                se.delete(index='resource_relations', doc_type='all', id=relationship_id)
            se.index_data(index='resource_relations', doc_type='all', body=model_to_dict(relationship), idfield='resourcexid')

        for relatedentity in original_relations:
            if relatedentity['relationship'].resourcexid not in relationship_ids:
                se.delete(index='resource_relations', doc_type='all', id=relatedentity['relationship'].resourcexid)
                relatedentity['relationship'].delete()
Exemple #24
0
    def test_delete_by_query(self):
        """
        Test deleting documents by query in Elasticsearch

        """

        se = SearchEngineFactory().create()

        for i in range(10):
            x = {"id": i, "type": "prefLabel", "value": "test pref label"}
            se.index_data(index="test", body=x, idfield="id", refresh=True)
            y = {"id": i + 100, "type": "altLabel", "value": "test alt label"}
            se.index_data(index="test", body=y, idfield="id", refresh=True)

        time.sleep(1)

        query = Query(se, start=0, limit=100)
        match = Match(field="type", query="altLabel")
        query.add_query(match)

        query.delete(index="test", refresh=True)

        self.assertEqual(se.count(index="test"), 10)
Exemple #25
0
    def index(self):
        """
        Indexes all the nessesary items values of a resource to support search

        """

        se = SearchEngineFactory().create()
        datatype_factory = DataTypeFactory()
        node_datatypes = {
            str(nodeid): datatype
            for nodeid, datatype in models.Node.objects.values_list(
                'nodeid', 'datatype')
        }

        document, terms = self.get_documents_to_index(
            datatype_factory=datatype_factory, node_datatypes=node_datatypes)
        se.index_data('resource',
                      self.graph_id,
                      JSONSerializer().serializeToPython(document),
                      id=self.pk)

        for term in terms:
            se.index_data('strings', 'term', term['_source'], id=term['_id'])
Exemple #26
0
    def index(self, documents, index, type, idfield, processdoc=None, getid=None, bulk=False):
        detail = ''
        bulkitems = []
        errorlist = []
        se = SearchEngineFactory().create()
        if not isinstance(documents, list):
            documents = [documents]
        for document in documents:
            #print "inserting document: %s" % (document)
            sys.stdout.write('.')
            if processdoc == None:
                data = document
            else:
                data = processdoc(document)
            id = None
            if getid != None:
                id = getid(document, data)            
            try:
                if bulk:
                    bulkitem = se.create_bulk_item(index, type, id, data)
                    bulkitems.append(bulkitem[0])
                    bulkitems.append(bulkitem[1])        
                else:
                    se.index_data(index, type, data, idfield=idfield, id=id)
            except Exception as detail:
                errorlist.append(id)
        if bulk:
            try:
                se.bulk_index(index, type, bulkitems)
            except Exception as detail:
                errorlist = bulkitems
                print 'bulk inset failed'

        if detail != '':
            print "\n\nException detail: %s " % (detail)
            print "There was a problem indexing the following items:"
            print errorlist
Exemple #27
0
    def index(self):
        """
        Indexes all the nessesary documents related to resources to support the map, search, and reports

        """

        se = SearchEngineFactory().create()

        search_documents = self.prepare_documents_for_search_index()
        for document in search_documents:
            se.index_data('entity', self.entitytypeid, document, id=self.entityid)

            report_documents = self.prepare_documents_for_report_index(geom_entities=document['geometries'])
            for report_document in report_documents:
                se.index_data('resource', self.entitytypeid, report_document, id=self.entityid)

            geojson_documents = self.prepare_documents_for_map_index(geom_entities=document['geometries'])
            for geojson in geojson_documents:
                se.index_data('maplayers', self.entitytypeid, geojson, idfield='id')

        for term in self.prepare_terms_for_search_index():
           se.index_term(term['term'], term['entityid'], term['context'], term['options'])
Exemple #28
0
    def index(documents, index, type, idfield, processdoc=None, getid=None, bulk=False):
        print 'index_concepts.index'
        detail = ''
        bulkitems = []
        errorlist = []
        se = SearchEngineFactory().create()
        if not isinstance(documents, list):
            documents = [documents]
        for document in documents:
            sys.stdout.write('.')
            if processdoc == None:
                data = document
            else:
                data = processdoc(document)
            id = None
            if getid != None:
                id = getid(document, data)            
            try:
                if bulk:
                    bulkitem = se.create_bulk_item(index, type, id, data)
                    bulkitems.append(bulkitem[0])
                    bulkitems.append(bulkitem[1])        
                else:
                    se.index_data(index, type, data, idfield=idfield, id=id)
                    #se.index_data('concept_labels', '00000000-0000-0000-0000-000000000005', data, 'id')
                    for concept in data['labels']:
                        #se.index_term(concept['label'], concept['labelid'], '00000000-0000-0000-0000-000000000005', settings.PUBLISHED_LABEL, {'conceptid': data['conceptid']})
                        if concept['label'].strip(' \t\n\r') != '':
                            already_indexed = False
                            count = 1
                            ids = [id]
                        try:
                            _id = uuid.uuid3(uuid.NAMESPACE_DNS, '%s%s' % (hash(concept['label']), hash(data['conceptid'])))
                            result = se.es.get(index='term', doc_type='value', id=_id, ignore=404)

                            #print 'result: %s' % result
                            if result['found'] == True:
                                ids = result['_source']['ids']
                                if id not in ids:
                                    ids.append(id)
                            else:
                                ids = [id]                             
                            if data['context'] != '00000000-0000-0000-0000-000000000003' and data['context'] != '00000000-0000-0000-0000-000000000004':
                                se.index_data('term', 'value', {'term': concept['label'], 'context': data['context'], 'ewstatus': settings.PUBLISHED_LABEL, 'options': {'conceptid': data['conceptid']}, 'count': len(ids), 'ids': ids}, id=_id)
                            
                        except Exception as detail:
                            raise detail   
            except Exception as detail:
                print detail
                errorlist.append(id)
        if bulk:
            try:
                se.bulk_index(index, type, bulkitems)
            except Exception as detail:
                errorlist = bulkitems
                print 'bulk inset failed'

        if detail != '':
            print "\n\nException detail: %s " % (detail)
            print "There was a problem indexing the following items:"
            print errorlist
def main():

    sql = """ALTER TABLE concepts.concepts
            ALTER COLUMN conceptid DROP DEFAULT;

    ALTER TABLE concepts.concepts
            ALTER COLUMN legacyoid SET NOT NULL;

    ALTER TABLE concepts.concepts
            DROP CONSTRAINT IF EXISTS unique_concepts_legacyoid;

    ALTER TABLE concepts.concepts
            ADD CONSTRAINT unique_concepts_legacyoid UNIQUE (legacyoid);


    CREATE OR REPLACE VIEW ontology.vw_export_nodes AS 
            SELECT foo.assettype,
                    foo.node AS label,
                    (foo.assettype || ':'::text) || foo.node AS id,
                    foo.mergenodeid AS mergenode,
                    foo.businesstable AS businesstablename
            FROM ( SELECT m.entitytypeidfrom AS assettype,
                            r.entitytypedomain AS node,
                            m.mergenodeid,
                            ( SELECT entity_types.businesstablename
                                         FROM data.entity_types
                                        WHERE entity_types.entitytypeid = r.entitytypedomain) AS businesstable
                         FROM ontology.mapping_steps ms
                             JOIN ontology.mappings m ON m.mappingid = ms.mappingid
                             JOIN ontology.rules r ON r.ruleid = ms.ruleid
                    UNION
                     SELECT m.entitytypeidfrom,
                            r.entitytyperange AS node,
                            m.mergenodeid,
                            ( SELECT entity_types.businesstablename
                                         FROM data.entity_types
                                        WHERE entity_types.entitytypeid = r.entitytyperange) AS businesstable
                         FROM ontology.mapping_steps ms
                             JOIN ontology.mappings m ON m.mappingid = ms.mappingid
                             JOIN ontology.rules r ON r.ruleid = ms.ruleid) foo
            WHERE (foo.node <> ALL (ARRAY['ARCHES_RECORD.E31'::text, 'CREATION_EVENT.E65'::text, 'UPDATE_EVENT.E65'::text, 'COMPILER.E82'::text, 'COMPILER_PERSON.E21'::text, 'REFERENCE_NUMBER_(INTERNAL).E42'::text, 'TIME-SPAN_UPDATE_EVENT.E52'::text, 'TIME-SPAN_CREATION_EVENT.E52'::text, 'DATE_OF_COMPILATION.E50'::text, 'DATE_OF_LAST_UPDATE.E50'::text])) AND foo.node <> foo.assettype
            ORDER BY foo.assettype, foo.node;

            ALTER TABLE ontology.vw_export_nodes
                OWNER TO postgres;


    CREATE OR REPLACE VIEW ontology.vw_export_edges AS 
            SELECT m.entitytypeidfrom AS assettype,
                    (m.entitytypeidfrom || ':'::text) || r.entitytypedomain AS source,
                    (m.entitytypeidfrom || ':'::text) || r.entitytyperange AS target,
                    r.propertyid AS label
            FROM ontology.mapping_steps ms
                    JOIN ontology.mappings m ON m.mappingid = ms.mappingid
                    JOIN ontology.rules r ON r.ruleid = ms.ruleid
            WHERE (m.entitytypeidfrom <> ALL (ARRAY['ARCHES_RECORD.E31'::text, 'CREATION_EVENT.E65'::text, 'UPDATE_EVENT.E65'::text, 'COMPILER.E82'::text, 'COMPILER_PERSON.E21'::text, 'REFERENCE_NUMBER_(INTERNAL).E42'::text, 'TIME-SPAN_UPDATE_EVENT.E52'::text, 'TIME-SPAN_CREATION_EVENT.E52'::text, 'DATE_OF_COMPILATION.E50'::text, 'DATE_OF_LAST_UPDATE.E50'::text])) AND (r.entitytypedomain <> ALL (ARRAY['ARCHES_RECORD.E31'::text, 'CREATION_EVENT.E65'::text, 'UPDATE_EVENT.E65'::text, 'COMPILER.E82'::text, 'COMPILER_PERSON.E21'::text, 'REFERENCE_NUMBER_(INTERNAL).E42'::text, 'TIME-SPAN_UPDATE_EVENT.E52'::text, 'TIME-SPAN_CREATION_EVENT.E52'::text, 'DATE_OF_COMPILATION.E50'::text, 'DATE_OF_LAST_UPDATE.E50'::text])) AND (r.entitytyperange <> ALL (ARRAY['ARCHES_RECORD.E31'::text, 'CREATION_EVENT.E65'::text, 'UPDATE_EVENT.E65'::text, 'COMPILER.E82'::text, 'COMPILER_PERSON.E21'::text, 'REFERENCE_NUMBER_(INTERNAL).E42'::text, 'TIME-SPAN_UPDATE_EVENT.E52'::text, 'TIME-SPAN_CREATION_EVENT.E52'::text, 'DATE_OF_COMPILATION.E50'::text, 'DATE_OF_LAST_UPDATE.E50'::text])) AND m.entitytypeidto = r.entitytyperange
            ORDER BY m.entitytypeidfrom;

    ALTER TABLE ontology.vw_export_edges
        OWNER TO postgres;

    INSERT INTO concepts.d_valuetypes SELECT 'sortorder', 'undefined', null, 'arches', 'text'
            WHERE NOT EXISTS (SELECT 1 FROM concepts.d_valuetypes WHERE valuetype = 'sortorder'); 


    CREATE OR REPLACE FUNCTION concepts.concpets_ins()
            RETURNS trigger AS
            $BODY$
            DECLARE
             v_uuid uuid = public.uuid_generate_v1mc();

            BEGIN
            --Provides CONCEPTID for RDM inserts and cases where ETL conceptid is not a UUID
                IF NEW.CONCEPTID IS NULL THEN
                     NEW.CONCEPTID := v_uuid;
                END IF;

             -- Supports RDM where no concpetid or legacyoid is fed in
                IF NEW.CONCEPTID IS NULL AND (NEW.LEGACYOID IS NULL OR NEW.LEGACYOID = '') THEN
                     NEW.LEGACYOID = v_uuid::text;
                END IF;   


            -- I would assume that two cases below are handled in python code by being explicit about insert values for both columns... just coding defensively here. ABL.
            -- Supports where ETL provided conceptid is a UUID and will be kept, but no LEGACYOID provided.
                IF NEW.CONCEPTID IS NOT NULL and (NEW.LEGACYOID is null or NEW.LEGACYOID = '') THEN
                     NEW.LEGACYOID = NEW.CONCEPTID::text;     
                END IF;   

            -- Supports where ETL'ed conceptid is not a UUID.  Populates original "concpetid" as LEGACYOID.
                IF NEW.LEGACYOID IS NOT NULL OR NEW.LEGACYOID != '' then
                     NEW.LEGACYOID = NEW.LEGACYOID;     
                END IF;   

            RETURN NEW;
            END$BODY$
                LANGUAGE plpgsql VOLATILE
                COST 100;

    ALTER FUNCTION concepts.concpets_ins()
        OWNER TO postgres;

        
        
    -- Trigger: concepts_ins_tgr on concepts.concepts

    DROP TRIGGER IF EXISTS concepts_ins_tgr ON concepts.concepts;

    CREATE TRIGGER concepts_ins_tgr
        BEFORE INSERT
        ON concepts.concepts
        FOR EACH ROW
        EXECUTE PROCEDURE concepts.concpets_ins();"""


    with transaction.atomic():
        #import arches.management.patches.upgrade_to_v3_0_4

        cursor = connection.cursor()
        cursor.execute(sql)

        anonymous_user, created = User.objects.get_or_create(username='******')
        if created:
                anonymous_user.set_password('')

        read_group, created = Group.objects.get_or_create(name='read')
        anonymous_user.groups.add(read_group)

        edit_group, created = Group.objects.get_or_create(name='edit')
        admin_user = User.objects.get(username='******')
        admin_user.groups.add(edit_group)
        admin_user.groups.add(read_group)

        print '\nINSTALLING PYSHP MODULE'
        print '-----------------------'
        pip.main(['install', 'pyshp'])


        print '\nUPDATING ENTITY INDEX'
        print '---------------------'

        # Add numbers array to resources that do not have them. Move numbers data from child_entities to numbers array in index.
        resourceid_sql = "SELECT entityid FROM data.entities WHERE entitytypeid IN (SELECT distinct(entitytypeid) FROM data.entity_types WHERE isresource =True);"
        cursor.execute(resourceid_sql)
        resourceids = []
        for val in cursor.fetchall():
            resourceids.append(val[0])
        
        start = time.time()
        records = 0
        se = SearchEngineFactory().create()
        for resourceid in resourceids:
            indexed_resource = se.search(index='entity', id=resourceid)        

            if 'numbers' not in indexed_resource['_source']:
                indexed_resource['_source']['numbers'] = []
            else:
                pass
                
            for child_entity in indexed_resource['_source']['child_entities']:
                if child_entity['businesstablename'] == 'numbers':
                    index_resource['_source']['numbers'].append(child_entity)
                    indexed_resource['_source']['child_entities'].remove(child_entity)
        
            ## Reindex resource here.
            se.index_data(index='entity',doc_type=indexed_resource['_type'], body=indexed_resource['_source'], id=indexed_resource['_id'])
            records+=1
            # if records%500 == 0:
            #     print '%s records processed'%str(records)

        print '%s records updated' % str(records)

        # print 'Patch took %s seconds to run.'%str(time.time() - start)

        print "\npatch '%s' successfully applied." % __name__
Exemple #30
0
    def index(documents,
              index,
              type,
              idfield,
              processdoc=None,
              getid=None,
              bulk=False):
        print 'index_concepts.index'
        detail = ''
        bulkitems = []
        errorlist = []
        se = SearchEngineFactory().create()
        if not isinstance(documents, list):
            documents = [documents]
        for document in documents:
            sys.stdout.write('.')
            if processdoc == None:
                data = document
            else:
                data = processdoc(document)
            id = None
            if getid != None:
                id = getid(document, data)
            try:
                if bulk:
                    bulkitem = se.create_bulk_item(index, type, id, data)
                    bulkitems.append(bulkitem[0])
                    bulkitems.append(bulkitem[1])
                else:
                    se.index_data(index, type, data, idfield=idfield, id=id)
                    #se.index_data('concept_labels', '00000000-0000-0000-0000-000000000005', data, 'id')
                    for concept in data['labels']:
                        #se.index_term(concept['label'], concept['labelid'], '00000000-0000-0000-0000-000000000005', settings.PUBLISHED_LABEL, {'conceptid': data['conceptid']})
                        if concept['label'].strip(' \t\n\r') != '':
                            already_indexed = False
                            count = 1
                            ids = [id]
                        try:
                            _id = uuid.uuid3(
                                uuid.NAMESPACE_DNS,
                                '%s%s' % (hash(concept['label']),
                                          hash(data['conceptid'])))
                            result = se.es.get(index='term',
                                               doc_type='value',
                                               id=_id,
                                               ignore=404)

                            #print 'result: %s' % result
                            if result['found'] == True:
                                ids = result['_source']['ids']
                                if id not in ids:
                                    ids.append(id)
                            else:
                                ids = [id]
                            if data['context'] != '00000000-0000-0000-0000-000000000003' and data[
                                    'context'] != '00000000-0000-0000-0000-000000000004':
                                se.index_data(
                                    'term',
                                    'value', {
                                        'term': concept['label'],
                                        'context': data['context'],
                                        'ewstatus': settings.PUBLISHED_LABEL,
                                        'options': {
                                            'conceptid': data['conceptid']
                                        },
                                        'count': len(ids),
                                        'ids': ids
                                    },
                                    id=_id)

                        except Exception as detail:
                            raise detail
            except Exception as detail:
                print detail
                errorlist.append(id)
        if bulk:
            try:
                se.bulk_index(index, type, bulkitems)
            except Exception as detail:
                errorlist = bulkitems
                print 'bulk inset failed'

        if detail != '':
            print "\n\nException detail: %s " % (detail)
            print "There was a problem indexing the following items:"
            print errorlist
Exemple #31
0
class ResourceLoader(object):

    def __init__(self):
        self.user = User()
        self.user.first_name = settings.ETL_USERNAME
        self.resources = []
        self.se = SearchEngineFactory().create()

    option_list = BaseCommand.option_list + (
        make_option('--source',
            action='store',
            dest='source',
            default='',
            help='.arches file containing resource records'),
         make_option('--format',
            action='store_true',
            default='arches',
            help='format extension that you would like to load: arches or shp'),
        )

    def load(self, source):
        file_name, file_format = os.path.splitext(source)
        archesjson = False
        if file_format == '.shp':
            reader = ShapeReader()
        elif file_format == '.arches':
            reader = ArchesReader()
            print '\nVALIDATING ARCHES FILE ({0})'.format(source)
            reader.validate_file(source)
        elif file_format == '.json':
            archesjson = True
            reader = JsonReader()

        start = time()
        resources = reader.load_file(source)

        print '\nLOADING RESOURCES ({0})'.format(source)
        relationships = None
        related_resource_records = []
        relationships_file = file_name + '.relations'
        elapsed = (time() - start)
        print 'time to parse {0} resources = {1}'.format(file_name, elapsed)
        results = self.resource_list_to_entities(resources, archesjson)
        if os.path.exists(relationships_file):
            relationships = csv.DictReader(open(relationships_file, 'r'), delimiter='|')
            for relationship in relationships:
                related_resource_records.append(self.relate_resources(relationship, results['legacyid_to_entityid'], archesjson))
        else:
            print 'No relationship file'

        #self.se.bulk_index(self.resources)


    def resource_list_to_entities(self, resource_list, archesjson=False):
        '''Takes a collection of imported resource records and saves them as arches entities'''

        start = time()
        d = datetime.datetime.now()
        load_id = 'LOADID:{0}-{1}-{2}-{3}-{4}-{5}'.format(d.year, d.month, d.day, d.hour, d.minute, d.microsecond) #Should we append the timestamp to the exported filename?

        ret = {'successfully_saved':0, 'failed_to_save':[]}
        schema = None
        current_entitiy_type = None
        legacyid_to_entityid = {}
        errors = []
        progress_interval = 250
        for count, resource in enumerate(resource_list):

            if count >= progress_interval and count % progress_interval == 0:
                print count, 'of', len(resource_list), 'loaded'


            if archesjson == False:
                masterGraph = None
                if current_entitiy_type != resource.entitytypeid:
                    schema = Resource.get_mapping_schema(resource.entitytypeid)

                master_graph = self.build_master_graph(resource, schema)
                self.pre_save(master_graph)

                try:
                    uuid.UUID(resource.resource_id)
                    entityid = resource.resource_id
                except(ValueError):
                    entityid = ''
                    
                master_graph.save(user=self.user, note=load_id, resource_uuid=entityid)
                master_graph.index()
                resource.entityid = master_graph.entityid
                legacyid_to_entityid[resource.resource_id] = master_graph.entityid
            
            else:
                new_resource = Resource(resource)
                new_resource.save(user=self.user, note=load_id, resource_uuid=new_resource.entityid)
                try:
                    new_resource.index()
                except:
                    print 'Could not index resource. This may be because the valueid of a concept is not in the database.'
                legacyid_to_entityid[new_resource.entityid] = new_resource.entityid

            ret['successfully_saved'] += 1


        ret['legacyid_to_entityid'] = legacyid_to_entityid
        elapsed = (time() - start)
        print len(resource_list), 'resources loaded'
        if len(resource_list) > 0:
            print 'total time to etl = %s' % (elapsed)
            print 'average time per entity = %s' % (elapsed/len(resource_list))
            print 'Load Identifier =', load_id
            print '***You can reverse this load with the following command:'
            print 'python manage.py packages -o remove_resources --load_id', load_id
        return ret

    def build_master_graph(self, resource, schema):
        master_graph = None
        entity_data = []

        if len(entity_data) > 0:
            master_graph = entity_data[0]
            for mapping in entity_data[1:]:
                master_graph.merge(mapping)

        for group in resource.groups:
            entity_data2 = []
            for row in group.rows:
                entity = Resource()
                entity.create_from_mapping(row.resourcetype, schema[row.attributename]['steps'], row.attributename, row.attributevalue)
                entity_data2.append(entity)  

            mapping_graph = entity_data2[0]
            for mapping in entity_data2[1:]:
                mapping_graph.merge(mapping)

            if master_graph == None:
                master_graph = mapping_graph
            else:
                node_type_to_merge_at = schema[row.attributename]['mergenodeid']
                master_graph.merge_at(mapping_graph, node_type_to_merge_at)

        return master_graph

    def pre_save(self, master_graph):
        pass

    def relate_resources(self, relationship, legacyid_to_entityid, archesjson):
        start_date = None if relationship['START_DATE'] in ('', 'None') else relationship['START_DATE']
        end_date = None if relationship['END_DATE'] in ('', 'None') else relationship['END_DATE']

        if archesjson == False:
            relationshiptype_concept = Concepts.objects.get(legacyoid = relationship['RELATION_TYPE'])
            concept_value = Values.objects.filter(conceptid = relationshiptype_concept.conceptid).filter(valuetype = 'prefLabel')
            entityid1 = legacyid_to_entityid[relationship['RESOURCEID_FROM']]
            entityid2 = legacyid_to_entityid[relationship['RESOURCEID_TO']]

        else:
            concept_value = Values.objects.filter(valueid = relationship['RELATION_TYPE'])
            entityid1 = relationship['RESOURCEID_FROM']
            entityid2 = relationship['RESOURCEID_TO']

        related_resource_record = ResourceXResource(
            entityid1 = entityid1,
            entityid2 = entityid2,
            notes = relationship['NOTES'],
            relationshiptype = concept_value[0].valueid,
            datestarted = start_date,
            dateended = end_date,
            )

        related_resource_record.save()
        self.se.index_data(index='resource_relations', doc_type='all', body=model_to_dict(related_resource_record), idfield='resourcexid')
class ResourceLoader(object):
    def __init__(self):
        self.user = User()
        self.user.first_name = settings.ETL_USERNAME
        self.resources = []
        self.se = SearchEngineFactory().create()

    option_list = BaseCommand.option_list + (
        make_option('--source',
                    action='store',
                    dest='source',
                    default='',
                    help='.arches file containing resource records'),
        make_option(
            '--format',
            action='store_true',
            default='arches',
            help='format extension that you would like to load: arches or shp'
        ),
    )

    def load(self, source, appending=False):
        file_name, file_format = os.path.splitext(source)
        archesjson = False
        if file_format == '.shp':
            reader = ShapeReader()
        elif file_format == '.arches':
            reader = ArchesReader()
            print '\nVALIDATING ARCHES FILE ({0})'.format(source)
            # reader.validate_file(source)
        elif file_format == '.json':
            archesjson = True
            reader = JsonReader()
            print '\nVALIDATING JSON FILE ({0})'.format(source)
            reader.validate_file(source)
        elif file_format == '.jsonl':
            archesjson = True
            reader = JsonReader()
            print '\nNO VALIDATION USED ON JSONL FILE ({0})'.format(source)
            d = datetime.datetime.now()
            load_id = 'LOADID:{0}-{1}-{2}-{3}-{4}-{5}'.format(
                d.year, d.month, d.day, d.hour, d.minute, d.microsecond)
            loaded_ct = 0
            with open(source, "rb") as openf:
                lines = openf.readlines()
                for line in lines:
                    resource = json.loads(line)
                    result = self.resource_list_to_entities(
                        [resource],
                        True,
                        False,
                        filename=os.path.basename(source),
                        load_id=load_id)
                    loaded_ct += 1
            return {"count": loaded_ct}

        start = time()
        resources = reader.load_file(source)

        print '\nLOADING RESOURCES ({0})'.format(source)
        relationships = None
        related_resource_records = []
        relationships_file = file_name + '.relations'
        elapsed = (time() - start)
        print 'time to parse {0} resources = {1}'.format(file_name, elapsed)
        results = self.resource_list_to_entities(
            resources,
            archesjson,
            appending,
            filename=os.path.basename(source))
        if os.path.exists(relationships_file):
            with open(relationships_file, "rb") as openf:
                lines = openf.readlines()
                if "," in lines[0]:
                    delim = ","
                elif "|" in lines[0]:
                    delim = "|"
                else:
                    delim = ","
            relationships = csv.DictReader(open(relationships_file, 'r'),
                                           delimiter=delim)
            for relationship in relationships:
                related_resource_records.append(
                    self.relate_resources(relationship,
                                          results['legacyid_to_entityid'],
                                          archesjson))
        else:
            print 'No relationship file'

        return results

        #self.se.bulk_index(self.resources)

    # def resource_list_chunk_to_entities():

    def resource_list_to_entities(self,
                                  resource_list,
                                  archesjson=False,
                                  append=False,
                                  filename='',
                                  load_id=None):
        '''Takes a collection of imported resource records and saves them as arches entities'''
        start = time()
        d = datetime.datetime.now()

        if load_id is None:
            load_id = 'LOADID:{0}-{1}-{2}-{3}-{4}-{5}'.format(
                d.year, d.month, d.day, d.hour, d.minute, d.microsecond
            )  #Should we append the timestamp to the exported filename?

        ret = {
            'successfully_saved': 0,
            'failed_to_save': [],
            'load_id': load_id
        }
        schema = None
        current_entitiy_type = None
        legacyid_to_entityid = {}
        errors = []
        progress_interval = 250

        def chunks(l, n):
            """Yield successive n-sized chunks from l. Thanks to:
            https://stackoverflow.com/a/312464/3873885"""
            for i in xrange(0, len(l), n):
                yield l[i:i + n]

        elapsed = 0
        chunktimes = list()
        for m, resource_list_chunk in enumerate(
                chunks(resource_list, progress_interval)):
            startchunk = time()
            multiplier = m + 1
            with transaction.atomic():
                for count, resource in enumerate(resource_list_chunk):
                    real_ct = count + 1
                    if archesjson == False:
                        masterGraph = None
                        if current_entitiy_type != resource.entitytypeid:
                            schema = Resource.get_mapping_schema(
                                resource.entitytypeid)
                            current_entitiy_type = resource.entitytypeid

                        master_graph = self.build_master_graph(
                            resource, schema)
                        self.pre_save(master_graph)

                        try:
                            uuid.UUID(resource.resource_id)
                            entityid = resource.resource_id
                        except ValueError:
                            entityid = ''

                        if append:
                            try:
                                resource_to_delete = Resource(entityid)
                                resource_to_delete.delete_index()
                            except ObjectDoesNotExist:
                                print 'Entity ', entityid, ' does not exist. Nothing to delete'

                        try:
                            master_graph.save(user=self.user,
                                              note=load_id,
                                              resource_uuid=entityid)
                        except Exception as e:
                            print 'Could not save resource {}.\nERROR: {}'.format(
                                master_graph.entityid, e)
                        resource.entityid = master_graph.entityid
                        #new_resource = Resource().get(resource.entityid)
                        #assert new_resource == master_graph
                        try:
                            master_graph.index()
                        except Exception as e:
                            print 'Could not index resource {}.\nERROR: {}'.format(
                                resource.entityid, e)
                        legacyid_to_entityid[
                            resource.resource_id] = master_graph.entityid
                    else:
                        new_resource = Resource(resource)
                        try:
                            new_resource.save(
                                user=self.user,
                                note=load_id,
                                resource_uuid=new_resource.entityid)
                        except Exception as e:
                            print 'Could not save resource {}.\nERROR: {}'.format(
                                resource['entityid'], e)
                            # with open(resource['entityid']+".json", "wb") as f:
                            # json.dump(resource, f, indent=1)
                            continue
                        new_resource = Resource().get(new_resource.entityid)
                        try:
                            new_resource.index()
                        except Exception as e:
                            print 'Could not index resource {}.\nERROR: {}'.format(
                                resource.entityid, e)
                        legacyid_to_entityid[
                            new_resource.entityid] = new_resource.entityid

                    ret['successfully_saved'] += 1
            endchunk = time() - startchunk

            chunktimes.append(endchunk)
            chunktime_avg = sum(chunktimes) / len(chunktimes)
            remtime = ((len(resource_list) -
                        (multiplier * progress_interval)) * chunktime_avg /
                       progress_interval) / 60
            if real_ct == progress_interval:
                print "{} of {} loaded in {}m. remaining time estimate: {}m".format(
                    progress_interval * multiplier, len(resource_list),
                    round(sum(chunktimes) / 60, 2), round(remtime, 2))

            else:
                print progress_interval * multiplier + real_ct

        ret['legacyid_to_entityid'] = legacyid_to_entityid
        elapsed = (time() - start)
        print len(resource_list), 'resources loaded'
        if len(resource_list) > 0:
            print 'total time to etl = %s' % (elapsed)
            print 'average time per entity = %s' % (elapsed /
                                                    len(resource_list))
            print 'Load Identifier =', load_id
            print '***You can reverse this load with the following command:'
            print 'python manage.py packages -o remove_resources --load_id', load_id
            log_msg = "\n~~~~~\n{}\nfile: {}\nresources: {}\nloadid: {}".format(
                d.strftime("%d/%m/%Y - %H:%M"), filename, len(resource_list),
                load_id)
            with open(settings.BULK_UPLOAD_LOG_FILE, "a") as loadlog:
                loadlog.write(log_msg)
        return ret

    def build_master_graph(self, resource, schema):
        master_graph = None
        entity_data = []

        if len(entity_data) > 0:
            master_graph = entity_data[0]
            for mapping in entity_data[1:]:
                master_graph.merge(mapping)

        for group in resource.groups:
            entity_data2 = []
            for row in group.rows:
                entity = Resource()
                entity.create_from_mapping(row.resourcetype,
                                           schema[row.attributename]['steps'],
                                           row.attributename,
                                           row.attributevalue)
                entity_data2.append(entity)

            mapping_graph = entity_data2[0]
            for mapping in entity_data2[1:]:
                mapping_graph.merge(mapping)

            if master_graph == None:
                master_graph = mapping_graph
            else:
                node_type_to_merge_at = schema[
                    row.attributename]['mergenodeid']
                has_merge_in_path = 0
                new_merge_node = None
                for ent in entity_data2:
                    for step in ent.flatten():
                        if step.entitytypeid == node_type_to_merge_at:
                            has_merge_in_path += 1
                            break
                for ent in mapping_graph.flatten():
                    if ent.entitytypeid == node_type_to_merge_at and ent.value != '':
                        new_merge_node = schema[node_type_to_merge_at][
                            'mergenodeid']
                if has_merge_in_path != len(entity_data2):
                    # Merge node is not in path of each node - so will merge in at root.
                    master_graph.merge_at(mapping_graph,
                                          mapping_graph.entitytypeid)
                elif new_merge_node:
                    # Merge node is a value node - so will merge one node up
                    master_graph.merge_at(mapping_graph, new_merge_node)
                else:
                    master_graph.merge_at(mapping_graph, node_type_to_merge_at)
        return master_graph

    def pre_save(self, master_graph):
        pass

    def relate_resources(self, relationship, legacyid_to_entityid, archesjson):
        start_date = None if relationship['START_DATE'] in (
            '', 'None') else relationship['START_DATE']
        end_date = None if relationship['END_DATE'] in (
            '', 'None') else relationship['END_DATE']

        if archesjson == False:
            relationshiptype_concept = Concepts.objects.get(
                legacyoid=relationship['RELATION_TYPE'])
            concept_value = Values.objects.filter(
                conceptid=relationshiptype_concept.conceptid).filter(
                    valuetype='prefLabel')
            entityid1 = legacyid_to_entityid[relationship['RESOURCEID_FROM']]
            if relationship['RESOURCEID_TO'] in legacyid_to_entityid.keys():
                entityid2 = legacyid_to_entityid[relationship['RESOURCEID_TO']]
            else:
                # If entityid is not in dictionary, likely is a uuid to previously existing resource
                entityid2 = relationship['RESOURCEID_TO']

        else:
            concept_value = Values.objects.filter(
                valueid=relationship['RELATION_TYPE'])
            entityid1 = relationship['RESOURCEID_FROM']
            entityid2 = relationship['RESOURCEID_TO']

        if len(concept_value) == 0:
            concept = Concepts.objects.get(
                conceptid=relationship['RELATION_TYPE'])
            concept_value = Values.objects.filter(conceptid=concept)

        related_resource_record = RelatedResource(
            entityid1=entityid1,
            entityid2=entityid2,
            notes=relationship['NOTES'],
            relationshiptype=concept_value[0].valueid,
            datestarted=start_date,
            dateended=end_date)

        related_resource_record.save()
        self.se.index_data(index='resource_relations',
                           doc_type='all',
                           body=model_to_dict(related_resource_record),
                           idfield='resourcexid')
Exemple #33
0
class ResourceLoader(object):
    def __init__(self):
        self.user = User()
        self.user.first_name = settings.ETL_USERNAME
        self.resources = []
        self.se = SearchEngineFactory().create()

    option_list = BaseCommand.option_list + (
        make_option('--source',
                    action='store',
                    dest='source',
                    default='',
                    help='.arches file containing resource records'),
        make_option(
            '--format',
            action='store_true',
            default='arches',
            help='format extension that you would like to load: arches or shp'
        ),
    )

    def load(self, source):
        file_name, file_format = os.path.splitext(source)
        archesjson = False
        if file_format == '.shp':
            reader = ShapeReader()
        elif file_format == '.arches':
            reader = ArchesReader()
            print '\nVALIDATING ARCHES FILE ({0})'.format(source)
            reader.validate_file(source)
        elif file_format == '.json':
            archesjson = True
            reader = JsonReader()

        start = time()
        resources = reader.load_file(source)

        print '\nLOADING RESOURCES ({0})'.format(source)
        relationships = None
        related_resource_records = []
        relationships_file = file_name + '.relations'
        elapsed = (time() - start)
        print 'time to parse {0} resources = {1}'.format(file_name, elapsed)
        results = self.resource_list_to_entities(resources, archesjson)
        if os.path.exists(relationships_file):
            relationships = csv.DictReader(open(relationships_file, 'r'),
                                           delimiter='|')
            for relationship in relationships:
                related_resource_records.append(
                    self.relate_resources(relationship,
                                          results['legacyid_to_entityid'],
                                          archesjson))
        else:
            print 'No relationship file'

        #self.se.bulk_index(self.resources)

    def resource_list_to_entities(self, resource_list, archesjson=False):
        '''Takes a collection of imported resource records and saves them as arches entities'''

        start = time()
        d = datetime.datetime.now()
        load_id = 'LOADID:{0}-{1}-{2}-{3}-{4}-{5}'.format(
            d.year, d.month, d.day, d.hour, d.minute, d.microsecond
        )  #Should we append the timestamp to the exported filename?

        ret = {'successfully_saved': 0, 'failed_to_save': []}
        schema = None
        current_entitiy_type = None
        legacyid_to_entityid = {}
        errors = []
        progress_interval = 250
        for count, resource in enumerate(resource_list):

            if count >= progress_interval and count % progress_interval == 0:
                print count, 'of', len(resource_list), 'loaded'

            if archesjson == False:
                masterGraph = None
                if current_entitiy_type != resource.entitytypeid:
                    schema = Resource.get_mapping_schema(resource.entitytypeid)

                master_graph = self.build_master_graph(resource, schema)
                self.pre_save(master_graph)

                try:
                    uuid.UUID(resource.resource_id)
                    entityid = resource.resource_id
                except (ValueError):
                    entityid = ''

                master_graph.save(user=self.user,
                                  note=load_id,
                                  resource_uuid=entityid)
                master_graph.index()
                resource.entityid = master_graph.entityid
                legacyid_to_entityid[
                    resource.resource_id] = master_graph.entityid

            else:
                new_resource = Resource(resource)
                new_resource.save(user=self.user,
                                  note=load_id,
                                  resource_uuid=new_resource.entityid)
                try:
                    new_resource.index()
                except:
                    print 'Could not index resource. This may be because the valueid of a concept is not in the database.'
                legacyid_to_entityid[
                    new_resource.entityid] = new_resource.entityid

            ret['successfully_saved'] += 1

        ret['legacyid_to_entityid'] = legacyid_to_entityid
        elapsed = (time() - start)
        print len(resource_list), 'resources loaded'
        if len(resource_list) > 0:
            print 'total time to etl = %s' % (elapsed)
            print 'average time per entity = %s' % (elapsed /
                                                    len(resource_list))
            print 'Load Identifier =', load_id
            print '***You can reverse this load with the following command:'
            print 'python manage.py packages -o remove_resources --load_id', load_id
        return ret

    def build_master_graph(self, resource, schema):
        master_graph = None
        entity_data = []

        if len(entity_data) > 0:
            master_graph = entity_data[0]
            for mapping in entity_data[1:]:
                master_graph.merge(mapping)

        for group in resource.groups:
            entity_data2 = []
            for row in group.rows:
                entity = Resource()
                entity.create_from_mapping(row.resourcetype,
                                           schema[row.attributename]['steps'],
                                           row.attributename,
                                           row.attributevalue)
                entity_data2.append(entity)

            mapping_graph = entity_data2[0]
            for mapping in entity_data2[1:]:
                mapping_graph.merge(mapping)

            if master_graph == None:
                master_graph = mapping_graph
            else:
                node_type_to_merge_at = schema[
                    row.attributename]['mergenodeid']
                master_graph.merge_at(mapping_graph, node_type_to_merge_at)

        return master_graph

    def pre_save(self, master_graph):
        pass

    def relate_resources(self, relationship, legacyid_to_entityid, archesjson):
        start_date = None if relationship['START_DATE'] in (
            '', 'None') else relationship['START_DATE']
        end_date = None if relationship['END_DATE'] in (
            '', 'None') else relationship['END_DATE']

        if archesjson == False:
            relationshiptype_concept = Concept.objects.get(
                legacyoid=relationship['RELATION_TYPE'])
            concept_value = Value.objects.filter(
                concept=relationshiptype_concept.conceptid).filter(
                    valuetype='prefLabel')
            entityid1 = legacyid_to_entityid[relationship['RESOURCEID_FROM']]
            entityid2 = legacyid_to_entityid[relationship['RESOURCEID_TO']]

        else:
            concept_value = Value.objects.filter(
                valueid=relationship['RELATION_TYPE'])
            entityid1 = relationship['RESOURCEID_FROM']
            entityid2 = relationship['RESOURCEID_TO']

        related_resource_record = ResourceXResource(
            entityid1=entityid1,
            entityid2=entityid2,
            notes=relationship['NOTES'],
            relationshiptype=concept_value[0].valueid,
            datestarted=start_date,
            dateended=end_date,
        )

        related_resource_record.save()
        self.se.index_data(index='resource_relations',
                           doc_type='all',
                           body=model_to_dict(related_resource_record),
                           idfield='resourcexid')
Exemple #34
0
class ResourceLoader(object):
    def __init__(self):
        self.user = User()
        self.user.first_name = settings.ETL_USERNAME
        self.resources = []
        self.se = SearchEngineFactory().create()

    option_list = BaseCommand.option_list + (
        make_option(
            "--source", action="store", dest="source", default="", help=".arches file containing resource records"
        ),
        make_option(
            "--format",
            action="store_true",
            default="arches",
            help="format extension that you would like to load: arches or shp",
        ),
    )

    def load(self, source):
        file_name, file_format = os.path.splitext(source)
        archesjson = False
        if file_format == ".shp":
            reader = ShapeReader()
        elif file_format == ".arches":
            reader = ArchesReader()
            print "\nVALIDATING ARCHES FILE ({0})".format(source)
            reader.validate_file(source)
        elif file_format == ".json":
            archesjson = True
            reader = JsonReader()

        start = time()
        resources = reader.load_file(source)

        print "\nLOADING RESOURCES ({0})".format(source)
        relationships = None
        related_resource_records = []
        relationships_file = file_name + ".relations"
        elapsed = time() - start
        print "time to parse {0} resources = {1}".format(file_name, elapsed)
        results = self.resource_list_to_entities(resources, archesjson)
        if os.path.exists(relationships_file):
            relationships = csv.DictReader(open(relationships_file, "r"), delimiter="|")
            for relationship in relationships:
                related_resource_records.append(
                    self.relate_resources(relationship, results["legacyid_to_entityid"], archesjson)
                )
        else:
            print "No relationship file"

        # self.se.bulk_index(self.resources)

    def resource_list_to_entities(self, resource_list, archesjson=False):
        """Takes a collection of imported resource records and saves them as arches entities"""

        start = time()
        d = datetime.datetime.now()
        load_id = "LOADID:{0}-{1}-{2}-{3}-{4}-{5}".format(
            d.year, d.month, d.day, d.hour, d.minute, d.microsecond
        )  # Should we append the timestamp to the exported filename?

        ret = {"successfully_saved": 0, "failed_to_save": []}
        schema = None
        current_entitiy_type = None
        legacyid_to_entityid = {}
        errors = []
        progress_interval = 250
        for count, resource in enumerate(resource_list):

            if count >= progress_interval and count % progress_interval == 0:
                print count, "of", len(resource_list), "loaded"

            if archesjson == False:
                masterGraph = None
                if current_entitiy_type != resource.entitytypeid:
                    schema = Resource.get_mapping_schema(resource.entitytypeid)

                master_graph = self.build_master_graph(resource, schema)
                self.pre_save(master_graph)

                try:
                    uuid.UUID(resource.resource_id)
                    entityid = resource.resource_id
                except (ValueError):
                    entityid = ""

                master_graph.save(user=self.user, note=load_id, resource_uuid=entityid)
                master_graph.index()
                resource.entityid = master_graph.entityid
                legacyid_to_entityid[resource.resource_id] = master_graph.entityid

            else:
                new_resource = Resource(resource)
                new_resource.save(user=self.user, note=load_id, resource_uuid=new_resource.entityid)
                try:
                    new_resource.index()
                except:
                    print "Could not index resource. This may be because the valueid of a concept is not in the database."
                legacyid_to_entityid[new_resource.entityid] = new_resource.entityid

            ret["successfully_saved"] += 1

        ret["legacyid_to_entityid"] = legacyid_to_entityid
        elapsed = time() - start
        print len(resource_list), "resources loaded"
        if len(resource_list) > 0:
            print "total time to etl = %s" % (elapsed)
            print "average time per entity = %s" % (elapsed / len(resource_list))
            print "Load Identifier =", load_id
            print "***You can reverse this load with the following command:"
            print "python manage.py packages -o remove_resources --load_id", load_id
        return ret

    def build_master_graph(self, resource, schema):
        master_graph = None
        entity_data = []

        if len(entity_data) > 0:
            master_graph = entity_data[0]
            for mapping in entity_data[1:]:
                master_graph.merge(mapping)

        for group in resource.groups:
            entity_data2 = []
            for row in group.rows:
                entity = Resource()
                entity.create_from_mapping(
                    row.resourcetype, schema[row.attributename]["steps"], row.attributename, row.attributevalue
                )
                entity_data2.append(entity)

            mapping_graph = entity_data2[0]
            for mapping in entity_data2[1:]:
                mapping_graph.merge(mapping)

            if master_graph == None:
                master_graph = mapping_graph
            else:
                node_type_to_merge_at = schema[row.attributename]["mergenodeid"]
                master_graph.merge_at(mapping_graph, node_type_to_merge_at)

        return master_graph

    def pre_save(self, master_graph):
        pass

    def relate_resources(self, relationship, legacyid_to_entityid, archesjson):
        start_date = None if relationship["START_DATE"] in ("", "None") else relationship["START_DATE"]
        end_date = None if relationship["END_DATE"] in ("", "None") else relationship["END_DATE"]

        if archesjson == False:
            relationshiptype_concept = Concept.objects.get(legacyoid=relationship["RELATION_TYPE"])
            concept_value = Value.objects.filter(concept=relationshiptype_concept.conceptid).filter(
                valuetype="prefLabel"
            )
            entityid1 = legacyid_to_entityid[relationship["RESOURCEID_FROM"]]
            entityid2 = legacyid_to_entityid[relationship["RESOURCEID_TO"]]

        else:
            concept_value = Value.objects.filter(valueid=relationship["RELATION_TYPE"])
            entityid1 = relationship["RESOURCEID_FROM"]
            entityid2 = relationship["RESOURCEID_TO"]

        related_resource_record = ResourceXResource(
            entityid1=entityid1,
            entityid2=entityid2,
            notes=relationship["NOTES"],
            relationshiptype=concept_value[0].valueid,
            datestarted=start_date,
            dateended=end_date,
        )

        related_resource_record.save()
        self.se.index_data(
            index="resource_relations",
            doc_type="all",
            body=model_to_dict(related_resource_record),
            idfield="resourcexid",
        )
Exemple #35
0
 def save(self):
     se = SearchEngineFactory().create()
     document = model_to_dict(self)
     se.index_data(index='resource_relations', doc_type='all', body=document, idfield='resourcexid')
     super(ResourceXResource, self).save()
Exemple #36
0
class BaseIndex(object):
    def __init__(self, index_name=None):
        if index_name is None or index_name is "":
            raise SearchIndexError("Index name is not defined")

        self.se = SearchEngineFactory().create()
        self.index_metadata = None
        self.index_name = index_name

    def prepare_index(self):
        """
        Defines the Elastic Search mapping and settings for an index

        Arguments:
        None

        Keyword Arguments:
        None

        Return: None
        """

        if self.index_metadata is not None:
            self.se.create_index(index=self.index_name,
                                 body=self.index_metadata)
        else:
            raise SearchIndexError("No index metadata defined.")

    def get_documents_to_index(self, resourceinstance, tiles):
        """
        Gets a document to index into Elastic Search

        Arguments:
        resourceinstance -- resource instance object
        tiles -- list of tiles that make up the resource instance

        Keyword Arguments:
        None

        Return: tuple of (document, document id)
        """

        raise NotImplementedError

    def index_document(self, document=None, id=None):
        """
        Indexes a document into Elastic Search

        Arguments:
        None

        Keyword Arguments:
        document -- the document to index
        id -- the id of the document

        Return: None
        """

        if document is not None and id is not None:
            self.se.index_data(index=self.index_name, body=document, id=id)

    def bulk_index(self,
                   resources=None,
                   resource_type=None,
                   graph_name=None,
                   clear_index=True):
        """
        Indexes a list of documents in bulk to Elastic Search

        Arguments:
        None

        Keyword Arguments:
        resources -- the list of resource instances to index
        resource_type -- the type of resources being indexed
        graph_name -- the name of the graph model that represents the resources being indexed
        clear_index -- True(default) to remove all index records of type "resource_type" before indexing, 
            assumes that a field called "graph_id" exists on the indexed documents

        Return: None
        """

        start = datetime.now()
        q = Query(se=self.se)
        if clear_index:
            term = Term(field="graph_id", term=str(resource_type))
            q.add_query(term)
            q.delete(index=self.index_name, refresh=True)

        q = Query(se=self.se)
        count_before = self.se.count(index=self.index_name, body=q.dsl)

        result_summary = {"database": len(resources), "indexed": 0}
        with self.se.BulkIndexer(batch_size=settings.BULK_IMPORT_BATCH_SIZE,
                                 refresh=True) as indexer:
            for resource in resources:
                tiles = list(
                    models.TileModel.objects.filter(resourceinstance=resource))
                document, doc_id = self.get_documents_to_index(resource, tiles)
                if document is not None and id is not None:
                    indexer.add(index=self.index_name,
                                id=doc_id,
                                data=document)

        result_summary["indexed"] = self.se.count(index=self.index_name,
                                                  body=q.dsl) - count_before
        status = "Passed" if result_summary["database"] == result_summary[
            "indexed"] else "Failed"
        print("Custom Index - %s:" % self.index_name)
        print(
            "    Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds"
            .format(status, graph_name, result_summary["database"],
                    result_summary["indexed"],
                    (datetime.now() - start).seconds))

    def delete_index(self):
        """
        Deletes this index from Elastic Search

        Arguments:
        None

        Keyword Arguments:
        None

        Return: None
        """

        self.se.delete_index(index=self.index_name)
Exemple #37
0
class BaseIndex(object):
    def __init__(self, index_name=None):
        if index_name is None or index_name == "":
            raise SearchIndexError("Index name is not defined")

        self.se = SearchEngineFactory().create()
        self.index_metadata = None
        self.index_name = index_name

    def prepare_index(self):
        """
        Defines the Elastic Search mapping and settings for an index

        Arguments:
        None

        Keyword Arguments:
        None

        Return: None
        """

        if self.index_metadata is not None:
            self.se.create_index(index=self.index_name, body=self.index_metadata)
        else:
            raise SearchIndexError("No index metadata defined.")

    def get_documents_to_index(self, resourceinstance, tiles):
        """
        Gets a document to index into Elastic Search

        Arguments:
        resourceinstance -- resource instance object
        tiles -- list of tiles that make up the resource instance

        Keyword Arguments:
        None

        Return: tuple of (document, document id)
        """

        raise NotImplementedError

    def index_document(self, document=None, id=None):
        """
        Indexes a document into Elastic Search

        Arguments:
        None

        Keyword Arguments:
        document -- the document to index
        id -- the id of the document

        Return: None
        """

        if document is not None and id is not None:
            self.se.index_data(index=self.index_name, body=document, id=id)

    def index_resources(self, resources=None, batch_size=settings.BULK_IMPORT_BATCH_SIZE, quiet=False):
        """
        Indexes a list of resources in bulk to Elastic Search

        Keyword Arguments:
        resources -- the list of resource instances to index
        batch_size -- the number of records to index as a group, the larger the number to more memory required
        quiet -- Silences the status bar output during certain operations, use in celery operations for example

        Return: None
        """

        start = datetime.now()
        q = Query(se=self.se)
        self.se.refresh(index=self.index_name)
        count_before = self.se.count(index=self.index_name, body=q.dsl)
        result_summary = {"database": len(resources), "indexed": 0}
        if quiet is False:
            bar = pyprind.ProgBar(len(resources), bar_char="█") if len(resources) > 1 else None
        with self.se.BulkIndexer(batch_size=batch_size, refresh=True) as indexer:
            for resource in resources:
                if quiet is False and bar is not None:
                    bar.update(item_id=resource)
                tiles = list(models.TileModel.objects.filter(resourceinstance=resource))
                document, doc_id = self.get_documents_to_index(resource, tiles)
                if document is not None and id is not None:
                    indexer.add(index=self.index_name, id=doc_id, data=document)

        self.se.refresh(index=self.index_name)
        result_summary["indexed"] = self.se.count(index=self.index_name, body=q.dsl) - count_before
        status = "Passed" if result_summary["database"] == result_summary["indexed"] else "Failed"
        print(f"Custom Index - {settings.ELASTICSEARCH_PREFIX}_{self.index_name}")
        print(
            f"    Status: {status}, In Database: {result_summary['database']}, Indexed: {result_summary['indexed']}, Took: {(datetime.now() - start).seconds} seconds"
        )

    def delete_resources(self, resources=None):
        """
        Deletes documents from an index based on the passed in list of resources
        Delete by query, so this is a single operation

        Keyword Arguments:
        resources -- a single resource instance or a list of resource instances
        """

        q = Query(se=self.se)
        if not isinstance(resources, list):
            resourcelist = [resources]
        else:
            resourcelist = resources
        list_of_ids_to_delete = []
        for resource in resourcelist:
            list_of_ids_to_delete.append(resource.pk)
        ids_query = Ids(ids=list_of_ids_to_delete)
        q.add_query(ids_query)
        q.delete(index=self.index_name)

    def delete_index(self):
        """
        Deletes this index from Elastic Search

        Arguments:
        None

        Keyword Arguments:
        None

        Return: None
        """

        self.se.delete_index(index=self.index_name)

    def reindex(self, graphids=None, clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE, quiet=False):
        """
        Reindexes the index.  By default this does nothing, it needs to be implemented in a subclass.
        By default you can pass in a list of graph ids to trigger the reindex.  This will loop through all resource instances of each graph type.

            Example subclass command:
            def reindex(self, clear_index=True):
                PARCEL_GRAPHID = "e3c35dca-5e72-11ea-a2d3-dca90488358a"
                super(CustomIndexName, self).reindex(graphids=[PARCEL_GRAPHID], clear_index=clear_index)

        Keyword Arguments:
        graphids -- list of graphs ids to trigger the reindex on, will get all resource instances of each graph id supplied
        clear_index -- True(default) to clear all documents out of the index before reindexing begins
        batch_size -- the number of records to index as a group, the larger the number to more memory required

        Return: None
        """

        if graphids is not None:
            if clear_index:
                self.delete_index()
                self.prepare_index()

            for graphid in graphids:
                resources = Resource.objects.filter(graph_id=graphid)
                self.index_resources(resources=resources, batch_size=batch_size, quiet=quiet)
        else:
            raise NotImplementedError