def test_bulk_delete(self): """ Test bulk deleting of documents in Elasticsearch """ se = SearchEngineFactory().create() # se.create_index(index='test') for i in range(10): x = { 'id': i, 'type': 'prefLabel', 'value': 'test pref label', } se.index_data(index='test', doc_type='test', body=x, idfield='id', refresh=True) y = { 'id': i + 100, 'type': 'altLabel', 'value': 'test alt label', } se.index_data(index='test', doc_type='test', body=y, idfield='id', refresh=True) query = Query(se, start=0, limit=100) match = Match(field='type', query='altLabel') query.add_query(match) query.delete(index='test', refresh=True) self.assertEqual(se.es.count(index='test', doc_type='test')['count'], 10)
def index(self): """ Indexes all the nessesary items values of a resource to support search """ if str(self.graph_id) != str( settings.SYSTEM_SETTINGS_RESOURCE_MODEL_ID): se = SearchEngineFactory().create() datatype_factory = DataTypeFactory() node_datatypes = { str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list( "nodeid", "datatype") } document, terms = self.get_documents_to_index( datatype_factory=datatype_factory, node_datatypes=node_datatypes) document["root_ontology_class"] = self.get_root_ontology() doc = JSONSerializer().serializeToPython(document) se.index_data(index="resources", body=doc, id=self.pk) for term in terms: se.index_data("terms", body=term["_source"], id=term["_id"]) for index in settings.ELASTICSEARCH_CUSTOM_INDEXES: es_index = import_class_from_string(index["module"])( index["name"]) document, doc_id = es_index.get_documents_to_index( self, document["tiles"]) es_index.index_document(document=document, id=doc_id)
def test_delete_by_query(self): """ Test deleting documents by query in Elasticsearch """ se = SearchEngineFactory().create() for i in range(10): x = { 'id': i, 'type': 'prefLabel', 'value': 'test pref label', } se.index_data(index='test', body=x, idfield='id', refresh=True) y = { 'id': i + 100, 'type': 'altLabel', 'value': 'test alt label', } se.index_data(index='test', body=y, idfield='id', refresh=True) time.sleep(1) query = Query(se, start=0, limit=100) match = Match(field='type', query='altLabel') query.add_query(match) query.delete(index='test', refresh=True) self.assertEqual(se.count(index='test'), 10)
def index_resources_by_type(resource_types, result_summary): """ Collects and indexes all resources """ for resource_type in resource_types: resources = archesmodels.Entities.objects.filter(entitytypeid = resource_type) print "Indexing {0} {1} resources".format(len(resources), resource_type[0]) result_summary[resource_type[0]] = {'database':len(resources), 'indexed':0} errors = [] for resource in resources: try: resource = Resource().get(resource.entityid) resource.index() except Exception as e: if e not in errors: errors.append(e) if len(errors) > 0: print errors[0], ':', len(errors) se = SearchEngineFactory().create() related_resource_records = archesmodels.RelatedResource.objects.all() for related_resource_record in related_resource_records: se.index_data(index='resource_relations', doc_type='all', body=model_to_dict(related_resource_record), idfield='resourcexid') return result_summary
def index(self): """ Indexes all the nessesary items values of a resource to support search """ if unicode(self.graph_id) != unicode( settings.SYSTEM_SETTINGS_RESOURCE_MODEL_ID): se = SearchEngineFactory().create() datatype_factory = DataTypeFactory() node_datatypes = { str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list( 'nodeid', 'datatype') } document, terms = self.get_documents_to_index( datatype_factory=datatype_factory, node_datatypes=node_datatypes) document['root_ontology_class'] = self.get_root_ontology() se.index_data('resource', self.graph_id, JSONSerializer().serializeToPython(document), id=self.pk) for term in terms: se.index_data('strings', 'term', term['_source'], id=term['_id'])
def save(self): se = SearchEngineFactory().create() document = model_to_dict(self) se.index_data(index='resource_relations', doc_type='all', body=document, idfield='resourcexid') super(ResourceXResource, self).save()
def save(self): from arches.app.search.search_engine_factory import SearchEngineFactory se = SearchEngineFactory().create() if not self.created: self.created = datetime.datetime.now() self.modified = datetime.datetime.now() document = model_to_dict(self) se.index_data(index='resource_relations', doc_type='all', body=document, idfield='resourcexid') super(ResourceXResource, self).save()
def index(self, scheme=None): if self.category == 'label': se = SearchEngineFactory().create() data = JSONSerializer().serializeToPython(self) if scheme == None: scheme = self.get_scheme_id() if scheme == None: raise Exception('Index of label failed. Index type (scheme id) could not be derived from the label.') data['top_concept'] = scheme.id se.index_data('strings', 'concept', data, 'id')
def index(self, scheme=None): if self.category == 'label': se = SearchEngineFactory().create() data = JSONSerializer().serializeToPython(self) if scheme == None: scheme = self.get_scheme_id() if scheme == None: raise Exception( 'Index of label failed. Index type (scheme id) could not be derived from the label.' ) data['top_concept'] = scheme.id se.index_data('strings', 'concept', data, 'id')
def index(self, scheme=None): if self.category == 'label': se = SearchEngineFactory().create() data = JSONSerializer().serializeToPython(self) if scheme == None: scheme = self.get_scheme_id() if scheme == None: raise Exception('Index of label failed. Index type (scheme id) could not be derived from the label.') se.create_mapping('concept_labels', scheme.id, fieldname='conceptid', fieldtype='string', fieldindex='not_analyzed') se.index_data('concept_labels', scheme.id, data, 'id') # don't create terms for entity type concepts if not(scheme.id == '00000000-0000-0000-0000-000000000003' or scheme.id == '00000000-0000-0000-0000-000000000004'): se.index_term(self.value, self.id, scheme.id, {'conceptid': self.conceptid})
def index(self): """ Indexes all the nessesary items values of a resource to support search """ if unicode(self.graph_id) != unicode(settings.SYSTEM_SETTINGS_RESOURCE_MODEL_ID): se = SearchEngineFactory().create() datatype_factory = DataTypeFactory() node_datatypes = {str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list('nodeid', 'datatype')} document, terms = self.get_documents_to_index(datatype_factory=datatype_factory, node_datatypes=node_datatypes) document['root_ontology_class'] = self.get_root_ontology() se.index_data('resource', self.graph_id, JSONSerializer().serializeToPython(document), id=self.pk) for term in terms: se.index_data('strings', 'term', term['_source'], id=term['_id'])
def index(self): """ Indexes all the nessesary items values of a resource to support search """ se = SearchEngineFactory().create() datatype_factory = DataTypeFactory() node_datatypes = {str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list('nodeid', 'datatype')} document, terms = self.get_documents_to_index(datatype_factory=datatype_factory, node_datatypes=node_datatypes) se.index_data('resource', self.graph_id, JSONSerializer().serializeToPython(document), id=self.pk) for term in terms: se.index_data('strings', 'term', term['_source'], id=term['_id'])
def index(self, scheme=None): if self.category == 'label': se = SearchEngineFactory().create() data = JSONSerializer().serializeToPython(self) if scheme == None: scheme = self.get_scheme_id() if scheme == None: raise Exception('Index of label failed. Index type (scheme id) could not be derived from the label.') se.create_mapping('concept_labels', scheme.id, fieldname='conceptid', fieldtype='string', fieldindex='not_analyzed') se.index_data('concept_labels', scheme.id, data, 'id') #Looks up whether the label is actually a dropdown label or an entity label and, if so, excludes them from the term search index. entity_or_dropdown= archesmodels.ConceptRelations.objects.filter(Q(relationtype ='hasCollection') | Q(relationtype ='hasEntity'),conceptidto = scheme.id) is_entity_or_dropdown = False if entity_or_dropdown.count() == 0 else True # don't create terms for entity type concepts if not(scheme.id == '00000000-0000-0000-0000-000000000003' or scheme.id == '00000000-0000-0000-0000-000000000004') and is_entity_or_dropdown ==False: se.index_term(self.value, self.id, scheme.id, {'conceptid': self.conceptid})
def index(self, documents, index, type, idfield, processdoc=None, getid=None, bulk=False): detail = '' bulkitems = [] errorlist = [] se = SearchEngineFactory().create() if not isinstance(documents, list): documents = [documents] for document in documents: #print "inserting document: %s" % (document) sys.stdout.write('.') if processdoc == None: data = document else: data = processdoc(document) id = None if getid != None: id = getid(document, data) try: if bulk: bulkitem = se.create_bulk_item(index, type, id, data) bulkitems.append(bulkitem[0]) bulkitems.append(bulkitem[1]) else: se.index_data(index, type, data, idfield=idfield, id=id) except Exception as detail: errorlist.append(id) if bulk: try: se.bulk_index(index, type, bulkitems) except Exception as detail: errorlist = bulkitems print 'bulk inset failed' if detail != '': print "\n\nException detail: %s " % (detail) print "There was a problem indexing the following items:" print errorlist
def index_resources_by_type(resource_types, result_summary): """ Collects and indexes all resources """ errors = [] for resource_type in resource_types: resources = archesmodels.Entities.objects.filter( entitytypeid=resource_type) print "Indexing {0} {1} resources".format(len(resources), resource_type[0]) result_summary[resource_type[0]] = { 'database': len(resources), 'indexed': 0 } for resource in resources: try: resource = Resource().get(resource.entityid) resource.index() except Exception as e: msg = 'Could not index resource {}.\nERROR: {}'.format( resource.entityid, e) print msg errors.append(e) se = SearchEngineFactory().create() related_resource_records = archesmodels.RelatedResource.objects.all() for related_resource_record in related_resource_records: se.index_data(index='resource_relations', doc_type='all', body=model_to_dict(related_resource_record), idfield='resourcexid') if len(errors) > 0: print "Number of errors:", len(errors) log_file = os.path.join(settings.PACKAGE_ROOT, 'logs', 'indexing_errors.txt') utils.write_to_file(log_file, '\n'.join(errors), mode="wb") print " -- errors written to:", log_file return result_summary
def post(self, request, resourceid=None): es = Elasticsearch() se = SearchEngineFactory().create() res = dict(request.POST) relationship_type = res['relationship_properties[relationship_type]'][0] datefrom = res['relationship_properties[datefrom]'][0] dateto = res['relationship_properties[dateto]'][0] dateto = None if dateto == '' else dateto datefrom = None if datefrom == '' else datefrom notes = res['relationship_properties[notes]'][0] root_resourceinstanceid = res['root_resourceinstanceid'] instances_to_relate = [] relationships_to_update = [] if 'instances_to_relate[]' in res: instances_to_relate = res['instances_to_relate[]'] if 'relationship_ids[]' in res: relationships_to_update = res['relationship_ids[]'] for instanceid in instances_to_relate: rr = models.ResourceXResource.objects.create( resourceinstanceidfrom = Resource(root_resourceinstanceid[0]), resourceinstanceidto = Resource(instanceid), notes = notes, relationshiptype = models.Value(relationship_type), datestarted = datefrom, dateended = dateto ) document = model_to_dict(rr) se.index_data(index='resource_relations', doc_type='all', body=document, idfield='resourcexid') for relationshipid in relationships_to_update: rr = models.ResourceXResource.objects.get(pk=relationshipid) rr.notes = notes rr.relationshiptype = models.Value(relationship_type) rr.datestarted = datefrom rr.dateended = dateto rr.save() document = model_to_dict(rr) se.index_data(index='resource_relations', doc_type='all', body=document, idfield='resourcexid') start = request.GET.get('start', 0) es.indices.refresh(index="resource_relations") return JSONResponse(self.get_related_resources(root_resourceinstanceid[0], lang="en-us", start=start, limit=15), indent=4)
def add_resource_relation(entityid1, entityid2, relationship_type_string): # find the relationship type se = SearchEngineFactory().create() try: logging.warning("finding relationship: %s", relationship_type_string) value = models.Values.objects.get( value__icontains=relationship_type_string) relationship = models.RelatedResource(entityid1=entityid1, entityid2=entityid2, relationshiptype=value.pk) relationship.save() se.index_data(index='resource_relations', doc_type='all', body=model_to_dict(relationship), idfield='resourcexid') logging.warning("Added relationship") except Exception as e: logging.warning("Unable to create relation %s to %s. %s", entityid1, entityid2, e)
def index(self): """ Indexes all the nessesary documents related to resources to support the map, search, and reports """ se = SearchEngineFactory().create() search_documents = self.prepare_documents_for_search_index() for document in search_documents: se.index_data('resource', self.resourceinstance.graph_id, document, id=self.resourceinstance_id) for term in self.prepare_terms_for_search_index(): term_id = '%s_%s' % (str(self.tileid), str(term['nodeid'])) se.delete_terms(term_id) se.index_term(term['term'], term_id, term['context'], term['options'])
def index(self): """ Indexes all the nessesary documents related to resources to support the map, search, and reports """ se = SearchEngineFactory().create() search_documents = self.prepare_documents_for_search_index() for document in search_documents: se.index_data('entity', self.entitytypeid, document, id=self.entityid) report_documents = self.prepare_documents_for_report_index( geom_entities=document['geometries']) for report_document in report_documents: se.index_data('resource', self.entitytypeid, report_document, id=self.entityid) geojson_documents = self.prepare_documents_for_map_index( geom_entities=document['geometries']) for geojson in geojson_documents: se.index_data('maplayers', self.entitytypeid, geojson, idfield='id') for term in self.prepare_terms_for_search_index(): se.index_term(term['term'], term['entityid'], term['context'], term['ewstatus'], term['options'])
def update(self, data, files): se = SearchEngineFactory().create() related_resources_data = data.get('related-resources', []) original_relations = self.resource.get_related_resources() if self.resource.entityid == '': self.resource.save() relationship_ids = [] for related_resource in related_resources_data: relationship_id = related_resource['relationship']['resourcexid'] relationship_ids.append(relationship_id) resource_id = related_resource['relatedresourceid'] relationship_type_id = related_resource['relationship']['relationshiptype'] if isinstance(relationship_type_id, dict): relationship_type_id = relationship_type_id['value'] notes = related_resource['relationship']['notes'] date_started = related_resource['relationship']['datestarted'] date_ended = related_resource['relationship']['dateended'] if not relationship_id: relationship = self.resource.create_resource_relationship(resource_id, relationship_type_id=relationship_type_id, notes=notes, date_started=date_started, date_ended=date_ended ) else: relationship = RelatedResource.objects.get(pk=relationship_id) relationship.relationshiptype = relationship_type_id relationship.notes = notes relationship.datestarted = date_started relationship.dateended = date_ended relationship.save() se.delete(index='resource_relations', doc_type='all', id=relationship_id) se.index_data(index='resource_relations', doc_type='all', body=model_to_dict(relationship), idfield='resourcexid') for relatedentity in original_relations: if relatedentity['relationship'].resourcexid not in relationship_ids: se.delete(index='resource_relations', doc_type='all', id=relatedentity['relationship'].resourcexid) relatedentity['relationship'].delete()
def test_delete_by_query(self): """ Test deleting documents by query in Elasticsearch """ se = SearchEngineFactory().create() for i in range(10): x = {"id": i, "type": "prefLabel", "value": "test pref label"} se.index_data(index="test", body=x, idfield="id", refresh=True) y = {"id": i + 100, "type": "altLabel", "value": "test alt label"} se.index_data(index="test", body=y, idfield="id", refresh=True) time.sleep(1) query = Query(se, start=0, limit=100) match = Match(field="type", query="altLabel") query.add_query(match) query.delete(index="test", refresh=True) self.assertEqual(se.count(index="test"), 10)
def index(self): """ Indexes all the nessesary items values of a resource to support search """ se = SearchEngineFactory().create() datatype_factory = DataTypeFactory() node_datatypes = { str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list( 'nodeid', 'datatype') } document, terms = self.get_documents_to_index( datatype_factory=datatype_factory, node_datatypes=node_datatypes) se.index_data('resource', self.graph_id, JSONSerializer().serializeToPython(document), id=self.pk) for term in terms: se.index_data('strings', 'term', term['_source'], id=term['_id'])
def index(self): """ Indexes all the nessesary documents related to resources to support the map, search, and reports """ se = SearchEngineFactory().create() search_documents = self.prepare_documents_for_search_index() for document in search_documents: se.index_data('entity', self.entitytypeid, document, id=self.entityid) report_documents = self.prepare_documents_for_report_index(geom_entities=document['geometries']) for report_document in report_documents: se.index_data('resource', self.entitytypeid, report_document, id=self.entityid) geojson_documents = self.prepare_documents_for_map_index(geom_entities=document['geometries']) for geojson in geojson_documents: se.index_data('maplayers', self.entitytypeid, geojson, idfield='id') for term in self.prepare_terms_for_search_index(): se.index_term(term['term'], term['entityid'], term['context'], term['options'])
def index(documents, index, type, idfield, processdoc=None, getid=None, bulk=False): print 'index_concepts.index' detail = '' bulkitems = [] errorlist = [] se = SearchEngineFactory().create() if not isinstance(documents, list): documents = [documents] for document in documents: sys.stdout.write('.') if processdoc == None: data = document else: data = processdoc(document) id = None if getid != None: id = getid(document, data) try: if bulk: bulkitem = se.create_bulk_item(index, type, id, data) bulkitems.append(bulkitem[0]) bulkitems.append(bulkitem[1]) else: se.index_data(index, type, data, idfield=idfield, id=id) #se.index_data('concept_labels', '00000000-0000-0000-0000-000000000005', data, 'id') for concept in data['labels']: #se.index_term(concept['label'], concept['labelid'], '00000000-0000-0000-0000-000000000005', settings.PUBLISHED_LABEL, {'conceptid': data['conceptid']}) if concept['label'].strip(' \t\n\r') != '': already_indexed = False count = 1 ids = [id] try: _id = uuid.uuid3(uuid.NAMESPACE_DNS, '%s%s' % (hash(concept['label']), hash(data['conceptid']))) result = se.es.get(index='term', doc_type='value', id=_id, ignore=404) #print 'result: %s' % result if result['found'] == True: ids = result['_source']['ids'] if id not in ids: ids.append(id) else: ids = [id] if data['context'] != '00000000-0000-0000-0000-000000000003' and data['context'] != '00000000-0000-0000-0000-000000000004': se.index_data('term', 'value', {'term': concept['label'], 'context': data['context'], 'ewstatus': settings.PUBLISHED_LABEL, 'options': {'conceptid': data['conceptid']}, 'count': len(ids), 'ids': ids}, id=_id) except Exception as detail: raise detail except Exception as detail: print detail errorlist.append(id) if bulk: try: se.bulk_index(index, type, bulkitems) except Exception as detail: errorlist = bulkitems print 'bulk inset failed' if detail != '': print "\n\nException detail: %s " % (detail) print "There was a problem indexing the following items:" print errorlist
def main(): sql = """ALTER TABLE concepts.concepts ALTER COLUMN conceptid DROP DEFAULT; ALTER TABLE concepts.concepts ALTER COLUMN legacyoid SET NOT NULL; ALTER TABLE concepts.concepts DROP CONSTRAINT IF EXISTS unique_concepts_legacyoid; ALTER TABLE concepts.concepts ADD CONSTRAINT unique_concepts_legacyoid UNIQUE (legacyoid); CREATE OR REPLACE VIEW ontology.vw_export_nodes AS SELECT foo.assettype, foo.node AS label, (foo.assettype || ':'::text) || foo.node AS id, foo.mergenodeid AS mergenode, foo.businesstable AS businesstablename FROM ( SELECT m.entitytypeidfrom AS assettype, r.entitytypedomain AS node, m.mergenodeid, ( SELECT entity_types.businesstablename FROM data.entity_types WHERE entity_types.entitytypeid = r.entitytypedomain) AS businesstable FROM ontology.mapping_steps ms JOIN ontology.mappings m ON m.mappingid = ms.mappingid JOIN ontology.rules r ON r.ruleid = ms.ruleid UNION SELECT m.entitytypeidfrom, r.entitytyperange AS node, m.mergenodeid, ( SELECT entity_types.businesstablename FROM data.entity_types WHERE entity_types.entitytypeid = r.entitytyperange) AS businesstable FROM ontology.mapping_steps ms JOIN ontology.mappings m ON m.mappingid = ms.mappingid JOIN ontology.rules r ON r.ruleid = ms.ruleid) foo WHERE (foo.node <> ALL (ARRAY['ARCHES_RECORD.E31'::text, 'CREATION_EVENT.E65'::text, 'UPDATE_EVENT.E65'::text, 'COMPILER.E82'::text, 'COMPILER_PERSON.E21'::text, 'REFERENCE_NUMBER_(INTERNAL).E42'::text, 'TIME-SPAN_UPDATE_EVENT.E52'::text, 'TIME-SPAN_CREATION_EVENT.E52'::text, 'DATE_OF_COMPILATION.E50'::text, 'DATE_OF_LAST_UPDATE.E50'::text])) AND foo.node <> foo.assettype ORDER BY foo.assettype, foo.node; ALTER TABLE ontology.vw_export_nodes OWNER TO postgres; CREATE OR REPLACE VIEW ontology.vw_export_edges AS SELECT m.entitytypeidfrom AS assettype, (m.entitytypeidfrom || ':'::text) || r.entitytypedomain AS source, (m.entitytypeidfrom || ':'::text) || r.entitytyperange AS target, r.propertyid AS label FROM ontology.mapping_steps ms JOIN ontology.mappings m ON m.mappingid = ms.mappingid JOIN ontology.rules r ON r.ruleid = ms.ruleid WHERE (m.entitytypeidfrom <> ALL (ARRAY['ARCHES_RECORD.E31'::text, 'CREATION_EVENT.E65'::text, 'UPDATE_EVENT.E65'::text, 'COMPILER.E82'::text, 'COMPILER_PERSON.E21'::text, 'REFERENCE_NUMBER_(INTERNAL).E42'::text, 'TIME-SPAN_UPDATE_EVENT.E52'::text, 'TIME-SPAN_CREATION_EVENT.E52'::text, 'DATE_OF_COMPILATION.E50'::text, 'DATE_OF_LAST_UPDATE.E50'::text])) AND (r.entitytypedomain <> ALL (ARRAY['ARCHES_RECORD.E31'::text, 'CREATION_EVENT.E65'::text, 'UPDATE_EVENT.E65'::text, 'COMPILER.E82'::text, 'COMPILER_PERSON.E21'::text, 'REFERENCE_NUMBER_(INTERNAL).E42'::text, 'TIME-SPAN_UPDATE_EVENT.E52'::text, 'TIME-SPAN_CREATION_EVENT.E52'::text, 'DATE_OF_COMPILATION.E50'::text, 'DATE_OF_LAST_UPDATE.E50'::text])) AND (r.entitytyperange <> ALL (ARRAY['ARCHES_RECORD.E31'::text, 'CREATION_EVENT.E65'::text, 'UPDATE_EVENT.E65'::text, 'COMPILER.E82'::text, 'COMPILER_PERSON.E21'::text, 'REFERENCE_NUMBER_(INTERNAL).E42'::text, 'TIME-SPAN_UPDATE_EVENT.E52'::text, 'TIME-SPAN_CREATION_EVENT.E52'::text, 'DATE_OF_COMPILATION.E50'::text, 'DATE_OF_LAST_UPDATE.E50'::text])) AND m.entitytypeidto = r.entitytyperange ORDER BY m.entitytypeidfrom; ALTER TABLE ontology.vw_export_edges OWNER TO postgres; INSERT INTO concepts.d_valuetypes SELECT 'sortorder', 'undefined', null, 'arches', 'text' WHERE NOT EXISTS (SELECT 1 FROM concepts.d_valuetypes WHERE valuetype = 'sortorder'); CREATE OR REPLACE FUNCTION concepts.concpets_ins() RETURNS trigger AS $BODY$ DECLARE v_uuid uuid = public.uuid_generate_v1mc(); BEGIN --Provides CONCEPTID for RDM inserts and cases where ETL conceptid is not a UUID IF NEW.CONCEPTID IS NULL THEN NEW.CONCEPTID := v_uuid; END IF; -- Supports RDM where no concpetid or legacyoid is fed in IF NEW.CONCEPTID IS NULL AND (NEW.LEGACYOID IS NULL OR NEW.LEGACYOID = '') THEN NEW.LEGACYOID = v_uuid::text; END IF; -- I would assume that two cases below are handled in python code by being explicit about insert values for both columns... just coding defensively here. ABL. -- Supports where ETL provided conceptid is a UUID and will be kept, but no LEGACYOID provided. IF NEW.CONCEPTID IS NOT NULL and (NEW.LEGACYOID is null or NEW.LEGACYOID = '') THEN NEW.LEGACYOID = NEW.CONCEPTID::text; END IF; -- Supports where ETL'ed conceptid is not a UUID. Populates original "concpetid" as LEGACYOID. IF NEW.LEGACYOID IS NOT NULL OR NEW.LEGACYOID != '' then NEW.LEGACYOID = NEW.LEGACYOID; END IF; RETURN NEW; END$BODY$ LANGUAGE plpgsql VOLATILE COST 100; ALTER FUNCTION concepts.concpets_ins() OWNER TO postgres; -- Trigger: concepts_ins_tgr on concepts.concepts DROP TRIGGER IF EXISTS concepts_ins_tgr ON concepts.concepts; CREATE TRIGGER concepts_ins_tgr BEFORE INSERT ON concepts.concepts FOR EACH ROW EXECUTE PROCEDURE concepts.concpets_ins();""" with transaction.atomic(): #import arches.management.patches.upgrade_to_v3_0_4 cursor = connection.cursor() cursor.execute(sql) anonymous_user, created = User.objects.get_or_create(username='******') if created: anonymous_user.set_password('') read_group, created = Group.objects.get_or_create(name='read') anonymous_user.groups.add(read_group) edit_group, created = Group.objects.get_or_create(name='edit') admin_user = User.objects.get(username='******') admin_user.groups.add(edit_group) admin_user.groups.add(read_group) print '\nINSTALLING PYSHP MODULE' print '-----------------------' pip.main(['install', 'pyshp']) print '\nUPDATING ENTITY INDEX' print '---------------------' # Add numbers array to resources that do not have them. Move numbers data from child_entities to numbers array in index. resourceid_sql = "SELECT entityid FROM data.entities WHERE entitytypeid IN (SELECT distinct(entitytypeid) FROM data.entity_types WHERE isresource =True);" cursor.execute(resourceid_sql) resourceids = [] for val in cursor.fetchall(): resourceids.append(val[0]) start = time.time() records = 0 se = SearchEngineFactory().create() for resourceid in resourceids: indexed_resource = se.search(index='entity', id=resourceid) if 'numbers' not in indexed_resource['_source']: indexed_resource['_source']['numbers'] = [] else: pass for child_entity in indexed_resource['_source']['child_entities']: if child_entity['businesstablename'] == 'numbers': index_resource['_source']['numbers'].append(child_entity) indexed_resource['_source']['child_entities'].remove(child_entity) ## Reindex resource here. se.index_data(index='entity',doc_type=indexed_resource['_type'], body=indexed_resource['_source'], id=indexed_resource['_id']) records+=1 # if records%500 == 0: # print '%s records processed'%str(records) print '%s records updated' % str(records) # print 'Patch took %s seconds to run.'%str(time.time() - start) print "\npatch '%s' successfully applied." % __name__
def index(documents, index, type, idfield, processdoc=None, getid=None, bulk=False): print 'index_concepts.index' detail = '' bulkitems = [] errorlist = [] se = SearchEngineFactory().create() if not isinstance(documents, list): documents = [documents] for document in documents: sys.stdout.write('.') if processdoc == None: data = document else: data = processdoc(document) id = None if getid != None: id = getid(document, data) try: if bulk: bulkitem = se.create_bulk_item(index, type, id, data) bulkitems.append(bulkitem[0]) bulkitems.append(bulkitem[1]) else: se.index_data(index, type, data, idfield=idfield, id=id) #se.index_data('concept_labels', '00000000-0000-0000-0000-000000000005', data, 'id') for concept in data['labels']: #se.index_term(concept['label'], concept['labelid'], '00000000-0000-0000-0000-000000000005', settings.PUBLISHED_LABEL, {'conceptid': data['conceptid']}) if concept['label'].strip(' \t\n\r') != '': already_indexed = False count = 1 ids = [id] try: _id = uuid.uuid3( uuid.NAMESPACE_DNS, '%s%s' % (hash(concept['label']), hash(data['conceptid']))) result = se.es.get(index='term', doc_type='value', id=_id, ignore=404) #print 'result: %s' % result if result['found'] == True: ids = result['_source']['ids'] if id not in ids: ids.append(id) else: ids = [id] if data['context'] != '00000000-0000-0000-0000-000000000003' and data[ 'context'] != '00000000-0000-0000-0000-000000000004': se.index_data( 'term', 'value', { 'term': concept['label'], 'context': data['context'], 'ewstatus': settings.PUBLISHED_LABEL, 'options': { 'conceptid': data['conceptid'] }, 'count': len(ids), 'ids': ids }, id=_id) except Exception as detail: raise detail except Exception as detail: print detail errorlist.append(id) if bulk: try: se.bulk_index(index, type, bulkitems) except Exception as detail: errorlist = bulkitems print 'bulk inset failed' if detail != '': print "\n\nException detail: %s " % (detail) print "There was a problem indexing the following items:" print errorlist
class ResourceLoader(object): def __init__(self): self.user = User() self.user.first_name = settings.ETL_USERNAME self.resources = [] self.se = SearchEngineFactory().create() option_list = BaseCommand.option_list + ( make_option('--source', action='store', dest='source', default='', help='.arches file containing resource records'), make_option('--format', action='store_true', default='arches', help='format extension that you would like to load: arches or shp'), ) def load(self, source): file_name, file_format = os.path.splitext(source) archesjson = False if file_format == '.shp': reader = ShapeReader() elif file_format == '.arches': reader = ArchesReader() print '\nVALIDATING ARCHES FILE ({0})'.format(source) reader.validate_file(source) elif file_format == '.json': archesjson = True reader = JsonReader() start = time() resources = reader.load_file(source) print '\nLOADING RESOURCES ({0})'.format(source) relationships = None related_resource_records = [] relationships_file = file_name + '.relations' elapsed = (time() - start) print 'time to parse {0} resources = {1}'.format(file_name, elapsed) results = self.resource_list_to_entities(resources, archesjson) if os.path.exists(relationships_file): relationships = csv.DictReader(open(relationships_file, 'r'), delimiter='|') for relationship in relationships: related_resource_records.append(self.relate_resources(relationship, results['legacyid_to_entityid'], archesjson)) else: print 'No relationship file' #self.se.bulk_index(self.resources) def resource_list_to_entities(self, resource_list, archesjson=False): '''Takes a collection of imported resource records and saves them as arches entities''' start = time() d = datetime.datetime.now() load_id = 'LOADID:{0}-{1}-{2}-{3}-{4}-{5}'.format(d.year, d.month, d.day, d.hour, d.minute, d.microsecond) #Should we append the timestamp to the exported filename? ret = {'successfully_saved':0, 'failed_to_save':[]} schema = None current_entitiy_type = None legacyid_to_entityid = {} errors = [] progress_interval = 250 for count, resource in enumerate(resource_list): if count >= progress_interval and count % progress_interval == 0: print count, 'of', len(resource_list), 'loaded' if archesjson == False: masterGraph = None if current_entitiy_type != resource.entitytypeid: schema = Resource.get_mapping_schema(resource.entitytypeid) master_graph = self.build_master_graph(resource, schema) self.pre_save(master_graph) try: uuid.UUID(resource.resource_id) entityid = resource.resource_id except(ValueError): entityid = '' master_graph.save(user=self.user, note=load_id, resource_uuid=entityid) master_graph.index() resource.entityid = master_graph.entityid legacyid_to_entityid[resource.resource_id] = master_graph.entityid else: new_resource = Resource(resource) new_resource.save(user=self.user, note=load_id, resource_uuid=new_resource.entityid) try: new_resource.index() except: print 'Could not index resource. This may be because the valueid of a concept is not in the database.' legacyid_to_entityid[new_resource.entityid] = new_resource.entityid ret['successfully_saved'] += 1 ret['legacyid_to_entityid'] = legacyid_to_entityid elapsed = (time() - start) print len(resource_list), 'resources loaded' if len(resource_list) > 0: print 'total time to etl = %s' % (elapsed) print 'average time per entity = %s' % (elapsed/len(resource_list)) print 'Load Identifier =', load_id print '***You can reverse this load with the following command:' print 'python manage.py packages -o remove_resources --load_id', load_id return ret def build_master_graph(self, resource, schema): master_graph = None entity_data = [] if len(entity_data) > 0: master_graph = entity_data[0] for mapping in entity_data[1:]: master_graph.merge(mapping) for group in resource.groups: entity_data2 = [] for row in group.rows: entity = Resource() entity.create_from_mapping(row.resourcetype, schema[row.attributename]['steps'], row.attributename, row.attributevalue) entity_data2.append(entity) mapping_graph = entity_data2[0] for mapping in entity_data2[1:]: mapping_graph.merge(mapping) if master_graph == None: master_graph = mapping_graph else: node_type_to_merge_at = schema[row.attributename]['mergenodeid'] master_graph.merge_at(mapping_graph, node_type_to_merge_at) return master_graph def pre_save(self, master_graph): pass def relate_resources(self, relationship, legacyid_to_entityid, archesjson): start_date = None if relationship['START_DATE'] in ('', 'None') else relationship['START_DATE'] end_date = None if relationship['END_DATE'] in ('', 'None') else relationship['END_DATE'] if archesjson == False: relationshiptype_concept = Concepts.objects.get(legacyoid = relationship['RELATION_TYPE']) concept_value = Values.objects.filter(conceptid = relationshiptype_concept.conceptid).filter(valuetype = 'prefLabel') entityid1 = legacyid_to_entityid[relationship['RESOURCEID_FROM']] entityid2 = legacyid_to_entityid[relationship['RESOURCEID_TO']] else: concept_value = Values.objects.filter(valueid = relationship['RELATION_TYPE']) entityid1 = relationship['RESOURCEID_FROM'] entityid2 = relationship['RESOURCEID_TO'] related_resource_record = ResourceXResource( entityid1 = entityid1, entityid2 = entityid2, notes = relationship['NOTES'], relationshiptype = concept_value[0].valueid, datestarted = start_date, dateended = end_date, ) related_resource_record.save() self.se.index_data(index='resource_relations', doc_type='all', body=model_to_dict(related_resource_record), idfield='resourcexid')
class ResourceLoader(object): def __init__(self): self.user = User() self.user.first_name = settings.ETL_USERNAME self.resources = [] self.se = SearchEngineFactory().create() option_list = BaseCommand.option_list + ( make_option('--source', action='store', dest='source', default='', help='.arches file containing resource records'), make_option( '--format', action='store_true', default='arches', help='format extension that you would like to load: arches or shp' ), ) def load(self, source, appending=False): file_name, file_format = os.path.splitext(source) archesjson = False if file_format == '.shp': reader = ShapeReader() elif file_format == '.arches': reader = ArchesReader() print '\nVALIDATING ARCHES FILE ({0})'.format(source) # reader.validate_file(source) elif file_format == '.json': archesjson = True reader = JsonReader() print '\nVALIDATING JSON FILE ({0})'.format(source) reader.validate_file(source) elif file_format == '.jsonl': archesjson = True reader = JsonReader() print '\nNO VALIDATION USED ON JSONL FILE ({0})'.format(source) d = datetime.datetime.now() load_id = 'LOADID:{0}-{1}-{2}-{3}-{4}-{5}'.format( d.year, d.month, d.day, d.hour, d.minute, d.microsecond) loaded_ct = 0 with open(source, "rb") as openf: lines = openf.readlines() for line in lines: resource = json.loads(line) result = self.resource_list_to_entities( [resource], True, False, filename=os.path.basename(source), load_id=load_id) loaded_ct += 1 return {"count": loaded_ct} start = time() resources = reader.load_file(source) print '\nLOADING RESOURCES ({0})'.format(source) relationships = None related_resource_records = [] relationships_file = file_name + '.relations' elapsed = (time() - start) print 'time to parse {0} resources = {1}'.format(file_name, elapsed) results = self.resource_list_to_entities( resources, archesjson, appending, filename=os.path.basename(source)) if os.path.exists(relationships_file): with open(relationships_file, "rb") as openf: lines = openf.readlines() if "," in lines[0]: delim = "," elif "|" in lines[0]: delim = "|" else: delim = "," relationships = csv.DictReader(open(relationships_file, 'r'), delimiter=delim) for relationship in relationships: related_resource_records.append( self.relate_resources(relationship, results['legacyid_to_entityid'], archesjson)) else: print 'No relationship file' return results #self.se.bulk_index(self.resources) # def resource_list_chunk_to_entities(): def resource_list_to_entities(self, resource_list, archesjson=False, append=False, filename='', load_id=None): '''Takes a collection of imported resource records and saves them as arches entities''' start = time() d = datetime.datetime.now() if load_id is None: load_id = 'LOADID:{0}-{1}-{2}-{3}-{4}-{5}'.format( d.year, d.month, d.day, d.hour, d.minute, d.microsecond ) #Should we append the timestamp to the exported filename? ret = { 'successfully_saved': 0, 'failed_to_save': [], 'load_id': load_id } schema = None current_entitiy_type = None legacyid_to_entityid = {} errors = [] progress_interval = 250 def chunks(l, n): """Yield successive n-sized chunks from l. Thanks to: https://stackoverflow.com/a/312464/3873885""" for i in xrange(0, len(l), n): yield l[i:i + n] elapsed = 0 chunktimes = list() for m, resource_list_chunk in enumerate( chunks(resource_list, progress_interval)): startchunk = time() multiplier = m + 1 with transaction.atomic(): for count, resource in enumerate(resource_list_chunk): real_ct = count + 1 if archesjson == False: masterGraph = None if current_entitiy_type != resource.entitytypeid: schema = Resource.get_mapping_schema( resource.entitytypeid) current_entitiy_type = resource.entitytypeid master_graph = self.build_master_graph( resource, schema) self.pre_save(master_graph) try: uuid.UUID(resource.resource_id) entityid = resource.resource_id except ValueError: entityid = '' if append: try: resource_to_delete = Resource(entityid) resource_to_delete.delete_index() except ObjectDoesNotExist: print 'Entity ', entityid, ' does not exist. Nothing to delete' try: master_graph.save(user=self.user, note=load_id, resource_uuid=entityid) except Exception as e: print 'Could not save resource {}.\nERROR: {}'.format( master_graph.entityid, e) resource.entityid = master_graph.entityid #new_resource = Resource().get(resource.entityid) #assert new_resource == master_graph try: master_graph.index() except Exception as e: print 'Could not index resource {}.\nERROR: {}'.format( resource.entityid, e) legacyid_to_entityid[ resource.resource_id] = master_graph.entityid else: new_resource = Resource(resource) try: new_resource.save( user=self.user, note=load_id, resource_uuid=new_resource.entityid) except Exception as e: print 'Could not save resource {}.\nERROR: {}'.format( resource['entityid'], e) # with open(resource['entityid']+".json", "wb") as f: # json.dump(resource, f, indent=1) continue new_resource = Resource().get(new_resource.entityid) try: new_resource.index() except Exception as e: print 'Could not index resource {}.\nERROR: {}'.format( resource.entityid, e) legacyid_to_entityid[ new_resource.entityid] = new_resource.entityid ret['successfully_saved'] += 1 endchunk = time() - startchunk chunktimes.append(endchunk) chunktime_avg = sum(chunktimes) / len(chunktimes) remtime = ((len(resource_list) - (multiplier * progress_interval)) * chunktime_avg / progress_interval) / 60 if real_ct == progress_interval: print "{} of {} loaded in {}m. remaining time estimate: {}m".format( progress_interval * multiplier, len(resource_list), round(sum(chunktimes) / 60, 2), round(remtime, 2)) else: print progress_interval * multiplier + real_ct ret['legacyid_to_entityid'] = legacyid_to_entityid elapsed = (time() - start) print len(resource_list), 'resources loaded' if len(resource_list) > 0: print 'total time to etl = %s' % (elapsed) print 'average time per entity = %s' % (elapsed / len(resource_list)) print 'Load Identifier =', load_id print '***You can reverse this load with the following command:' print 'python manage.py packages -o remove_resources --load_id', load_id log_msg = "\n~~~~~\n{}\nfile: {}\nresources: {}\nloadid: {}".format( d.strftime("%d/%m/%Y - %H:%M"), filename, len(resource_list), load_id) with open(settings.BULK_UPLOAD_LOG_FILE, "a") as loadlog: loadlog.write(log_msg) return ret def build_master_graph(self, resource, schema): master_graph = None entity_data = [] if len(entity_data) > 0: master_graph = entity_data[0] for mapping in entity_data[1:]: master_graph.merge(mapping) for group in resource.groups: entity_data2 = [] for row in group.rows: entity = Resource() entity.create_from_mapping(row.resourcetype, schema[row.attributename]['steps'], row.attributename, row.attributevalue) entity_data2.append(entity) mapping_graph = entity_data2[0] for mapping in entity_data2[1:]: mapping_graph.merge(mapping) if master_graph == None: master_graph = mapping_graph else: node_type_to_merge_at = schema[ row.attributename]['mergenodeid'] has_merge_in_path = 0 new_merge_node = None for ent in entity_data2: for step in ent.flatten(): if step.entitytypeid == node_type_to_merge_at: has_merge_in_path += 1 break for ent in mapping_graph.flatten(): if ent.entitytypeid == node_type_to_merge_at and ent.value != '': new_merge_node = schema[node_type_to_merge_at][ 'mergenodeid'] if has_merge_in_path != len(entity_data2): # Merge node is not in path of each node - so will merge in at root. master_graph.merge_at(mapping_graph, mapping_graph.entitytypeid) elif new_merge_node: # Merge node is a value node - so will merge one node up master_graph.merge_at(mapping_graph, new_merge_node) else: master_graph.merge_at(mapping_graph, node_type_to_merge_at) return master_graph def pre_save(self, master_graph): pass def relate_resources(self, relationship, legacyid_to_entityid, archesjson): start_date = None if relationship['START_DATE'] in ( '', 'None') else relationship['START_DATE'] end_date = None if relationship['END_DATE'] in ( '', 'None') else relationship['END_DATE'] if archesjson == False: relationshiptype_concept = Concepts.objects.get( legacyoid=relationship['RELATION_TYPE']) concept_value = Values.objects.filter( conceptid=relationshiptype_concept.conceptid).filter( valuetype='prefLabel') entityid1 = legacyid_to_entityid[relationship['RESOURCEID_FROM']] if relationship['RESOURCEID_TO'] in legacyid_to_entityid.keys(): entityid2 = legacyid_to_entityid[relationship['RESOURCEID_TO']] else: # If entityid is not in dictionary, likely is a uuid to previously existing resource entityid2 = relationship['RESOURCEID_TO'] else: concept_value = Values.objects.filter( valueid=relationship['RELATION_TYPE']) entityid1 = relationship['RESOURCEID_FROM'] entityid2 = relationship['RESOURCEID_TO'] if len(concept_value) == 0: concept = Concepts.objects.get( conceptid=relationship['RELATION_TYPE']) concept_value = Values.objects.filter(conceptid=concept) related_resource_record = RelatedResource( entityid1=entityid1, entityid2=entityid2, notes=relationship['NOTES'], relationshiptype=concept_value[0].valueid, datestarted=start_date, dateended=end_date) related_resource_record.save() self.se.index_data(index='resource_relations', doc_type='all', body=model_to_dict(related_resource_record), idfield='resourcexid')
class ResourceLoader(object): def __init__(self): self.user = User() self.user.first_name = settings.ETL_USERNAME self.resources = [] self.se = SearchEngineFactory().create() option_list = BaseCommand.option_list + ( make_option('--source', action='store', dest='source', default='', help='.arches file containing resource records'), make_option( '--format', action='store_true', default='arches', help='format extension that you would like to load: arches or shp' ), ) def load(self, source): file_name, file_format = os.path.splitext(source) archesjson = False if file_format == '.shp': reader = ShapeReader() elif file_format == '.arches': reader = ArchesReader() print '\nVALIDATING ARCHES FILE ({0})'.format(source) reader.validate_file(source) elif file_format == '.json': archesjson = True reader = JsonReader() start = time() resources = reader.load_file(source) print '\nLOADING RESOURCES ({0})'.format(source) relationships = None related_resource_records = [] relationships_file = file_name + '.relations' elapsed = (time() - start) print 'time to parse {0} resources = {1}'.format(file_name, elapsed) results = self.resource_list_to_entities(resources, archesjson) if os.path.exists(relationships_file): relationships = csv.DictReader(open(relationships_file, 'r'), delimiter='|') for relationship in relationships: related_resource_records.append( self.relate_resources(relationship, results['legacyid_to_entityid'], archesjson)) else: print 'No relationship file' #self.se.bulk_index(self.resources) def resource_list_to_entities(self, resource_list, archesjson=False): '''Takes a collection of imported resource records and saves them as arches entities''' start = time() d = datetime.datetime.now() load_id = 'LOADID:{0}-{1}-{2}-{3}-{4}-{5}'.format( d.year, d.month, d.day, d.hour, d.minute, d.microsecond ) #Should we append the timestamp to the exported filename? ret = {'successfully_saved': 0, 'failed_to_save': []} schema = None current_entitiy_type = None legacyid_to_entityid = {} errors = [] progress_interval = 250 for count, resource in enumerate(resource_list): if count >= progress_interval and count % progress_interval == 0: print count, 'of', len(resource_list), 'loaded' if archesjson == False: masterGraph = None if current_entitiy_type != resource.entitytypeid: schema = Resource.get_mapping_schema(resource.entitytypeid) master_graph = self.build_master_graph(resource, schema) self.pre_save(master_graph) try: uuid.UUID(resource.resource_id) entityid = resource.resource_id except (ValueError): entityid = '' master_graph.save(user=self.user, note=load_id, resource_uuid=entityid) master_graph.index() resource.entityid = master_graph.entityid legacyid_to_entityid[ resource.resource_id] = master_graph.entityid else: new_resource = Resource(resource) new_resource.save(user=self.user, note=load_id, resource_uuid=new_resource.entityid) try: new_resource.index() except: print 'Could not index resource. This may be because the valueid of a concept is not in the database.' legacyid_to_entityid[ new_resource.entityid] = new_resource.entityid ret['successfully_saved'] += 1 ret['legacyid_to_entityid'] = legacyid_to_entityid elapsed = (time() - start) print len(resource_list), 'resources loaded' if len(resource_list) > 0: print 'total time to etl = %s' % (elapsed) print 'average time per entity = %s' % (elapsed / len(resource_list)) print 'Load Identifier =', load_id print '***You can reverse this load with the following command:' print 'python manage.py packages -o remove_resources --load_id', load_id return ret def build_master_graph(self, resource, schema): master_graph = None entity_data = [] if len(entity_data) > 0: master_graph = entity_data[0] for mapping in entity_data[1:]: master_graph.merge(mapping) for group in resource.groups: entity_data2 = [] for row in group.rows: entity = Resource() entity.create_from_mapping(row.resourcetype, schema[row.attributename]['steps'], row.attributename, row.attributevalue) entity_data2.append(entity) mapping_graph = entity_data2[0] for mapping in entity_data2[1:]: mapping_graph.merge(mapping) if master_graph == None: master_graph = mapping_graph else: node_type_to_merge_at = schema[ row.attributename]['mergenodeid'] master_graph.merge_at(mapping_graph, node_type_to_merge_at) return master_graph def pre_save(self, master_graph): pass def relate_resources(self, relationship, legacyid_to_entityid, archesjson): start_date = None if relationship['START_DATE'] in ( '', 'None') else relationship['START_DATE'] end_date = None if relationship['END_DATE'] in ( '', 'None') else relationship['END_DATE'] if archesjson == False: relationshiptype_concept = Concept.objects.get( legacyoid=relationship['RELATION_TYPE']) concept_value = Value.objects.filter( concept=relationshiptype_concept.conceptid).filter( valuetype='prefLabel') entityid1 = legacyid_to_entityid[relationship['RESOURCEID_FROM']] entityid2 = legacyid_to_entityid[relationship['RESOURCEID_TO']] else: concept_value = Value.objects.filter( valueid=relationship['RELATION_TYPE']) entityid1 = relationship['RESOURCEID_FROM'] entityid2 = relationship['RESOURCEID_TO'] related_resource_record = ResourceXResource( entityid1=entityid1, entityid2=entityid2, notes=relationship['NOTES'], relationshiptype=concept_value[0].valueid, datestarted=start_date, dateended=end_date, ) related_resource_record.save() self.se.index_data(index='resource_relations', doc_type='all', body=model_to_dict(related_resource_record), idfield='resourcexid')
class ResourceLoader(object): def __init__(self): self.user = User() self.user.first_name = settings.ETL_USERNAME self.resources = [] self.se = SearchEngineFactory().create() option_list = BaseCommand.option_list + ( make_option( "--source", action="store", dest="source", default="", help=".arches file containing resource records" ), make_option( "--format", action="store_true", default="arches", help="format extension that you would like to load: arches or shp", ), ) def load(self, source): file_name, file_format = os.path.splitext(source) archesjson = False if file_format == ".shp": reader = ShapeReader() elif file_format == ".arches": reader = ArchesReader() print "\nVALIDATING ARCHES FILE ({0})".format(source) reader.validate_file(source) elif file_format == ".json": archesjson = True reader = JsonReader() start = time() resources = reader.load_file(source) print "\nLOADING RESOURCES ({0})".format(source) relationships = None related_resource_records = [] relationships_file = file_name + ".relations" elapsed = time() - start print "time to parse {0} resources = {1}".format(file_name, elapsed) results = self.resource_list_to_entities(resources, archesjson) if os.path.exists(relationships_file): relationships = csv.DictReader(open(relationships_file, "r"), delimiter="|") for relationship in relationships: related_resource_records.append( self.relate_resources(relationship, results["legacyid_to_entityid"], archesjson) ) else: print "No relationship file" # self.se.bulk_index(self.resources) def resource_list_to_entities(self, resource_list, archesjson=False): """Takes a collection of imported resource records and saves them as arches entities""" start = time() d = datetime.datetime.now() load_id = "LOADID:{0}-{1}-{2}-{3}-{4}-{5}".format( d.year, d.month, d.day, d.hour, d.minute, d.microsecond ) # Should we append the timestamp to the exported filename? ret = {"successfully_saved": 0, "failed_to_save": []} schema = None current_entitiy_type = None legacyid_to_entityid = {} errors = [] progress_interval = 250 for count, resource in enumerate(resource_list): if count >= progress_interval and count % progress_interval == 0: print count, "of", len(resource_list), "loaded" if archesjson == False: masterGraph = None if current_entitiy_type != resource.entitytypeid: schema = Resource.get_mapping_schema(resource.entitytypeid) master_graph = self.build_master_graph(resource, schema) self.pre_save(master_graph) try: uuid.UUID(resource.resource_id) entityid = resource.resource_id except (ValueError): entityid = "" master_graph.save(user=self.user, note=load_id, resource_uuid=entityid) master_graph.index() resource.entityid = master_graph.entityid legacyid_to_entityid[resource.resource_id] = master_graph.entityid else: new_resource = Resource(resource) new_resource.save(user=self.user, note=load_id, resource_uuid=new_resource.entityid) try: new_resource.index() except: print "Could not index resource. This may be because the valueid of a concept is not in the database." legacyid_to_entityid[new_resource.entityid] = new_resource.entityid ret["successfully_saved"] += 1 ret["legacyid_to_entityid"] = legacyid_to_entityid elapsed = time() - start print len(resource_list), "resources loaded" if len(resource_list) > 0: print "total time to etl = %s" % (elapsed) print "average time per entity = %s" % (elapsed / len(resource_list)) print "Load Identifier =", load_id print "***You can reverse this load with the following command:" print "python manage.py packages -o remove_resources --load_id", load_id return ret def build_master_graph(self, resource, schema): master_graph = None entity_data = [] if len(entity_data) > 0: master_graph = entity_data[0] for mapping in entity_data[1:]: master_graph.merge(mapping) for group in resource.groups: entity_data2 = [] for row in group.rows: entity = Resource() entity.create_from_mapping( row.resourcetype, schema[row.attributename]["steps"], row.attributename, row.attributevalue ) entity_data2.append(entity) mapping_graph = entity_data2[0] for mapping in entity_data2[1:]: mapping_graph.merge(mapping) if master_graph == None: master_graph = mapping_graph else: node_type_to_merge_at = schema[row.attributename]["mergenodeid"] master_graph.merge_at(mapping_graph, node_type_to_merge_at) return master_graph def pre_save(self, master_graph): pass def relate_resources(self, relationship, legacyid_to_entityid, archesjson): start_date = None if relationship["START_DATE"] in ("", "None") else relationship["START_DATE"] end_date = None if relationship["END_DATE"] in ("", "None") else relationship["END_DATE"] if archesjson == False: relationshiptype_concept = Concept.objects.get(legacyoid=relationship["RELATION_TYPE"]) concept_value = Value.objects.filter(concept=relationshiptype_concept.conceptid).filter( valuetype="prefLabel" ) entityid1 = legacyid_to_entityid[relationship["RESOURCEID_FROM"]] entityid2 = legacyid_to_entityid[relationship["RESOURCEID_TO"]] else: concept_value = Value.objects.filter(valueid=relationship["RELATION_TYPE"]) entityid1 = relationship["RESOURCEID_FROM"] entityid2 = relationship["RESOURCEID_TO"] related_resource_record = ResourceXResource( entityid1=entityid1, entityid2=entityid2, notes=relationship["NOTES"], relationshiptype=concept_value[0].valueid, datestarted=start_date, dateended=end_date, ) related_resource_record.save() self.se.index_data( index="resource_relations", doc_type="all", body=model_to_dict(related_resource_record), idfield="resourcexid", )
class BaseIndex(object): def __init__(self, index_name=None): if index_name is None or index_name is "": raise SearchIndexError("Index name is not defined") self.se = SearchEngineFactory().create() self.index_metadata = None self.index_name = index_name def prepare_index(self): """ Defines the Elastic Search mapping and settings for an index Arguments: None Keyword Arguments: None Return: None """ if self.index_metadata is not None: self.se.create_index(index=self.index_name, body=self.index_metadata) else: raise SearchIndexError("No index metadata defined.") def get_documents_to_index(self, resourceinstance, tiles): """ Gets a document to index into Elastic Search Arguments: resourceinstance -- resource instance object tiles -- list of tiles that make up the resource instance Keyword Arguments: None Return: tuple of (document, document id) """ raise NotImplementedError def index_document(self, document=None, id=None): """ Indexes a document into Elastic Search Arguments: None Keyword Arguments: document -- the document to index id -- the id of the document Return: None """ if document is not None and id is not None: self.se.index_data(index=self.index_name, body=document, id=id) def bulk_index(self, resources=None, resource_type=None, graph_name=None, clear_index=True): """ Indexes a list of documents in bulk to Elastic Search Arguments: None Keyword Arguments: resources -- the list of resource instances to index resource_type -- the type of resources being indexed graph_name -- the name of the graph model that represents the resources being indexed clear_index -- True(default) to remove all index records of type "resource_type" before indexing, assumes that a field called "graph_id" exists on the indexed documents Return: None """ start = datetime.now() q = Query(se=self.se) if clear_index: term = Term(field="graph_id", term=str(resource_type)) q.add_query(term) q.delete(index=self.index_name, refresh=True) q = Query(se=self.se) count_before = self.se.count(index=self.index_name, body=q.dsl) result_summary = {"database": len(resources), "indexed": 0} with self.se.BulkIndexer(batch_size=settings.BULK_IMPORT_BATCH_SIZE, refresh=True) as indexer: for resource in resources: tiles = list( models.TileModel.objects.filter(resourceinstance=resource)) document, doc_id = self.get_documents_to_index(resource, tiles) if document is not None and id is not None: indexer.add(index=self.index_name, id=doc_id, data=document) result_summary["indexed"] = self.se.count(index=self.index_name, body=q.dsl) - count_before status = "Passed" if result_summary["database"] == result_summary[ "indexed"] else "Failed" print("Custom Index - %s:" % self.index_name) print( " Status: {0}, Resource Type: {1}, In Database: {2}, Indexed: {3}, Took: {4} seconds" .format(status, graph_name, result_summary["database"], result_summary["indexed"], (datetime.now() - start).seconds)) def delete_index(self): """ Deletes this index from Elastic Search Arguments: None Keyword Arguments: None Return: None """ self.se.delete_index(index=self.index_name)
class BaseIndex(object): def __init__(self, index_name=None): if index_name is None or index_name == "": raise SearchIndexError("Index name is not defined") self.se = SearchEngineFactory().create() self.index_metadata = None self.index_name = index_name def prepare_index(self): """ Defines the Elastic Search mapping and settings for an index Arguments: None Keyword Arguments: None Return: None """ if self.index_metadata is not None: self.se.create_index(index=self.index_name, body=self.index_metadata) else: raise SearchIndexError("No index metadata defined.") def get_documents_to_index(self, resourceinstance, tiles): """ Gets a document to index into Elastic Search Arguments: resourceinstance -- resource instance object tiles -- list of tiles that make up the resource instance Keyword Arguments: None Return: tuple of (document, document id) """ raise NotImplementedError def index_document(self, document=None, id=None): """ Indexes a document into Elastic Search Arguments: None Keyword Arguments: document -- the document to index id -- the id of the document Return: None """ if document is not None and id is not None: self.se.index_data(index=self.index_name, body=document, id=id) def index_resources(self, resources=None, batch_size=settings.BULK_IMPORT_BATCH_SIZE, quiet=False): """ Indexes a list of resources in bulk to Elastic Search Keyword Arguments: resources -- the list of resource instances to index batch_size -- the number of records to index as a group, the larger the number to more memory required quiet -- Silences the status bar output during certain operations, use in celery operations for example Return: None """ start = datetime.now() q = Query(se=self.se) self.se.refresh(index=self.index_name) count_before = self.se.count(index=self.index_name, body=q.dsl) result_summary = {"database": len(resources), "indexed": 0} if quiet is False: bar = pyprind.ProgBar(len(resources), bar_char="█") if len(resources) > 1 else None with self.se.BulkIndexer(batch_size=batch_size, refresh=True) as indexer: for resource in resources: if quiet is False and bar is not None: bar.update(item_id=resource) tiles = list(models.TileModel.objects.filter(resourceinstance=resource)) document, doc_id = self.get_documents_to_index(resource, tiles) if document is not None and id is not None: indexer.add(index=self.index_name, id=doc_id, data=document) self.se.refresh(index=self.index_name) result_summary["indexed"] = self.se.count(index=self.index_name, body=q.dsl) - count_before status = "Passed" if result_summary["database"] == result_summary["indexed"] else "Failed" print(f"Custom Index - {settings.ELASTICSEARCH_PREFIX}_{self.index_name}") print( f" Status: {status}, In Database: {result_summary['database']}, Indexed: {result_summary['indexed']}, Took: {(datetime.now() - start).seconds} seconds" ) def delete_resources(self, resources=None): """ Deletes documents from an index based on the passed in list of resources Delete by query, so this is a single operation Keyword Arguments: resources -- a single resource instance or a list of resource instances """ q = Query(se=self.se) if not isinstance(resources, list): resourcelist = [resources] else: resourcelist = resources list_of_ids_to_delete = [] for resource in resourcelist: list_of_ids_to_delete.append(resource.pk) ids_query = Ids(ids=list_of_ids_to_delete) q.add_query(ids_query) q.delete(index=self.index_name) def delete_index(self): """ Deletes this index from Elastic Search Arguments: None Keyword Arguments: None Return: None """ self.se.delete_index(index=self.index_name) def reindex(self, graphids=None, clear_index=True, batch_size=settings.BULK_IMPORT_BATCH_SIZE, quiet=False): """ Reindexes the index. By default this does nothing, it needs to be implemented in a subclass. By default you can pass in a list of graph ids to trigger the reindex. This will loop through all resource instances of each graph type. Example subclass command: def reindex(self, clear_index=True): PARCEL_GRAPHID = "e3c35dca-5e72-11ea-a2d3-dca90488358a" super(CustomIndexName, self).reindex(graphids=[PARCEL_GRAPHID], clear_index=clear_index) Keyword Arguments: graphids -- list of graphs ids to trigger the reindex on, will get all resource instances of each graph id supplied clear_index -- True(default) to clear all documents out of the index before reindexing begins batch_size -- the number of records to index as a group, the larger the number to more memory required Return: None """ if graphids is not None: if clear_index: self.delete_index() self.prepare_index() for graphid in graphids: resources = Resource.objects.filter(graph_id=graphid) self.index_resources(resources=resources, batch_size=batch_size, quiet=quiet) else: raise NotImplementedError