def bulk_save(resources): """ Saves and indexes a list of resources Arguments: resources -- a list of resource models """ se = SearchEngineFactory().create() datatype_factory = DataTypeFactory() node_datatypes = {str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list('nodeid', 'datatype')} tiles = [] documents = [] term_list = [] # flatten out the nested tiles into a single array for resource in resources: for parent_tile in resource.tiles: for child_tile in parent_tile.tiles.itervalues(): if len(child_tile) > 0: resource.tiles.extend(child_tile) parent_tile.tiles = {} tiles.extend(resource.tiles) # need to save the models first before getting the documents for index Resource.objects.bulk_create(resources) TileModel.objects.bulk_create(tiles) for resource in resources: resource.save_edit(edit_type='create') document, terms = resource.get_documents_to_index(fetchTiles=False, datatype_factory=datatype_factory, node_datatypes=node_datatypes) document['root_ontology_class'] = resource.get_root_ontology() documents.append(se.create_bulk_item(index='resource', doc_type=document['graph_id'], id=document['resourceinstanceid'], data=document)) for term in terms: term_list.append(se.create_bulk_item(index='strings', doc_type='term', id=term['_id'], data=term['_source'])) for tile in tiles: tile.save_edit(edit_type='tile create', new_value=tile.data) # bulk index the resources, tiles and terms se.bulk_index(documents) se.bulk_index(term_list)
def index(self, documents, index, type, idfield, processdoc=None, getid=None, bulk=False): detail = '' bulkitems = [] errorlist = [] se = SearchEngineFactory().create() if not isinstance(documents, list): documents = [documents] for document in documents: #print "inserting document: %s" % (document) sys.stdout.write('.') if processdoc == None: data = document else: data = processdoc(document) id = None if getid != None: id = getid(document, data) try: if bulk: bulkitem = se.create_bulk_item(index, type, id, data) bulkitems.append(bulkitem[0]) bulkitems.append(bulkitem[1]) else: se.index_data(index, type, data, idfield=idfield, id=id) except Exception as detail: errorlist.append(id) if bulk: try: se.bulk_index(index, type, bulkitems) except Exception as detail: errorlist = bulkitems print 'bulk inset failed' if detail != '': print "\n\nException detail: %s " % (detail) print "There was a problem indexing the following items:" print errorlist
def bulk_save(resources): """ Saves and indexes a list of resources Arguments: resources -- a list of resource models """ se = SearchEngineFactory().create() datatype_factory = DataTypeFactory() node_datatypes = {str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list('nodeid', 'datatype')} tiles = [] documents = [] term_list = [] for resource in resources: resource.tiles = resource.get_flattened_tiles() tiles.extend(resource.tiles) # need to save the models first before getting the documents for index Resource.objects.bulk_create(resources) TileModel.objects.bulk_create(tiles) for resource in resources: resource.save_edit(edit_type='create') document, terms = resource.get_documents_to_index(fetchTiles=False, datatype_factory=datatype_factory, node_datatypes=node_datatypes) document['root_ontology_class'] = resource.get_root_ontology() documents.append(se.create_bulk_item(index='resource', doc_type=document['graph_id'], id=document['resourceinstanceid'], data=document)) for term in terms: term_list.append(se.create_bulk_item(index='strings', doc_type='term', id=term['_id'], data=term['_source'])) for tile in tiles: tile.save_edit(edit_type='tile create', new_value=tile.data) # bulk index the resources, tiles and terms se.bulk_index(documents) se.bulk_index(term_list)
def test_bulk_add_documents(self): """ Test adding documents to Elasticsearch in bulk """ se = SearchEngineFactory().create() se.create_index(index="test") documents = [] count_before = se.count(index="test") for i in range(10): doc = { "id": i, "type": "prefLabel", "value": "test pref label", } documents.append(se.create_bulk_item(op_type="index", index="test", id=doc["id"], data=doc)) ret = se.bulk_index(documents, refresh=True) count_after = se.count(index="test") self.assertEqual(count_after - count_before, 10)
def index(documents, index, type, idfield, processdoc=None, getid=None, bulk=False): print 'index_concepts.index' detail = '' bulkitems = [] errorlist = [] se = SearchEngineFactory().create() if not isinstance(documents, list): documents = [documents] for document in documents: sys.stdout.write('.') if processdoc == None: data = document else: data = processdoc(document) id = None if getid != None: id = getid(document, data) try: if bulk: bulkitem = se.create_bulk_item(index, type, id, data) bulkitems.append(bulkitem[0]) bulkitems.append(bulkitem[1]) else: se.index_data(index, type, data, idfield=idfield, id=id) #se.index_data('concept_labels', '00000000-0000-0000-0000-000000000005', data, 'id') for concept in data['labels']: #se.index_term(concept['label'], concept['labelid'], '00000000-0000-0000-0000-000000000005', settings.PUBLISHED_LABEL, {'conceptid': data['conceptid']}) if concept['label'].strip(' \t\n\r') != '': already_indexed = False count = 1 ids = [id] try: _id = uuid.uuid3(uuid.NAMESPACE_DNS, '%s%s' % (hash(concept['label']), hash(data['conceptid']))) result = se.es.get(index='term', doc_type='value', id=_id, ignore=404) #print 'result: %s' % result if result['found'] == True: ids = result['_source']['ids'] if id not in ids: ids.append(id) else: ids = [id] if data['context'] != '00000000-0000-0000-0000-000000000003' and data['context'] != '00000000-0000-0000-0000-000000000004': se.index_data('term', 'value', {'term': concept['label'], 'context': data['context'], 'ewstatus': settings.PUBLISHED_LABEL, 'options': {'conceptid': data['conceptid']}, 'count': len(ids), 'ids': ids}, id=_id) except Exception as detail: raise detail except Exception as detail: print detail errorlist.append(id) if bulk: try: se.bulk_index(index, type, bulkitems) except Exception as detail: errorlist = bulkitems print 'bulk inset failed' if detail != '': print "\n\nException detail: %s " % (detail) print "There was a problem indexing the following items:" print errorlist
def bulk_save(resources, primaryDescriptorsFunctionConfig, graph_nodes): """ Saves and indexes a list of resources Arguments: resources -- a list of resource models """ start = time() print("saving resource to db") se = SearchEngineFactory().create() datatype_factory = DataTypeFactory() node_datatypes = { str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list( "nodeid", "datatype" ) } tiles = [] documents = [] term_list = [] start = time() for resource in resources: resource.tiles = resource.get_flattened_tiles() tiles.extend(resource.tiles) print("time to extend tiles: %s" % datetime.timedelta(seconds=time() - start)) start = time() # need to save the models first before getting the documents for index Resource.objects.bulk_create(resources) TileModel.objects.bulk_create(tiles) print( "time to bulk create tiles and resources: %s" % datetime.timedelta(seconds=time() - start) ) start = time() for resource in resources: resource.save_edit(edit_type="create") resources[0].tiles[0].save_edit( note=f"bulk created: {len(tiles)} for {len(resources)} resources.", edit_type="bulk_create" ) print( "time to save resource edits: %s" % datetime.timedelta(seconds=time() - start) ) start = time() time_to_get_docs = 0 time_to_get_root_ontology = 0 time_to_create_bulk_docs = 0 time_to_create_bulk_term_docs = 0 timers = {"timer": 0, "timer1": 0, "timer2": 0, "timer3": 0, "timer4": 0} for resource in resources: s = time() document, terms = resource.get_documents_to_index( fetchTiles=False, datatype_factory=datatype_factory, node_datatypes=node_datatypes, config=primaryDescriptorsFunctionConfig, graph_nodes=graph_nodes, ) time_to_get_docs = time_to_get_docs + (time() - s) # s = time() # #document['root_ontology_class'] = resource.get_root_ontology() # time_to_get_root_ontology = time_to_get_root_ontology + (time()-s) s = time() documents.append( se.create_bulk_item( index="resources", id=document["resourceinstanceid"], data=document ) ) time_to_create_bulk_docs = time_to_create_bulk_docs + (time() - s) s = time() for term in terms: term_list.append( se.create_bulk_item( index="terms", id=term["_id"], data=term["_source"] ) ) time_to_create_bulk_term_docs = time_to_create_bulk_term_docs + (time() - s) # print("timer: %s" % datetime.timedelta(seconds=timers['timer']) # print("timer1: %s" % datetime.timedelta(seconds=timers['timer1']) # print("timer2: %s" % datetime.timedelta(seconds=timers['timer2']) # print("timer3: %s" % datetime.timedelta(seconds=timers['timer3']) # print("timer4: %s" % datetime.timedelta(seconds=timers['timer4']) # print("time to get documents to index: %s" % datetime.timedelta(seconds=time_to_get_docs) # print("time to get root ontology: %s" % datetime.timedelta(seconds=time_to_get_root_ontology) # print("time to create bulk docs: %s" % datetime.timedelta(seconds=time_to_create_bulk_docs) # print("time to create bulk term docs: %s" % datetime.timedelta(seconds=time_to_create_bulk_term_docs) start = time() if not settings.STREAMLINE_IMPORT: for tile in tiles: tile.save_edit(edit_type="tile create", new_value=tile.data) # print("time to save tile edits: %s" % datetime.timedelta(seconds=time() - start) start = time() # print("time to save resources to db:%s" % datetime.timedelta(seconds=time() - start) start = time() # bulk index the resources, tiles and terms # print(documents[0] se.bulk_index(documents) se.bulk_index(term_list)
def bulk_save(resources): """ Saves and indexes a list of resources Arguments: resources -- a list of resource models """ se = SearchEngineFactory().create() datatype_factory = DataTypeFactory() node_datatypes = { str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list( "nodeid", "datatype") } tiles = [] documents = [] term_list = [] for resource in resources: resource.tiles = resource.get_flattened_tiles() tiles.extend(resource.tiles) # need to save the models first before getting the documents for index start = time() Resource.objects.bulk_create(resources) TileModel.objects.bulk_create(tiles) print( f"Time to bulk create tiles and resources: {datetime.timedelta(seconds=time() - start)}" ) start = time() for resource in resources: resource.save_edit(edit_type="create") resources[0].tiles[0].save_edit( note=f"Bulk created: {len(tiles)} for {len(resources)} resources.", edit_type="bulk_create") print("Time to save resource edits: %s" % datetime.timedelta(seconds=time() - start)) for resource in resources: start = time() document, terms = resource.get_documents_to_index( fetchTiles=False, datatype_factory=datatype_factory, node_datatypes=node_datatypes) documents.append( se.create_bulk_item(index="resources", id=document["resourceinstanceid"], data=document)) for term in terms: term_list.append( se.create_bulk_item(index="terms", id=term["_id"], data=term["_source"])) se.bulk_index(documents) se.bulk_index(term_list)
def index(documents, index, type, idfield, processdoc=None, getid=None, bulk=False): print 'index_concepts.index' detail = '' bulkitems = [] errorlist = [] se = SearchEngineFactory().create() if not isinstance(documents, list): documents = [documents] for document in documents: sys.stdout.write('.') if processdoc == None: data = document else: data = processdoc(document) id = None if getid != None: id = getid(document, data) try: if bulk: bulkitem = se.create_bulk_item(index, type, id, data) bulkitems.append(bulkitem[0]) bulkitems.append(bulkitem[1]) else: se.index_data(index, type, data, idfield=idfield, id=id) #se.index_data('concept_labels', '00000000-0000-0000-0000-000000000005', data, 'id') for concept in data['labels']: #se.index_term(concept['label'], concept['labelid'], '00000000-0000-0000-0000-000000000005', settings.PUBLISHED_LABEL, {'conceptid': data['conceptid']}) if concept['label'].strip(' \t\n\r') != '': already_indexed = False count = 1 ids = [id] try: _id = uuid.uuid3( uuid.NAMESPACE_DNS, '%s%s' % (hash(concept['label']), hash(data['conceptid']))) result = se.es.get(index='term', doc_type='value', id=_id, ignore=404) #print 'result: %s' % result if result['found'] == True: ids = result['_source']['ids'] if id not in ids: ids.append(id) else: ids = [id] if data['context'] != '00000000-0000-0000-0000-000000000003' and data[ 'context'] != '00000000-0000-0000-0000-000000000004': se.index_data( 'term', 'value', { 'term': concept['label'], 'context': data['context'], 'ewstatus': settings.PUBLISHED_LABEL, 'options': { 'conceptid': data['conceptid'] }, 'count': len(ids), 'ids': ids }, id=_id) except Exception as detail: raise detail except Exception as detail: print detail errorlist.append(id) if bulk: try: se.bulk_index(index, type, bulkitems) except Exception as detail: errorlist = bulkitems print 'bulk inset failed' if detail != '': print "\n\nException detail: %s " % (detail) print "There was a problem indexing the following items:" print errorlist