Exemple #1
0
    def bulk_save(resources):
        """
        Saves and indexes a list of resources

        Arguments:
        resources -- a list of resource models

        """

        se = SearchEngineFactory().create()
        datatype_factory = DataTypeFactory()
        node_datatypes = {str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list('nodeid', 'datatype')}
        tiles = []
        documents = []
        term_list = []

        # flatten out the nested tiles into a single array
        for resource in resources:
            for parent_tile in resource.tiles:
                for child_tile in parent_tile.tiles.itervalues():
                    if len(child_tile) > 0:
                        resource.tiles.extend(child_tile)
                parent_tile.tiles = {}

            tiles.extend(resource.tiles)

        # need to save the models first before getting the documents for index
        Resource.objects.bulk_create(resources)
        TileModel.objects.bulk_create(tiles)

        for resource in resources:
            resource.save_edit(edit_type='create')
            document, terms = resource.get_documents_to_index(fetchTiles=False, datatype_factory=datatype_factory, node_datatypes=node_datatypes)
            document['root_ontology_class'] = resource.get_root_ontology()
            documents.append(se.create_bulk_item(index='resource', doc_type=document['graph_id'], id=document['resourceinstanceid'], data=document))
            for term in terms:
                term_list.append(se.create_bulk_item(index='strings', doc_type='term', id=term['_id'], data=term['_source']))

        for tile in tiles:
            tile.save_edit(edit_type='tile create', new_value=tile.data)
        # bulk index the resources, tiles and terms
        se.bulk_index(documents)
        se.bulk_index(term_list)
Exemple #2
0
    def index(self, documents, index, type, idfield, processdoc=None, getid=None, bulk=False):
        detail = ''
        bulkitems = []
        errorlist = []
        se = SearchEngineFactory().create()
        if not isinstance(documents, list):
            documents = [documents]
        for document in documents:
            #print "inserting document: %s" % (document)
            sys.stdout.write('.')
            if processdoc == None:
                data = document
            else:
                data = processdoc(document)
            id = None
            if getid != None:
                id = getid(document, data)            
            try:
                if bulk:
                    bulkitem = se.create_bulk_item(index, type, id, data)
                    bulkitems.append(bulkitem[0])
                    bulkitems.append(bulkitem[1])        
                else:
                    se.index_data(index, type, data, idfield=idfield, id=id)
            except Exception as detail:
                errorlist.append(id)
        if bulk:
            try:
                se.bulk_index(index, type, bulkitems)
            except Exception as detail:
                errorlist = bulkitems
                print 'bulk inset failed'

        if detail != '':
            print "\n\nException detail: %s " % (detail)
            print "There was a problem indexing the following items:"
            print errorlist
Exemple #3
0
    def index(self, documents, index, type, idfield, processdoc=None, getid=None, bulk=False):
        detail = ''
        bulkitems = []
        errorlist = []
        se = SearchEngineFactory().create()
        if not isinstance(documents, list):
            documents = [documents]
        for document in documents:
            #print "inserting document: %s" % (document)
            sys.stdout.write('.')
            if processdoc == None:
                data = document
            else:
                data = processdoc(document)
            id = None
            if getid != None:
                id = getid(document, data)            
            try:
                if bulk:
                    bulkitem = se.create_bulk_item(index, type, id, data)
                    bulkitems.append(bulkitem[0])
                    bulkitems.append(bulkitem[1])        
                else:
                    se.index_data(index, type, data, idfield=idfield, id=id)
            except Exception as detail:
                errorlist.append(id)
        if bulk:
            try:
                se.bulk_index(index, type, bulkitems)
            except Exception as detail:
                errorlist = bulkitems
                print 'bulk inset failed'

        if detail != '':
            print "\n\nException detail: %s " % (detail)
            print "There was a problem indexing the following items:"
            print errorlist
Exemple #4
0
    def bulk_save(resources):
        """
        Saves and indexes a list of resources

        Arguments:
        resources -- a list of resource models

        """

        se = SearchEngineFactory().create()
        datatype_factory = DataTypeFactory()
        node_datatypes = {str(nodeid): datatype for nodeid, datatype in models.Node.objects.values_list('nodeid', 'datatype')}
        tiles = []
        documents = []
        term_list = []

        for resource in resources:
            resource.tiles = resource.get_flattened_tiles()
            tiles.extend(resource.tiles)

        # need to save the models first before getting the documents for index
        Resource.objects.bulk_create(resources)
        TileModel.objects.bulk_create(tiles)

        for resource in resources:
            resource.save_edit(edit_type='create')
            document, terms = resource.get_documents_to_index(fetchTiles=False, datatype_factory=datatype_factory, node_datatypes=node_datatypes)
            document['root_ontology_class'] = resource.get_root_ontology()
            documents.append(se.create_bulk_item(index='resource', doc_type=document['graph_id'], id=document['resourceinstanceid'], data=document))
            for term in terms:
                term_list.append(se.create_bulk_item(index='strings', doc_type='term', id=term['_id'], data=term['_source']))

        for tile in tiles:
            tile.save_edit(edit_type='tile create', new_value=tile.data)
        # bulk index the resources, tiles and terms
        se.bulk_index(documents)
        se.bulk_index(term_list)
Exemple #5
0
    def test_bulk_add_documents(self):
        """
        Test adding documents to Elasticsearch in bulk

        """

        se = SearchEngineFactory().create()
        se.create_index(index="test")

        documents = []
        count_before = se.count(index="test")
        for i in range(10):
            doc = {
                "id": i,
                "type": "prefLabel",
                "value": "test pref label",
            }
            documents.append(se.create_bulk_item(op_type="index", index="test", id=doc["id"], data=doc))

        ret = se.bulk_index(documents, refresh=True)
        count_after = se.count(index="test")
        self.assertEqual(count_after - count_before, 10)
Exemple #6
0
    def index(documents, index, type, idfield, processdoc=None, getid=None, bulk=False):
        print 'index_concepts.index'
        detail = ''
        bulkitems = []
        errorlist = []
        se = SearchEngineFactory().create()
        if not isinstance(documents, list):
            documents = [documents]
        for document in documents:
            sys.stdout.write('.')
            if processdoc == None:
                data = document
            else:
                data = processdoc(document)
            id = None
            if getid != None:
                id = getid(document, data)            
            try:
                if bulk:
                    bulkitem = se.create_bulk_item(index, type, id, data)
                    bulkitems.append(bulkitem[0])
                    bulkitems.append(bulkitem[1])        
                else:
                    se.index_data(index, type, data, idfield=idfield, id=id)
                    #se.index_data('concept_labels', '00000000-0000-0000-0000-000000000005', data, 'id')
                    for concept in data['labels']:
                        #se.index_term(concept['label'], concept['labelid'], '00000000-0000-0000-0000-000000000005', settings.PUBLISHED_LABEL, {'conceptid': data['conceptid']})
                        if concept['label'].strip(' \t\n\r') != '':
                            already_indexed = False
                            count = 1
                            ids = [id]
                        try:
                            _id = uuid.uuid3(uuid.NAMESPACE_DNS, '%s%s' % (hash(concept['label']), hash(data['conceptid'])))
                            result = se.es.get(index='term', doc_type='value', id=_id, ignore=404)

                            #print 'result: %s' % result
                            if result['found'] == True:
                                ids = result['_source']['ids']
                                if id not in ids:
                                    ids.append(id)
                            else:
                                ids = [id]                             
                            if data['context'] != '00000000-0000-0000-0000-000000000003' and data['context'] != '00000000-0000-0000-0000-000000000004':
                                se.index_data('term', 'value', {'term': concept['label'], 'context': data['context'], 'ewstatus': settings.PUBLISHED_LABEL, 'options': {'conceptid': data['conceptid']}, 'count': len(ids), 'ids': ids}, id=_id)
                            
                        except Exception as detail:
                            raise detail   
            except Exception as detail:
                print detail
                errorlist.append(id)
        if bulk:
            try:
                se.bulk_index(index, type, bulkitems)
            except Exception as detail:
                errorlist = bulkitems
                print 'bulk inset failed'

        if detail != '':
            print "\n\nException detail: %s " % (detail)
            print "There was a problem indexing the following items:"
            print errorlist
Exemple #7
0
    def bulk_save(resources, primaryDescriptorsFunctionConfig, graph_nodes):
        """
        Saves and indexes a list of resources

        Arguments:
        resources -- a list of resource models

        """
        start = time()

        print("saving resource to db")

        se = SearchEngineFactory().create()
        datatype_factory = DataTypeFactory()
        node_datatypes = {
            str(nodeid): datatype
            for nodeid, datatype in models.Node.objects.values_list(
                "nodeid", "datatype"
            )
        }
        tiles = []
        documents = []
        term_list = []

        start = time()

        for resource in resources:
            resource.tiles = resource.get_flattened_tiles()
            tiles.extend(resource.tiles)

        print("time to extend tiles: %s" % datetime.timedelta(seconds=time() - start))
        start = time()

        # need to save the models first before getting the documents for index
        Resource.objects.bulk_create(resources)
        TileModel.objects.bulk_create(tiles)

        print(
            "time to bulk create tiles and resources: %s"
            % datetime.timedelta(seconds=time() - start)
        )
        start = time()

        for resource in resources:
            resource.save_edit(edit_type="create")

        resources[0].tiles[0].save_edit(
            note=f"bulk created: {len(tiles)} for {len(resources)} resources.", edit_type="bulk_create"
        )

        print(
            "time to save resource edits: %s"
            % datetime.timedelta(seconds=time() - start)
        )
        start = time()

        time_to_get_docs = 0
        time_to_get_root_ontology = 0
        time_to_create_bulk_docs = 0
        time_to_create_bulk_term_docs = 0
        timers = {"timer": 0, "timer1": 0, "timer2": 0, "timer3": 0, "timer4": 0}
        for resource in resources:
            s = time()
            document, terms = resource.get_documents_to_index(
                fetchTiles=False,
                datatype_factory=datatype_factory,
                node_datatypes=node_datatypes,
                config=primaryDescriptorsFunctionConfig,
                graph_nodes=graph_nodes,
            )
            time_to_get_docs = time_to_get_docs + (time() - s)
            # s = time()
            # #document['root_ontology_class'] = resource.get_root_ontology()
            # time_to_get_root_ontology = time_to_get_root_ontology + (time()-s)
            s = time()
            documents.append(
                se.create_bulk_item(
                    index="resources", id=document["resourceinstanceid"], data=document
                )
            )
            time_to_create_bulk_docs = time_to_create_bulk_docs + (time() - s)
            s = time()
            for term in terms:
                term_list.append(
                    se.create_bulk_item(
                        index="terms", id=term["_id"], data=term["_source"]
                    )
                )
            time_to_create_bulk_term_docs = time_to_create_bulk_term_docs + (time() - s)

        # print("timer: %s" % datetime.timedelta(seconds=timers['timer'])
        # print("timer1: %s" % datetime.timedelta(seconds=timers['timer1'])
        # print("timer2: %s" % datetime.timedelta(seconds=timers['timer2'])
        # print("timer3: %s" % datetime.timedelta(seconds=timers['timer3'])
        # print("timer4: %s" % datetime.timedelta(seconds=timers['timer4'])
        # print("time to get documents to index: %s" % datetime.timedelta(seconds=time_to_get_docs)
        # print("time to get root ontology: %s" % datetime.timedelta(seconds=time_to_get_root_ontology)
        # print("time to create bulk docs: %s" % datetime.timedelta(seconds=time_to_create_bulk_docs)
        # print("time to create bulk term docs: %s" % datetime.timedelta(seconds=time_to_create_bulk_term_docs)
        start = time()

        if not settings.STREAMLINE_IMPORT:
            for tile in tiles:
                tile.save_edit(edit_type="tile create", new_value=tile.data)

        # print("time to save tile edits: %s" % datetime.timedelta(seconds=time() - start)
        start = time()

        # print("time to save resources to db:%s" % datetime.timedelta(seconds=time() - start)
        start = time()
        # bulk index the resources, tiles and terms

        # print(documents[0]
        se.bulk_index(documents)
        se.bulk_index(term_list)
Exemple #8
0
    def bulk_save(resources):
        """
        Saves and indexes a list of resources

        Arguments:
        resources -- a list of resource models

        """

        se = SearchEngineFactory().create()
        datatype_factory = DataTypeFactory()
        node_datatypes = {
            str(nodeid): datatype
            for nodeid, datatype in models.Node.objects.values_list(
                "nodeid", "datatype")
        }
        tiles = []
        documents = []
        term_list = []

        for resource in resources:
            resource.tiles = resource.get_flattened_tiles()
            tiles.extend(resource.tiles)

        # need to save the models first before getting the documents for index
        start = time()
        Resource.objects.bulk_create(resources)
        TileModel.objects.bulk_create(tiles)

        print(
            f"Time to bulk create tiles and resources: {datetime.timedelta(seconds=time() - start)}"
        )

        start = time()
        for resource in resources:
            resource.save_edit(edit_type="create")

        resources[0].tiles[0].save_edit(
            note=f"Bulk created: {len(tiles)} for {len(resources)} resources.",
            edit_type="bulk_create")

        print("Time to save resource edits: %s" %
              datetime.timedelta(seconds=time() - start))

        for resource in resources:
            start = time()
            document, terms = resource.get_documents_to_index(
                fetchTiles=False,
                datatype_factory=datatype_factory,
                node_datatypes=node_datatypes)

            documents.append(
                se.create_bulk_item(index="resources",
                                    id=document["resourceinstanceid"],
                                    data=document))

            for term in terms:
                term_list.append(
                    se.create_bulk_item(index="terms",
                                        id=term["_id"],
                                        data=term["_source"]))

        se.bulk_index(documents)
        se.bulk_index(term_list)
Exemple #9
0
    def index(documents,
              index,
              type,
              idfield,
              processdoc=None,
              getid=None,
              bulk=False):
        print 'index_concepts.index'
        detail = ''
        bulkitems = []
        errorlist = []
        se = SearchEngineFactory().create()
        if not isinstance(documents, list):
            documents = [documents]
        for document in documents:
            sys.stdout.write('.')
            if processdoc == None:
                data = document
            else:
                data = processdoc(document)
            id = None
            if getid != None:
                id = getid(document, data)
            try:
                if bulk:
                    bulkitem = se.create_bulk_item(index, type, id, data)
                    bulkitems.append(bulkitem[0])
                    bulkitems.append(bulkitem[1])
                else:
                    se.index_data(index, type, data, idfield=idfield, id=id)
                    #se.index_data('concept_labels', '00000000-0000-0000-0000-000000000005', data, 'id')
                    for concept in data['labels']:
                        #se.index_term(concept['label'], concept['labelid'], '00000000-0000-0000-0000-000000000005', settings.PUBLISHED_LABEL, {'conceptid': data['conceptid']})
                        if concept['label'].strip(' \t\n\r') != '':
                            already_indexed = False
                            count = 1
                            ids = [id]
                        try:
                            _id = uuid.uuid3(
                                uuid.NAMESPACE_DNS,
                                '%s%s' % (hash(concept['label']),
                                          hash(data['conceptid'])))
                            result = se.es.get(index='term',
                                               doc_type='value',
                                               id=_id,
                                               ignore=404)

                            #print 'result: %s' % result
                            if result['found'] == True:
                                ids = result['_source']['ids']
                                if id not in ids:
                                    ids.append(id)
                            else:
                                ids = [id]
                            if data['context'] != '00000000-0000-0000-0000-000000000003' and data[
                                    'context'] != '00000000-0000-0000-0000-000000000004':
                                se.index_data(
                                    'term',
                                    'value', {
                                        'term': concept['label'],
                                        'context': data['context'],
                                        'ewstatus': settings.PUBLISHED_LABEL,
                                        'options': {
                                            'conceptid': data['conceptid']
                                        },
                                        'count': len(ids),
                                        'ids': ids
                                    },
                                    id=_id)

                        except Exception as detail:
                            raise detail
            except Exception as detail:
                print detail
                errorlist.append(id)
        if bulk:
            try:
                se.bulk_index(index, type, bulkitems)
            except Exception as detail:
                errorlist = bulkitems
                print 'bulk inset failed'

        if detail != '':
            print "\n\nException detail: %s " % (detail)
            print "There was a problem indexing the following items:"
            print errorlist