Python get_temp_index_name 예제들, cbh_utils.elasticsearch_client.get_temp_index_name Python 예제들

예제 #1

0

파일 보기

파일: tasks.py 프로젝트: thesgc/chembiohub_ws

def save_multiple_batch( 
                        multiple_batch, 
                        creator_user, 
                        session_key):

    from cbh_chem_api.compounds  import CBHCompoundUploadResource
    cbr_instance = CBHCompoundUploadResource()

    limit = 100
    offset = 0
    batches = []
    hasMoreData = True
    

    datasets = []
    for run in range(0,math.ceil(float(multiple_batch.batch_count)/100.0)):
        datasets.append(( 
                        multiple_batch, 
                        creator_user, 
                        session_key,
                        limit, 
                        offset))
        offset += limit

    lists_of_batches = [process_batch_list(*ds) for ds in datasets]
    batches = [inner for outer in lists_of_batches for inner in outer]
    if multiple_batch.uploaded_file:
        cbr_instance.alter_batch_data_after_save( batches , multiple_batch.uploaded_file.file , multiple_batch)
    index_batches_in_new_index(batches)
    elasticsearch_client.delete_index(
        elasticsearch_client.get_temp_index_name(session_key, multiple_batch.id))
    cbr_instance.after_save_and_index_hook(multiple_batch.id, multiple_batch.project_id)
    return True

예제 #2

0

파일 보기

    def get_part_processed_multiple_batch(self, request, **kwargs):
        """
        Get the part processed data from elasticsearch and the stats about the
        multiple batch
        """
        # TODO: Uncached for now. Invalidation that works for everyone may be
        #       impossible.
        bundle = self.build_bundle(request=request)
        session_key = request.COOKIES[settings.SESSION_COOKIE_NAME]
        # self.authorized_create_detail(self.get_object_list(bundle.request), bundle)
        if (kwargs.get("multi_batch", None)):
            mb = kwargs.get("multi_batch")
            id = mb.id
        else:
            id = request.GET.get("current_batch")
            mb = CBHCompoundMultipleBatch.objects.get(pk=id)

        task_id = request.session.get("mb_inprogress_%d" % mb.id, None)

        if task_id:
            res = result(task_id, wait=10)
            if isinstance(res, basestring):
                raise Exception(res)
        if not mb.uploaded_data:
            #The uploaded data field will be set once the data is fully processed
            return self.create_response(request, {},
                                        response_class=http.HttpAccepted)

        to_be_serialized = mb.uploaded_data
        to_be_serialized = self.get_cached_temporary_batch_data(
            id, request.GET, session_key, bundledata=to_be_serialized)
        index_name = elasticsearch_client.get_temp_index_name(session_key, id)
        elasticsearch_client.get_action_totals(index_name, to_be_serialized)
        return self.create_response(request, to_be_serialized)

예제 #3

0

파일 보기

 def set_cached_temporary_batches(self, batches, multi_batch_id,
                                  session_key):
     """Index the new data when a new bulk upload is done"""
     batch_dicts = self.batches_to_es_ready(batches)
     index_name = elasticsearch_client.get_temp_index_name(
         session_key, multi_batch_id)
     elasticsearch_client.create_temporary_index(batch_dicts, index_name)

예제 #4

0

파일 보기

    def update_temp_batches(self, request, **kwargs):
        '''Update a set of molecules into elasticsearch (used in ChemBio Hub to set the action field to ignore or new batch)'''
        deserialized = self.deserialize(request,
                                        request.body,
                                        format=request.META.get(
                                            'CONTENT_TYPE',
                                            'application/json'))

        deserialized = self.alter_deserialized_detail_data(
            request, deserialized)
        bundle = self.build_bundle(data=dict_strip_unicode_keys(deserialized),
                                   request=request)
        if bundle.obj.pk:
            self.authorized_update_detail(self.get_object_list(bundle.request),
                                          bundle)
        else:
            self.authorized_create_detail(self.get_object_list(bundle.request),
                                          bundle)
        multi_batch_id = bundle.data["multiplebatch"]
        es_ready_updates = bundle.data["objects"]

        index_name = elasticsearch_client.get_temp_index_name(
            request.COOKIES[settings.SESSION_COOKIE_NAME], multi_batch_id)

        elasticsearch_client.create_temporary_index(es_ready_updates,
                                                    index_name)
        elasticsearch_client.get_action_totals(index_name, bundle.data)
        return self.create_response(request,
                                    bundle,
                                    response_class=http.HttpAccepted)

예제 #5

0

파일 보기

    def multi_batch_custom_fields(self, request, **kwargs):
        '''change the structure column for an excel file'''
        deserialized = self.deserialize(request,
                                        request.body,
                                        format=request.META.get(
                                            'CONTENT_TYPE',
                                            'application/json'))

        deserialized = self.alter_deserialized_detail_data(
            request, deserialized)
        bundle = self.build_bundle(data=dict_strip_unicode_keys(deserialized),
                                   request=request)
        if bundle.obj.pk:
            self.authorized_update_detail(self.get_object_list(bundle.request),
                                          bundle)
        else:
            self.authorized_create_detail(self.get_object_list(bundle.request),
                                          bundle)
        id = bundle.data["multiplebatch"]
        headers = bundle.data["headers"]
        # structure_col = bundle.data.get("structure_col", None)
        mb = CBHCompoundMultipleBatch.objects.get(pk=id)
        processSmiles = False
        # if structure_col and structure_col != mb.uploaded_data.get("structure_col", ""):
        #     processSmiles =  True
        index_name = elasticsearch_client.get_temp_index_name(
            request.COOKIES[settings.SESSION_COOKIE_NAME], mb.id)
        elasticsearch_client.get_action_totals(index_name, bundle.data)
        mb.uploaded_data = bundle.data
        mb.save()
        return self.create_response(request,
                                    bundle,
                                    response_class=http.HttpAccepted)

예제 #6

0

파일 보기

파일: compounds.py 프로젝트: thesgc/chembiohub_ws

    def get_part_processed_multiple_batch(self, request, **kwargs):
        """
        Get the part processed data from elasticsearch and the stats about the
        multiple batch
        """
        # TODO: Uncached for now. Invalidation that works for everyone may be
        #       impossible.
        bundle = self.build_bundle(request=request)
        session_key = request.COOKIES[settings.SESSION_COOKIE_NAME]
        # self.authorized_create_detail(self.get_object_list(bundle.request), bundle)
        if(kwargs.get("multi_batch", None)):
            mb = kwargs.get("multi_batch")
            id = mb.id
        else:
            id = request.GET.get("current_batch")
            mb = CBHCompoundMultipleBatch.objects.get(pk=id)

        task_id = request.session.get("mb_inprogress_%d" % mb.id, None)

        if task_id:
            res = result(task_id, wait=10)
            if isinstance(res, basestring):
                raise Exception(res)
        if not mb.uploaded_data:
            #The uploaded data field will be set once the data is fully processed
            return self.create_response(request, {}, response_class=http.HttpAccepted)


        to_be_serialized = mb.uploaded_data
        to_be_serialized = self.get_cached_temporary_batch_data(
            id, request.GET, session_key, bundledata=to_be_serialized)
        index_name = elasticsearch_client.get_temp_index_name(session_key, id)
        elasticsearch_client.get_action_totals(index_name, to_be_serialized)
        return self.create_response(request, to_be_serialized)

예제 #7

0

파일 보기

    def delete_index(self, request, **kwargs):
        """Delete the index that was created for a multiple batch"""
        deserialized = self.deserialize(request,
                                        request.body,
                                        format=request.META.get(
                                            'CONTENT_TYPE',
                                            'application/json'))

        deserialized = self.alter_deserialized_detail_data(
            request, deserialized)
        session_key = request.COOKIES[settings.SESSION_COOKIE_NAME]
        bundle = self.build_bundle(data=dict_strip_unicode_keys(deserialized),
                                   request=request)
        if bundle.obj.pk:
            self.authorized_update_detail(self.get_object_list(bundle.request),
                                          bundle)
        else:
            self.authorized_create_detail(self.get_object_list(bundle.request),
                                          bundle)
        id = bundle.data["multiplebatch"]
        mb = CBHCompoundMultipleBatch.objects.get(pk=id)
        elasticsearch_client.delete_index(
            elasticsearch_client.get_temp_index_name(session_key, mb.id))
        return self.create_response(request,
                                    bundle,
                                    response_class=http.HttpAccepted)

예제 #8

0

파일 보기

파일: compounds.py 프로젝트: thesgc/chembiohub_ws

 def set_cached_temporary_batches(self, batches, multi_batch_id, session_key):
     """Index the new data when a new bulk upload is done"""
     batch_dicts = self.batches_to_es_ready(batches)
     index_name = elasticsearch_client.get_temp_index_name(
         session_key, multi_batch_id)
     elasticsearch_client.create_temporary_index(
         batch_dicts,  index_name)

예제 #9

0

파일 보기

파일: compounds.py 프로젝트: thesgc/chembiohub_ws

    def multi_batch_custom_fields(self, request, **kwargs):
        '''change the structure column for an excel file'''
        deserialized = self.deserialize(request, request.body, format=request.META.get(
            'CONTENT_TYPE', 'application/json'))

        deserialized = self.alter_deserialized_detail_data(
            request, deserialized)
        bundle = self.build_bundle(
            data=dict_strip_unicode_keys(deserialized), request=request)
        if bundle.obj.pk:
            self.authorized_update_detail(
                self.get_object_list(bundle.request), bundle)
        else:
            self.authorized_create_detail(
                self.get_object_list(bundle.request), bundle)
        id = bundle.data["multiplebatch"]
        headers = bundle.data["headers"]
        # structure_col = bundle.data.get("structure_col", None)
        mb = CBHCompoundMultipleBatch.objects.get(pk=id)
        processSmiles = False
        # if structure_col and structure_col != mb.uploaded_data.get("structure_col", ""):
        #     processSmiles =  True
        index_name = elasticsearch_client.get_temp_index_name(request.COOKIES[settings.SESSION_COOKIE_NAME], mb.id)
        elasticsearch_client.get_action_totals(index_name, bundle.data)
        mb.uploaded_data = bundle.data
        mb.save()
        return self.create_response(request, bundle, response_class=http.HttpAccepted)

예제 #10

0

파일 보기

파일: compounds.py 프로젝트: thesgc/chembiohub_ws

    def update_temp_batches(self, request, **kwargs):
        '''Update a set of molecules into elasticsearch (used in ChemBio Hub to set the action field to ignore or new batch)'''
        deserialized = self.deserialize(request, request.body, format=request.META.get(
            'CONTENT_TYPE', 'application/json'))

        deserialized = self.alter_deserialized_detail_data(
            request, deserialized)
        bundle = self.build_bundle(
            data=dict_strip_unicode_keys(deserialized), request=request)
        if bundle.obj.pk:
            self.authorized_update_detail(
                self.get_object_list(bundle.request), bundle)
        else:
            self.authorized_create_detail(
                self.get_object_list(bundle.request), bundle)
        multi_batch_id = bundle.data["multiplebatch"]
        es_ready_updates = bundle.data["objects"]

        index_name = elasticsearch_client.get_temp_index_name(
            request.COOKIES[settings.SESSION_COOKIE_NAME], multi_batch_id)

        elasticsearch_client.create_temporary_index(
            es_ready_updates, index_name)
        elasticsearch_client.get_action_totals(index_name, bundle.data)
        return self.create_response(request, bundle, response_class=http.HttpAccepted)

예제 #11

0

파일 보기

def save_multiple_batch(multiple_batch, creator_user, session_key):

    from cbh_chem_api.compounds import CBHCompoundUploadResource
    cbr_instance = CBHCompoundUploadResource()

    limit = 100
    offset = 0
    batches = []
    hasMoreData = True

    datasets = []
    for run in range(0, math.ceil(float(multiple_batch.batch_count) / 100.0)):
        datasets.append(
            (multiple_batch, creator_user, session_key, limit, offset))
        offset += limit

    lists_of_batches = [process_batch_list(*ds) for ds in datasets]
    batches = [inner for outer in lists_of_batches for inner in outer]
    if multiple_batch.uploaded_file:
        cbr_instance.alter_batch_data_after_save(
            batches, multiple_batch.uploaded_file.file, multiple_batch)
    index_batches_in_new_index(batches)
    elasticsearch_client.delete_index(
        elasticsearch_client.get_temp_index_name(session_key,
                                                 multiple_batch.id))
    cbr_instance.after_save_and_index_hook(multiple_batch.id,
                                           multiple_batch.project_id)
    return True

예제 #12

0

파일 보기

파일: compounds.py 프로젝트: thesgc/chembiohub_ws

 def get_cached_temporary_batch_data(self, multi_batch_id, get_data, session_key, bundledata={}):
     """make the batch data into models so it can be serialized properly"""
     es_request = {
         "from": get_data.get("offset", 0),
         "size": get_data.get("limit", 50),
         "filter": json.loads(get_data.get("query", '{ "match_all" : {}}')),
         "sort": json.loads(get_data.get("sorts", '[{"id": {"order": "asc"}}]'))
     }
     index = elasticsearch_client.get_temp_index_name(
         session_key, multi_batch_id)
     bundledata = elasticsearch_client.get_from_temp_index(index, es_request, bundledata)
     
     return bundledata

예제 #13

0

파일 보기

    def get_cached_temporary_batch_data(self,
                                        multi_batch_id,
                                        get_data,
                                        session_key,
                                        bundledata={}):
        """make the batch data into models so it can be serialized properly"""
        es_request = {
            "from": get_data.get("offset", 0),
            "size": get_data.get("limit", 50),
            "filter": json.loads(get_data.get("query", '{ "match_all" : {}}')),
            "sort":
            json.loads(get_data.get("sorts", '[{"id": {"order": "asc"}}]'))
        }
        index = elasticsearch_client.get_temp_index_name(
            session_key, multi_batch_id)
        bundledata = elasticsearch_client.get_from_temp_index(
            index, es_request, bundledata)

        return bundledata

예제 #14

0

파일 보기

파일: compounds.py 프로젝트: thesgc/chembiohub_ws

    def delete_index(self, request, **kwargs):
        """Delete the index that was created for a multiple batch"""
        deserialized = self.deserialize(request, request.body, format=request.META.get(
            'CONTENT_TYPE', 'application/json'))

        deserialized = self.alter_deserialized_detail_data(
            request, deserialized)
        session_key = request.COOKIES[settings.SESSION_COOKIE_NAME]
        bundle = self.build_bundle(
            data=dict_strip_unicode_keys(deserialized), request=request)
        if bundle.obj.pk:
            self.authorized_update_detail(
                self.get_object_list(bundle.request), bundle)
        else:
            self.authorized_create_detail(
                self.get_object_list(bundle.request), bundle)
        id = bundle.data["multiplebatch"]
        mb = CBHCompoundMultipleBatch.objects.get(pk=id)
        elasticsearch_client.delete_index(
            elasticsearch_client.get_temp_index_name(session_key, mb.id))
        return self.create_response(request, bundle, response_class=http.HttpAccepted)

예제 #15

0

파일 보기

파일: tasks.py 프로젝트: thesgc/chembiohub_ws

def validate_multi_batch(cbr_instance, multiple_batch, bundledata, session_key, batches):
    """Generate a set of staticstics about a set of data that has been uploaded"""
    batches_not_errors = [batch for batch in batches if batch and not batch.warnings.get(
        "parseerror", None) and not batch.warnings.get("smilesParseError", None)]


    for b in batches_not_errors:
        b.properties["action"] = "New Batch"
    batches_with_structures = [
        batch for batch in batches_not_errors if batch.ctab]
    blinded_data = [
        batch for batch in batches_not_errors if not batch.ctab]
    sdfstrings = [batch.ctab for batch in batches_with_structures]
    sdf = "\n".join(sdfstrings)

    filename = "/tmp/" + shortuuid.ShortUUID().random()
    text_file = open(filename, "w")
    text_file.write(sdf)
    text_file.close()
    from subprocess import PIPE, Popen
    p = Popen([settings.INCHI_BINARIES_LOCATION['1.02'],
               "-STDIO",  filename], stdout=PIPE, stderr=PIPE)

    a = p.communicate()
    inchis = {}

    #PB - there is an assumption here that everything that has a structure will generate an inChi without issue. This is not the case.
    #Where a molecule does not generate an inchi, there will be a key error looking up the inchi in inchiparts, as anything that cannot 
    #generate an inchi will be missing from inchiparts, i.e. 50 structures with 1 error will have 49 entries in inchiparts, and this 
    #will in turn bin the whole file - not great when we can handle erroring structures elsewhere

    error_locs = []

    #a[0] holds the generated inchis. a[1] holds all of the error and warning information (if any)
    errorparts = a[1].split("\nError")
    if(len(errorparts) > 1):
        for i, errorp in enumerate(errorparts):
            #split on 'structure #', then get the number given
            if(i > 0):
                splits = errorp.split('structure #')

                error_loc = splits[1].split('.')[0]
                #convert to number, put this number in an errors list
                error_locs.append(error_loc)

    err_batches = []
    #for the errors found, remove from non-error lists and flag as erroring
    for error_no in error_locs:
        error_no_int = int(float(error_no)) - 1

        #find structures at the position indicated - 1 (for 0-indexed list)
        err_batch = batches_with_structures[error_no_int]
        err_batches.append(err_batch)

    #we can't remove these while looping through err_locs as it messes up the list order and gives arrayindex exceptions
    for err_batch in err_batches:

        #remove from batches_with_structures and batches_not_errors
        batches_with_structures.remove(err_batch)
        batches_not_errors.remove(err_batch)

        #flag this batch as erroring due to inability to generate anything for the standard_inchi_key field
        batches_index = batches.index(err_batch)
        batches[batches_index].warnings["inchicreationerror"] = "true"
        batches[batches_index].properties["action"] = "Ignore"


    inchiparts = a[0].split("\nStructure:")

    for i, inch in enumerate(inchiparts):
        parts = inch.split("\n")
        if len(parts) == 1:
            continue
        ints = [s for s in parts[0].split() if s.isdigit()]
        part = "".join(ints)
        inchis[part] = parts[1]
    if not bundledata.get("fileerrors"):
        bundledata["fileerrors"] = []
    new_uploaded_data = []
    already_found = set([])
    duplicates = set([])
    for i, batch in enumerate(batches_with_structures):
        if (str(i+1) in error_locs):
            batch.standard_inchi = None
        else: 
            batch.standard_inchi = inchis[str(i+1)]
        batch.validate(temp_props=False)
        if batch.standard_inchi_key in already_found:
            # setting this in case we change it later
            duplicates.add(batch.standard_inchi_key)
        else:
            already_found.add(batch.standard_inchi_key)

        new_uploaded_data.append(batch)
    already_in_db = MoleculeDictionary.objects.filter(project=bundledata[
                                                      "project"], structure_type="MOL", structure_key__in=already_found).values_list("structure_key", flat=True)
    already_in_db = set(already_in_db)

    bundledata["new"] = 0
    new_data = set([])
    duplicate_overlaps = set([])
    duplicate_new = set([])
    for batch in batches_with_structures:
        if batch.standard_inchi_key in duplicates:
            batch.warnings["duplicate"] = True
        if batch.standard_inchi_key in already_in_db:
            batch.warnings["overlap"] = True
            if batch.standard_inchi_key in duplicates:
                batch.warnings["duplicate"] = True
                duplicate_overlaps.add(batch.standard_inchi_key)
        else:
            batch.warnings["new"] = True

            new_data.add(batch.standard_inchi_key)
            if batch.standard_inchi_key in duplicates:
                batch.warnings["duplicate"] = True
                duplicate_new.add(batch.standard_inchi_key)

    for batch in batches_with_structures:
        if batch.warnings.get("withoutstructure") == True:
            del batch.warnings["withoutstructure"]
    for batch in blinded_data:
        batch.warnings["withoutstructure"] = True
    bundledata["batchstats"] = {}
    bundledata["batchstats"]["withstructure"] = len(
        batches_with_structures)
    bundledata["batchstats"]["parseerrors"] = len(batches) - len(batches_not_errors) + len(
        [b for b in batches_not_errors if b.warnings.get("parseerror", False) == "true"])
    bundledata["batchstats"]["withoutstructure"] = len(blinded_data)
    bundledata["batchstats"]["total"] = len(batches)
    bundledata["compoundstats"] = {}
    bundledata["compoundstats"]["total"] = len(
        already_in_db) + len(new_data)
    bundledata["compoundstats"]["overlaps"] = len(already_in_db)
    bundledata["compoundstats"]["new"] = len(new_data)
    bundledata["compoundstats"][
        "duplicateoverlaps"] = len(duplicate_overlaps)
    bundledata["compoundstats"]["duplicatenew"] = len(duplicate_new)
    bundledata["multiplebatch"] = multiple_batch.pk


    cbr_instance.set_cached_temporary_batches(
        batches, multiple_batch.id, session_key)
    
    #bundledata["objects"] = fifty_batches_for_first_page
    index_name = elasticsearch_client.get_temp_index_name(
        session_key, multiple_batch.id)
    elasticsearch_client.get_action_totals(index_name, bundledata)
    multiple_batch.uploaded_data = bundledata
    multiple_batch.save()

예제 #16

0

파일 보기

def validate_multi_batch(cbr_instance, multiple_batch, bundledata, session_key,
                         batches):
    """Generate a set of staticstics about a set of data that has been uploaded"""
    batches_not_errors = [
        batch for batch in batches
        if batch and not batch.warnings.get("parseerror", None)
        and not batch.warnings.get("smilesParseError", None)
    ]

    for b in batches_not_errors:
        b.properties["action"] = "New Batch"
    batches_with_structures = [
        batch for batch in batches_not_errors if batch.ctab
    ]
    blinded_data = [batch for batch in batches_not_errors if not batch.ctab]
    sdfstrings = [batch.ctab for batch in batches_with_structures]
    sdf = "\n".join(sdfstrings)

    filename = "/tmp/" + shortuuid.ShortUUID().random()
    text_file = open(filename, "w")
    text_file.write(sdf)
    text_file.close()
    from subprocess import PIPE, Popen
    p = Popen([settings.INCHI_BINARIES_LOCATION['1.02'], "-STDIO", filename],
              stdout=PIPE,
              stderr=PIPE)

    a = p.communicate()
    inchis = {}

    #PB - there is an assumption here that everything that has a structure will generate an inChi without issue. This is not the case.
    #Where a molecule does not generate an inchi, there will be a key error looking up the inchi in inchiparts, as anything that cannot
    #generate an inchi will be missing from inchiparts, i.e. 50 structures with 1 error will have 49 entries in inchiparts, and this
    #will in turn bin the whole file - not great when we can handle erroring structures elsewhere

    error_locs = []

    #a[0] holds the generated inchis. a[1] holds all of the error and warning information (if any)
    errorparts = a[1].split("\nError")
    if (len(errorparts) > 1):
        for i, errorp in enumerate(errorparts):
            #split on 'structure #', then get the number given
            if (i > 0):
                splits = errorp.split('structure #')

                error_loc = splits[1].split('.')[0]
                #convert to number, put this number in an errors list
                error_locs.append(error_loc)

    err_batches = []
    #for the errors found, remove from non-error lists and flag as erroring
    for error_no in error_locs:
        error_no_int = int(float(error_no)) - 1

        #find structures at the position indicated - 1 (for 0-indexed list)
        err_batch = batches_with_structures[error_no_int]
        err_batches.append(err_batch)

    #we can't remove these while looping through err_locs as it messes up the list order and gives arrayindex exceptions
    for err_batch in err_batches:

        #remove from batches_with_structures and batches_not_errors
        batches_with_structures.remove(err_batch)
        batches_not_errors.remove(err_batch)

        #flag this batch as erroring due to inability to generate anything for the standard_inchi_key field
        batches_index = batches.index(err_batch)
        batches[batches_index].warnings["inchicreationerror"] = "true"
        batches[batches_index].properties["action"] = "Ignore"

    inchiparts = a[0].split("\nStructure:")

    for i, inch in enumerate(inchiparts):
        parts = inch.split("\n")
        if len(parts) == 1:
            continue
        ints = [s for s in parts[0].split() if s.isdigit()]
        part = "".join(ints)
        inchis[part] = parts[1]
    if not bundledata.get("fileerrors"):
        bundledata["fileerrors"] = []
    new_uploaded_data = []
    already_found = set([])
    duplicates = set([])
    for i, batch in enumerate(batches_with_structures):
        if (str(i + 1) in error_locs):
            batch.standard_inchi = None
        else:
            batch.standard_inchi = inchis[str(i + 1)]
        batch.validate(temp_props=False)
        if batch.standard_inchi_key in already_found:
            # setting this in case we change it later
            duplicates.add(batch.standard_inchi_key)
        else:
            already_found.add(batch.standard_inchi_key)

        new_uploaded_data.append(batch)
    already_in_db = MoleculeDictionary.objects.filter(
        project=bundledata["project"],
        structure_type="MOL",
        structure_key__in=already_found).values_list("structure_key",
                                                     flat=True)
    already_in_db = set(already_in_db)

    bundledata["new"] = 0
    new_data = set([])
    duplicate_overlaps = set([])
    duplicate_new = set([])
    for batch in batches_with_structures:
        if batch.standard_inchi_key in duplicates:
            batch.warnings["duplicate"] = True
        if batch.standard_inchi_key in already_in_db:
            batch.warnings["overlap"] = True
            if batch.standard_inchi_key in duplicates:
                batch.warnings["duplicate"] = True
                duplicate_overlaps.add(batch.standard_inchi_key)
        else:
            batch.warnings["new"] = True

            new_data.add(batch.standard_inchi_key)
            if batch.standard_inchi_key in duplicates:
                batch.warnings["duplicate"] = True
                duplicate_new.add(batch.standard_inchi_key)

    for batch in batches_with_structures:
        if batch.warnings.get("withoutstructure") == True:
            del batch.warnings["withoutstructure"]
    for batch in blinded_data:
        batch.warnings["withoutstructure"] = True
    bundledata["batchstats"] = {}
    bundledata["batchstats"]["withstructure"] = len(batches_with_structures)
    bundledata["batchstats"]["parseerrors"] = len(batches) - len(
        batches_not_errors) + len([
            b for b in batches_not_errors
            if b.warnings.get("parseerror", False) == "true"
        ])
    bundledata["batchstats"]["withoutstructure"] = len(blinded_data)
    bundledata["batchstats"]["total"] = len(batches)
    bundledata["compoundstats"] = {}
    bundledata["compoundstats"]["total"] = len(already_in_db) + len(new_data)
    bundledata["compoundstats"]["overlaps"] = len(already_in_db)
    bundledata["compoundstats"]["new"] = len(new_data)
    bundledata["compoundstats"]["duplicateoverlaps"] = len(duplicate_overlaps)
    bundledata["compoundstats"]["duplicatenew"] = len(duplicate_new)
    bundledata["multiplebatch"] = multiple_batch.pk

    cbr_instance.set_cached_temporary_batches(batches, multiple_batch.id,
                                              session_key)

    #bundledata["objects"] = fifty_batches_for_first_page
    index_name = elasticsearch_client.get_temp_index_name(
        session_key, multiple_batch.id)
    elasticsearch_client.get_action_totals(index_name, bundledata)
    multiple_batch.uploaded_data = bundledata
    multiple_batch.save()