def save_multiple_batch( multiple_batch, creator_user, session_key): from cbh_chem_api.compounds import CBHCompoundUploadResource cbr_instance = CBHCompoundUploadResource() limit = 100 offset = 0 batches = [] hasMoreData = True datasets = [] for run in range(0,math.ceil(float(multiple_batch.batch_count)/100.0)): datasets.append(( multiple_batch, creator_user, session_key, limit, offset)) offset += limit lists_of_batches = [process_batch_list(*ds) for ds in datasets] batches = [inner for outer in lists_of_batches for inner in outer] if multiple_batch.uploaded_file: cbr_instance.alter_batch_data_after_save( batches , multiple_batch.uploaded_file.file , multiple_batch) index_batches_in_new_index(batches) elasticsearch_client.delete_index( elasticsearch_client.get_temp_index_name(session_key, multiple_batch.id)) cbr_instance.after_save_and_index_hook(multiple_batch.id, multiple_batch.project_id) return True
def get_part_processed_multiple_batch(self, request, **kwargs): """ Get the part processed data from elasticsearch and the stats about the multiple batch """ # TODO: Uncached for now. Invalidation that works for everyone may be # impossible. bundle = self.build_bundle(request=request) session_key = request.COOKIES[settings.SESSION_COOKIE_NAME] # self.authorized_create_detail(self.get_object_list(bundle.request), bundle) if (kwargs.get("multi_batch", None)): mb = kwargs.get("multi_batch") id = mb.id else: id = request.GET.get("current_batch") mb = CBHCompoundMultipleBatch.objects.get(pk=id) task_id = request.session.get("mb_inprogress_%d" % mb.id, None) if task_id: res = result(task_id, wait=10) if isinstance(res, basestring): raise Exception(res) if not mb.uploaded_data: #The uploaded data field will be set once the data is fully processed return self.create_response(request, {}, response_class=http.HttpAccepted) to_be_serialized = mb.uploaded_data to_be_serialized = self.get_cached_temporary_batch_data( id, request.GET, session_key, bundledata=to_be_serialized) index_name = elasticsearch_client.get_temp_index_name(session_key, id) elasticsearch_client.get_action_totals(index_name, to_be_serialized) return self.create_response(request, to_be_serialized)
def set_cached_temporary_batches(self, batches, multi_batch_id, session_key): """Index the new data when a new bulk upload is done""" batch_dicts = self.batches_to_es_ready(batches) index_name = elasticsearch_client.get_temp_index_name( session_key, multi_batch_id) elasticsearch_client.create_temporary_index(batch_dicts, index_name)
def update_temp_batches(self, request, **kwargs): '''Update a set of molecules into elasticsearch (used in ChemBio Hub to set the action field to ignore or new batch)''' deserialized = self.deserialize(request, request.body, format=request.META.get( 'CONTENT_TYPE', 'application/json')) deserialized = self.alter_deserialized_detail_data( request, deserialized) bundle = self.build_bundle(data=dict_strip_unicode_keys(deserialized), request=request) if bundle.obj.pk: self.authorized_update_detail(self.get_object_list(bundle.request), bundle) else: self.authorized_create_detail(self.get_object_list(bundle.request), bundle) multi_batch_id = bundle.data["multiplebatch"] es_ready_updates = bundle.data["objects"] index_name = elasticsearch_client.get_temp_index_name( request.COOKIES[settings.SESSION_COOKIE_NAME], multi_batch_id) elasticsearch_client.create_temporary_index(es_ready_updates, index_name) elasticsearch_client.get_action_totals(index_name, bundle.data) return self.create_response(request, bundle, response_class=http.HttpAccepted)
def multi_batch_custom_fields(self, request, **kwargs): '''change the structure column for an excel file''' deserialized = self.deserialize(request, request.body, format=request.META.get( 'CONTENT_TYPE', 'application/json')) deserialized = self.alter_deserialized_detail_data( request, deserialized) bundle = self.build_bundle(data=dict_strip_unicode_keys(deserialized), request=request) if bundle.obj.pk: self.authorized_update_detail(self.get_object_list(bundle.request), bundle) else: self.authorized_create_detail(self.get_object_list(bundle.request), bundle) id = bundle.data["multiplebatch"] headers = bundle.data["headers"] # structure_col = bundle.data.get("structure_col", None) mb = CBHCompoundMultipleBatch.objects.get(pk=id) processSmiles = False # if structure_col and structure_col != mb.uploaded_data.get("structure_col", ""): # processSmiles = True index_name = elasticsearch_client.get_temp_index_name( request.COOKIES[settings.SESSION_COOKIE_NAME], mb.id) elasticsearch_client.get_action_totals(index_name, bundle.data) mb.uploaded_data = bundle.data mb.save() return self.create_response(request, bundle, response_class=http.HttpAccepted)
def get_part_processed_multiple_batch(self, request, **kwargs): """ Get the part processed data from elasticsearch and the stats about the multiple batch """ # TODO: Uncached for now. Invalidation that works for everyone may be # impossible. bundle = self.build_bundle(request=request) session_key = request.COOKIES[settings.SESSION_COOKIE_NAME] # self.authorized_create_detail(self.get_object_list(bundle.request), bundle) if(kwargs.get("multi_batch", None)): mb = kwargs.get("multi_batch") id = mb.id else: id = request.GET.get("current_batch") mb = CBHCompoundMultipleBatch.objects.get(pk=id) task_id = request.session.get("mb_inprogress_%d" % mb.id, None) if task_id: res = result(task_id, wait=10) if isinstance(res, basestring): raise Exception(res) if not mb.uploaded_data: #The uploaded data field will be set once the data is fully processed return self.create_response(request, {}, response_class=http.HttpAccepted) to_be_serialized = mb.uploaded_data to_be_serialized = self.get_cached_temporary_batch_data( id, request.GET, session_key, bundledata=to_be_serialized) index_name = elasticsearch_client.get_temp_index_name(session_key, id) elasticsearch_client.get_action_totals(index_name, to_be_serialized) return self.create_response(request, to_be_serialized)
def delete_index(self, request, **kwargs): """Delete the index that was created for a multiple batch""" deserialized = self.deserialize(request, request.body, format=request.META.get( 'CONTENT_TYPE', 'application/json')) deserialized = self.alter_deserialized_detail_data( request, deserialized) session_key = request.COOKIES[settings.SESSION_COOKIE_NAME] bundle = self.build_bundle(data=dict_strip_unicode_keys(deserialized), request=request) if bundle.obj.pk: self.authorized_update_detail(self.get_object_list(bundle.request), bundle) else: self.authorized_create_detail(self.get_object_list(bundle.request), bundle) id = bundle.data["multiplebatch"] mb = CBHCompoundMultipleBatch.objects.get(pk=id) elasticsearch_client.delete_index( elasticsearch_client.get_temp_index_name(session_key, mb.id)) return self.create_response(request, bundle, response_class=http.HttpAccepted)
def set_cached_temporary_batches(self, batches, multi_batch_id, session_key): """Index the new data when a new bulk upload is done""" batch_dicts = self.batches_to_es_ready(batches) index_name = elasticsearch_client.get_temp_index_name( session_key, multi_batch_id) elasticsearch_client.create_temporary_index( batch_dicts, index_name)
def multi_batch_custom_fields(self, request, **kwargs): '''change the structure column for an excel file''' deserialized = self.deserialize(request, request.body, format=request.META.get( 'CONTENT_TYPE', 'application/json')) deserialized = self.alter_deserialized_detail_data( request, deserialized) bundle = self.build_bundle( data=dict_strip_unicode_keys(deserialized), request=request) if bundle.obj.pk: self.authorized_update_detail( self.get_object_list(bundle.request), bundle) else: self.authorized_create_detail( self.get_object_list(bundle.request), bundle) id = bundle.data["multiplebatch"] headers = bundle.data["headers"] # structure_col = bundle.data.get("structure_col", None) mb = CBHCompoundMultipleBatch.objects.get(pk=id) processSmiles = False # if structure_col and structure_col != mb.uploaded_data.get("structure_col", ""): # processSmiles = True index_name = elasticsearch_client.get_temp_index_name(request.COOKIES[settings.SESSION_COOKIE_NAME], mb.id) elasticsearch_client.get_action_totals(index_name, bundle.data) mb.uploaded_data = bundle.data mb.save() return self.create_response(request, bundle, response_class=http.HttpAccepted)
def update_temp_batches(self, request, **kwargs): '''Update a set of molecules into elasticsearch (used in ChemBio Hub to set the action field to ignore or new batch)''' deserialized = self.deserialize(request, request.body, format=request.META.get( 'CONTENT_TYPE', 'application/json')) deserialized = self.alter_deserialized_detail_data( request, deserialized) bundle = self.build_bundle( data=dict_strip_unicode_keys(deserialized), request=request) if bundle.obj.pk: self.authorized_update_detail( self.get_object_list(bundle.request), bundle) else: self.authorized_create_detail( self.get_object_list(bundle.request), bundle) multi_batch_id = bundle.data["multiplebatch"] es_ready_updates = bundle.data["objects"] index_name = elasticsearch_client.get_temp_index_name( request.COOKIES[settings.SESSION_COOKIE_NAME], multi_batch_id) elasticsearch_client.create_temporary_index( es_ready_updates, index_name) elasticsearch_client.get_action_totals(index_name, bundle.data) return self.create_response(request, bundle, response_class=http.HttpAccepted)
def save_multiple_batch(multiple_batch, creator_user, session_key): from cbh_chem_api.compounds import CBHCompoundUploadResource cbr_instance = CBHCompoundUploadResource() limit = 100 offset = 0 batches = [] hasMoreData = True datasets = [] for run in range(0, math.ceil(float(multiple_batch.batch_count) / 100.0)): datasets.append( (multiple_batch, creator_user, session_key, limit, offset)) offset += limit lists_of_batches = [process_batch_list(*ds) for ds in datasets] batches = [inner for outer in lists_of_batches for inner in outer] if multiple_batch.uploaded_file: cbr_instance.alter_batch_data_after_save( batches, multiple_batch.uploaded_file.file, multiple_batch) index_batches_in_new_index(batches) elasticsearch_client.delete_index( elasticsearch_client.get_temp_index_name(session_key, multiple_batch.id)) cbr_instance.after_save_and_index_hook(multiple_batch.id, multiple_batch.project_id) return True
def get_cached_temporary_batch_data(self, multi_batch_id, get_data, session_key, bundledata={}): """make the batch data into models so it can be serialized properly""" es_request = { "from": get_data.get("offset", 0), "size": get_data.get("limit", 50), "filter": json.loads(get_data.get("query", '{ "match_all" : {}}')), "sort": json.loads(get_data.get("sorts", '[{"id": {"order": "asc"}}]')) } index = elasticsearch_client.get_temp_index_name( session_key, multi_batch_id) bundledata = elasticsearch_client.get_from_temp_index(index, es_request, bundledata) return bundledata
def get_cached_temporary_batch_data(self, multi_batch_id, get_data, session_key, bundledata={}): """make the batch data into models so it can be serialized properly""" es_request = { "from": get_data.get("offset", 0), "size": get_data.get("limit", 50), "filter": json.loads(get_data.get("query", '{ "match_all" : {}}')), "sort": json.loads(get_data.get("sorts", '[{"id": {"order": "asc"}}]')) } index = elasticsearch_client.get_temp_index_name( session_key, multi_batch_id) bundledata = elasticsearch_client.get_from_temp_index( index, es_request, bundledata) return bundledata
def delete_index(self, request, **kwargs): """Delete the index that was created for a multiple batch""" deserialized = self.deserialize(request, request.body, format=request.META.get( 'CONTENT_TYPE', 'application/json')) deserialized = self.alter_deserialized_detail_data( request, deserialized) session_key = request.COOKIES[settings.SESSION_COOKIE_NAME] bundle = self.build_bundle( data=dict_strip_unicode_keys(deserialized), request=request) if bundle.obj.pk: self.authorized_update_detail( self.get_object_list(bundle.request), bundle) else: self.authorized_create_detail( self.get_object_list(bundle.request), bundle) id = bundle.data["multiplebatch"] mb = CBHCompoundMultipleBatch.objects.get(pk=id) elasticsearch_client.delete_index( elasticsearch_client.get_temp_index_name(session_key, mb.id)) return self.create_response(request, bundle, response_class=http.HttpAccepted)
def validate_multi_batch(cbr_instance, multiple_batch, bundledata, session_key, batches): """Generate a set of staticstics about a set of data that has been uploaded""" batches_not_errors = [batch for batch in batches if batch and not batch.warnings.get( "parseerror", None) and not batch.warnings.get("smilesParseError", None)] for b in batches_not_errors: b.properties["action"] = "New Batch" batches_with_structures = [ batch for batch in batches_not_errors if batch.ctab] blinded_data = [ batch for batch in batches_not_errors if not batch.ctab] sdfstrings = [batch.ctab for batch in batches_with_structures] sdf = "\n".join(sdfstrings) filename = "/tmp/" + shortuuid.ShortUUID().random() text_file = open(filename, "w") text_file.write(sdf) text_file.close() from subprocess import PIPE, Popen p = Popen([settings.INCHI_BINARIES_LOCATION['1.02'], "-STDIO", filename], stdout=PIPE, stderr=PIPE) a = p.communicate() inchis = {} #PB - there is an assumption here that everything that has a structure will generate an inChi without issue. This is not the case. #Where a molecule does not generate an inchi, there will be a key error looking up the inchi in inchiparts, as anything that cannot #generate an inchi will be missing from inchiparts, i.e. 50 structures with 1 error will have 49 entries in inchiparts, and this #will in turn bin the whole file - not great when we can handle erroring structures elsewhere error_locs = [] #a[0] holds the generated inchis. a[1] holds all of the error and warning information (if any) errorparts = a[1].split("\nError") if(len(errorparts) > 1): for i, errorp in enumerate(errorparts): #split on 'structure #', then get the number given if(i > 0): splits = errorp.split('structure #') error_loc = splits[1].split('.')[0] #convert to number, put this number in an errors list error_locs.append(error_loc) err_batches = [] #for the errors found, remove from non-error lists and flag as erroring for error_no in error_locs: error_no_int = int(float(error_no)) - 1 #find structures at the position indicated - 1 (for 0-indexed list) err_batch = batches_with_structures[error_no_int] err_batches.append(err_batch) #we can't remove these while looping through err_locs as it messes up the list order and gives arrayindex exceptions for err_batch in err_batches: #remove from batches_with_structures and batches_not_errors batches_with_structures.remove(err_batch) batches_not_errors.remove(err_batch) #flag this batch as erroring due to inability to generate anything for the standard_inchi_key field batches_index = batches.index(err_batch) batches[batches_index].warnings["inchicreationerror"] = "true" batches[batches_index].properties["action"] = "Ignore" inchiparts = a[0].split("\nStructure:") for i, inch in enumerate(inchiparts): parts = inch.split("\n") if len(parts) == 1: continue ints = [s for s in parts[0].split() if s.isdigit()] part = "".join(ints) inchis[part] = parts[1] if not bundledata.get("fileerrors"): bundledata["fileerrors"] = [] new_uploaded_data = [] already_found = set([]) duplicates = set([]) for i, batch in enumerate(batches_with_structures): if (str(i+1) in error_locs): batch.standard_inchi = None else: batch.standard_inchi = inchis[str(i+1)] batch.validate(temp_props=False) if batch.standard_inchi_key in already_found: # setting this in case we change it later duplicates.add(batch.standard_inchi_key) else: already_found.add(batch.standard_inchi_key) new_uploaded_data.append(batch) already_in_db = MoleculeDictionary.objects.filter(project=bundledata[ "project"], structure_type="MOL", structure_key__in=already_found).values_list("structure_key", flat=True) already_in_db = set(already_in_db) bundledata["new"] = 0 new_data = set([]) duplicate_overlaps = set([]) duplicate_new = set([]) for batch in batches_with_structures: if batch.standard_inchi_key in duplicates: batch.warnings["duplicate"] = True if batch.standard_inchi_key in already_in_db: batch.warnings["overlap"] = True if batch.standard_inchi_key in duplicates: batch.warnings["duplicate"] = True duplicate_overlaps.add(batch.standard_inchi_key) else: batch.warnings["new"] = True new_data.add(batch.standard_inchi_key) if batch.standard_inchi_key in duplicates: batch.warnings["duplicate"] = True duplicate_new.add(batch.standard_inchi_key) for batch in batches_with_structures: if batch.warnings.get("withoutstructure") == True: del batch.warnings["withoutstructure"] for batch in blinded_data: batch.warnings["withoutstructure"] = True bundledata["batchstats"] = {} bundledata["batchstats"]["withstructure"] = len( batches_with_structures) bundledata["batchstats"]["parseerrors"] = len(batches) - len(batches_not_errors) + len( [b for b in batches_not_errors if b.warnings.get("parseerror", False) == "true"]) bundledata["batchstats"]["withoutstructure"] = len(blinded_data) bundledata["batchstats"]["total"] = len(batches) bundledata["compoundstats"] = {} bundledata["compoundstats"]["total"] = len( already_in_db) + len(new_data) bundledata["compoundstats"]["overlaps"] = len(already_in_db) bundledata["compoundstats"]["new"] = len(new_data) bundledata["compoundstats"][ "duplicateoverlaps"] = len(duplicate_overlaps) bundledata["compoundstats"]["duplicatenew"] = len(duplicate_new) bundledata["multiplebatch"] = multiple_batch.pk cbr_instance.set_cached_temporary_batches( batches, multiple_batch.id, session_key) #bundledata["objects"] = fifty_batches_for_first_page index_name = elasticsearch_client.get_temp_index_name( session_key, multiple_batch.id) elasticsearch_client.get_action_totals(index_name, bundledata) multiple_batch.uploaded_data = bundledata multiple_batch.save()
def validate_multi_batch(cbr_instance, multiple_batch, bundledata, session_key, batches): """Generate a set of staticstics about a set of data that has been uploaded""" batches_not_errors = [ batch for batch in batches if batch and not batch.warnings.get("parseerror", None) and not batch.warnings.get("smilesParseError", None) ] for b in batches_not_errors: b.properties["action"] = "New Batch" batches_with_structures = [ batch for batch in batches_not_errors if batch.ctab ] blinded_data = [batch for batch in batches_not_errors if not batch.ctab] sdfstrings = [batch.ctab for batch in batches_with_structures] sdf = "\n".join(sdfstrings) filename = "/tmp/" + shortuuid.ShortUUID().random() text_file = open(filename, "w") text_file.write(sdf) text_file.close() from subprocess import PIPE, Popen p = Popen([settings.INCHI_BINARIES_LOCATION['1.02'], "-STDIO", filename], stdout=PIPE, stderr=PIPE) a = p.communicate() inchis = {} #PB - there is an assumption here that everything that has a structure will generate an inChi without issue. This is not the case. #Where a molecule does not generate an inchi, there will be a key error looking up the inchi in inchiparts, as anything that cannot #generate an inchi will be missing from inchiparts, i.e. 50 structures with 1 error will have 49 entries in inchiparts, and this #will in turn bin the whole file - not great when we can handle erroring structures elsewhere error_locs = [] #a[0] holds the generated inchis. a[1] holds all of the error and warning information (if any) errorparts = a[1].split("\nError") if (len(errorparts) > 1): for i, errorp in enumerate(errorparts): #split on 'structure #', then get the number given if (i > 0): splits = errorp.split('structure #') error_loc = splits[1].split('.')[0] #convert to number, put this number in an errors list error_locs.append(error_loc) err_batches = [] #for the errors found, remove from non-error lists and flag as erroring for error_no in error_locs: error_no_int = int(float(error_no)) - 1 #find structures at the position indicated - 1 (for 0-indexed list) err_batch = batches_with_structures[error_no_int] err_batches.append(err_batch) #we can't remove these while looping through err_locs as it messes up the list order and gives arrayindex exceptions for err_batch in err_batches: #remove from batches_with_structures and batches_not_errors batches_with_structures.remove(err_batch) batches_not_errors.remove(err_batch) #flag this batch as erroring due to inability to generate anything for the standard_inchi_key field batches_index = batches.index(err_batch) batches[batches_index].warnings["inchicreationerror"] = "true" batches[batches_index].properties["action"] = "Ignore" inchiparts = a[0].split("\nStructure:") for i, inch in enumerate(inchiparts): parts = inch.split("\n") if len(parts) == 1: continue ints = [s for s in parts[0].split() if s.isdigit()] part = "".join(ints) inchis[part] = parts[1] if not bundledata.get("fileerrors"): bundledata["fileerrors"] = [] new_uploaded_data = [] already_found = set([]) duplicates = set([]) for i, batch in enumerate(batches_with_structures): if (str(i + 1) in error_locs): batch.standard_inchi = None else: batch.standard_inchi = inchis[str(i + 1)] batch.validate(temp_props=False) if batch.standard_inchi_key in already_found: # setting this in case we change it later duplicates.add(batch.standard_inchi_key) else: already_found.add(batch.standard_inchi_key) new_uploaded_data.append(batch) already_in_db = MoleculeDictionary.objects.filter( project=bundledata["project"], structure_type="MOL", structure_key__in=already_found).values_list("structure_key", flat=True) already_in_db = set(already_in_db) bundledata["new"] = 0 new_data = set([]) duplicate_overlaps = set([]) duplicate_new = set([]) for batch in batches_with_structures: if batch.standard_inchi_key in duplicates: batch.warnings["duplicate"] = True if batch.standard_inchi_key in already_in_db: batch.warnings["overlap"] = True if batch.standard_inchi_key in duplicates: batch.warnings["duplicate"] = True duplicate_overlaps.add(batch.standard_inchi_key) else: batch.warnings["new"] = True new_data.add(batch.standard_inchi_key) if batch.standard_inchi_key in duplicates: batch.warnings["duplicate"] = True duplicate_new.add(batch.standard_inchi_key) for batch in batches_with_structures: if batch.warnings.get("withoutstructure") == True: del batch.warnings["withoutstructure"] for batch in blinded_data: batch.warnings["withoutstructure"] = True bundledata["batchstats"] = {} bundledata["batchstats"]["withstructure"] = len(batches_with_structures) bundledata["batchstats"]["parseerrors"] = len(batches) - len( batches_not_errors) + len([ b for b in batches_not_errors if b.warnings.get("parseerror", False) == "true" ]) bundledata["batchstats"]["withoutstructure"] = len(blinded_data) bundledata["batchstats"]["total"] = len(batches) bundledata["compoundstats"] = {} bundledata["compoundstats"]["total"] = len(already_in_db) + len(new_data) bundledata["compoundstats"]["overlaps"] = len(already_in_db) bundledata["compoundstats"]["new"] = len(new_data) bundledata["compoundstats"]["duplicateoverlaps"] = len(duplicate_overlaps) bundledata["compoundstats"]["duplicatenew"] = len(duplicate_new) bundledata["multiplebatch"] = multiple_batch.pk cbr_instance.set_cached_temporary_batches(batches, multiple_batch.id, session_key) #bundledata["objects"] = fifty_batches_for_first_page index_name = elasticsearch_client.get_temp_index_name( session_key, multiple_batch.id) elasticsearch_client.get_action_totals(index_name, bundledata) multiple_batch.uploaded_data = bundledata multiple_batch.save()