class Predict: def __init__(self, model, version=0, output_format=None, label=None): LOG.debug('Starting predict...') self.model = model self.version = version self.param = Parameters() self.conveyor = Conveyor() # identify the workflow type self.conveyor.setOrigin('apply') # load modelID success, result = utils.getModelID(model, version, 'model') if not success: LOG.critical(f'{result}. Aborting...') sys.exit() self.conveyor.addMeta('modelID', result) LOG.debug (f'Loaded model with modelID: {result}') # assign prediction label self.conveyor.addVal(label, 'prediction_label', 'prediction label', 'method', 'single', 'Label used to identify the prediction') success, results = self.param.loadYaml(model, version) if not success: LOG.critical(f'Unable to load model parameters. {results}. Aborting...') sys.exit() # add additional output formats included in the constructor # this is requiered to add JSON format as output when the object is # instantiated from a web service call, requiring this output if output_format != None: if output_format not in self.param.getVal('output_format'): self.param.appVal('output_format',output_format) if 'ghost' in output_format: self.param.setVal('output_similar', False) return def get_ensemble(self): ''' Returns a Boolean indicating if the model uses external input sources and a list with these sources ''' return self.param.getEnsemble() def set_single_CPU(self) -> None: ''' Forces the use of a single CPU ''' LOG.debug('parameter "numCPUs" forced to be 1') self.param.setVal('numCPUs',1) def run(self, input_source): ''' Executes a default predicton workflow ''' # path to endpoint endpoint = utils.model_path(self.model, self.version) # if not os.path.isdir(endpoint): # self.conveyor.setError(f'Unable to find model {self.model}, version {self.version}') # #LOG.error(f'Unable to find model {self.model}') # if not self.conveyor.getError(): # uses the child classes within the 'model' folder, # to allow customization of # the processing applied to each model modpath = utils.module_path(self.model, self.version) idata_child = importlib.import_module(modpath+".idata_child") apply_child = importlib.import_module(modpath+".apply_child") odata_child = importlib.import_module(modpath+".odata_child") # run idata object, in charge of generate model data from input try: idata = idata_child.IdataChild(self.param, self.conveyor, input_source) except: LOG.warning ('Idata child architecture mismatch, defaulting to Idata parent') idata = Idata(self.param, self.conveyor, input_source) idata.run() LOG.debug(f'idata child {type(idata).__name__} completed `run()`') if not self.conveyor.getError(): success, results = idata.preprocess_apply() if not success: self.conveyor.setError(results) if not self.conveyor.getError(): # make sure there is X data if not self.conveyor.isKey('xmatrix'): LOG.debug(f'Failed to compute MDs') self.conveyor.setError(f'Failed to compute MDs') # for secret models avoid searching similar compounds space_pkl = os.path.join(endpoint,'space.pkl') if not os.path.isfile(space_pkl): self.param.setVal('output_similar', False) if not self.conveyor.getError(): if self.param.getVal('output_similar') is True: from flame.sapply import Sapply metric = self.param.getVal('similarity_metric') numsel = self.param.getVal('similarity_cutoff_num') cutoff = self.param.getVal('similarity_cutoff_distance') # sapply = Sapply(self.param, self.conveyor) sapply_child = importlib.import_module(modpath+".sapply_child") # run apply object, in charge of generate a prediction from idata try: sapply = sapply_child.SapplyChild(self.param, self.conveyor) except: LOG.warning ('Sapply child architecture mismatch, defaulting to Sapply parent') sapply = Sapply(self.param, self.conveyor) sapply.run(cutoff, numsel, metric) LOG.debug(f'sapply child {type(sapply).__name__} completed `run()`') if not self.conveyor.getError(): # run apply object, in charge of generate a prediction from idata try: apply = apply_child.ApplyChild(self.param, self.conveyor) except: LOG.warning ('Apply child architecture mismatch, defaulting to Apply parent') apply = Apply(self.param, self.conveyor) apply.run() LOG.debug(f'apply child {type(apply).__name__} completed `run()`') # run odata object, in charge of formatting the prediction results # note that if any of the above steps failed, an error has been inserted in the # conveyor and odata will take case of showing an error message try: odata = odata_child.OdataChild(self.param, self.conveyor) except: LOG.warning ('Odata child architecture mismatch, defaulting to Odata parent') odata = Odata(self.param, self.conveyor) return odata.run()
class Search: def __init__(self, space, version, output_format=None, label=None): LOG.debug('Starting search...') self.space = space self.version = version self.label = label self.param = Parameters() self.conveyor = Conveyor() # identify the workflow type self.conveyor.setOrigin('sapply') # load modelID path = utils.space_path(space, version) meta = os.path.join(path,'space-meta.pkl') try: with open(meta, 'rb') as handle: modelID = pickle.load(handle) except: LOG.critical(f'Unable to load modelID from {meta}. Aborting...') sys.exit() self.conveyor.addMeta('modelID', modelID) LOG.debug (f'Loaded space with modelID: {modelID}') # assign prediction (search) label self.conveyor.addVal(label, 'prediction_label', 'prediction label', 'method', 'single', 'Label used to identify the prediction') success, results = self.param.loadYaml(space, version, isSpace=True) if not success: LOG.critical(f'Unable to load space parameters. {results}. Aborting...') sys.exit() # add additional output formats included in the constructor # this is requiered to add JSON format as output when the object is # instantiated from a web service call, requiring this output if output_format != None: if output_format not in self.param.getVal('output_format'): self.param.appVal('output_format',output_format) return def set_single_CPU(self) -> None: ''' Forces the use of a single CPU ''' LOG.debug('parameter "numCPUs" forced to be 1') self.param.setVal('numCPUs',1) def getVal (self, idict, ikey): if not ikey in idict: return None return idict[ikey] # def run(self, input_source, runtime_param=None, metric=None, numsel=None, cutoff=None): def run(self, param_dict): ''' Executes a default predicton workflow ''' metric = None numsel = None cutoff = None # path to endpoint epd = utils.space_path(self.space, self.version) if not os.path.isdir(epd): LOG.error(f'Unable to find space {self.space}') self.conveyor.setError(f'Unable to find space {self.space}, version {self.version}') if self.getVal(param_dict,'smarts') is not None: input_source = param_dict['smarts'] self.param.setVal('input_type', 'smarts') elif self.getVal(param_dict,'infile') is not None: input_source = param_dict['infile'] else: LOG.error(f'Unable to find input_file') self.conveyor.setError('wrong format in the runtime similarity parameters') if 'runtime_param' in param_dict: runtime_param = self.getVal(param_dict, 'runtime_param') if runtime_param is not None: LOG.info (f'runtime parameters: {str(runtime_param)}') try: with open(runtime_param, 'r') as pfile: rtparam = yaml.safe_load(pfile) try: metric = rtparam['similarity_metric'] numsel = rtparam['similarity_cutoff_num'] cutoff = rtparam['similarity_cutoff_distance'] except: LOG.error('wrong format in the runtime similarity parameters') self.conveyor.setError('wrong format in the runtime similarity parameters') except: LOG.error('runtime similarity parameter file not found') self.conveyor.setError('runtime similarity parameter file not found') else: try: metric = param_dict['metric'] numsel = param_dict['numsel'] cutoff = param_dict['cutoff'] except: LOG.error('wrong format in the runtime similarity parameters') self.conveyor.setError('wrong format in the runtime similarity parameters') md = self.param.getVal('computeMD_method') if utils.isFingerprint(md) and len(md) > 1: LOG.warning(f'When using fingerprints, only a single type of MD can be used to build spaces. Selecting {md[0]}') self.conveyor.setWarning(f'When using fingerprints, only a single type of MD can be used to build spaces. Selecting {md[0]}') self.param.setVal('computeMD_method',[md[0]]) if not self.conveyor.getError(): # uses the child classes within the 'space' folder, # to allow customization of # the processing applied to each space modpath = utils.smodule_path(self.space, self.version) idata_child = importlib.import_module(modpath+".idata_child") sapply_child = importlib.import_module(modpath+".sapply_child") odata_child = importlib.import_module(modpath+".odata_child") # run idata object, in charge of generate space data from input try: idata = idata_child.IdataChild(self.param, self.conveyor, input_source) except: LOG.warning ('Idata child architecture mismatch, defaulting to Idata parent') idata = Idata(self.param, self.conveyor, input_source) idata.run() LOG.debug(f'idata child {type(idata).__name__} completed `run()`') if not self.conveyor.getError(): # make sure there is X data if not self.conveyor.isKey('xmatrix'): if not self.conveyor.isKey ('SMARTS'): LOG.debug(f'Failed to compute MDs') self.conveyor.setError(f'Failed to compute MDs') if not self.conveyor.getError(): # run apply object, in charge of generate a prediction from idata try: sapply = sapply_child.SapplyChild(self.param, self.conveyor) except: LOG.warning ('Sapply child architecture mismatch, defaulting to Sapply parent') sapply = Sapply(self.param, self.conveyor) sapply.run(cutoff, numsel, metric) LOG.debug(f'sapply child {type(sapply).__name__} completed `run()`') # run odata object, in charge of formatting the prediction results # note that if any of the above steps failed, an error has been inserted in the # conveyor and odata will take case of showing an error message try: odata = odata_child.OdataChild(self.param, self.conveyor) except: LOG.warning ('Odata child architecture mismatch, defaulting to Odata parent') odata = Odata(self.param, self.conveyor) return odata.run()
class Documentation: ''' Class storing the information needed to documentate models Fields are loaded from a YAML file (documentation.yaml) ... Attributes ---------- fields : dict fields in the documentation version : int documentation version Methods ------- load_parameters() Accesses to param file to retrieve all information needed to document the model. load_results() Accesses to build results to retrieve all information needed to document the model. assign_parameters() Fill documentation values corresponding to model parameter values assign_results() Assign result values to documentation fields get_upf_template() creates a spreedsheet QMRF-like get_prediction_template() Creates a reporting document for predictions ''' def __init__(self, model, version=0, context='model'): ''' Load the fields from the documentation file''' self.model = model self.version = version self.fields = None self.parameters = Parameters() self.conveyor = None # obtain the path and the default name of the model documents documentation_file_path = utils.model_path(self.model, self.version) documentation_file_name = os.path.join(documentation_file_path, 'documentation.yaml') # load the main class dictionary (p) from this yaml file if not os.path.isfile(documentation_file_name): raise Exception('Documentation file not found') try: with open(documentation_file_name, 'r') as documentation_file: self.fields = yaml.safe_load(documentation_file) except Exception as e: # LOG.error(f'Error loading documentation file with exception: {e}') raise e success, message = self.parameters.loadYaml(model, version) if not success: print( f'Parameters could not be loaded. {message}. Please make sure endpoint and version are correct' ) return # Remove this after acc #self.load_parameters() if context == 'model': self.load_results() self.assign_parameters() self.assign_results() self.autocomplete_documentation() self.setVal('md5', self.idataHash()) def safe_copy(inputfile): ''' this function makes sure that the input file contains only printable chars ''' def delta(self, model, version, doc, iformat='YAML', isSpace=False): ''' load a set of parameters from the configuration file present at the model directory also, inserts the keys present in the param_file provided, assuming that it contains a YAML-compatible format, like the one generated by manage adds some parameters identifying the model and the hash of the configuration file ''' # input is a string, either in JSON or YAML format # this is the typical input sent by if iformat not in ['JSON', 'JSONS', 'YAML', 'YAMLS']: return False, 'input format not recognized' if iformat == 'JSONS': try: newp = json.loads(doc) except Exception as e: return False, str(e) elif iformat == 'YAMLS': try: newp = yaml.load(doc) except Exception as e: return False, str(e) # input is a file, either in YAML or JSON format else: try: with open(doc, 'r') as pfile: if iformat == 'YAML': newp = yaml.safe_load(pfile) elif iformat == 'JSON': newp = json.load(pfile) except Exception as e: return False, str(e) # update interna dict with keys in the input file (delta) black_list = [] for key in newp: if key not in black_list: val = newp[key] # YAML define null values as 'None, which are interpreted # as strings if val == 'None': val = None if isinstance(val, dict): for inner_key in val: inner_val = val[inner_key] if inner_val == 'None': inner_val = None self.setInnerVal(key, inner_key, inner_val) #print ('@delta: adding',key, inner_key, inner_val) else: self.setVal(key, val) #print ('@delta: adding',key,val,type(val)) # dump internal dict to the parameters file if isSpace: parameters_file_path = utils.space_path(model, version) else: parameters_file_path = utils.model_path(model, version) parameters_file_name = os.path.join(parameters_file_path, 'documentation.yaml') try: with open(parameters_file_name, 'w') as pfile: yaml.dump(self.fields, pfile) except Exception as e: return False, 'unable to write parameters' self.setVal('md5', self.idataHash()) return True, 'OK' def load_results(self): ''' Load results pickle with model information ''' # obtain the path and the default name of the results file results_file_path = utils.model_path(self.model, self.version) results_file_name = os.path.join(results_file_path, 'model-results.pkl') self.conveyor = Conveyor() # load the main class dictionary (p) from this yaml file if not os.path.isfile(results_file_name): raise Exception('Results file not found') try: with open(results_file_name, "rb") as input_file: self.conveyor.load(input_file) except Exception as e: # LOG.error(f'No valid results pickle found at: # {results_file_name}') raise e def getVal(self, key): ''' Return the value of the key parameter or None if it is not found in the parameters dictionary ''' if not key in self.fields: return None if 'value' in self.fields[key]: return self.fields[key]['value'] return None def getDict(self, key): ''' Return the value of the key parameter or None if it ises. not found in the parameters dictionary ''' d = {} if not key in self.fields: return d element = self.fields[key]['value'] if isinstance(element, dict): # iterate keys and copy to the temp dictionary # the key and the content of 'value' for k, v in element.items(): if 'value' in v: d[k] = v['value'] return d def setVal(self, key, value): ''' Sets the parameter defined by key to the given value ''' # for existing keys, replace the contents of 'value' if key in self.fields: if "value" in self.fields[key]: if not isinstance(self.fields[key]['value'], dict): self.fields[key]["value"] = value # this should never happen, since value is never a dictionary # else: # for k in value.keys(): # self.fields[key][k] = value[k] # this behaviour is deprecated, do not add new keys # # for new keys, create a new element with 'value' key # else: # self.fields[key] = {'value': value} def setInnerVal(self, okey, ikey, value): ''' Sets a parameter within an internal dictionary. The entry is defined by a key of the outer dictionary (okey) and a second key in the inner dicctionary (ikey). The paramenter will be set to the given value This function test the existence of all the keys and dictionaries to prevent crashes and returns without setting the value if any error is found ''' if not okey in self.fields: return if not "value" in self.fields[okey]: return odict = self.fields[okey]['value'] if not isinstance(odict, dict): return # now we are sure that odict is the right inner dictionary if not ikey in odict: return # algorithm parameters not present in the template if not isinstance(odict[ikey], dict): odict['value'] = value return # keys present in the template if "value" in odict[ikey]: odict[ikey]["value"] = value def appVal(self, key, value): ''' Appends value to the end of existing key list ''' if not key in self.fields: return if "value" in self.fields[key]: vt = self.fields[key]['value'] # if the key is already a list, append the new value at the end if isinstance(vt, list): self.fields[key]['value'].append(value) # ... otherwyse, create a list with the previous content and the # new value else: self.fields[key]['value'] = [vt, value] def dumpJSON(self): return json.dumps(self.fields, allow_nan=True) def dumpYAML(self): yaml_out = [] order = [ 'ID', 'Version', 'Model_title', 'Model_description', 'Keywords', 'Contact', 'Institution', 'Date', 'Endpoint', 'Endpoint_units', 'Interpretation', 'Dependent_variable', 'Species', 'Limits_applicability', 'Experimental_protocol', 'Model_availability', 'Data_info', 'Algorithm', 'Software', 'Descriptors', 'Algorithm_settings', 'AD_method', 'AD_parameters', 'Goodness_of_fit_statistics', 'Internal_validation_1', 'Internal_validation_2', 'External_validation', 'Comments', 'Other_related_models', 'Date_of_QMRF', 'Date_of_QMRF_updates', 'QMRF_updates', 'References', 'QMRF_same_models', 'Mechanistic_basis', 'Mechanistic_references', 'Supporting_information', 'Comment_on_the_endpoint', 'Endpoint_data_quality_and_variability', 'Descriptor_selection' ] for ik in order: if ik in self.fields: k = ik v = self.fields[k] ivalue = '' idescr = '' ioptio = '' ## newest parameter formats are extended and contain ## rich metainformation for each entry if 'value' in v: if not isinstance(v['value'], dict): ivalue = v['value'] else: # print header of dictionary yaml_out.append(f'{k} :') # iterate keys assuming existence of value and description for intk in v['value']: intv = v['value'][intk] if not isinstance(intv, dict): yaml_out.append( f' {intk:27} : {str(intv):30}' ) #{iioptio} {iidescr}') else: #print(intk) intv = v['value'][intk] iivalue = '' if "value" in intv: iivalue = intv["value"] # else: # iivalue = intv iidescr = '' if "description" in intv and intv[ "description"] is not None: iidescr = intv["description"] iioptio = '' if 'options' in intv: toptio = intv['options'] if isinstance(toptio, list): if toptio != [None]: iioptio = f' {toptio}' if isinstance(iivalue, float): iivalue = f'{iivalue:f}' elif iivalue is None: iivalue = '' yaml_out.append( f' {intk:27} : {str(iivalue):30} #{iioptio} {iidescr}' ) continue if 'description' in v: idescr = v['description'] if 'options' in v: toptio = v['options'] if isinstance(toptio, list): ioptio = f' {toptio}' yaml_out.append( f'{k:30} : {str(ivalue):30} #{ioptio} {idescr}') return (yaml_out) def dumpExcel(self, oname): # openpyxl should be installed in the environment # pip install openpyxl from openpyxl import Workbook from openpyxl.styles import Font, NamedStyle, Alignment # from openpyxl.comments import Comment wb = Workbook() ws = wb.active ws.title = f"Model {self.model} documentation" alignment_style = Alignment(vertical='top', wrapText=True) # Label Style Label = NamedStyle(name="Label") Label.font = Font(name='Calibri', size=11, bold=True) Label.alignment = alignment_style ws.column_dimensions['A'].width = 25.10 ws.column_dimensions['B'].width = 28.00 ws.column_dimensions['C'].width = 60.00 ws.column_dimensions['D'].width = 60.00 # sections of the document, specifying the document keys which will be listed sections = [ ('General model information', [ 'ID', 'Version', 'Model_title', 'Model_description', 'Keywords', 'Contact', 'Institution', 'Date', 'Endpoint', 'Endpoint_units', 'Interpretation', 'Dependent_variable', 'Species', 'Limits_applicability', 'Experimental_protocol', 'Model_availability', 'Data_info' ]), ('Algorithm and software', [ 'Algorithm', 'Software', 'Descriptors', 'Algorithm_settings', 'AD_method', 'AD_parameters', 'Goodness_of_fit_statistics', 'Internal_validation_1', 'Internal_validation_2', 'External_validation', 'Comments' ]), ('Other information', [ 'Other_related_models', 'Date_of_QMRF', 'Date_of_QMRF_updates', 'QMRF_updates', 'References', 'QMRF_same_models', 'Mechanistic_basis', 'Mechanistic_references', 'Supporting_information', 'Comment_on_the_endpoint', 'Endpoint_data_quality_and_variability', 'Descriptor_selection' ]) ] #Save the position and name of the label for the first and last section position = [] name = [sections[0][1][0], 'Other Comments'] count = 1 for isection in sections: for ik in isection[1]: label_k = ik.replace('_', ' ') if label_k == 'Internal validation 2' or label_k == 'External validation': ws[f"A{count}"] = label_k ws[f'A{count}'].style = Label else: ws[f"B{count}"] = label_k ws[f"B{count}"].style = Label if ik in self.fields: # set defaults for value ivalue = '' #v is the selected entry in the documentation dictionary v = self.fields[ik] ## newest parameter formats are extended and contain ## rich metainformation for each entry if 'value' in v: ivalue = v['value'] if isinstance(ivalue, dict): ws[f"A{count}"] = label_k ws[f"A{count}"].style = Label end = (count) + (len(ivalue) - 1) for intk in ivalue: label_ik = intk.replace('_', ' ') # label_ik = intk.replace('_f', '').replace('_', ' ') ws[f'B{count}'] = label_ik ws[f'B{count}'].style = Label intv = ivalue[intk] if not isinstance(intv, dict): iivalue = intv if iivalue is None: iivalue = " " else: intv = ivalue[intk] iivalue = '' if 'value' in intv: iivalue = intv["value"] if iivalue is None: iivalue = '' ws[f'D{count}'] = intv['description'] ws[f'D{count}'].alignment = alignment_style ws[f'C{count}'] = f'{str(iivalue)}' ws[f'C{count}'].font = Font(name='Calibri', size=11, color='3465a4') ws[f'C{count}'].alignment = alignment_style ws.merge_cells(f'A{count}:A{end}') count += 1 else: ws[f'D{count}'] = v['description'] ws[f'D{count}'].alignment = alignment_style if label_k == 'Experimental protocol' or label_k == 'Comments': position.append(count) if ivalue is None: ivalue = '' ws[f'C{count}'] = f'{str(ivalue)}' ws[f'C{count}'].font = Font(name='Calibri', size=11, color='3465a4') ws[f'C{count}'].alignment = alignment_style count += 1 itr = 0 for i in position: if itr == 0: ws[f'A{1}'] = name[itr] ws[f"A{1}"].style = Label ws.merge_cells(f'A{1}:A{i}') else: ws[f'A{i}'] = name[itr] ws[f"A{i}"].style = Label ws.merge_cells(f'A{i}:A{count-1}') itr += 1 try: wb.save(oname) except: return False, f'error saving document as {oname}' return True, 'OK' def dumpWORD(self, oname): # python-docx should be installed in the environment # pip install python-docx from docx import Document from docx.shared import Pt from docx.shared import RGBColor # most of the formatting is included in this template, where we # redefined default styles for Normal, 'heading 1' and 'Table Grid' # # note that this template can be easily customized with a company # or project logo path = os.path.dirname(os.path.abspath(__file__)) path = os.path.join(path, 'children') path = os.path.join(path, 'documentation_template.docx') document = Document(path) # define style for normal and heading 1 # normal_style = document.styles['Normal'] # normal_font = normal_style.font # normal_font.name = 'Calibri' # normal_font.size = Pt(10) # heading_style = document.styles['heading 1'] # heading_font = heading_style.font # heading_font.name = 'Calibri' # heading_font.color.rgb = RGBColor(0x00, 0x00, 0x00) # heading_font.size = Pt(12) # withd of column 1 and 2 wcol1 = 1400000 wcol2 = 4200000 # withd of internal columns i and 2 wicol1 = 1200000 wicol2 = 2900000 # sections of the document, specifying the document keys which will be listed sections = [ ('General model information', [ 'ID', 'Version', 'Model_title', 'Model_description', 'Keywords', 'Contact', 'Institution', 'Date', 'Endpoint', 'Endpoint_units', 'Interpretation', 'Dependent_variable', 'Species', 'Limits_applicability', 'Experimental_protocol', 'Model_availability', 'Data_info' ]), ('Algorithm and software', [ 'Algorithm', 'Software', 'Descriptors', 'Algorithm_settings', 'AD_method', 'AD_parameters', 'Goodness_of_fit_statistics', 'Internal_validation_1', 'Internal_validation_2', 'External_validation', 'Comments' ]), ('Other information', [ 'Other_related_models', 'Date_of_QMRF', 'Date_of_QMRF_updates', 'QMRF_updates', 'References', 'QMRF_same_models', 'Mechanistic_basis', 'Mechanistic_references', 'Supporting_information', 'Comment_on_the_endpoint', 'Endpoint_data_quality_and_variability', 'Descriptor_selection' ]) ] for isection in sections: # heading with the section name document.add_heading(isection[0], level=1) # table with one row per key table = document.add_table(rows=len(isection[1]), cols=2) table.style = 'Table Grid' table.autofit = False count = 0 for ik in isection[1]: # add a row and format two columns row = table.rows[count] row.cells[0].width = wcol1 row.cells[1].width = wcol2 label_k = ik.replace('_', ' ') row.cells[0].text = f'{label_k}' count = count + 1 # define value if ik in self.fields: # set defaults for value ivalue = '' # v is the selected entry in the documentation dictionary v = self.fields[ik] ## newest parameter formats are extended and contain ## rich metainformation for each entry if 'value' in v: ivalue = v['value'] # if ivalue is a dictionary create a nested table and iterate # to represent the keys within if isinstance(ivalue, dict): row.cells[0].text = f'{label_k}' itable = row.cells[1].add_table(rows=len(ivalue), cols=2) itable.style = 'Table Grid' itable.autofit = False icount = 0 # iterate keys assuming existence of value and description for intk in ivalue: label_ik = intk.replace('_', ' ') # label_ik = intk.replace('_f', '').replace('_', ' ') irow = itable.rows[icount] irow.cells[0].width = wicol1 irow.cells[1].width = wicol2 icount = icount + 1 intv = ivalue[intk] if not isinstance(intv, dict): iivalue = intv else: intv = ivalue[intk] iivalue = '' if "value" in intv: iivalue = intv["value"] if isinstance(iivalue, float): iivalue = f'{iivalue:f}' elif iivalue is None: iivalue = '' irow.cells[0].text = f'{label_ik}' irow.cells[1].text = f'{str(iivalue)}' # if the key is not a dictionary just insert the value inside else: if ivalue is None: ivalue = '' row.cells[1].text = f'{str(ivalue)}' try: document.save(oname) except: return False, f'error saving document as {oname}' return True, 'OK' def assign_parameters(self): ''' Fill documentation values corresponding to model parameter values ''' if not self.parameters: raise ('Parameters were not loaded') # self.fields['Algorithm']['subfields']['algorithm']['value'] = \ # self.parameters.getVal('model') self.setInnerVal('Algorithm', 'algorithm', self.parameters.getVal('model')) if self.parameters.getVal('input_type') == 'molecule': self.setInnerVal('Algorithm', 'descriptors', self.parameters.getVal('computeMD_method')) cv_method = f'{self.parameters.getVal("ModelValidationCV")} ({str(self.parameters.getVal("ModelValidationN"))})' self.setInnerVal('Algorithm', 'cross-validation', cv_method) features = self.parameters.getVal("feature_selection") if features is not None: features += f' ({self.parameters.getVal("feature_number")})' self.setInnerVal('Descriptors', 'descriptors', self.parameters.getVal('computeMD_method')) self.setInnerVal('Descriptors', 'scaling', self.parameters.getVal('modelAutoscaling')) self.setInnerVal('Descriptors', 'selection_method', features) elif self.parameters.getVal('input_type') == 'model_ensemble': self.setInnerVal('Descriptors', 'descriptors', 'ensemble models') if self.parameters.getVal('conformal'): self.setInnerVal('AD_method', 'name', 'conformal prediction') # self.setInnerVal('AD_parameters', 'confidence', f'{self.parameters.getVal("conformalConfidence")}') conformal_settings_dict = {} conformal_settings_dict['confidence'] = self.parameters.getVal( "conformalConfidence") conformal_settings = self.parameters.getVal('conformal_settings') if conformal_settings is not None: for key in conformal_settings: conformal_settings_dict[key] = conformal_settings[key][ "value"] self.fields['AD_parameters']['value'] = conformal_settings_dict def assign_results(self): ''' Assign result values to documentation fields ''' # Accepted validation keys # allowed = ['Conformal_accuracy', 'Conformal_mean_interval', # 'Conformal_coverage', 'Conformal_accuracy', # 'Q2', 'SDEP', # 'SensitivityPred', 'SpecificityPred', 'MCCpred'] # gof_allowed = ['R2', 'SDEC', 'scoringR' # 'Sensitivity', 'Specificity', 'MCC'] allowed = [ 'Conformal_accuracy', 'Conformal_mean_interval', 'Conformal_coverage', 'Q2', 'SDEP', 'scoringP', 'Sensitivity', 'Specificity', 'MCC' ] gof_allowed = [ 'Conformal_accuracy_f', 'Conformal_mean_interval_f', 'Conformal_coverage_f', 'R2', 'SDEC', 'scoringR', 'Sensitivity_f', 'Specificity_f', 'MCC_f' ] model_info = self.conveyor.getVal('model_build_info') validation = self.conveyor.getVal('model_valid_info') # print(model_info) # The code below to filter the hyperparameters to be # reported. # Get parameter keys for the used estimator #param_key = self.parameters.getVal('model') + '_parameters' # Get parameter dictionary #estimator_params = self.parameters.getDict(param_key) self.fields['Algorithm_settings']['value'] = \ (self.conveyor.getVal('estimator_parameters')) # print (self.conveyor.getVal('estimator_parameters')) # Horrendous patch to solve backcompatibility problem if 'subfields' in self.fields['Data_info']: sub_label = 'subfields' else: sub_label = 'value' self.fields['Data_info'][sub_label]['training_set_size']['value'] = \ model_info[0][2] self.fields['Descriptors'][sub_label]['final_number']['value'] = \ model_info[1][2] self.fields['Descriptors'][sub_label]['ratio']['value'] = \ '{:0.2f}'.format(model_info[1][2]/model_info[0][2]) internal_val = dict() for stat in validation: if stat[0] in allowed: internal_val[stat[0]] = float("{0:.2f}".format(stat[2])) if internal_val: self.fields['Internal_validation_1']\ ['value'] = internal_val gof = dict() for stat in validation: if stat[0] in gof_allowed: gof[stat[0]] = float("{0:.2f}".format(stat[2])) if gof: self.fields['Goodness_of_fit_statistics']\ ['value'] = gof def get_string(self, dictionary): ''' Convert a dictionary (from documentation.yaml) to string format for the model template ''' text = '' for key, val in dictionary.items(): text += f'{key} : {val["value"]}\n' return text def get_string2(self, dictionary): ''' Convert a dictionary (from parameter file) to string format for the model template ''' text = '' for key, val in dictionary.items(): try: if isinstance(str(val), str): text += f'{key} : {val}\n' except: continue return text def get_upf_template(self): ''' This function creates a tabular model template based on the QMRF document type ''' template = pd.DataFrame() template['ID'] = [''] template['Version'] = [''] template['Description'] = [''] template['Contact'] = [''] template['Institution'] = [''] template['Date'] = [''] template['Endpoint'] = [''] template['Endpoint_units'] = [''] template['Dependent_variable'] = [''] template['Species'] = [''] template['Limits_applicability'] = [''] template['Experimental_protocol'] = [''] template['Data_info'] = [ self.get_string(self.fields['Data_info']['subfields']) ] template['Model_availability'] = [\ self.get_string(self.fields['Model_availability'] ['subfields'])] template['Algorithm'] = [ self.get_string(self.fields['Algorithm']['subfields']) ] template['Software'] = [ self.get_string(self.fields['Software']['subfields']) ] template['Descriptors'] = [ self.get_string(self.fields['Descriptors']['subfields']) ] template['Algorithm_settings'] = [ self.get_string(self.fields['Algorithm_settings']['subfields']) ] template['AD_method'] = [ self.get_string(self.fields['AD_method']['subfields']) ] template['AD_parameters'] = [self.fields['AD_parameters']['value']] template['Goodness_of_fit_statistics'] = [self.fields\ ['Goodness_of_fit_statistics']['value']] template['Internal_validation_1'] = [ self.fields['Internal_validation_1']['value'] ] template.to_csv('QMRF_template.tsv', sep='\t') def get_upf_template2(self): ''' This function creates a tabular model template based on the QMRF document type ''' fields = ['ID', 'Version', 'Contact', 'Institution',\ 'Date', 'Endpoint', 'Endpoint_units', 'Dependent_variable', 'Species',\ 'Limits_applicability', 'Experimental_protocol', 'Data_info',\ 'Model_availability', 'Algorithm', 'Software', 'Descriptors',\ 'Algorithm_settings', 'AD_method', 'AD_parameters',\ 'Goodness_of_fit_statistics', 'Internal_validation_1' ] template = pd.DataFrame( columns=['Field', 'Parameter name', 'Parameter value']) for field in fields: try: subfields = self.fields[field]['subfields'] except: subfields = self.fields[field]['value'] if subfields is not None: for index, subfield in enumerate(subfields): field2 = '' if index == 0: field2 = field else: field2 = "" value = str(subfields[subfield]['value']) # None types are retrieved as str from yaml?? if value == "None": value = "" row = dict(zip(['Field', 'Parameter name', 'Parameter value'],\ [field2, subfield, value])) template = template.append(row, ignore_index=True) else: value = str(self.fields[field]['value']) if value == 'None': value = "" row = dict(zip(['Field', 'Parameter name', 'Parameter value'],\ [field, "", value])) template = template.append(row, ignore_index=True) template.to_csv('QMRF_template3.tsv', sep='\t', index=False) def get_prediction_template(self): ''' This function creates a tabular model template based on the QMRF document type ''' # obtain the path and the default name of the results file results_file_path = utils.model_path(self.model, self.version) results_file_name = os.path.join(results_file_path, 'prediction-results.pkl') conveyor = Conveyor() # load the main class dictionary (p) from this yaml file if not os.path.isfile(results_file_name): raise Exception('Results file not found') try: with open(results_file_name, "rb") as input_file: conveyor.load(input_file) except Exception as e: # LOG.error(f'No valid results pickle found at: {results_file_name}') raise e # First get Name, Inchi and InChIkey names = conveyor.getVal('obj_nam') smiles = conveyor.getVal('SMILES') inchi = [AllChem.MolToInchi(AllChem.MolFromSmiles(m)) for m in smiles] inchikeys = [ AllChem.InchiToInchiKey( AllChem.MolToInchi(AllChem.MolFromSmiles(m))) for m in smiles ] predictions = [] applicability = [] if self.parameters['quantitative']['value']: raise ('Prediction template for quantitative endpoints' ' not implemented yet') if not self.parameters['conformal']['value']: predictions = conveyor.getVal('values') else: c0 = np.asarray(conveyor.getVal('c0')) c1 = np.asarray(conveyor.getVal('c1')) predictions = [] for i, j in zip(c0, c1): prediction = '' if i == j: prediction = 'out of AD' applicability.append('out') if i != j: if i == True: prediction = 'Inactive' else: prediction = 'Active' applicability.append('in') predictions.append(prediction) # Now create the spreedsheats for prediction # First write summary summary = ("Study name\n" + "Endpoint\n" + "QMRF-ID\n" + "(Target)Compounds\n" + "Compounds[compounds]\tName\tInChiKey\n") for name, inch in zip(names, inchikeys): summary += f'\t{name}\t{inch}\n' summary += ("\nFile\n" + "Author name\n" + "E-mail\n" + "Role\n" + "Affiliation\n" + "Date\n") with open('summary_document.tsv', 'w') as out: out.write(summary) # Now prediction details # Pandas is used to ease the table creation. reporting = pd.DataFrame() reporting['InChI'] = inchi reporting['CAS-RN'] = '-' reporting['SMILES'] = smiles reporting['prediction'] = predictions reporting['Applicability_domain'] = applicability reporting['reliability'] = '-' reporting['Structural_analogue_1_CAS'] = '-' reporting['Structural_analogue_1_smiles'] = '-' reporting['Structural_analogue_1_source'] = '-' reporting['Structural_analogue_1_experimental_value'] = '-' reporting['Structural_analogue_2_CAS'] = '-' reporting['Structural_analogue_2_smiles'] = '-' reporting['Structural_analogue_2_source'] = '-' reporting['Structural_analogue_2_experimental_value'] = '-' reporting['Structural_analogue_3_CAS'] = '-' reporting['Structural_analogue_3_smiles'] = '-' reporting['Structural_analogue_3_source'] = '-' reporting['Structural_analogue_3_experimental_value'] = '-' reporting.to_csv('prediction_report.tsv', sep='\t', index=False) def idataHash(self): ''' Create a md5 hash for a number of keys describing parameters relevant for idata This hash is compared between runs, to check wether idata must recompute or not the MD ''' # update with any new idata relevant parameter keylist = [ 'SDFile_name', 'SDFile_activity', 'SDFile_experimental', 'normalize_method', 'ionize_method', 'convert3D_method', 'computeMD_method', 'TSV_objnames', 'TSV_activity', 'input_type' ] idata_params = [] for i in keylist: idata_params.append(self.getVal(i)) # MD_settings is a dictionary, obtain and sort the keys+values md_params = self.getDict('MD_settings') md_list = [] for key in md_params: # combine key + value in a single string md_list.append(key + str(md_params[key])) idata_params.append(md_list.sort()) # use picke as a buffered object, neccesary to generate the hexdigest p = pickle.dumps(idata_params) return hashlib.md5(p).hexdigest() def empty_fields(self): ''' This function checks which fields do not contain values ''' emptyfields = [] for ik in self.fields: v = self.fields[ik] if 'value' in v: ivalue = v['value'] if isinstance(ivalue, dict): for intk in ivalue: intv = ivalue[intk] if not isinstance(intv, dict): iivalue = intv if iivalue is None or len(str(iivalue)) is 0: emptyfields.append(intk) else: intv = ivalue[intk] iivalue = '' if intv["value"] is None or len(str( intv["value"])) is 0: emptyfields.append(intk) else: if ivalue is None or len(str(ivalue)) is 0: emptyfields.append(ik) return emptyfields def get_mols(self): return dict( zip(self.conveyor.getVal("obj_nam"), self.conveyor.getVal("SMILES"))) def autocomplete_documentation(self): """ Auto complete fields in model documentation """ #ID, Model identifier. self.fields['ID']['value'] = utils.getModelID(self.model, self.version, 'model')[1] #Version self.fields['Version']['value'] = str(self.version) #Date, Date of model development and Date of QMRF. today = date.today().strftime("%B %d, %Y") self.fields['Date']['value'] = today self.fields['Date_of_QMRF']['value'] = today #format, Format used(SDF,TSV) if self.parameters.getVal('input_type') == 'data': self.fields['Data_info']['value']['format']['value'] = 'TSV' else: self.fields['Data_info']['value']['format']['value'] = 'SDF' #Algorithm, type: QSAR. self.fields['Algorithm']['value']['type']['value'] = 'QSAR' #Model, Main modelling program, version, description and license. software = "Flame, 1.0rc3" fieldsapplysoftware = ['model', 'descriptors', 'applicability_domain'] for field in fieldsapplysoftware: if field == 'applicability_domain': if self.parameters.getVal('conformal'): self.fields['Software']['value'][field]['value'] = software else: self.fields['Software']['value'][field]['value'] = software
class Build: def __init__(self, model, param_file=None, param_string=None, output_format=None): LOG.debug('Starting build...') self.model = model self.param = Parameters() self.conveyor = Conveyor() # load parameters if param_file is not None: # use the param_file to update existing parameters at the model # directory and save changes to make them persistent success, message = self.param.delta(model, 0, param_file, iformat='YAML') elif param_string is not None: success, message = self.param.delta(model, 0, param_string, iformat='JSONS') else: # load parameter file at the model directory success, message = self.param.loadYaml(model, 0) # being unable to load parameters is a critical error if not success: LOG.critical( f'Unable to load model parameters. "{message}" Aborting...') sys.exit(1) # add additional output formats included in the constructor # this is requiered to add JSON format as output when the object is # instantiated from a web service call, requiring this output if output_format is not None: if output_format not in self.param.getVal('output_format'): self.param.appVal('output_format', output_format) def get_ensemble(self): ''' Returns a Boolean indicating if the model uses external input sources and a list with these sources ''' return self.param.getEnsemble() def set_single_CPU(self) -> None: ''' Forces the use of a single CPU ''' LOG.debug('parameter "numCPUs" forced to be 1') self.param.setVal('numCPUs', 1) def run(self, input_source): ''' Executes a default predicton workflow ''' # path to endpoint epd = utils.model_path(self.model, 0) if not os.path.isdir(epd): self.conveyor.setError(f'Unable to find model {self.model}') #LOG.error(f'Unable to find model {self.model}') # import ichild classes if not self.conveyor.getError(): # uses the child classes within the 'model' folder, # to allow customization of the processing applied to each model modpath = utils.module_path(self.model, 0) idata_child = importlib.import_module(modpath + ".idata_child") learn_child = importlib.import_module(modpath + ".learn_child") odata_child = importlib.import_module(modpath + ".odata_child") # run idata object, in charge of generate model data from input try: idata = idata_child.IdataChild(self.param, self.conveyor, input_source) except: LOG.warning( 'Idata child architecture mismatch, defaulting to Idata parent' ) idata = Idata(self.param, self.conveyor, input_source) idata.run() LOG.debug(f'idata child {type(idata).__name__} completed `run()`') if not self.conveyor.getError(): # check there is a suitable X and Y if not self.conveyor.isKey('xmatrix'): self.conveyor.setError(f'Failed to compute MDs') if not self.conveyor.isKey('ymatrix'): self.conveyor.setError( f'No activity data (Y) found in training series') if not self.conveyor.getError(): # instantiate learn (build a model from idata) and run it learn = learn_child.LearnChild(self.param, self.conveyor) learn.run() try: learn = learn_child.LearnChild(self.param, self.conveyor) except: LOG.warning( 'Learn child architecture mismatch, defaulting to Learn parent' ) learn = Learn(self.param, self.conveyor) LOG.debug(f'learn child {type(learn).__name__} completed `run()`') # run odata object, in charge of formatting the prediction results # note that if any of the above steps failed, an error has been inserted in the # conveyor and odata will take case of showing an error message try: odata = odata_child.OdataChild(self.param, self.conveyor) except: LOG.warning( 'Odata child architecture mismatch, defaulting to Odata parent' ) odata = Odata(self.param, self.conveyor) return odata.run()
class Predict: def __init__(self, model, version=0, output_format=None, label=None): LOG.debug('Starting predict...') self.model = model self.version = version self.param = Parameters() self.conveyor = Conveyor() self.conveyor.addVal(label, 'prediction_label', 'prediction label', 'method', 'single', 'Label used to identify the prediction') if not self.param.loadYaml(model, version): LOG.critical('Unable to load model parameters. Aborting...') sys.exit() # add additional output formats included in the constructor # this is requiered to add JSON format as output when the object is # instantiated from a web service call, requiring this output if output_format != None: if output_format not in self.param.getVal('output_format'): self.param.appVal('output_format', output_format) return def get_ensemble(self): ''' Returns a Boolean indicating if the model uses external input sources and a list with these sources ''' return self.param.getEnsemble() def set_single_CPU(self) -> None: ''' Forces the use of a single CPU ''' LOG.debug('parameter "numCPUs" forced to be 1') self.param.setVal('numCPUs', 1) def run(self, input_source): ''' Executes a default predicton workflow ''' # path to endpoint # path to endpoint endpoint = utils.model_path(self.model, self.version) if not os.path.isdir(endpoint): self.conveyor.setError( f'Unable to find model {self.model}, version {self.version}') #LOG.error(f'Unable to find model {self.model}') if not self.conveyor.getError(): # uses the child classes within the 'model' folder, # to allow customization of # the processing applied to each model modpath = utils.module_path(self.model, self.version) idata_child = importlib.import_module(modpath + ".idata_child") apply_child = importlib.import_module(modpath + ".apply_child") odata_child = importlib.import_module(modpath + ".odata_child") # run idata object, in charge of generate model data from input try: idata = idata_child.IdataChild(self.param, self.conveyor, input_source) except: LOG.warning( 'Idata child architecture mismatch, defaulting to Idata parent' ) idata = Idata(self.param, self.conveyor, input_source) idata.run() LOG.debug(f'idata child {type(idata).__name__} completed `run()`') if not self.conveyor.getError(): # make sure there is X data if not self.conveyor.isKey('xmatrix'): LOG.debug(f'Failed to compute MDs') self.conveyor.setError(f'Failed to compute MDs') if not self.conveyor.getError(): # run apply object, in charge of generate a prediction from idata try: apply = apply_child.ApplyChild(self.param, self.conveyor) except: LOG.warning( 'Apply child architecture mismatch, defaulting to Apply parent' ) apply = Apply(self.param, self.conveyor) apply.run() LOG.debug(f'apply child {type(apply).__name__} completed `run()`') # run odata object, in charge of formatting the prediction results # note that if any of the above steps failed, an error has been inserted in the # conveyor and odata will take case of showing an error message try: odata = odata_child.OdataChild(self.param, self.conveyor) except: LOG.warning( 'Odata child architecture mismatch, defaulting to Odata parent' ) odata = Odata(self.param, self.conveyor) return odata.run()
class Sbuild: def __init__(self, space, param_file=None, param_string=None, output_format=None): LOG.debug('Starting sbuild...') self.space = space self.param = Parameters() self.conveyor = Conveyor() # identify the workflow type self.conveyor.setOrigin('slearn') # generate a unique modelID self.conveyor.addMeta('modelID', utils.id_generator()) LOG.debug( f'Generated new space with modelID: {self.conveyor.getMeta("modelID")}' ) # load parameters if param_file is not None: # use the param_file to update existing parameters at the space # directory and save changes to make them persistent success, message = self.param.delta(space, 0, param_file, iformat='YAML', isSpace=True) elif param_string is not None: success, message = self.param.delta(space, 0, param_string, iformat='JSONS', isSpace=True) else: # load parameter file at the space directory success, message = self.param.loadYaml(space, 0, isSpace=True) # being unable to load parameters is a critical error if not success: LOG.critical( f'Unable to load space parameters. {message}. Aborting...') sys.exit(1) md = self.param.getVal('computeMD_method') if utils.isFingerprint(md) and len(md) > 1: LOG.warning( f'When using fingerprints, only a single type of MD can be used to build spaces. Selecting {md[0]}' ) self.conveyor.setWarning( f'When using fingerprints, only a single type of MD can be used to build spaces. Selecting {md[0]}' ) self.param.setVal('computeMD_method', [md[0]]) # add additional output formats included in the constructor # this is requiered to add JSON format as output when the object is # instantiated from a web service call, requiring this output if output_format is not None: if output_format not in self.param.getVal('output_format'): self.param.appVal('output_format', output_format) def set_single_CPU(self) -> None: ''' Forces the use of a single CPU ''' LOG.debug('parameter "numCPUs" forced to be 1') self.param.setVal('numCPUs', 1) def run(self, input_source): ''' Executes a default chemical space building workflow ''' # path to endpoint epd = utils.space_path(self.space, 0) if not os.path.isdir(epd): self.conveyor.setError(f'Unable to find space {self.space}') #LOG.error(f'Unable to find space {self.space}') # import ichild classes if not self.conveyor.getError(): # uses the child classes within the 'space' folder, # to allow customization of the processing applied to each space modpath = utils.smodule_path(self.space, 0) idata_child = importlib.import_module(modpath + ".idata_child") slearn_child = importlib.import_module(modpath + ".slearn_child") odata_child = importlib.import_module(modpath + ".odata_child") # run idata object, in charge of generate space data from input try: idata = idata_child.IdataChild(self.param, self.conveyor, input_source) except: LOG.warning( 'Idata child architecture mismatch, defaulting to Idata parent' ) idata = Idata(self.param, self.conveyor, input_source) idata.run() LOG.debug(f'idata child {type(idata).__name__} completed `run()`') if not self.conveyor.getError(): success, results = idata.preprocess_create() if not success: self.conveyor.setError(results) if not self.conveyor.getError(): # check there is a suitable X and Y if not self.conveyor.isKey('xmatrix'): self.conveyor.setError(f'Failed to compute MDs') if not self.conveyor.getError(): # instantiate learn (build a space from idata) and run it try: slearn = slearn_child.SlearnChild(self.param, self.conveyor) except: LOG.warning( 'Slearn child architecture mismatch, defaulting to Learn parent' ) slearn = Slearn(self.param, self.conveyor) slearn.run() LOG.debug( f'slearn child {type(slearn).__name__} completed `run()`') # run odata object, in charge of formatting the prediction results # note that if any of the above steps failed, an error has been inserted in the # conveyor and odata will take case of showing an error message try: odata = odata_child.OdataChild(self.param, self.conveyor) except: LOG.warning( 'Odata child architecture mismatch, defaulting to Odata parent' ) odata = Odata(self.param, self.conveyor) return odata.run()
def action_refresh(model=None, version=None, GUI=False): ''' Rebuild one or many models making use of existing parameter files and locally stored training series. ''' import flame.context as context from flame.parameters import Parameters # from flame.documentation import Documentation import logging if GUI: token_file = os.path.join(tempfile.gettempdir(), 'refreshing_' + model) # update token file with content 'working' with open(token_file, 'w') as f: f.write('Analyzing and sorting models...') # list endpoints relevant for the arguments if model is not None: model_list = [model] else: model_root = pathlib.Path(utils.model_repository_path()) model_list = [x.stem for x in model_root.iterdir() if x.is_dir()] # list versions relevant for the arguments task_list = [] for imodel in model_list: if version is not None: task_list.append((imodel, version)) else: model_root = pathlib.Path(utils.model_tree_path(imodel)) itask_list = [(imodel, utils.modeldir2ver(x.stem)) for x in model_root.iterdir() if x.is_dir()] task_list += itask_list # use "+=" and not "append" to merge the new list with the old one # analize task_list and add at the end ensemble models # this is needed to have low models refreshed BEFORE refreshing the high models # eliminating the need to refresh them recursively LOG.info("Analyzing and sorting models...") # make sure the lower models are in task_list and, if not, force the inclussion for itask in task_list: param = Parameters() success, results = param.loadYaml(itask[0], itask[1]) if not success: continue if param.getVal('input_type') == 'model_ensemble': ens_nams = param.getVal('ensemble_names') ens_vers = param.getVal('ensemble_versions') for i in range(len(ens_nams)): iver = 0 inam = ens_nams[i] if (i < len(ens_vers)): iver = ens_vers[i] if ((inam, iver)) not in task_list: task_list.append((inam, iver)) # create separate lists for regular and ensemble models # and add ensemble models at the end # this needs to be carried out after the previos step because # some of the lower level models could be an ensemble model # itself mol_list = [] ens_list = [] for itask in task_list: param = Parameters() success, results = param.loadYaml(itask[0], itask[1]) if not success: mol_list.append(itask) continue if param.getVal('input_type') == 'model_ensemble': ens_list.append(itask) else: mol_list.append(itask) task_list = mol_list + ens_list # show all models before stating LOG.info( "Starting model refreshing task for the following models and versions") for itask in task_list: LOG.info(f' model: {itask[0]} version: {itask[1]}') LOG.info("This can take some time, please be patient...") source_dir = os.path.dirname(os.path.abspath(__file__)) children_dir = os.path.join(source_dir, 'children') master_parameters = os.path.join(children_dir, 'parameters.yaml') master_documentation = os.path.join(children_dir, 'documentation.yaml') # now send the build command for each task for itask in task_list: destinat_path = utils.model_path(itask[0], 0) # dev if itask[1] != 0: # move version to /dev for building original_path = utils.model_path(itask[0], itask[1]) # veri security_path = destinat_path + '_security' # dev_sec shutil.move(destinat_path, security_path) # dev --> dev_sec shutil.move(original_path, destinat_path) # veri --> dev LOG.info( f' refreshing model: {itask[0]} version: {itask[1]} ({task_list.index(itask)+1} of {len(task_list)})...' ) if GUI: with open(token_file, 'w') as f: f.write( f'model: {itask[0]} version: {itask[1]} ({task_list.index(itask)+1} of {len(task_list)})' ) # dissable LOG output logging.disable(logging.ERROR) # update parameters dump_parameters = os.path.join(destinat_path, 'parameters_dump.yaml') success, param = action_parameters(itask[0], 0, oformat='bin') if success: param_yaml = param.dumpYAML() with open(dump_parameters, 'w') as f: for line in param_yaml: f.write(line + '\n') else: LOG.info( ' ERROR: unable to merge parameters for model: {itask[0]} version: {itask[1]}' ) dump_parameters = None original_parameters = os.path.join(destinat_path, 'parameters.yaml') shutil.copy(master_parameters, original_parameters) #update documentation dump_documentation = os.path.join(destinat_path, 'documentation_dump.yaml') success, documentation = action_documentation(itask[0], 0, doc_file=None, oformat='bin') original_documentation = os.path.join(destinat_path, 'documentation.yaml') shutil.copy(master_documentation, original_documentation) if success: documentation_yaml = documentation.dumpYAML() with open(dump_documentation, 'w') as f: for line in documentation_yaml: line = line.encode("ascii", "ignore") line = line.decode("ascii", "ignore") f.write(line + '\n') s2, documentation = action_documentation(itask[0], 0, doc_file=None, oformat='bin') s3, r3 = documentation.delta(itask[0], 0, dump_documentation) else: LOG.info( ' ERROR: unable to merge documentation for model: {itask[0]} version: {itask[1]}' ) # rebuild the model command_build = { 'endpoint': itask[0], 'infile': None, 'param_file': dump_parameters, 'incremental': False } success, results = context.build_cmd(command_build) # enable LOG output logging.disable(logging.NOTSET) if itask[1] != 0: shutil.move(destinat_path, original_path) # dev --> veri shutil.move(security_path, destinat_path) # dev_sec --> dev if not success: LOG.error(results) LOG.info("Model refreshing task finished") if GUI: # update token file with status 'ready' with open(token_file, 'w') as f: f.write('ready') return True, 'OK'
class Documentation: ''' Class storing the information needed to documentate models Fields are loaded from a YAML file (documentation.yaml) ... Attributes ---------- fields : dict fields in the documentation version : int documentation version Methods ------- load_parameters() Accesses to param file to retrieve all information needed to document the model. load_results() Accesses to build results to retrieve all information needed to document the model. assign_parameters() Fill documentation values corresponding to model parameter values assign_results() Assign result values to documentation fields get_upf_template() creates a spreedsheet QMRF-like get_prediction_template() Creates a reporting document for predictions ''' def __init__(self, model, version=0, context='model'): ''' Load the fields from the documentation file''' self.model = model self.version = version self.fields = None self.parameters = Parameters() self.conveyor = None # obtain the path and the default name of the model documents documentation_file_path = utils.model_path(self.model, self.version) documentation_file_name = os.path.join(documentation_file_path, 'documentation.yaml') # load the main class dictionary (p) from this yaml file if not os.path.isfile(documentation_file_name): raise Exception('Documentation file not found') try: with open(documentation_file_name, 'r') as documentation_file: self.fields = yaml.safe_load(documentation_file) except Exception as e: # LOG.error(f'Error loading documentation file with exception: {e}') raise e success, message = self.parameters.loadYaml(model, 0) if not success: print( 'Parameters could not be loaded. Please assure endpoint is correct' ) return # Remove this after acc #self.load_parameters() if context == 'model': self.load_results() self.assign_parameters() self.assign_results() self.setVal('md5', self.idataHash()) def delta(self, model, version, doc, iformat='YAML', isSpace=False): ''' load a set of parameters from the configuration file present at the model directory also, inserts the keys present in the param_file provided, assuming that it contains a YAML-compatible format, like the one generated by manage adds some parameters identifying the model and the hash of the configuration file ''' # if not self.loadYaml(model, version, isSpace): # return False, 'file not found' # parse parameter file assuning it will be in # a YAML-compatible format if iformat == 'JSONS': try: newp = json.loads(doc) except Exception as e: return False, e else: try: with open(doc, 'r') as pfile: if iformat == 'YAML': newp = yaml.safe_load(pfile) elif iformat == 'JSON': newp = json.load(pfile) except Exception as e: return False, e # update interna dict with keys in the input file (delta) black_list = [] for key in newp: if key not in black_list: val = newp[key] # YAML define null values as 'None, which are interpreted # as strings if val == 'None': val = None if isinstance(val, dict): for inner_key in val: inner_val = val[inner_key] if inner_val == 'None': inner_val = None self.setInnerVal(key, inner_key, inner_val) #print ('@delta: adding',key, inner_key, inner_val) else: self.setVal(key, val) #print ('@delta: adding',key,val,type(val)) # dump internal dict to the parameters file if isSpace: parameters_file_path = utils.space_path(model, version) else: parameters_file_path = utils.model_path(model, version) parameters_file_name = os.path.join(parameters_file_path, 'documentation.yaml') try: with open(parameters_file_name, 'w') as pfile: yaml.dump(self.fields, pfile) except Exception as e: return False, 'unable to write parameters' self.setVal('md5', self.idataHash()) return True, 'OK' def load_results(self): ''' Load results pickle with model information ''' # obtain the path and the default name of the results file results_file_path = utils.model_path(self.model, self.version) results_file_name = os.path.join(results_file_path, 'results.pkl') self.conveyor = Conveyor() # load the main class dictionary (p) from this yaml file if not os.path.isfile(results_file_name): raise Exception('Results file not found') try: with open(results_file_name, "rb") as input_file: self.conveyor.load(input_file) except Exception as e: # LOG.error(f'No valid results pickle found at: # {results_file_name}') raise e def getVal(self, key): ''' Return the value of the key parameter or None if it is not found in the parameters dictionary ''' if not key in self.fields: return None if 'value' in self.fields[key]: return self.fields[key]['value'] return None def getDict(self, key): ''' Return the value of the key parameter or None if it ises. not found in the parameters dictionary ''' d = {} if not key in self.fields: return d element = self.fields[key]['value'] if isinstance(element, dict): # iterate keys and copy to the temp dictionary # the key and the content of 'value' for k, v in element.items(): if 'value' in v: d[k] = v['value'] return d def setVal(self, key, value): ''' Sets the parameter defined by key to the given value ''' # for existing keys, replace the contents of 'value' if key in self.fields: if "value" in self.fields[key]: if not isinstance(self.fields[key]['value'], dict): self.fields[key]["value"] = value else: # print(key) for k in value.keys(): self.fields[key][k] = value[k] # for new keys, create a new element with 'value' key else: self.fields[key] = {'value': value} def setInnerVal(self, okey, ikey, value): ''' Sets a parameter within an internal dictionary. The entry is defined by a key of the outer dictionary (okey) and a second key in the inner dicctionary (ikey). The paramenter will be set to the given value This function test the existence of all the keys and dictionaries to prevent crashes and returns without setting the value if any error is found ''' if not okey in self.fields: return if not "value" in self.fields[okey]: return odict = self.fields[okey]['value'] if not isinstance(odict, dict): return if not ikey in odict: return if not isinstance(odict[ikey], dict): odict['value'] = value return if "value" in odict[ikey]: odict[ikey]["value"] = value else: odict[ikey] = {'value': value} def appVal(self, key, value): ''' Appends value to the end of existing key list ''' if not key in self.fields: return if "value" in self.fields[key]: vt = self.fields[key]['value'] # if the key is already a list, append the new value at the end if isinstance(vt, list): self.fields[key]['value'].append(value) # ... otherwyse, create a list with the previous content and the # new value else: self.fields[key]['value'] = [vt, value] def dumpJSON(self): return json.dumps(self.fields) def assign_parameters(self): ''' Fill documentation values corresponding to model parameter values ''' if not self.parameters: raise ('Parameters were not loaded') # self.fields['Algorithm']['subfields']['algorithm']['value'] = \ # self.parameters.getVal('model') self.setInnerVal('Algorithm', 'algorithm', self.parameters.getVal('model')) self.setInnerVal('Algorithm', 'descriptors', self.parameters.getVal('computeMD_method')) if self.parameters.getVal('conformal'): self.setInnerVal('AD_method', 'name', 'conformal prediction') self.setVal( 'AD_parameters', f'Conformal Significance ' f'{self.parameters.getVal("conformalSignificance")}') def assign_results(self): ''' Assign result values to documentation fields ''' # Accepted validation keys allowed = [ 'Conformal_accuracy', 'Conformal_mean_interval', 'Sensitivity', 'Specificity', 'MCC', 'Conformal_coverage', 'Conformal_accuracy', 'Q2', 'SDEP', 'SensitivityPed', 'SpecificityPred', 'SpecificityPred', 'MCCpred', 'scoringR', 'R2', 'SDEC' ] model_info = self.conveyor.getVal('model_build_info') validation = self.conveyor.getVal('model_valid_info') # The code below to filter the hyperparameters to be # reported. # Get parameter keys for the used estimator #param_key = self.parameters.getVal('model') + '_parameters' # Get parameter dictionary #estimator_params = self.parameters.getDict(param_key) self.fields['Algorithm_settings']['value'] = \ (self.conveyor.getVal('estimator_parameters')) # Horrendous patch to solve backcompatibility problem if 'subfields' in self.fields['Data_info']: sub_label = 'subfields' else: sub_label = 'value' self.fields['Data_info']\ [sub_label]['training_set_size']['value'] = \ model_info[0][2] self.fields['Data_info']\ [sub_label]['training_set_size']['value'] = \ model_info[0][2] self.fields['Descriptors']\ [sub_label]['final_number']['value'] = \ model_info[1][2] self.fields['Descriptors']\ [sub_label]['ratio']['value'] = \ '{:0.2f}'.format(model_info[1][2]/model_info[0][2]) internal_val = dict() for stat in validation: if stat[0] in allowed: internal_val[stat[0]] = float("{0:.2f}".format(stat[2])) if internal_val: self.fields['Internal_validation_1']\ ['value'] = internal_val def get_string(self, dictionary): ''' Convert a dictionary (from documentation.yaml) to string format for the model template ''' text = '' for key, val in dictionary.items(): text += f'{key} : {val["value"]}\n' return text def get_string2(self, dictionary): ''' Convert a dictionary (from parameter file) to string format for the model template ''' text = '' for key, val in dictionary.items(): try: if isinstance(str(val), str): text += f'{key} : {val}\n' except: continue return text def get_upf_template(self): ''' This function creates a tabular model template based on the QMRF document type ''' template = pd.DataFrame() template['ID'] = [''] template['Version'] = [''] template['Description'] = [''] template['Contact'] = [''] template['Institution'] = [''] template['Date'] = [''] template['Endpoint'] = [''] template['Endpoint_units'] = [''] template['Dependent_variable'] = [''] template['Species'] = [''] template['Limits_applicability'] = [''] template['Experimental_protocol'] = [''] template['Data_info'] = [ self.get_string(self.fields['Data_info']['subfields']) ] template['Model_availability'] = [\ self.get_string(self.fields['Model_availability'] ['subfields'])] template['Algorithm'] = [ self.get_string(self.fields['Algorithm']['subfields']) ] template['Software'] = [ self.get_string(self.fields['Software']['subfields']) ] template['Descriptors'] = [ self.get_string(self.fields['Descriptors']['subfields']) ] template['Algorithm_settings'] = [ self.get_string(self.fields['Algorithm_settings']['subfields']) ] template['AD_method'] = [ self.get_string(self.fields['AD_method']['subfields']) ] template['AD_parameters'] = [self.fields['AD_parameters']['value']] template['Goodness_of_fit_statistics'] = [self.fields\ ['Goodness_of_fit_statistics']['value']] template['Internal_validation_1'] = [ self.fields['Internal_validation_1']['value'] ] template.to_csv('QMRF_template.tsv', sep='\t') def get_upf_template2(self): ''' This function creates a tabular model template based on the QMRF document type ''' fields = ['ID', 'Version', 'Contact', 'Institution',\ 'Date', 'Endpoint', 'Endpoint_units', 'Dependent_variable', 'Species',\ 'Limits_applicability', 'Experimental_protocol', 'Data_info',\ 'Model_availability', 'Algorithm', 'Software', 'Descriptors',\ 'Algorithm_settings', 'AD_method', 'AD_parameters',\ 'Goodness_of_fit_statistics', 'Internal_validation_1' ] template = pd.DataFrame( columns=['Field', 'Parameter name', 'Parameter value']) for field in fields: try: subfields = self.fields[field]['subfields'] except: subfields = self.fields[field]['value'] if subfields is not None: for index, subfield in enumerate(subfields): field2 = '' if index == 0: field2 = field else: field2 = "" value = str(subfields[subfield]['value']) # None types are retrieved as str from yaml?? if value == "None": value = "" row = dict(zip(['Field', 'Parameter name', 'Parameter value'],\ [field2, subfield, value])) template = template.append(row, ignore_index=True) else: value = str(self.fields[field]['value']) if value == 'None': value = "" row = dict(zip(['Field', 'Parameter name', 'Parameter value'],\ [field, "", value])) template = template.append(row, ignore_index=True) template.to_csv('QMRF_template3.tsv', sep='\t', index=False) def get_prediction_template(self): ''' This function creates a tabular model template based on the QMRF document type ''' # obtain the path and the default name of the results file results_file_path = utils.model_path(self.model, self.version) results_file_name = os.path.join(results_file_path, 'prediction-results.pkl') conveyor = Conveyor() # load the main class dictionary (p) from this yaml file if not os.path.isfile(results_file_name): raise Exception('Results file not found') try: with open(results_file_name, "rb") as input_file: conveyor.load(input_file) except Exception as e: # LOG.error(f'No valid results pickle found at: {results_file_name}') raise e # First get Name, Inchi and InChIkey names = conveyor.getVal('obj_nam') smiles = conveyor.getVal('SMILES') inchi = [AllChem.MolToInchi(AllChem.MolFromSmiles(m)) for m in smiles] inchikeys = [ AllChem.InchiToInchiKey( AllChem.MolToInchi(AllChem.MolFromSmiles(m))) for m in smiles ] predictions = [] applicability = [] if self.parameters['quantitative']['value']: raise ('Prediction template for quantitative endpoints' ' not implemented yet') if not self.parameters['conformal']['value']: predictions = conveyor.getVal('values') else: c0 = np.asarray(conveyor.getVal('c0')) c1 = np.asarray(conveyor.getVal('c1')) predictions = [] for i, j in zip(c0, c1): prediction = '' if i == j: prediction = 'out of AD' applicability.append('out') if i != j: if i == True: prediction = 'Inactive' else: prediction = 'Active' applicability.append('in') predictions.append(prediction) # Now create the spreedsheats for prediction # First write summary summary = ("Study name\n" + "Endpoint\n" + "QMRF-ID\n" + "(Target)Compounds\n" + "Compounds[compounds]\tName\tInChiKey\n") for name, inch in zip(names, inchikeys): summary += f'\t{name}\t{inch}\n' summary += ("\nFile\n" + "Author name\n" + "E-mail\n" + "Role\n" + "Affiliation\n" + "Date\n") with open('summary_document.tsv', 'w') as out: out.write(summary) # Now prediction details # Pandas is used to ease the table creation. reporting = pd.DataFrame() reporting['InChI'] = inchi reporting['CAS-RN'] = '-' reporting['SMILES'] = smiles reporting['prediction'] = predictions reporting['Applicability_domain'] = applicability reporting['reliability'] = '-' reporting['Structural_analogue_1_CAS'] = '-' reporting['Structural_analogue_1_smiles'] = '-' reporting['Structural_analogue_1_source'] = '-' reporting['Structural_analogue_1_experimental_value'] = '-' reporting['Structural_analogue_2_CAS'] = '-' reporting['Structural_analogue_2_smiles'] = '-' reporting['Structural_analogue_2_source'] = '-' reporting['Structural_analogue_2_experimental_value'] = '-' reporting['Structural_analogue_3_CAS'] = '-' reporting['Structural_analogue_3_smiles'] = '-' reporting['Structural_analogue_3_source'] = '-' reporting['Structural_analogue_3_experimental_value'] = '-' reporting.to_csv('prediction_report.tsv', sep='\t', index=False) def idataHash(self): ''' Create a md5 hash for a number of keys describing parameters relevant for idata This hash is compared between runs, to check wether idata must recompute or not the MD ''' # update with any new idata relevant parameter keylist = [ 'SDFile_name', 'SDFile_activity', 'SDFile_experimental', 'normalize_method', 'ionize_method', 'convert3D_method', 'computeMD_method', 'TSV_varnames', 'TSV_objnames', 'TSV_activity', 'input_type' ] idata_params = [] for i in keylist: idata_params.append(self.getVal(i)) # MD_settings is a dictionary, obtain and sort the keys+values md_params = self.getDict('MD_settings') md_list = [] for key in md_params: # combine key + value in a single string md_list.append(key + str(md_params[key])) idata_params.append(md_list.sort()) # use picke as a buffered object, neccesary to generate the hexdigest p = pickle.dumps(idata_params) return hashlib.md5(p).hexdigest()
def action_refresh(model=None, version=None): ''' Rebuild one or many models making use of existing parameter files and locally stored training series. ''' import flame.context as context from flame.parameters import Parameters import logging # list endpoints relevant for the arguments if model is not None: model_list = [model] else: model_root = pathlib.Path(utils.model_repository_path()) model_list = [x.stem for x in model_root.iterdir() if x.is_dir()] # list versions relevant for the arguments task_list = [] for imodel in model_list: if version is not None: task_list.append((imodel, version)) else: model_root = pathlib.Path(utils.model_tree_path(imodel)) itask_list = [(imodel, utils.modeldir2ver(x.stem)) for x in model_root.iterdir() if x.is_dir()] task_list += itask_list # use "+=" and not "append" to merge the new list with the old one # analize task_list and add at the end ensemble models # this is needed to have low models refreshed BEFORE refreshing the high models # eliminating the need to refresh them recursively LOG.info("Analyzing and sorting models...") # make sure the lower models are in task_list and, if not, force the inclussion for itask in task_list: param = Parameters() success, results = param.loadYaml(itask[0], itask[1]) if not success: continue if param.getVal('input_type') == 'model_ensemble': ens_nams = param.getVal('ensemble_names') ens_vers = param.getVal('ensemble_versions') for i in range(len(ens_nams)): iver = 0 inam = ens_nams[i] if (i < len(ens_vers)): iver = ens_vers[i] if ((inam, iver)) not in task_list: task_list.append((inam, iver)) # create separate lists for regular and ensemble models # and add ensemble models at the end # this needs to be carried out after the previos step because # some of the lower level models could be an ensemble model # itself mol_list = [] ens_list = [] for itask in task_list: param = Parameters() success, results = param.loadYaml(itask[0], itask[1]) if not success: mol_list.append(itask) continue if param.getVal('input_type') == 'model_ensemble': ens_list.append(itask) else: mol_list.append(itask) task_list = mol_list + ens_list # show all models before stating LOG.info( "Starting model refreshing task for the following models and versions") for itask in task_list: LOG.info(f' model: {itask[0]} version: {itask[1]}') LOG.info("This can take some time, please be patient...") # now send the build command for each task for itask in task_list: if itask[1] != 0: # move version to /dev for building original_path = utils.model_path(itask[0], itask[1]) # veri destinat_path = utils.model_path(itask[0], 0) # dev security_path = destinat_path + '_security' # dev_sec shutil.move(destinat_path, security_path) # dev --> dev_sec shutil.move(original_path, destinat_path) # veri --> dev LOG.info( f' refreshing model: {itask[0]} version: {itask[1]} ({task_list.index(itask)+1} of {len(task_list)})...' ) # dissable LOG output logging.disable(logging.ERROR) command_build = { 'endpoint': itask[0], 'infile': None, 'param_file': None, 'incremental': False } success, results = context.build_cmd(command_build) # enable LOG output logging.disable(logging.NOTSET) if itask[1] != 0: shutil.move(destinat_path, original_path) # dev --> veri shutil.move(security_path, destinat_path) # dev_sec --> dev if not success: LOG.error(results) LOG.info("Model refreshing task finished") return True, 'OK'
class Search: def __init__(self, space, version, output_format=None, label=None): LOG.debug('Starting predict...') self.space = space self.version = version self.label = label self.param = Parameters() self.conveyor = Conveyor() self.conveyor.addVal(label, 'prediction_label', 'prediction label', 'method', 'single', 'Label used to identify the prediction') if not self.param.loadYaml(space, version, isSpace=True): LOG.critical('Unable to load space parameters. Aborting...') sys.exit() # add additional output formats included in the constructor # this is requiered to add JSON format as output when the object is # instantiated from a web service call, requiring this output if output_format != None: if output_format not in self.param.getVal('output_format'): self.param.appVal('output_format', output_format) return def set_single_CPU(self) -> None: ''' Forces the use of a single CPU ''' LOG.debug('parameter "numCPUs" forced to be 1') self.param.setVal('numCPUs', 1) # def run(self, input_source, runtime_param=None, metric=None, numsel=None, cutoff=None): def run(self, param_dict): ''' Executes a default predicton workflow ''' print('*********', param_dict) metric = None numsel = None cutoff = None # path to endpoint epd = utils.space_path(self.space, self.version) if not os.path.isdir(epd): self.conveyor.setError( f'Unable to find space {self.space}, version {self.version}') #LOG.error(f'Unable to find space {self.space}') if 'infile' in param_dict: input_source = param_dict['infile'] else: LOG.error(f'Unable to find input_file') self.conveyor.setError( 'wrong format in the runtime similarity parameters') if 'runtime_param' in param_dict: runtime_param = param_dict['runtime_param'] if runtime_param is not None: print(runtime_param) try: with open(runtime_param, 'r') as pfile: rtparam = yaml.safe_load(pfile) try: metric = rtparam['similarity_metric'] numsel = rtparam['similarity_cutoff_num'] cutoff = rtparam['similarity_cutoff_distance'] except: LOG.error( 'wrong format in the runtime similarity parameters' ) self.conveyor.setError( 'wrong format in the runtime similarity parameters' ) except: LOG.error('runtime similarity parameter file not found') self.conveyor.setError( 'runtime similarity parameter file not found') else: try: metric = param_dict['metric'] numsel = param_dict['numsel'] cutoff = param_dict['cutoff'] except: LOG.error('wrong format in the runtime similarity parameters') self.conveyor.setError( 'wrong format in the runtime similarity parameters') if not self.conveyor.getError(): # uses the child classes within the 'space' folder, # to allow customization of # the processing applied to each space modpath = utils.smodule_path(self.space, self.version) idata_child = importlib.import_module(modpath + ".idata_child") sapply_child = importlib.import_module(modpath + ".sapply_child") odata_child = importlib.import_module(modpath + ".odata_child") # run idata object, in charge of generate space data from input try: idata = idata_child.IdataChild(self.param, self.conveyor, input_source) except: LOG.warning( 'Idata child architecture mismatch, defaulting to Idata parent' ) idata = Idata(self.param, self.conveyor, input_source) idata.run() LOG.debug(f'idata child {type(idata).__name__} completed `run()`') if not self.conveyor.getError(): # make sure there is X data if not self.conveyor.isKey('xmatrix'): LOG.debug(f'Failed to compute MDs') self.conveyor.setError(f'Failed to compute MDs') if not self.conveyor.getError(): # run apply object, in charge of generate a prediction from idata try: sapply = sapply_child.SapplyChild(self.param, self.conveyor) except: LOG.warning( 'Sapply child architecture mismatch, defaulting to Sapply parent' ) sapply = Sapply(self.param, self.conveyor) sapply.run(cutoff, numsel, metric) LOG.debug( f'sapply child {type(sapply).__name__} completed `run()`') # run odata object, in charge of formatting the prediction results # note that if any of the above steps failed, an error has been inserted in the # conveyor and odata will take case of showing an error message try: odata = odata_child.OdataChild(self.param, self.conveyor, self.label) except: LOG.warning( 'Odata child architecture mismatch, defaulting to Odata parent' ) odata = Odata(self.param, self.conveyor, self.label) return odata.run()
class Build: def __init__(self, model, param_file=None, param_string=None, output_format=None): LOG.debug('Starting build...') self.model = model self.param = Parameters() self.conveyor = Conveyor() # identify the workflow type self.conveyor.setOrigin('learn') # generate a unique modelID self.conveyor.addMeta('modelID',utils.id_generator()) LOG.debug(f'Generated new model with modelID: {self.conveyor.getMeta("modelID")}') # load parameters if param_file is not None: # use the param_file to update existing parameters at the model # directory and save changes to make them persistent success, message = self.param.delta(model, 0, param_file, iformat='YAML') elif param_string is not None: success, message = self.param.delta(model, 0, param_string, iformat='JSONS') else: # load parameter file at the model directory success, message = self.param.loadYaml(model, 0) # being unable to load parameters is a critical error if not success: LOG.critical(f'Unable to load model parameters. {message}. Aborting...') sys.exit(1) # add additional output formats included in the constructor # this is requiered to add JSON format as output when the object is # instantiated from a web service call, requiring this output if output_format is not None: if output_format not in self.param.getVal('output_format'): self.param.appVal('output_format',output_format) if self.param.getVal('confidential'): self.confidentialAuditParam() def confidentialAuditParam (self): import yaml original_method = self.param.getVal('model') if self.param.getVal ('quantitative'): if original_method != 'PLSR': self.param.setVal('model', 'PLSR') LOG.info (f'CONFIDENTIALITY AUDIT: the model was set to PLSR, ' f'the original method {original_method} was not suitable to build confidential models') else: if original_method != 'PLSDA': self.param.setVal('model', 'PLSDA') LOG.info (f'CONFIDENTIALITY AUDIT: the model was set to PLSDA, ' f'the original method {original_method} was not suitable to build confidential models') # TODO: conformal support if self.param.getVal('conformal'): self.param.setVal('conformal', False) LOG.info ('CONFIDENTIALITY AUDIT: conformal was set to False. ' 'Conformal models are not supported for now in confidential models') parameters_file_path = utils.model_path(self.model, 0) parameters_file_name = os.path.join (parameters_file_path, 'parameters.yaml') with open(parameters_file_name, 'w') as pfile: yaml.dump (self.param.p, pfile) def get_ensemble(self): ''' Returns a Boolean indicating if the model uses external input sources and a list with these sources ''' return self.param.getEnsemble() def extend_modelID (self, ensembleID): modelID = self.conveyor.getMeta('modelID') modelID = f'{modelID}-{ensembleID}' self.conveyor.addMeta('modelID', modelID) LOG.debug (f'modelID re-defined as {self.conveyor.getVal("modelID")}') def set_single_CPU(self) -> None: ''' Forces the use of a single CPU ''' LOG.debug('parameter "numCPUs" forced to be 1') self.param.setVal('numCPUs',1) def run(self, input_source): ''' Executes a default predicton workflow ''' # path to endpoint epd = utils.model_path(self.model, 0) # if not os.path.isdir(epd): # self.conveyor.setError(f'Unable to find model {self.model}') # #LOG.error(f'Unable to find model {self.model}') # import ichild classes # if not self.conveyor.getError(): # uses the child classes within the 'model' folder, # to allow customization of the processing applied to each model modpath = utils.module_path(self.model, 0) idata_child = importlib.import_module(modpath+".idata_child") learn_child = importlib.import_module(modpath+".learn_child") odata_child = importlib.import_module(modpath+".odata_child") # run idata object, in charge of generate model data from input try: idata = idata_child.IdataChild(self.param, self.conveyor, input_source) except: LOG.warning ('Idata child architecture mismatch, defaulting to Idata parent') idata = Idata(self.param, self.conveyor, input_source) idata.run() LOG.debug(f'idata child {type(idata).__name__} completed `run()`') if not self.conveyor.getError(): success, results = idata.preprocess_create() if not success: self.conveyor.setError(results) if not self.conveyor.getError(): # check there is a suitable X and Y if not self.conveyor.isKey ('xmatrix'): self.conveyor.setError(f'Failed to compute MDs') if not self.conveyor.isKey ('ymatrix'): self.conveyor.setError(f'No activity data (Y) found in training series') # run optional chemical space building for supporting "closest" training series object # if self.param.getVal('buildSimilarity'): if self.param.getVal('output_similar') is True: from flame.slearn import Slearn slearn_child = importlib.import_module(modpath+".slearn_child") if not self.conveyor.getError(): # instantiate learn (build a space from idata) and run it try: slearn = slearn_child.SlearnChild(self.param, self.conveyor) except: LOG.warning ('Slearn child architecture mismatch, defaulting to Learn parent') slearn = Slearn(self.param, self.conveyor) slearn.run() LOG.debug(f'slearn child {type(slearn).__name__} completed `run()`') if not self.conveyor.getError(): # instantiate learn (build a model from idata) and run it try: learn = learn_child.LearnChild(self.param, self.conveyor) except: LOG.warning ('Learn child architecture mismatch, defaulting to Learn parent') learn = Learn(self.param, self.conveyor) learn.run() LOG.debug(f'learn child {type(learn).__name__} completed `run()`') # run odata object, in charge of formatting the prediction results # note that if any of the above steps failed, an error has been inserted in the # conveyor and odata will take case of showing an error message try: odata = odata_child.OdataChild(self.param, self.conveyor) except: LOG.warning ('Odata child architecture mismatch, defaulting to Odata parent') odata = Odata(self.param, self.conveyor) return odata.run()