def run_internal(self): ''' Builds a model using the internally defined machine learning tools. All input parameters are extracted from self.param. The main output is an instance of basemodel saved in the model folder as a pickle (model.pkl) and used for prediction. The results of building and validation are added to results, but also saved to the model folder as a pickle (info.pkl) for being displayed in manage tools. ''' # check suitability of Y matrix if not self.param.getVal('quantitative') : success, yresult = utils.qualitative_Y(self.Y) if not success: self.conveyor.setError(yresult) return # expand with new methods here: registered_methods = [('RF', RF), ('SVM', SVM), ('GNB', GNB), ('PLSR', PLSR), ('PLSDA', PLSDA), ] # instantiate an appropriate child of base_model model = None for imethod in registered_methods: if imethod[0] == self.param.getVal('model'): model = imethod[1](self.X, self.Y, self.param) LOG.debug('Recognized learner: ' f"{self.param.getVal('model')}") break if not model: self.conveyor.setError(f'Modeling method {self.param.getVal("model")}' 'not recognized') LOG.error(f'Modeling method {self.param.getVal("model")}' 'not recognized') return # build model LOG.info('Starting model building') success, model_building_results = model.build() if not success: self.conveyor.setError(model_building_results) return self.conveyor.addVal( model_building_results, 'model_build_info', 'model building information', 'method', 'single', 'Information about the model') # self.results['model_build'] = results # validate model LOG.info('Starting model validation') success, model_validation_results = model.validate() if not success: self.conveyor.setError(model_validation_results) return # model_validation_results is a dictionary which contains model_validation_info and # (optionally) Y_adj and Y_pred, depending on the model type self.conveyor.addVal( model_validation_results['quality'], 'model_valid_info', 'model validation information', 'method', 'single', 'Information about the model validation') # non-conformal qualitative and quantitative models if 'Y_adj' in model_validation_results: self.conveyor.addVal( model_validation_results['Y_adj'], 'Y_adj', 'Y fitted', 'result', 'objs', 'Y values of the training series fitted by the model') if 'Y_pred' in model_validation_results: self.conveyor.addVal( model_validation_results['Y_pred'], 'Y_pred', 'Y predicted', 'result', 'objs', 'Y values of the training series predicted by the model') # conformal qualitative models produce a list of tuples, indicating # if the object is predicted to belong to class 0 and 1 if 'classes' in model_validation_results: for i in range(len(model_validation_results['classes'][0])): class_key = 'c' + str(i) class_label = 'Class ' + str(i) class_list = model_validation_results['classes'][:, i].tolist() self.conveyor.addVal( class_list, class_key, class_label, 'result', 'objs', 'Conformal class assignment', 'main') # conformal quantitataive models produce a list of tuples, indicating # the minumum and maximum value # if 'interval' in model_validation_results: # mean1 = np.mean(model_validation_results['classes'], axis=1) # lower_limit = model_validation_results['classes'][:, 0] # upper_limit = model_validation_results['classes'][:, 1] # utils.add_result(results, mean1, 'values', 'Prediction', # 'result', 'objs', # 'Results of the prediction', 'main') # utils.add_result(results, lower_limit, 'lower_limit', # 'Lower limit', 'confidence', 'objs', # 'Lower limit of the conformal prediction') # utils.add_result(results, upper_limit, 'upper_limit', # 'Upper limit', 'confidence', 'objs', # 'Upper limit of the conformal prediction') # TODO: compute AD (when applicable) LOG.info('Model finished successfully') # save model try: model.save_model() except Exception as e: LOG.error(f'Error saving model with exception {e}') return False, 'An error ocurred saving the model' return
def external_validation(self): ''' when experimental values are available for the predicted compounds, run external validation ''' ext_val_results = [] # Ye are the y values present in the input file Ye = np.asarray(self.conveyor.getVal("ymatrix")) # For qualitative models, make sure the Y is qualitative as well if not self.param.getVal("quantitative"): qy, message = utils.qualitative_Y(Ye) if not qy: self.conveyor.setWarning( f'No qualitative activity suitable for external validation "{message}". Skipping.' ) LOG.warning( f'No qualitative activity suitable for external validation "{message}". Skipping.' ) return # there are four variants of external validation, depending if the method # if conformal or non-conformal and the model is qualitative and quantitative if not self.param.getVal("conformal"): # non-conformal if not self.param.getVal("quantitative"): # non-conformal & qualitative Yp = np.asarray(self.conveyor.getVal("values")) if Ye.size == 0: raise ValueError("Experimental activity vector is empty") if Yp.size == 0: raise ValueError("Predicted activity vector is empty") # the use of labels is compulsory to inform the confusion matrix that # it must return a 2x2 confussion matrix. Otherwise it will fail when # a single class is represented (all TP, for example) TN, FP, FN, TP = confusion_matrix(Ye, Yp, labels=[0, 1]).ravel() # protect to avoid warnings in special cases (div by zero) MCC = mcc(Ye, Yp) if (TP + FN) > 0: sensitivity = (TP / (TP + FN)) else: sensitivity = 0.0 if (TN + FP) > 0: specificity = (TN / (TN + FP)) else: specificity = 0.0 ext_val_results.append( ('TP', 'True positives in external-validation', float(TP))) ext_val_results.append( ('TN', 'True negatives in external-validation', float(TN))) ext_val_results.append( ('FP', 'False positives in external-validation', float(FP))) ext_val_results.append( ('FN', 'False negatives in external-validation', float(FN))) ext_val_results.append( ('Sensitivity', 'Sensitivity in external-validation', float(sensitivity))) ext_val_results.append( ('Specificity', 'Specificity in external-validation', float(specificity))) ext_val_results.append( ('MCC', 'Mattews Correlation Coefficient in external-validation', float(MCC))) else: # non-conformal & quantitative Yp = np.asarray(self.conveyor.getVal("values")) if Ye.size == 0: raise ValueError("Experimental activity vector is empty") if Yp.size == 0: raise ValueError("Predicted activity vector is empty") Ym = np.mean(Ye) nobj = len(Yp) SSY0_out = np.sum(np.square(Ym - Ye)) SSY_out = np.sum(np.square(Ye - Yp)) scoringP = mean_squared_error(Ye, Yp) SDEP = np.sqrt(SSY_out / (nobj)) if SSY0_out == 0: Q2 = 0.0 else: Q2 = 1.00 - (SSY_out / SSY0_out) ext_val_results.append(('scoringP', 'Scoring P', scoringP)) ext_val_results.append( ('Q2', 'Determination coefficient in cross-validation', Q2)) ext_val_results.append( ('SDEP', 'Standard Deviation Error of the Predictions', SDEP)) self.conveyor.addVal(ext_val_results, 'external-validation', 'external validation', 'method', 'single', 'External validation results') else: # conformal external validation if not self.param.getVal("quantitative"): # conformal & qualitative Yp = np.concatenate( (np.asarray(self.conveyor.getVal('c0')).reshape(-1, 1), np.asarray(self.conveyor.getVal('c1')).reshape(-1, 1)), axis=1) if Ye.size == 0: raise ValueError("Experimental activity vector is empty") if Yp.size == 0: raise ValueError("Predicted activity vector is empty") c0_correct = 0 c1_correct = 0 not_predicted = 0 c0_incorrect = 0 c1_incorrect = 0 Ye1 = [] Yp1 = [] for i in range(len(Ye)): real = float(Ye[i]) predicted = Yp[i] if predicted[0] != predicted[1]: Ye1.append(real) if predicted[0]: Yp1.append(0) else: Yp1.append(1) if real == 0 and predicted[0] == True: c0_correct += 1 if real == 0 and predicted[1] == True: c0_incorrect += 1 if real == 1 and predicted[1] == True: c1_correct += 1 if real == 1 and predicted[0] == True: c1_incorrect += 1 else: not_predicted += 1 MCC = mcc(Ye1, Yp1) TN = c0_correct FP = c0_incorrect TP = c1_correct FN = c1_incorrect coverage = float((len(Yp) - not_predicted) / len(Yp)) try: # Compute accuracy (% of correct predictions) conformal_accuracy = (float(TN + TP) / float(FP + FN + TN + TP)) except Exception as e: LOG.error(f'Failed to compute conformal accuracy with' f'exception {e}') conformal_accuracy = '-' if (TP + FN) > 0: sensitivity = (TP / (TP + FN)) else: sensitivity = 0.0 if (TN + FP) > 0: specificity = (TN / (TN + FP)) else: specificity = 0.0 ext_val_results.append( ('TP', 'True positives in external-validation', float(TP))) ext_val_results.append( ('TN', 'True negatives in external-validation', float(TN))) ext_val_results.append( ('FP', 'False positives in external-validation', float(FP))) ext_val_results.append( ('FN', 'False negatives in external-validation', float(FN))) ext_val_results.append( ('Sensitivity', 'Sensitivity in external-validation', float(sensitivity))) ext_val_results.append( ('Specificity', 'Specificity in external-validation', float(specificity))) ext_val_results.append( ('MCC', 'Mattews Correlation Coefficient in external-validation', float(MCC))) ext_val_results.append( ('Conformal_coverage', 'Conformal coverage in external-validation', float(coverage))) ext_val_results.append( ('Conformal_accuracy', 'Conformal accuracy in external-validation', float(conformal_accuracy))) self.conveyor.addVal(ext_val_results, 'external-validation', 'external validation', 'method', 'single', 'External validation results') else: # conformal & quantitative Yp_lower = self.conveyor.getVal('lower_limit') Yp_upper = self.conveyor.getVal('upper_limit') mean_interval = np.mean(np.abs(Yp_lower) - np.abs(Yp_upper)) inside_interval = (Yp_lower.reshape(-1, 1) < Ye) & (Yp_upper.reshape(-1, 1) > Ye) accuracy = len(inside_interval) / len(Ye) conformal_accuracy = float("{0:.2f}".format(accuracy)) conformal_mean_interval = float( "{0:.2f}".format(mean_interval)) ext_val_results.append( ('Conformal_mean_interval', 'Conformal mean interval', conformal_mean_interval)) ext_val_results.append( ('Conformal_accuracy', 'Conformal accuracy', conformal_accuracy)) self.conveyor.addVal(ext_val_results, 'external-validation', 'external validation', 'method', 'single', 'External validation results')
def run_internal(self): ''' Builds a model using the internally defined machine learning tools. All input parameters are extracted from self.param. The main output is an instance of basemodel saved in the model folder as a pickle (model.pkl) and used for prediction. The results of building and validation are added to results, but also saved to the model folder as a pickle (info.pkl) for being displayed in manage tools. ''' # expand with new methods here: # registered_methods = [('RF', RF), # ('SVM', SVM), # ('GNB', GNB), # ('PLSR', PLSR), # ('PLSDA', PLSDA), # ('median', median), # ('mean', mean), # ('majority', majority), # ('logicalOR', logicalOR), # ('matrix', matrix)] if self.param.getVal('model') == 'XGBOOST': from flame.stats.XGboost import XGBOOST self.registered_methods.append(('XGBOOST', XGBOOST)) # check suitability of Y matrix if not self.param.getVal('quantitative'): success, yresult = utils.qualitative_Y(self.Y) if not success: self.conveyor.setError(yresult) return # print (np.shape(self.X)) # collect model information from parameters model_type_info = [] model_type_info.append( ('quantitative', 'True if the endpoint is quantitative', self.param.getVal('quantitative'))) model_type_info.append( ('conformal', 'True if the endpoint is conformal', self.param.getVal('conformal'))) model_type_info.append( ('confidential', 'True if the model is confidential', self.param.getVal('confidential'))) model_type_info.append( ('secret', 'True for barebone models exported by a confidential models', False)) model_type_info.append( ('ensemble', 'True if the model is an ensemble of models', self.param.getVal('input_type') == 'model_ensemble')) model_type_info.append(('ensemble_names', 'List of ensemble models', self.param.getVal('ensemble_names'))) model_type_info.append( ('ensemble_versions', 'List of ensemble versions', self.param.getVal('ensemble_versions'))) model_type_info.append( ('conformal_confidence', 'Confidence of the conformal model', self.param.getVal('conformalConfidence'))) self.conveyor.addVal(model_type_info, 'model_type_info', 'model type information', 'method', 'single', 'Information about the type of model') # instantiate an appropriate child of base_model model = None for imethod in self.registered_methods: if imethod[0] == self.param.getVal('model'): # we instantiate the subtype of base_model, # passing # - preteated X and Y matrices for model building # - model parameters (param) # - already obtained results (conveyor) model = imethod[1](self.X, self.Y, self.param, self.conveyor) LOG.debug('Recognized learner: ' f"{self.param.getVal('model')}") break if not model: self.conveyor.setError( f'Modeling method {self.param.getVal("model")}' 'not recognized') LOG.error(f'Modeling method {self.param.getVal("model")}' 'not recognized') return if self.conveyor.getError(): return # build model LOG.debug('Starting model building') success, model_building_results = model.build() if not success: self.conveyor.setError(model_building_results) return self.conveyor.addVal(model_building_results, 'model_build_info', 'model building information', 'method', 'single', 'Information about the model building') if hasattr(model, 'feature_importances'): self.conveyor.addVal( model.feature_importances, 'feature_importances', 'feature importances', 'method', 'vars', 'Information about the relative importance of the model variables' ) if hasattr(model, 'feature_importances_method'): self.conveyor.addVal( model.feature_importances_method, 'feature_importances_method', 'feature importances_method', 'method', 'single', 'Method used to compute the relative importance of the model variables' ) # validate model if self.param.getVal('input_type') == 'model_ensemble': validation_method = 'ensemble validation' else: validation_method = self.param.getVal("ModelValidationCV") LOG.info(f'Validating the model using method: {validation_method}') success, model_validation_results = model.validate() if not success: self.conveyor.setError(model_validation_results) return # model_validation_results is a dictionary which contains model_validation_info and # (optionally) Y_adj and Y_pred, depending on the model type self.conveyor.addVal(model_validation_results['quality'], 'model_valid_info', 'model validation information', 'method', 'single', 'Information about the model validation') # non-conformal qualitative and quantitative models if 'Y_adj' in model_validation_results: self.conveyor.addVal( model_validation_results['Y_adj'], 'Y_adj', 'Y fitted', 'result', 'objs', 'Y values of the training series fitted by the model') if 'Y_pred' in model_validation_results: self.conveyor.addVal( model_validation_results['Y_pred'], 'Y_pred', 'Y predicted', 'result', 'objs', 'Y values of the training series predicted by the model') if 'Conformal_prediction_ranges' in model_validation_results: self.conveyor.addVal( model_validation_results['Conformal_prediction_ranges'], 'Conformal_prediction_ranges', 'Conformal prediction ranges', 'method', 'objs', 'Interval for the cross-validated predictions') if 'Conformal_prediction_ranges_fitting' in model_validation_results: self.conveyor.addVal( model_validation_results[ 'Conformal_prediction_ranges_fitting'], 'Conformal_prediction_ranges_fitting', 'Conformal prediction ranges fitting', 'method', 'objs', 'Interval for the predictions in fitting') # conformal qualitative models produce a list of tuples, indicating # if the object is predicted to belong to class 0 and 1 if 'classes' in model_validation_results: for i in range(len(model_validation_results['classes'][0])): class_key = 'c' + str(i) class_label = 'Class ' + str(i) class_list = model_validation_results['classes'][:, i].tolist() self.conveyor.addVal(class_list, class_key, class_label, 'result', 'objs', 'Conformal class assignment', 'main') # conformal quantitataive models produce a list of tuples, indicating # the minumum and maximum value dimRed = self.param.getVal("dimensionality_reduction") if dimRed is None: nobj, nvarx = np.shape(self.X) if nvarx > 300: dimRed = 't-SNE' else: dimRed = 'PCA' if dimRed == 'PCA': generatePCASpace(self.X, self.param, self.conveyor) elif dimRed == 't-SNE': generateManifoldSpace(self.X, self.param, self.conveyor) # TODO: compute AD (when applicable) LOG.info('Model finished successfully') # save model model.save_model() return
def external_validation(self): ''' when experimental values are available for the predicted compounds, run external validation ''' if self.conveyor.getVal("values") is None: LOG.error("Predicted activity vector is empty") return if self.conveyor.getVal("ymatrix") is None: LOG.error("External activity vector is empty") return ext_val_results = [] # Ye are the y values present in the input file Ye = np.asarray(self.conveyor.getVal("ymatrix")) # For qualitative models, make sure the Y is qualitative as well if not self.param.getVal("quantitative"): qy, message = utils.qualitative_Y(Ye) if not qy: self.conveyor.setWarning( f'No qualitative activity suitable for external validation "{message}". Skipping.' ) LOG.warning( f'No qualitative activity suitable for external validation "{message}". Skipping.' ) return # there are four variants of external validation, depending if the variable is qualitative or quantitative if not self.param.getVal("quantitative"): # qualitative Yp = np.asarray(self.conveyor.getVal("values")) if len(Yp[Yp == -1]) > 0: pseudo_conformal = True nobj = len(Ye) Ye = Ye[Yp != -1] Yp = Yp[Yp != -1] coverage = len(Ye) / nobj ext_val_results.append( ('Conformal_coverage', 'Conformal coverage in external-validation', coverage)) else: pseudo_conformal = False if Ye.size == 0: LOG.error("Experimental activity vector is empty") return if Yp.size == 0: LOG.error("Predicted activity vector is empty") return # the use of labels is compulsory to inform the confusion matrix that # it must return a 2x2 confussion matrix. Otherwise it will fail when # a single class is represented (all TP, for example) TN, FP, FN, TP = confusion_matrix(Ye, Yp, labels=[0, 1]).ravel() # protect to avoid warnings in special cases (div by zero) MCC = matthews_corrcoef(Ye, Yp) if (TP + FN) > 0: sensitivity = (TP / (TP + FN)) else: sensitivity = 0.0 if (TN + FP) > 0: specificity = (TN / (TN + FP)) else: specificity = 0.0 ext_val_results.append( ('TP', 'True positives in external-validation', float(TP))) ext_val_results.append( ('TN', 'True negatives in external-validation', float(TN))) ext_val_results.append( ('FP', 'False positives in external-validation', float(FP))) ext_val_results.append( ('FN', 'False negatives in external-validation', float(FN))) ext_val_results.append( ('Sensitivity', 'Sensitivity in external-validation', float(sensitivity))) ext_val_results.append( ('Specificity', 'Specificity in external-validation', float(specificity))) ext_val_results.append( ('MCC', 'Mattews Correlation Coefficient in external-validation', float(MCC))) if pseudo_conformal: try: conformal_accuracy = (float(TN + TP) / float(FP + FN + TN + TP)) except Exception as e: LOG.error(f'Failed to compute conformal accuracy with' f'exception {e}') conformal_accuracy = '-' ext_val_results.append( ('Conformal_accuracy', 'Conformal accuracy in external-validation', conformal_accuracy)) else: # quantitative Yp = np.asarray(self.conveyor.getVal("values")) if Yp.size == 0: LOG.error("Predicted activity vector is empty") return if Ye.size == 0: LOG.error("Experimental activity vector is empty") return Ym = np.mean(Ye) nobj = len(Yp) SSY0_out = np.sum(np.square(Ym - Ye)) SSY_out = np.sum(np.square(Ye - Yp)) scoringP = mean_squared_error(Ye, Yp) SDEP = np.sqrt(SSY_out / (nobj)) if SSY0_out == 0: Q2 = 0.0 else: Q2 = 1.00 - (SSY_out / SSY0_out) ext_val_results.append(('scoringP', 'Scoring P', scoringP)) ext_val_results.append( ('Q2', 'Determination coefficient in cross-validation', Q2)) ext_val_results.append( ('SDEP', 'Standard Deviation Error of the Predictions', SDEP)) self.conveyor.addVal(ext_val_results, 'external-validation', 'external validation', 'method', 'single', 'External validation results')
def run_internal(self): ''' Builds a model using the internally defined machine learning tools. All input parameters are extracted from self.param. The main output is an instance of basemodel saved in the model folder as a pickle (model.pkl) and used for prediction. The results of building and validation are added to results, but also saved to the model folder as a pickle (info.pkl) for being displayed in manage tools. ''' # check suitability of Y matrix if not self.param.getVal('quantitative') : success, yresult = utils.qualitative_Y(self.Y) if not success: self.conveyor.setError(yresult) return # pre-process data success, message = self.preprocess() if not success: self.conveyor.setError(message) return # collect model information from parameters model_type_info = [] model_type_info.append(('quantitative', 'True if the endpoint is quantitative', self.param.getVal('quantitative'))) model_type_info.append(('conformal', 'True if the endpoint is conformal', self.param.getVal('conformal'))) model_type_info.append(('ensemble', 'True if the model is an ensemble of models', self.param.getVal('input_type') == 'model_ensemble')) model_type_info.append(('ensemble_names', 'List of ensemble models', self.param.getVal('ensemble_names'))) model_type_info.append(('ensemble_versions', 'List of ensemble versions', self.param.getVal('ensemble_versions'))) model_type_info.append(('conformal_confidence', 'Confidence of the conformal model', self.param.getVal('conformalConfidence'))) self.conveyor.addVal( model_type_info, 'model_type_info', 'model type information', 'method', 'single', 'Information about the type of model') # instantiate an appropriate child of base_model model = None for imethod in self.registered_methods: if imethod[0] == self.param.getVal('model'): # we instantiate the subtype of base_model, # passing # - preteated X and Y matrices for model building # - model parameters (param) # - already obtained results (conveyor) model = imethod[1](self.X, self.Y, self.param, self.conveyor) LOG.debug('Recognized learner: ' f"{self.param.getVal('model')}") break if not model: self.conveyor.setError(f'Modeling method {self.param.getVal("model")}' 'not recognized') LOG.error(f'Modeling method {self.param.getVal("model")}' 'not recognized') return if self.conveyor.getError(): return # build model LOG.debug('Starting model building') success, model_building_results = model.build() if not success: self.conveyor.setError(model_building_results) return self.conveyor.addVal( model_building_results, 'model_build_info', 'model building information', 'method', 'single', 'Information about the model building') # validate model if self.param.getVal('input_type') == 'model_ensemble': validation_method = 'ensemble validation' else: validation_method = self.param.getVal("ModelValidationCV") LOG.info(f'Validating the model using method: {validation_method}') success, model_validation_results = model.validate() if not success: self.conveyor.setError(model_validation_results) return # model_validation_results is a dictionary which contains model_validation_info and # (optionally) Y_adj and Y_pred, depending on the model type self.conveyor.addVal( model_validation_results['quality'], 'model_valid_info', 'model validation information', 'method', 'single', 'Information about the model validation') # non-conformal qualitative and quantitative models if 'Y_adj' in model_validation_results: self.conveyor.addVal( model_validation_results['Y_adj'], 'Y_adj', 'Y fitted', 'result', 'objs', 'Y values of the training series fitted by the model') if 'Y_pred' in model_validation_results: self.conveyor.addVal( model_validation_results['Y_pred'], 'Y_pred', 'Y predicted', 'result', 'objs', 'Y values of the training series predicted by the model') if 'Conformal_prediction_ranges' in model_validation_results: self.conveyor.addVal( model_validation_results['Conformal_prediction_ranges'], 'Conformal_prediction_ranges', 'Conformal prediction ranges', 'method', 'objs', 'Interval for the cross-validated predictions') if 'Conformal_prediction_ranges_fitting' in model_validation_results: self.conveyor.addVal( model_validation_results['Conformal_prediction_ranges_fitting'], 'Conformal_prediction_ranges_fitting', 'Conformal prediction ranges fitting', 'method', 'objs', 'Interval for the predictions in fitting') # conformal qualitative models produce a list of tuples, indicating # if the object is predicted to belong to class 0 and 1 if 'classes' in model_validation_results: for i in range(len(model_validation_results['classes'][0])): class_key = 'c' + str(i) class_label = 'Class ' + str(i) class_list = model_validation_results['classes'][:, i].tolist() self.conveyor.addVal( class_list, class_key, class_label, 'result', 'objs', 'Conformal class assignment', 'main') # conformal quantitataive models produce a list of tuples, indicating # the minumum and maximum value # TODO: compute AD (when applicable) # generate a proyected space and use it to generate graphics generateProjectedSpace(self.X, self.param, self.conveyor) LOG.info('Model finished successfully') # save model model.save_model() return