Exemple #1
0
    def regularProject(self, Xb, results):
        ''' projects a collection of query objects in a regular model,
         for obtaining predictions '''

        Yp = self.estimator.predict(Xb)

        utils.add_result(results, Yp, 'values', 'Prediction', 'result', 'objs',
                         'Results of the prediction', 'main')
Exemple #2
0
    def conformalProject(self, Xb, results):
        ''' projects a collection of query objects in a conformal model,
         for obtaining predictions '''

        prediction = self.conformal_pred.predict(
            Xb, significance=self.conformalSignificance)

        if self.quantitative:
            mean1 = np.mean(prediction, axis=1)
            lower_limit = prediction[:, 0]
            upper_limit = prediction[:, 1]
            utils.add_result(results, mean1, 'values', 'Prediction', 'result',
                             'objs', 'Results of the prediction', 'main')
            utils.add_result(results, lower_limit, 'lower_limit',
                             'Lower limit', 'confidence', 'objs',
                             'Lower limit of the conformal prediction')
            utils.add_result(results, upper_limit, 'upper_limit',
                             'Upper limit', 'confidence', 'objs',
                             'Upper limit of the conformal prediction')
        else:
            # For the moment is returning a dictionary with class
            #  predictions
            # / c0 / c1 / c2 /
            # /True/True/False/

            for i in range(len(prediction[0])):
                class_key = 'c' + str(i)
                class_label = 'Class ' + str(i)
                class_list = prediction[:, i].tolist()
                utils.add_result(results, class_list, class_key, class_label,
                                 'result', 'objs',
                                 'Conformal class assignment', 'main')
Exemple #3
0
    def external_validation(self):
        ''' when experimental values are available for the predicted compounds,
        run external validation '''

        ext_val_results = []
        
        # Ye are the y values present in the input file
        Ye = np.asarray(self.results["ymatrix"])

        # there are four variants of external validation, depending if the method
        # if conformal or non-conformal and the model is qualitative and quantitative

        if not self.parameters["conformal"]:

            # non-conformal 
            if not self.parameters["quantitative"]:
                
                # non-conformal & qualitative
                Yp = np.asarray(self.results["values"])

                if Ye.size == 0:
                    raise ValueError("Experimental activity vector is empty")
                if Yp.size == 0:
                    raise ValueError("Predicted activity vector is empty")

                # the use of labels is compulsory to inform the confusion matrix that
                # it must return a 2x2 confussion matrix. Otherwise it will fail when
                # a single class is represented (all TP, for example)
                TN, FP, FN, TP = confusion_matrix(
                    Ye, Yp, labels=[0, 1]).ravel()

                # protect to avoid warnings in special cases (div by zero)
                MCC = mcc(Ye, Yp)

                if (TP+FN) > 0:
                    sensitivity = (TP / (TP + FN))
                else:
                    sensitivity = 0.0

                if (TN+FP) > 0:
                    specificity = (TN / (TN + FP))
                else:
                    specificity = 0.0

                ext_val_results.append(('TP_ex',
                                        'True positives in external-validation',
                                        float(TP)))
                ext_val_results.append(('TN_ex',
                                        'True negatives in external-validation',
                                        float(TN)))
                ext_val_results.append(('FP_ex',
                                        'False positives in external-validation',
                                        float(FP)))
                ext_val_results.append(('FN_ex',
                                        'False negatives in external-validation',
                                        float(FN)))
                ext_val_results.append(('Sensitivity_ex',
                                        'Sensitivity in external-validation',
                                        float(sensitivity)))
                ext_val_results.append(('Specificity_ex',
                                        'Specificity in external-validation',
                                        float(specificity)))
                ext_val_results.append(('MCC_ex',
                                        'Mattews Correlation Coefficient in external-validation',
                                        float(MCC)))

            else:

                # non-conformal & quantitative
                Yp = np.asarray(self.results["values"])

                if Ye.size == 0:
                    raise ValueError("Experimental activity vector is empty")
                if Yp.size == 0:
                    raise ValueError("Predicted activity vector is empty")

                Ym = np.mean(Ye)
                nobj = len(Yp)

                SSY0_out = np.sum(np.square(Ym - Ye))
                SSY_out = np.sum(np.square(Ye - Yp))
                scoringP = mean_squared_error(Ye, Yp)
                SDEP = np.sqrt(SSY_out / (nobj))
                Q2 = 1.00 - (SSY_out / SSY0_out)

                ext_val_results.append(
                    ('scoringP_ex', 'Scoring P', scoringP))
                ext_val_results.append(
                    ('Q2_ex', 'Determination coefficient in cross-validation', Q2))
                ext_val_results.append(
                    ('SDEP_ex', 'Standard Deviation Error of the Predictions', SDEP))

            utils.add_result(self.results,
                             ext_val_results,
                             'external-validation',
                             'external validation',
                             'method',
                             'single',
                             'External validation results')

        else:
            # conformal external validation

            if not self.parameters["quantitative"]:
                
                # conformal & qualitative
                Yp = np.concatenate((np.asarray(self.results['c0']).reshape(
                    -1, 1), np.asarray(self.results['c1']).reshape(-1, 1)), axis=1)

                if Ye.size == 0:
                    raise ValueError("Experimental activity vector is empty")
                if Yp.size == 0:
                    raise ValueError("Predicted activity vector is empty")

                c0_correct = 0
                c1_correct = 0
                not_predicted = 0
                c0_incorrect = 0
                c1_incorrect = 0

                Ye1 = []
                Yp1 = []
                for i in range(len(Ye)):
                    real = float(Ye[i])
                    predicted = Yp[i]
                    if predicted[0] != predicted[1]:
                        Ye1.append(real)
                        if predicted[0]:
                            Yp1.append(0)
                        else:
                            Yp1.append(1)

                        if real == 0 and predicted[0] == True:
                            c0_correct += 1
                        if real == 0 and predicted[1] == True:
                            c0_incorrect += 1
                        if real == 1 and predicted[1] == True:
                            c1_correct += 1
                        if real == 1 and predicted[0] == True:
                            c1_incorrect += 1
                    else:
                        not_predicted += 1
                MCC = mcc(Ye1, Yp1)
                TN = c0_correct
                FP = c0_incorrect
                TP = c1_correct
                FN = c1_incorrect
                coverage = float((len(Yp) - not_predicted) / len(Yp))

                if (TP+FN) > 0:
                    sensitivity = (TP / (TP + FN))
                else:
                    sensitivity = 0.0
                if (TN+FP) > 0:
                    specificity = (TN / (TN + FP))
                else:
                    specificity = 0.0
                ext_val_results.append(('TP',
                                        'True positives in external-validation',
                                        float(TP)))
                ext_val_results.append(('TN',
                                        'True negatives in external-validation',
                                        float(TN)))
                ext_val_results.append(('FP',
                                        'False positives in external-validation',
                                        float(FP)))
                ext_val_results.append(('FN',
                                        'False negatives in external-validation',
                                        float(FN)))
                ext_val_results.append(('Coverage',
                                        'Conformal coverage in external-validation',
                                        float(coverage)))
                ext_val_results.append(('Sensitivity',
                                        'Sensitivity in external-validation',
                                        float(sensitivity)))
                ext_val_results.append(('Specificity',
                                        'Specificity in external-validation',
                                        float(specificity)))
                ext_val_results.append(('MCC',
                                        'Mattews Correlation Coefficient in external-validation',
                                        float(MCC)))
                utils.add_result(self.results,
                                 ext_val_results,
                                 'external-validation',
                                 'external validation',
                                 'method',
                                 'single',
                                 'External validation results')
            else:

                # conformal & quantitative
                Yp_lower = self.results['lower_limit']
                Yp_upper = self.results['upper_limit']

                mean_interval = np.mean(np.abs(Yp_lower) - np.abs(Yp_upper))
                inside_interval = (Yp_lower.reshape(-1, 1) <
                                   Ye) & (Yp_upper.reshape(-1, 1) > Ye)
                accuracy = len(inside_interval)/len(Ye)
                conformal_accuracy = float("{0:.2f}".format(accuracy))
                conformal_mean_interval = float(
                    "{0:.2f}".format(mean_interval))

                ext_val_results.append(('Conformal_mean_interval',
                                        'Conformal mean interval',
                                        conformal_mean_interval))
                ext_val_results.append(('Conformal_accuracy',
                                        'Conformal accuracy',
                                        conformal_accuracy))

                utils.add_result(self.results,
                                 ext_val_results,
                                 'external-validation',
                                 'external validation',
                                 'method',
                                 'single',
                                 'External validation results')
Exemple #4
0
    def _run_molecule(self):
        '''
        version of Run for molecular input

        '''

        # extract useful information from file

        success_inform = self.extractInformation(self.ifile)
        if 'error' in self.results:
            return

        nobj = self.results['obj_num']
        ncpu = min(nobj, self.parameters['numCPUs'])

        # copy the input file to a temp file which will be cleaned at the end
        temp_path = tempfile.mkdtemp()
        shutil.copy(self.ifile, temp_path)
        lfile = os.path.join(temp_path, os.path.basename(self.ifile))

        # Execute the workflow in 1 or n CPUs
        if ncpu > 1:
            LOG.debug('Entering molecule workflow for {} cpus'.format(ncpu))
            success, results = sdfu.split_SDFile(lfile, ncpu)

            if not success:
                self.results['error'] = 'unable to split input molecule'
                return

            split_files_names = results[0]
            split_files_sizes = results[1]

            pool = mp.Pool(ncpu)

            if self.parameters['mol_batch'] == 'series':
                results = pool.map(self.workflow_series, split_files_names)
            else:
                results = pool.map(self.workflow_objects, split_files_names)

            success, results = self.consolidate(results, split_files_sizes)

        else:

            if self.parameters['mol_batch'] == 'series':
                success, results = self.workflow_series(lfile)
            else:
                success, results = self.workflow_objects(lfile)

        # series processing (1 or n CPUs) can produce a success == False if
        # any of the series/pieces contains an error. Abort the processing...
        if not success:
            self.results['error'] = results

        # check if any molecule failed to complete the workflow and then
        # ammend object annotations in self.results

        success_workflow = results[2]

        if len(success_inform) != len(success_workflow):

            LOG.error('shape mismatch of informed and workflow results:'
                      f' ({len(success_inform), len(success_workflow)})'
                      ' This is because some molecules failed during'
                      ' the standarization or descriptors computations.')

            self.results['error'] = ('number of molecules informed'
                                     ' and processed does not match')

            return

        # Check if molecules not informed succeded
        # to be complete MD generation.
        # This should never happen, because they
        # do not pass the normalization step
        for i, (inform,
                workflow) in enumerate(zip(success_inform, success_workflow)):
            if workflow and not inform:

                LOG.critical(f'Molecule #{i} is `None` in Rdkit'
                             ' but appears to be processed. This means that'
                             ' there is a serious workflow issue and the '
                             ' molecule should be cured or eliminated.')

                self.results[
                    'error'] = 'Unknown error processing input file. Probably the format is wrong or not supported'
                return

        # check if a molecule informed did not
        # succeed to complete MD generation
        for i, j in zip(success_inform, success_workflow):
            if i and not j:
                self.ammend_objects(success_inform, success_workflow)
                break

        # remove the temp directory with all the temp files inside
        shutil.rmtree(temp_path)

        utils.add_result(self.results, results[0], 'xmatrix', 'X matrix',
                         'method', 'vars', 'Molecular descriptors')

        utils.add_result(self.results, results[1], 'var_nam', 'Var names',
                         'method', 'vars', 'Names of the X variables')

        return
Exemple #5
0
    def extractInformation(self, ifile):
        '''
        Extracts molecule names, biological anotations and experimental values
        from an SDFile.

        All this information is added to the results using method utils.add_result,
        so they are also inserted into the results manifest.
        '''

        # Initiate a RDKit SDFile iterator to process the molecules one by one
        try:
            suppl = Chem.SDMolSupplier(ifile)
            LOG.debug(f'mol supplier created from {ifile}')
        except Exception as e:
            LOG.debug('Unable to create mol supplier with the exception: '
                      f'{e}')
            self.results['error'] = f'unable to open {ifile}. {e}'
            return

        # Raise error if SDF is empty
        if len(suppl) == 0:
            LOG.critical('ifile {} is empty'.format(ifile))
            raise ValueError('Input SDF is empty')

        # Initate lists which will contain the extracted values
        obj_nam = []
        obj_bio = []
        obj_exp = []
        obj_sml = []
        success_list = []
        obj_num = 0

        # Iterate for every molecule inside the SDFile
        for mol in suppl:

            # Do not try to process molecules not recognised by RDKit.
            # They will be removed at the pre-normalization step, which is
            # compulsory for every molecule
            if mol is None:
                LOG.error(
                    f'(@extractInformaton) Unable to process molecule #{obj_num+1}'
                    f' in file {ifile}')
                # success_list.append(False)
                continue

            # extract the molecule name, using a sdfileutils algorithm
            name = sdfu.getName(mol,
                                count=obj_num,
                                field=self.parameters['SDFile_name'],
                                suppl=suppl)

            # extracts biological information (activity) which is used as dependent variable
            # for the model training and is provided as a prediction for new compounds
            bio = None
            if self.parameters['SDFile_activity'] is not None:
                bio = utils.get_sdf_value(mol,
                                          self.parameters['SDFile_activity'])

            # extracts experimental information, if any.
            # note that experimental information is used only in prediction, as a value
            # which overrides any model predicted value
            exp = None
            if self.parameters['SDFile_experimental'] is not None:
                exp = utils.get_sdf_value(
                    mol, self.parameters['SDFile_experimental'])

            # generates a SMILES
            sml = None
            try:
                sml = Chem.MolToSmiles(mol)
            except Exception as e:
                LOG.error('while converting mol to smiles'
                          f' an exception has ocurred: {e}')

            # assigns the information extracted from the SDFile to the corresponding lists
            obj_nam.append(name)
            obj_bio.append(bio)
            obj_exp.append(exp)
            obj_sml.append(sml)

            success_list.append(True)
            obj_num += 1

        # Insert the values as lists in 'results' using an utility function
        utils.add_result(self.results, obj_num, 'obj_num', 'Num mol', 'method',
                         'single',
                         'Number of molecules present in the input file')
        utils.add_result(self.results, obj_nam, 'obj_nam', 'Mol name', 'label',
                         'objs',
                         'Name of the molecule, as present in the input file')
        utils.add_result(self.results, obj_sml, 'SMILES', 'SMILES', 'smiles',
                         'objs', 'Structure of the molecule in SMILES format')

        if not utils.is_empty(obj_bio):
            utils.add_result(
                self.results, np.array(obj_bio, dtype=np.float64), 'ymatrix',
                'Activity', 'decoration', 'objs',
                'Biological anotation to be predicted by the model')

        if not utils.is_empty(obj_exp):
            utils.add_result(
                self.results, np.array(obj_exp, dtype=np.float64), 'experim',
                'Experim.', 'decoration', 'objs',
                'Experimental anotation present in the input file')

        LOG.debug(f'processed {obj_num} molecules'
                  f' from a supplier of {len(suppl)} without issues')

        return success_list
Exemple #6
0
    def _run_ext_data(self):
        '''
        version of Run for inter-process input
        (calling another model to obtain input)
        '''

        # idata is a list of JSON from 1-n sources
        # the data usable for input must be listed in the ['meta']['main'] key

        # use first JSON to load common info like obj_nam, etc
        obj_common = ['label', 'decoration']

        # load object identifiers and decorators
        first_results = json.loads(self.idata[0])
        first_manifest = first_results['manifest']

        for item in first_manifest:
            if item['type'] in obj_common:
                item_key = item['key']
                self.results[item_key] = first_results[item_key]
                self.results['manifest'].append(item)

        # extract usable data from every source and add to 'combo' np.array
        combined_md = None
        combined_cf = None
        combined_md_names = []
        combined_cf_names = []

        for ijson in self.idata:
            i_result = json.loads(ijson)
            i_manifest = i_result['manifest']
            i_meta = i_result['meta']

            for item in i_manifest:
                if item['type'] == 'result':
                    item_key = item['key']

                    if combined_md is None:  # for first element just copy
                        combined_md = np.array(i_result[item_key],
                                               dtype=np.float64)
                        num_obj = len(i_result[item_key])
                    else:  # append laterally
                        if len(i_result[item_key]) != num_obj:
                            self.results[
                                'error'] = 'incompatible size of results obtained from external sources'
                            return

                        combined_md = np.c_[
                            combined_md,
                            np.array(i_result[item_key], dtype=np.float64)]

                    combined_md_names.append(item_key + ':' +
                                             i_meta['endpoint'] + ':' +
                                             str(i_meta['version']))

                if item['type'] == 'confidence':
                    item_key = item['key']
                    if combined_cf is None:  # for first element just copy
                        combined_cf = np.array(i_result[item_key],
                                               dtype=np.float64)
                    else:  # append laterally
                        combined_cf = np.c_[
                            combined_cf,
                            np.array(i_result[item_key], dtype=np.float64)]

                    combined_cf_names.append(item_key + ':' +
                                             i_meta['endpoint'] + ':' +
                                             str(i_meta['version']))

        utils.add_result(self.results, combined_md, 'xmatrix', 'X matrix',
                         'results', 'objs',
                         'Combined output from external sources')
        utils.add_result(self.results, combined_cf, 'confidence', 'Confidence',
                         'confidence', 'objs',
                         'Combined confidence from external sources')
        utils.add_result(self.results, combined_md_names, 'var_nam',
                         'Var. names', 'method', 'vars',
                         'Variable names from external sources')
        utils.add_result(self.results, combined_cf_names, 'conf_nam',
                         'Conf. names', 'method', 'vars',
                         'Confidence indexes from external sources')

        return
Exemple #7
0
    def _run_data(self):
        '''
        version of Run for data input (TSV tabular format)
        '''
        if not os.path.isfile(self.ifile):
            self.results['error'] = '{} not found'.format(self.ifile)
            # raise FileNotFoundError('{} not found'.format(self.ifile))
            return

        #  Reading TSV by hand
        with open(self.ifile, 'r') as fi:

            var_nam = []
            obj_nam = []
            smiles = []

            for index, line in enumerate(fi):
                # we asume that the first row contains var names
                if index == 0 and self.parameters['TSV_varnames']:
                    var_nam = line.strip().split('\t')
                    var_nam = var_nam[1:]
                else:
                    value_list = line.strip().split('\t')

                    if self.parameters['TSV_objnames']:
                        # we asume that the first column contains object names
                        obj_nam.append(value_list[0])
                        value_list = value_list[1:]

                    if 'SMILES' in var_nam:
                        col = var_nam.index('SMILES')
                        smiles.append(value_list[col])
                        del value_list[col]

                    value_array = np.array(value_list, dtype=np.float64)
                    if index == 1:  # for the fist row, just copy the value list to the xmatrix
                        xmatrix = value_array
                    else:
                        xmatrix = np.vstack((xmatrix, value_array))

        obj_num = index
        LOG.debug('loaded TSV with shape {} '.format(xmatrix.shape))
        if self.parameters['TSV_varnames']:
            obj_num -= 1  # what?

        # extract any named as "TSV_activity" as the ymatrix
        activity_param = self.parameters['TSV_activity']
        LOG.debug('creating ymatrix from column {}'.format(activity_param))
        if activity_param in var_nam:
            col = var_nam.index(activity_param)
            ymatrix = xmatrix[:, col]
            xmatrix = np.delete(xmatrix, col, 1)
            utils.add_result(
                self.results, ymatrix, 'ymatrix', 'Activity', 'decoration',
                'objs', 'Biological anotation to be predicted by the model')

        utils.add_result(self.results, obj_num, 'obj_num', 'Num mol', 'method',
                         'single',
                         'Number of molecules present in the input file')
        utils.add_result(self.results, xmatrix, 'xmatrix', 'X matrix',
                         'method', 'vars', 'Molecular descriptors')

        if self.parameters['TSV_varnames']:
            utils.add_result(self.results, var_nam, 'var_nam', 'Var names',
                             'method', 'vars', 'Names of the X variables')

        if not self.parameters['TSV_objnames']:
            for i in range(obj_num):
                obj_nam.append('obj%.10f' % i)

        utils.add_result(self.results, obj_nam, 'obj_nam', 'Mol name', 'label',
                         'objs',
                         'Name of the molecule, as present in the input file')

        if len(smiles) > 0:
            utils.add_result(self.results, smiles, 'SMILES', 'SMILES',
                             'smiles', 'objs',
                             'Structure of the molecule in SMILES format')
        return
Exemple #8
0
    def run_internal(self):
        '''
        Builds a model using the internally defined machine learning tools.

        All input parameters are extracted from self.parameters.

        The main output is an instance of basemodel saved in
        the model folder as a pickle (model.pkl) and used for prediction.

        The results of building and validation are added to results,
        but also saved to the model folder as a pickle (info.pkl)
        for being displayed in manage tools.
        '''
        # expand with new methods here:
        registered_methods = [('RF', RF),
                              ('SVM', SVM),
                              ('GNB', GNB),
                              ('PLSR', PLSR),
                              ('PLSDA', PLSDA), ]

        # instanciate an appropriate child of base_model
        model = None
        for imethod in registered_methods:
            if imethod[0] == self.parameters['model']:
                model = imethod[1](self.X, self.Y, self.parameters)
                LOG.debug('Recognized learner: '
                          f"{self.parameters['model']}")
                break

        if not model:
            self.results['error'] = 'modeling method not recognised'
            LOG.error(f'Modeling method {self.parameters["model"]}'
                      'not recognized')
            return

        # build model
        LOG.info('Starting model building')
        success, model_building_results = model.build()
        if not success:
            self.results['error'] = model_buidling_results
            return

        utils.add_result(self.results,
                    model_building_results,
                    'model_build_info',
                    'model buidling information',
                    'method',
                    'single',
                    'Information about the model')
        # self.results['model_build'] = results

        # validate model
        LOG.info('Starting model validation')
        success, model_validation_results = model.validate()
        if not success:
            self.results['error'] = model_validation_results
            return

        # model_validation_results is a tuple which contains model_validation_info and 
        # (optionally) Y_adj and Y_pred, depending on the model type    
        
        utils.add_result(self.results,
            model_validation_results[0],
            'model_valid_info',
            'model validation information',
            'method',
            'single',
            'Information about the model validation')

        if len(model_validation_results)>1:
            utils.add_result(self.results,
                model_validation_results[1],
                'Y_adj',
                'Y fitted',
                'result',
                'objs',
                'Y values of the training series fitted by the model')
        
        if len(model_validation_results)>2:
            utils.add_result(self.results,
                model_validation_results[2],
                'Y_pred',
                'Y predicted',
                'result',
                'objs',
                'Y values of the training series predicted by the model')

        # TODO: compute AD (when applicable)

        LOG.info('Model finished succesfully')

        # save model
        model_pkl_path = os.path.join(self.parameters['model_path'],
                                      'model.pkl')
        with open(model_pkl_path, 'wb') as handle:
            pickle.dump(model, handle, protocol=pickle.HIGHEST_PROTOCOL)
        LOG.debug('Model saved as:{}'.format(model_pkl_path))

        return