Python getSMARTSrecalcDesc Examples

Programming Language: Python

Namespace/Package Name: AZutilities.getStructuralDesc

Method/Function: getSMARTSrecalcDesc

Examples at hotexamples.com: 4

Python getSMARTSrecalcDesc - 4 examples found. These are the top rated real world Python examples of AZutilities.getStructuralDesc.getSMARTSrecalcDesc extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: AZorngStructDescTest.py Project: girschic/AZOrange

 def test_smartsRecalc(self):
     """Test structural feature recalculation"""
     result = getStructuralDesc.getFTMDescResult(self.data, 0.9)
     smarts = result.domain.attributes[len(self.data.domain.attributes):]
     
     result2 = getStructuralDesc.getSMARTSrecalcDesc(self.data,smarts)
             
     #expected_atts = 3
     self.assertEqual(len(result),len(result2))

Example #2

Show file

File: getUnbiasedAccuracy.py Project: girschic/AZOrange

    def getProbabilitiesAsAttribute(self, algorithm=None, minsup=None, atts=None):
        """ For regression problems, it returns the RMSE and the Q2 
            For Classification problems, it returns CA and the ConfMat
            The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]}
            For the EvalResults not supported for a specific learner/datase, the respective result will be None

            if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus
                made out of those that were stable

            It some error occurred, the respective values in the Dict will be None
                
			parameters:
                algo - key for the structural feature generation algorithm (set dependent structural features that have to be calculated inside the crossvalidation)
                minsup - minimum support for the algorithm
                atts - attributes to be removed before learning (e.g. meta etc...)
        """
        self.__log("Starting Calculating MLStatistics")
        statistics = {}
        if not self.__areInputsOK():
            return None

        if algorithm:
            self.__log(" Additional features to be calculated inside of cross-validation")
            self.__log(" Algorithm for structural features: " + str(algorithm))
            self.__log(" Minimum support parameter: " + str(minsup))

        # Set the response type
        self.responseType = (
            self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression"
        )
        self.__log("  " + str(self.responseType))

        # Create the Train and test sets
        DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds)

        # Var for saving each Fols result
        optAcc = {}
        results = {}
        exp_pred = {}
        nTrainEx = {}
        nTestEx = {}

        # Set a dict of learners
        MLmethods = {}
        if type(self.learner) == dict:
            for ml in self.learner:
                MLmethods[ml] = self.learner[ml]
        else:
            MLmethods[self.learner.name] = self.learner

        models = {}
        rocs = {}
        self.__log("Calculating Statistics for MLmethods:")
        self.__log("  " + str([x for x in MLmethods]))

        # Check data in advance so that, by chance, it will not faill at the last fold!
        for foldN in range(self.nExtFolds):
            trainData = self.data.select(DataIdxs[foldN], negate=1)
            self.__checkTrainData(trainData)

        # Optional!!
        # Order Learners so that PLS is the first
        sortedML = [ml for ml in MLmethods]
        if "PLS" in sortedML:
            sortedML.remove("PLS")
            sortedML.insert(0, "PLS")

        for ml in sortedML:
            self.__log("    > " + str(ml) + "...")
            try:
                # Var for saving each Fols result
                results[ml] = []
                exp_pred[ml] = []
                models[ml] = []
                rocs[ml] = []
                nTrainEx[ml] = []
                nTestEx[ml] = []
                optAcc[ml] = []

                ### mods TG
                prediction_attribute = orange.FloatVariable("class_prob")
                domain = [data.domain.attributes, prediction_attribute, data.domain.classvar]
                data_new = orange.ExampleTable(domain)

                logTxt = ""
                for foldN in range(self.nExtFolds):
                    if type(self.learner) == dict:
                        self.paramList = None

                    trainData = self.data.select(DataIdxs[foldN], negate=1)
                    orig_len = len(trainData.domain.attributes)
                    # add structural descriptors to the training data (TG)
                    if algorithm:
                        trainData_structDesc = getStructuralDesc.getStructuralDescResult(trainData, algorithm, minsup)
                        trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, atts)

                    testData = self.data.select(DataIdxs[foldN])
                    # print "IDX: ",
                    # print DataIdxs[foldN]
                    # calculate the feature values for the test data (TG)
                    if algorithm:
                        cut_off = orig_len - len(atts)
                        smarts = trainData.domain.attributes[cut_off:]
                        self.__log("  Number of structural features added: " + str(len(smarts)))
                        testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData, smarts)
                        testData = dataUtilities.attributeDeselectionData(testData_structDesc, atts)

                    nTrainEx[ml].append(len(trainData))
                    nTestEx[ml].append(len(testData))
                    # Test if trainsets inside optimizer will respect dataSize criterias.
                    #  if not, don't optimize, but still train the model
                    dontOptimize = False
                    if self.responseType != "Classification" and (len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20):
                        dontOptimize = True
                    else:
                        tmpDataIdxs = dataUtilities.SeedDataSampler(trainData, self.nInnerFolds)
                        tmpTrainData = trainData.select(tmpDataIdxs[0], negate=1)
                        if not self.__checkTrainData(tmpTrainData, False):
                            dontOptimize = True

                    if dontOptimize:
                        logTxt += (
                            "       Fold " + str(foldN) + ": Too few compounds to optimize model hyper-parameters\n"
                        )
                        self.__log(logTxt)
                        if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                            res = orngTest.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                randomGenerator=random.randint(0, 100),
                            )
                            CA = evalUtilities.CA(res)[0]
                            optAcc[ml].append(CA)
                        else:
                            res = orngTest.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                randomGenerator=random.randint(0, 100),
                            )
                            R2 = evalUtilities.R2(res)[0]
                            optAcc[ml].append(R2)
                    else:
                        runPath = miscUtilities.createScratchDir(
                            baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData)
                        )
                        trainData.save(os.path.join(runPath, "trainData.tab"))

                        tunedPars = paramOptUtilities.getOptParam(
                            learner=MLmethods[ml],
                            trainDataFile=os.path.join(runPath, "trainData.tab"),
                            paramList=self.paramList,
                            useGrid=False,
                            verbose=self.verbose,
                            queueType=self.queueType,
                            runPath=runPath,
                            nExtFolds=None,
                            nFolds=self.nInnerFolds,
                            logFile=self.logFile,
                            getTunedPars=True,
                        )
                        if not MLmethods[ml] or not MLmethods[ml].optimized:
                            self.__log(
                                "       WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized."
                            )
                            self.__log("                It will be ignored")
                            # self.__log("                It will be set to default parameters")
                            self.__log("                    DEBUG can be done in: " + runPath)
                            # Set learner back to default
                            # MLmethods[ml] = MLmethods[ml].__class__()
                            raise Exception("The learner " + str(ml) + " was not optimized.")
                        else:
                            if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                                optAcc[ml].append(tunedPars[0])
                            else:
                                res = orngTest.crossValidation(
                                    [MLmethods[ml]],
                                    trainData,
                                    folds=5,
                                    strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                    randomGenerator=random.randint(0, 100),
                                )
                                R2 = evalUtilities.R2(res)[0]
                                optAcc[ml].append(R2)

                            miscUtilities.removeDir(runPath)
                    # Train the model
                    model = MLmethods[ml](trainData)
                    models[ml].append(model)
                    # Test the model
                    if self.responseType == "Classification":
                        results[ml].append(
                            (
                                evalUtilities.getClassificationAccuracy(testData, model),
                                evalUtilities.getConfMat(testData, model),
                            )
                        )
                        roc = self.aroc(testData, [model])
                        rocs[ml].append(roc)

                    # save the prediction probabilities

                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        results[ml].append(
                            (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))
                        )
                        # Save the experimental value and correspondent predicted value
                        exp_pred[ml] += local_exp_pred

                res = self.createStatObj(
                    results[ml],
                    exp_pred[ml],
                    nTrainEx[ml],
                    nTestEx[ml],
                    self.responseType,
                    self.nExtFolds,
                    logTxt,
                    rocs[ml],
                )
                if self.verbose > 0:
                    print "UnbiasedAccuracyGetter!Results  " + ml + ":\n"
                    pprint(res)
                if not res:
                    raise Exception("No results available!")
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)
                self.__log("       OK")
            except:
                self.__log("       Learner " + str(ml) + " failed to create/optimize the model!")
                res = self.createStatObj()
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)

        if not statistics or len(statistics) < 1:
            self.__log("ERROR: No statistics to return!")
            return None
        elif len(statistics) > 1:
            # We still need to build a consensus model out of the stable models
            #   ONLY if there are more that one model stable!
            #   When only one or no stable models, build a consensus based on all models
            consensusMLs = {}
            for modelName in statistics:
                StabilityValue = statistics[modelName]["StabilityValue"]
                if StabilityValue is not None and statistics[modelName]["stable"]:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            self.__log(
                "Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods."
            )

            if len(consensusMLs) <= 1:  # we need more models to build a consensus!
                consensusMLs = {}
                for modelName in statistics:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            if len(consensusMLs) >= 2:
                # Var for saving each Fols result
                Cresults = []
                Cexp_pred = []
                CnTrainEx = []
                CnTestEx = []
                self.__log(
                    "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs])
                )
                for foldN in range(self.nExtFolds):
                    if self.responseType == "Classification":
                        CLASS0 = str(self.data.domain.classVar.values[0])
                        CLASS1 = str(self.data.domain.classVar.values[1])
                        exprTest0 = "(0"
                        for ml in consensusMLs:
                            exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str(optAcc[ml][foldN]) + " "
                        exprTest0 += ")/IF0(sum([False"
                        for ml in consensusMLs:
                            exprTest0 += ", " + ml + " == " + CLASS0 + " "
                        exprTest0 += "]),1)"
                        exprTest1 = exprTest0.replace(CLASS0, CLASS1)
                        expression = [exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1]
                    else:
                        Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs])
                        expression = "(1 / " + str(Q2sum) + ") * (0"
                        for ml in consensusMLs:
                            expression += " + " + str(optAcc[ml][foldN]) + " * " + ml + " "
                        expression += ")"

                    testData = self.data.select(DataIdxs[foldN])
                    CnTestEx.append(len(testData))
                    consensusClassifiers = {}
                    for learnerName in consensusMLs:
                        consensusClassifiers[learnerName] = models[learnerName][foldN]

                    model = AZorngConsensus.ConsensusClassifier(classifiers=consensusClassifiers, expression=expression)
                    CnTrainEx.append(model.NTrainEx)
                    # Test the model
                    if self.responseType == "Classification":
                        Cresults.append(
                            (
                                evalUtilities.getClassificationAccuracy(testData, model),
                                evalUtilities.getConfMat(testData, model),
                            )
                        )
                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        Cresults.append(
                            (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))
                        )
                        # Save the experimental value and correspondent predicted value
                        Cexp_pred += local_exp_pred

                res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds)
                statistics["Consensus"] = copy.deepcopy(res)
                statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs)
                self.__writeResults(statistics)
            self.__log("Returned multiple ML methods statistics.")
            return statistics

        # By default return the only existing statistics!
        self.__writeResults(statistics)
        self.__log("Returned only one ML method statistics.")
        return statistics[statistics.keys()[0]]

Example #3

Show file

File: getUnbiasedAccuracy.py Project: girschic/AZOrange

    def getAcc(self, callBack=None, algorithm=None, params=None, atts=None, holdout=None):
        """ For regression problems, it returns the RMSE and the Q2 
            For Classification problems, it returns CA and the ConfMat
            The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]}
            For the EvalResults not supported for a specific learner/datase, the respective result will be None

            if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus
                made out of those that were stable

            It some error occurred, the respective values in the Dict will be None
                
			parameters:
                algorithm - list of feature generation algorithms (set dependent features that have to be calculated inside the crossvalidation)
                params - dictionary of parameters
                atts - attributes to be removed before learning (e.g. meta etc...)
        """
        self.__log("Starting Calculating MLStatistics")
        statistics = {}
        if not self.__areInputsOK():
            return None

        if holdout:
            self.nExtFolds = 1

        if algorithm:
            self.__log(" Additional features to be calculated inside of cross-validation")
            for i in algorithm:
                self.__log(" Algorithm: " + str(i))
            for j, v in params.iteritems():
                self.__log(" Parameter: " + str(j) + " = " + str(v))

        # Set the response type
        self.responseType = (
            self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression"
        )
        self.__log("  " + str(self.responseType))

        # Create the Train and test sets
        DataIdxs = None
        if holdout:
            self.__log("Using hold out evaluation with " + str(holdout) + "*100 % of data for training")
            DataIdxs = dataUtilities.SeedDataSampler_holdOut(self.data, holdout)
        else:
            DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds)

        # Var for saving each Fols result
        optAcc = {}
        results = {}
        exp_pred = {}
        nTrainEx = {}
        nTestEx = {}

        # Set a dict of learners
        MLmethods = {}
        if type(self.learner) == dict:
            for ml in self.learner:
                MLmethods[ml] = self.learner[ml]
        else:
            MLmethods[self.learner.name] = self.learner

        models = {}
        rocs = {}
        self.__log("Calculating Statistics for MLmethods:")
        self.__log("  " + str([x for x in MLmethods]))

        # Check data in advance so that, by chance, it will not fail at the last fold!
        for foldN in range(self.nExtFolds):
            trainData = self.data.select(DataIdxs[foldN], negate=1)
            self.__checkTrainData(trainData)

        # Optional!!
        # Order Learners so that PLS is the first
        sortedML = [ml for ml in MLmethods]
        if "PLS" in sortedML:
            sortedML.remove("PLS")
            sortedML.insert(0, "PLS")

        stepsDone = 0
        nTotalSteps = len(sortedML) * self.nExtFolds
        for ml in sortedML:
            self.__log("    > " + str(ml) + "...")
            try:
                # Var for saving each Fols result
                results[ml] = []
                exp_pred[ml] = []
                models[ml] = []
                rocs[ml] = []
                nTrainEx[ml] = []
                nTestEx[ml] = []
                optAcc[ml] = []
                logTxt = ""

                for foldN in range(self.nExtFolds):
                    if type(self.learner) == dict:
                        self.paramList = None

                    trainData = self.data.select(DataIdxs[foldN], negate=1)
                    orig_len = len(trainData.domain.attributes)
                    refs = None
                    methods = [
                        "rdk_MACCS_keys",
                        "rdk_topo_fps",
                        "rdk_morgan_fps",
                        "rdk_morgan_features_fps",
                        "rdk_atompair_fps",
                    ]
                    train_domain = None
                    # add structural descriptors to the training data (TG)
                    if algorithm:
                        for i in range(len(algorithm)):
                            if algorithm[i] == "structClust":
                                self.__log("Algorithm " + str(i) + ": " + str(algorithm[i]))
                                actData = orange.ExampleTable(trainData.domain)
                                for d in trainData:
                                    # only valid for simboosted qsar paper experiments!?
                                    if d.getclass() == "2":
                                        actData.append(d)

                                refs = structuralClustering.getReferenceStructures(
                                    actData,
                                    threshold=params["threshold"],
                                    minClusterSize=params["minClusterSize"],
                                    numThreads=2,
                                )
                                self.__log(
                                    " found "
                                    + str(len(refs))
                                    + " reference structures in "
                                    + str(len(actData))
                                    + " active structures"
                                )
                                orig_len = orig_len + (len(refs) * len(methods))
                                trainData_sim = SimBoostedQSAR.getSimDescriptors(refs, trainData, methods)

                                if i == (len(algorithm) - 1):
                                    trainData = dataUtilities.attributeDeselectionData(trainData_sim, atts)
                                else:
                                    trainData = dataUtilities.attributeDeselectionData(trainData_sim, [])

                            elif algorithm[i] == "ECFP":
                                self.__log("Algorithm " + str(i) + ": " + str(algorithm[i]))
                                trainData_ecfp = getCinfonyDesc.getCinfonyDescResults(trainData, ["rdk.FingerPrints"])
                                train_domain = trainData_ecfp.domain
                                if i == (len(algorithm) - 1):
                                    trainData = dataUtilities.attributeDeselectionData(trainData_ecfp, atts)
                                else:
                                    trainData = dataUtilities.attributeDeselectionData(trainData_ecfp, [])

                            else:
                                self.__log("Algorithm " + str(i) + ": " + str(algorithm[i]))
                                trainData_structDesc = getStructuralDesc.getStructuralDescResult(
                                    trainData, algorithm[i], params["minsup"]
                                )
                                if i == (len(algorithm) - 1):
                                    trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, atts)
                                else:
                                    trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, [])

                                    # trainData.save("/home/girschic/proj/AZ/ProjDev/train.tab")
                    testData = self.data.select(DataIdxs[foldN])
                    # calculate the feature values for the test data (TG)
                    if algorithm:
                        for i in range(len(algorithm)):
                            if algorithm[i] == "structClust":
                                self.__log(str(algorithm[i]))
                                testData_sim = SimBoostedQSAR.getSimDescriptors(refs, testData, methods)
                                if i == (len(algorithm) - 1):
                                    testData = dataUtilities.attributeDeselectionData(testData_sim, atts)
                                else:
                                    testData = dataUtilities.attributeDeselectionData(testData_sim, [])
                            elif algorithm[i] == "ECFP":
                                self.__log(str(algorithm[i]))
                                # testData_ecfp = orange.ExampleTable(train_domain)
                                tmp_dat = []
                                for d in testData:
                                    tmp = getCinfonyDesc.getRdkFPforTestInstance(train_domain, d)
                                    tmp_dat.append(tmp)
                                testData_ecfp = orange.ExampleTable(tmp_dat[0].domain, tmp_dat)
                                if i == (len(algorithm) - 1):
                                    # 						print "removing atts"
                                    testData = dataUtilities.attributeDeselectionData(testData_ecfp, atts)
                                else:
                                    # 						print "removing no atts"
                                    testData = dataUtilities.attributeDeselectionData(testData_ecfp, [])

                            else:
                                cut_off = orig_len - len(atts)
                                smarts = trainData.domain.attributes[cut_off:]
                                self.__log("  Number of structural features added: " + str(len(smarts)))
                                testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData, smarts)
                                if i == (len(algorithm) - 1):
                                    testData = dataUtilities.attributeDeselectionData(testData_structDesc, atts)
                                else:
                                    testData = dataUtilities.attributeDeselectionData(testData_structDesc, [])

                    #                testData.save("/home/girschic/proj/AZ/ProjDev/test.tab")
                    nTrainEx[ml].append(len(trainData))
                    nTestEx[ml].append(len(testData))
                    # Test if trainsets inside optimizer will respect dataSize criterias.
                    #  if not, don't optimize, but still train the model
                    dontOptimize = False
                    if self.responseType != "Classification" and (len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20):
                        dontOptimize = True
                    else:
                        tmpDataIdxs = dataUtilities.SeedDataSampler(trainData, self.nInnerFolds)
                        tmpTrainData = trainData.select(tmpDataIdxs[0], negate=1)
                        if not self.__checkTrainData(tmpTrainData, False):
                            dontOptimize = True

                    if dontOptimize:
                        logTxt += (
                            "       Fold " + str(foldN) + ": Too few compounds to optimize model hyper-parameters\n"
                        )
                        self.__log(logTxt)
                        if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                            res = orngTest.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                randomGenerator=random.randint(0, 100),
                            )
                            CA = evalUtilities.CA(res)[0]
                            optAcc[ml].append(CA)
                        else:
                            res = orngTest.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                randomGenerator=random.randint(0, 100),
                            )
                            R2 = evalUtilities.R2(res)[0]
                            optAcc[ml].append(R2)
                    else:
                        runPath = miscUtilities.createScratchDir(
                            baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData)
                        )
                        # 		    self.__log("	run path:"+str(runPath))
                        trainData.save(os.path.join(runPath, "trainData.tab"))

                        tunedPars = paramOptUtilities.getOptParam(
                            learner=MLmethods[ml],
                            trainDataFile=os.path.join(runPath, "trainData.tab"),
                            paramList=self.paramList,
                            useGrid=False,
                            verbose=self.verbose,
                            queueType=self.queueType,
                            runPath=runPath,
                            nExtFolds=None,
                            nFolds=self.nInnerFolds,
                            logFile=self.logFile,
                            getTunedPars=True,
                        )
                        if not MLmethods[ml] or not MLmethods[ml].optimized:
                            self.__log(
                                "       WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized."
                            )
                            self.__log("                It will be ignored")
                            # self.__log("                It will be set to default parameters")
                            self.__log("                    DEBUG can be done in: " + runPath)
                            # Set learner back to default
                            # MLmethods[ml] = MLmethods[ml].__class__()
                            raise Exception("The learner " + str(ml) + " was not optimized.")
                        else:
                            if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                                optAcc[ml].append(tunedPars[0])
                            else:
                                res = orngTest.crossValidation(
                                    [MLmethods[ml]],
                                    trainData,
                                    folds=5,
                                    strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                    randomGenerator=random.randint(0, 100),
                                )
                                R2 = evalUtilities.R2(res)[0]
                                optAcc[ml].append(R2)

                            miscUtilities.removeDir(runPath)
                    # Train the model
                    model = MLmethods[ml](trainData)
                    models[ml].append(model)
                    # Test the model
                    if self.responseType == "Classification":
                        results[ml].append(
                            (
                                evalUtilities.getClassificationAccuracy(testData, model),
                                evalUtilities.getConfMat(testData, model),
                            )
                        )
                        roc = self.aroc(testData, [model])
                        rocs[ml].append(roc)
                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        results[ml].append(
                            (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))
                        )
                        # Save the experimental value and correspondent predicted value
                        exp_pred[ml] += local_exp_pred
                    if callBack:
                        stepsDone += 1
                        if not callBack((100 * stepsDone) / nTotalSteps):
                            return None

                res = self.createStatObj(
                    results[ml],
                    exp_pred[ml],
                    nTrainEx[ml],
                    nTestEx[ml],
                    self.responseType,
                    self.nExtFolds,
                    logTxt,
                    rocs[ml],
                )

                if self.verbose > 0:
                    print "UnbiasedAccuracyGetter!Results  " + ml + ":\n"
                    pprint(res)
                if not res:
                    raise Exception("No results available!")
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)
                self.__log("       OK")
            except:
                print "Unexpected error:",
                print sys.exc_info()[0]
                print sys.exc_info()[1]
                self.__log("       Learner " + str(ml) + " failed to create/optimize the model!")
                res = self.createStatObj(
                    results[ml],
                    exp_pred[ml],
                    nTrainEx[ml],
                    nTestEx[ml],
                    self.responseType,
                    self.nExtFolds,
                    logTxt,
                    rocs[ml],
                )
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)

        if not statistics or len(statistics) < 1:
            self.__log("ERROR: No statistics to return!")
            return None
        elif len(statistics) > 1:
            # We still need to build a consensus model out of the stable models
            #   ONLY if there are more that one model stable!
            #   When only one or no stable models, build a consensus based on all models
            consensusMLs = {}
            for modelName in statistics:
                StabilityValue = statistics[modelName]["StabilityValue"]
                if StabilityValue is not None and statistics[modelName]["stable"]:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            self.__log(
                "Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods."
            )

            if len(consensusMLs) <= 1:  # we need more models to build a consensus!
                consensusMLs = {}
                for modelName in statistics:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            if len(consensusMLs) >= 2:
                # Var for saving each Fols result
                Cresults = []
                Cexp_pred = []
                CnTrainEx = []
                CnTestEx = []
                self.__log(
                    "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs])
                )
                for foldN in range(self.nExtFolds):
                    if self.responseType == "Classification":
                        CLASS0 = str(self.data.domain.classVar.values[0])
                        CLASS1 = str(self.data.domain.classVar.values[1])
                        exprTest0 = "(0"
                        for ml in consensusMLs:
                            exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str(optAcc[ml][foldN]) + " "
                        exprTest0 += ")/IF0(sum([False"
                        for ml in consensusMLs:
                            exprTest0 += ", " + ml + " == " + CLASS0 + " "
                        exprTest0 += "]),1)"
                        exprTest1 = exprTest0.replace(CLASS0, CLASS1)
                        expression = [exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1]
                    else:
                        Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs])
                        expression = "(1 / " + str(Q2sum) + ") * (0"
                        for ml in consensusMLs:
                            expression += " + " + str(optAcc[ml][foldN]) + " * " + ml + " "
                        expression += ")"

                    testData = self.data.select(DataIdxs[foldN])
                    CnTestEx.append(len(testData))
                    consensusClassifiers = {}
                    for learnerName in consensusMLs:
                        consensusClassifiers[learnerName] = models[learnerName][foldN]

                    model = AZorngConsensus.ConsensusClassifier(classifiers=consensusClassifiers, expression=expression)
                    CnTrainEx.append(model.NTrainEx)
                    # Test the model
                    if self.responseType == "Classification":
                        Cresults.append(
                            (
                                evalUtilities.getClassificationAccuracy(testData, model),
                                evalUtilities.getConfMat(testData, model),
                            )
                        )
                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        Cresults.append(
                            (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))
                        )
                        # Save the experimental value and correspondent predicted value
                        Cexp_pred += local_exp_pred

                res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds)
                statistics["Consensus"] = copy.deepcopy(res)
                statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs)
                self.__writeResults(statistics)
            self.__log("Returned multiple ML methods statistics.")
            return statistics

        # By default return the only existing statistics!
        self.__writeResults(statistics)
        self.__log("Returned only one ML method statistics.")
        return statistics[statistics.keys()[0]]

Example #4

Show file

File: getAccWOptParam.py Project: girschic/AZOrange

    def getAcc(self, algorithm = None, minsup = None, atts = None):
        """ For regression problems, it returns the RMSE and the R2 
            For Classification problems, it returns CA and the ConfMat
            The return is made in a Dict: {"RMSE":0.2,"R2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]}
            For the EvalResults not supported for a specific learner/datase, the respective result will be None

            if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus
                made out of those that were stable

            It some error occurred, the respective values in the Dict will be None

        """
        self.__log("Starting Calculating MLStatistics")
        statistics = {}
        if not self.__areInputsOK():
            return None
        
        if (self.algorithm):
            self.__log(" Additional structural features to be calculated inside of cross-validation")
            self.__log(" Algorithm for structural features: "+str(self.algorithm))
            self.__log(" Minimum support parameter: "+str(self.minsup))
        
        # Set the response type
        responseType =  self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification"  or "Regression"
        self.__log("  "+str(responseType))

        #Create the Train and test sets
        DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds) 
        
        #Var for saving each Fols result
        results = {}
        exp_pred = {}
        
        #Set a dict of learners
        MLmethods = {}
        if type(self.learner) == dict:
            for ml in self.learner:
                MLmethods[ml] = self.learner[ml]
        else:
            MLmethods[self.learner.name] = self.learner

        models={}
        self.__log("Calculating Statistics for MLmethods:")
        self.__log("  "+str([x for x in MLmethods]))
        for ml in MLmethods:
          self.__log("    > "+str(ml)+"...")
          try:
            #Var for saving each Fols result
            results[ml] = []
            exp_pred[ml] = []
            models[ml] = []
            for foldN in range(self.nExtFolds):
                if type(self.learner) == dict:
                    self.paramList = None

                trainData = self.data.select(DataIdxs[foldN],negate=1)
                orig_len = len(trainData.domain.attributes)

		if (self.algorithm):
			# add structural descriptors to the training data (TG)
                	trainData_structDesc = getStructuralDesc.getStructuralDescResult(trainData, self.algorithm, self.minsup)
                	trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, self.atts)

                runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "AccWOptParam")
                trainData.save(os.path.join(runPath,"trainData.tab"))

                testData = self.data.select(DataIdxs[foldN])
		if (self.algorithm):
			# calculate the feature values for the test data (TG)
			cut_off = orig_len - len(self.atts)
                	smarts = trainData.domain.attributes[cut_off:]
			self.__log("  Number of structural features added: "+str(len(smarts)))
			testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData,smarts)
			testData = dataUtilities.attributeDeselectionData(testData_structDesc, self.atts)

                paramOptUtilities.getOptParam(
                    learner = MLmethods[ml], 
                    trainDataFile = os.path.join(runPath,"trainData.tab"), 
                    paramList = self.paramList, 
                    useGrid = False, 
                    verbose = self.verbose, 
                    queueType = self.queueType, 
                    runPath = runPath, 
                    nExtFolds = None, 
                    nFolds = self.nInnerFolds
                    )
                if not MLmethods[ml].optimized:
                    self.__log("       The learner "+str(ml)+" was not optimized.")
                    raise Exception("The learner "+str(ml)+" was not optimized.")
                miscUtilities.removeDir(runPath) 
		
                #Train the model
                model = MLmethods[ml](trainData)
                models[ml].append(model)
                #Test the model
                if responseType == "Classification":
                    results[ml].append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) )
                else:
                    local_exp_pred = []
                    for ex in testData:
                        local_exp_pred.append((ex.getclass(), model(ex)))
                    results[ml].append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) )
                    #Save the experimental value and correspondent predicted value
                    exp_pred[ml] += local_exp_pred
            res = self.createStatObj(results[ml], exp_pred[ml], responseType, self.nExtFolds)
            if self.verbose > 0: 
                print "AccWOptParamGetter!Results  "+ml+":\n"
                pprint(res)
            if not res:
                raise Exception("No results available!")
            statistics[ml] = res.copy()
            self.__writeResults(res)
            self.__log("       OK")
          except:
            self.__log("       Learner "+str(ml)+" failed to optimize!")
            res = self.createStatObj()
            statistics[ml] = res.copy()

        if not statistics or len(statistics) < 1:
            self.__log("ERROR: No statistics to return!")
            return None
        elif len(statistics) > 1:
            #We still need to build a consensus model out of the stable models 
            #   ONLY if there are more that one model stable!
            stableML={}
            for modelName in statistics:
                if statistics[modelName]["StabilityValue"] < AZOC.QSARSTABILITYTHRESHOLD:   # Select only stable models
                    stableML[modelName] = statistics[modelName].copy()
            if len(stableML) >= 2:
                self.__log("Found "+str(len(stableML))+" stable MLmethods out of "+str(len(statistics))+" MLmethods.")
                if responseType == "Classification":
                    CLASS0 = str(self.data.domain.classVar.values[0])
                    CLASS1 = str(self.data.domain.classVar.values[1])
                    exprTest0 = "(0"
                    for ml in stableML:
                        exprTest0 += "+( "+ml+" == "+CLASS0+" )*"+str(stableML[ml]["CA"])+" "
                    exprTest0 += ")/IF0(sum([False"
                    for ml in stableML:
                        exprTest0 += ", "+ml+" == "+CLASS0+" "
                    exprTest0 += "]),1)"
                    exprTest1 = exprTest0.replace(CLASS0,CLASS1)
                    expression = [exprTest0+" >= "+exprTest1+" -> "+CLASS0," -> "+CLASS1]
                else:
                    R2sum = sum([stableML[ml]["R2"] for ml in stableML])
                    expression = "(1 / "+str(R2sum)+") * (0"
                    for ml in stableML:
                        expression += " + "+str(stableML[ml]["R2"])+" * "+ml+" "
                    expression += ")"

                #Var for saving each Fols result
                Cresults = []
                Cexp_pred = []
                self.__log("Calculating the statistics for a Consensus model")
                for foldN in range(self.nExtFolds):
                    testData = self.data.select(DataIdxs[foldN])
                    consensusClassifiers = {}
                    for learnerName in stableML:
                        consensusClassifiers[learnerName] = models[learnerName][foldN]

                    model = AZorngConsensus.ConsensusClassifier(classifiers = consensusClassifiers, expression = expression)     
                    #Test the model
                    if responseType == "Classification":
                        Cresults.append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) )
                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        Cresults.append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) )
                        #Save the experimental value and correspondent predicted value
                        Cexp_pred += local_exp_pred

                res = self.createStatObj(Cresults, Cexp_pred, responseType, self.nExtFolds)
                statistics["Consensus"] = res.copy()
                statistics["Consensus"]["IndividualStatistics"] = stableML.copy()
                self.__writeResults(statistics)
            self.__log("Returned multiple ML methods statistics.")
            return statistics
                 
        #By default return the only existing statistics!
        self.__writeResults(statistics)
        self.__log("Returned only one ML method statistics.")
        return statistics[statistics.keys()[0]]