Ejemplo n.º 1
0
    def optimizeParameters(self):
        """ Sets up the input learner with tuned parameters  """

        self.clearErrors()
        self.tunedPars = None
        if hasattr(self.learner, "optimized"):
            self.learner.optimized = False

        if not self.learner:
            self.send("Learner - Tuned", None)
            self.send("Examples - Optimization Steps", None)
            self.updateInfo()
            return

        # Apply the parameters var with values  on configuration table of GUI (user could have changed them!)
        if not self.updateParametersFromTable():
            return

        if not self.dataset:
            self.dataset = None
            self.send("Learner - Tuned", None)
            self.send("Examples - Optimization Steps", None)
            self.updateInfo()
            return

        # Progess Bar 1
        optSteps = 3
        progress1 = QProgressDialog(
            "Gathering data and configuring the optimizer...", "Cancel", 0,
            optSteps, self, Qt.Dialog)  #, "progress", True )
        progress1.setWindowModality(Qt.WindowModal)
        bar1 = QProgressBar(progress1)
        bar1.show()
        progress1.setBar(bar1)
        #progress1.setTotalSteps(optSteps)
        progress1.setMinimumDuration(0)
        progress1.forceShow()
        progress1.setValue(0)
        time.sleep(0.1)
        progress1.setValue(0)

        # Create path for running the optimizer
        randNr = random.randint(0, 10000)
        if self.execEnv == 0:
            scratchdir = miscUtilities.createScratchDir(
                desc="OWParamOpt_Serial")
        else:
            scratchdir = miscUtilities.createScratchDir(
                desc="OWParamOpt_MPI", baseDir=AZOC.NFS_SCRATCHDIR)
        # Save the dataset to the optimizer running path
        OrngFile = os.path.join(scratchdir, "OrngData.tab")
        orange.saveTabDelimited(OrngFile, self.dataset)
        # Advance Progress Bar
        progress1.setValue(1)
        # Define the evaluation method to use
        if self.dataset.domain.classVar.varType == orange.VarTypes.Continuous:
            fMin = self.RMethods[self.RMethod][2]
            evalM = self.RMethods[self.RMethod][1]
        else:
            fMin = self.CMethods[self.CMethod][2]
            evalM = self.CMethods[self.CMethod][1]
        try:
            if os.path.exists(
                    os.path.join(scratchdir, "AZLearnersParamsConfig.py")):
                os.system(
                    "rm " +
                    str(os.path.join(scratchdir, "AZLearnersParamsConfig.py")))
            paramFile = file(
                os.path.join(scratchdir, "AZLearnersParamsConfig.py"), "w")
            paramFile.write(self.learnerType + "= " + str(self.parameters) +
                            "\r\n")
            paramFile.close()

            progress1.setValue(2)
            # Run the optimizer which will configure the input learner and aditionaly return [<minimum of objective function found>, <optimized parameters>]
            # Serial
            print "ENV:", self.execEnv
            if self.execEnv == 0:
                print "Executing the optimizer in serial mode on local machine"
                optPID = self.optimizer(
                    learner=self.learner,
                    dataSet=OrngFile,
                    evaluateMethod=evalM,
                    findMin=fMin,
                    nFolds=self.nFolds,
                    samplingMethod=self.SMethods[self.SMethod][1],
                    runPath=scratchdir,
                    verbose=self.verbose,
                    externalControl=1,
                    useParameters=self.parameters,
                    useGridSearchFirst=self.UseGridSearch,
                    gridSearchInnerPoints=self.nInnerPoints,
                    np=None,
                    machinefile=None,
                    advancedMPIoptions="",
                )
            # Local mpi
            elif self.execEnv == 1:
                print "Executing the optimizer in parallel mode on local machine"
                optPID = self.optimizer(
                    learner=self.learner,
                    dataSet=OrngFile,
                    evaluateMethod=evalM,
                    findMin=fMin,
                    nFolds=self.nFolds,
                    samplingMethod=self.SMethods[self.SMethod][1],
                    runPath=scratchdir,
                    verbose=self.verbose,
                    externalControl=1,
                    useParameters=self.parameters,
                    useGridSearchFirst=self.UseGridSearch,
                    gridSearchInnerPoints=self.nInnerPoints,
                    machinefile=0)
            # Sge Molndal
            elif self.execEnv == 2:
                print "Executing the optimizer in parallel mode in the batch queue on the sge"
                print "*****************runPath*****************"
                optPID = self.optimizer(
                    learner=self.learner,
                    dataSet=OrngFile,
                    evaluateMethod=evalM,
                    findMin=fMin,
                    nFolds=self.nFolds,
                    samplingMethod=self.SMethods[self.SMethod][1],
                    runPath=scratchdir,
                    verbose=self.verbose,
                    externalControl=1,
                    useParameters=self.parameters,
                    useGridSearchFirst=self.UseGridSearch,
                    gridSearchInnerPoints=self.nInnerPoints,
                    np=8,
                    machinefile="qsub")  #, sgeEnv = "sge_seml")
            elif self.execEnv == 3:
                print "Executing the optimizer in parallel mode in the quick queue on the sge"
                print "*****************runPath*****************"
                optPID = self.optimizer(
                    learner=self.learner,
                    dataSet=OrngFile,
                    evaluateMethod=evalM,
                    findMin=fMin,
                    nFolds=self.nFolds,
                    samplingMethod=self.SMethods[self.SMethod][1],
                    runPath=scratchdir,
                    verbose=self.verbose,
                    externalControl=1,
                    useParameters=self.parameters,
                    useGridSearchFirst=self.UseGridSearch,
                    gridSearchInnerPoints=self.nInnerPoints,
                    np=8,
                    machinefile="qsub",
                    queueType="quick.q")  #, sgeEnv = "sge_seml")
            else:
                print "No SGE Env. selected. Nothing will happen."
        except:
            progress1.close()
            self.updateInfo()
            self.setErrors(
                "Some error(s) occurred during the optimization.\nCheck the " +
                str(scratchdir) +
                " and the output terminal for more information")
            self.send("Learner - Tuned", None)
            self.send("Examples - Optimization Steps", None)
            return

        progress1.setValue(3)

        if type(optPID) != types.IntType:
            progress1.close()
            self.updateInfo()
            self.setErrors("Some error(s) occurred during optimization:\n" +
                           str(optPID))
            self.send("Learner - Tuned", None)
            self.send("Examples - Optimization Steps", None)
            return

        progress1.close()

        # Progess Bar
        optSteps = (1 + round(
            (len(self.dataset) * len(self.dataset.domain.attributes) *
             self.nParameters) / 1000)) * 8
        print "Learner optimization started at " + time.asctime()
        print "Optimization steps = ", int(
            optSteps), " (estimated to aprox. ", optSteps / 2, " seconds)"
        progress = QProgressDialog("Learner optimization started at " +
                                   time.asctime() + " ,please wait...",
                                   "Abort Optimization", 0, optSteps, self,
                                   Qt.Dialog)  #, "progress", True )
        progress.setWindowModality(Qt.WindowModal)
        bar = QProgressBar(progress)
        bar.show()
        progress.setBar(bar)
        #progress.setTotalSteps(optSteps)
        progress.setMinimumDuration(0)
        stepsDone = 0
        progress.setValue(stepsDone)
        progress.forceShow()
        #Loop waiting for the optimizer to finish
        while 1:
            if stepsDone < (progress.maximum() - 1):
                progress.setValue(stepsDone)
                stepsDone += 1
                time.sleep(0.5)
            else:
                bar.setTextVisible(False)
                progress.setLabelText(
                    "The optimizer is taking longer than expected, please wait some more time..."
                )
                stepsDone = 0
                progress.setValue(stepsDone)
                time.sleep(0.5)
            if progress.wasCanceled():
                if not self.optimizer.stop():
                    progress.setLabelText(
                        "Could not stop the optimizer! Please wait until it finish..."
                    )
                else:
                    self.setErrors(
                        "Learner optimization stopped by user at " +
                        time.asctime(), "WARNING")
                    break
            if self.optimizer.isFinished():
                print "Learner optimization finished at " + time.asctime()
                break
        progress.setValue(progress.maximum() - 1)
        time.sleep(0.5)
        progress.setValue(progress.maximum())
        self.tunedPars = self.optimizer.tunedParameters
        if self.verbose > 0:
            if self.optimizer.usedMPI:
                print "appspack version used in fact: MPI"
            else:
                print "appspack version used in fact: SERIAL"
        if type(self.tunedPars
                ) != types.ListType or self.learner.optimized == False:
            self.send("Learner - Tuned", None)
            self.send("Examples - Optimization Steps", None)
        else:
            self.send("Learner - Tuned", self.learner)
            self.intRes = dataUtilities.DataTable(scratchdir +
                                                  "/optimizationLog.txt")
            self.send("Examples - Optimization Steps", self.intRes)
        self.updateInfo()

        if self.verbose == 0:
            miscUtilities.removeDir(scratchdir)
        else:
            self.setErrors(
                "The directory " + str(scratchdir) +
                " was not deleted because verbose flag is ON", "DEBUG")
Ejemplo n.º 2
0
                self.classifier = None
                self.error(0,"ERROR: It was not possible to create a classifyer. Check any previous errors.")
            #print time.asctime(), " -> self.data = self.data"
        else:
            self.classifier = None
        #print time.asctime(), " -> self.send(...)"
        self.send("Classifier", self.classifier)


    def pbchange(self, val):
        self.progressBarSet(val*100)



##############################################################################
# Test the widget, run from DOS prompt
# > python OWDataTable.py)
# Make sure that a sample data set (adult_sample.tab) is in the directory

if __name__=="__main__":
    a=QApplication(sys.argv)
    ow=OWCvRF()
    a.setMainWidget(ow)

    d = dataUtilities.DataTable('adult_sample')
    ow.setData(d)

    ow.show()
    a.exec_loop()
    ow.saveSettings()
Ejemplo n.º 3
0
def LLOOprob_b(idx, extTrain, measure=None):
    """
    Use the fraction of kNN correctly predicted by a local model
    Hard coded to 50 NN.
    Modeling method. RF of Tree?
    """

    distList = []
    if not measure:
        measure = orange.ExamplesDistanceConstructor_Euclidean(extTrain)
    for runIdx in range(len(extTrain)):
        if runIdx != idx:
            dist = measure(extTrain[idx], extTrain[runIdx])
            distList.append(dist)

    # Get the distance of the 50th NN
    distList.sort()
    thresDist = distList[
        50]  # Smaller number of NN does not work with returnDFV

    # Find the predEx and the 20 NN
    kNN = []
    for runIdx in range(len(extTrain)):
        dist = measure(extTrain[idx], extTrain[runIdx])
        if dist <= thresDist:
            kNN.append(extTrain[runIdx])
    kNNtrain = dataUtilities.DataTable(kNN)

    # Find the fraction of correctly predicted ex in a LOO over kNN
    alphaList = []
    alphaEx = 0
    for iidx in range(len(kNNtrain)):

        # Deselect example idx in extTrain
        idxList = range(0, iidx)
        idxList.extend(range(iidx + 1, len(kNNtrain)))
        train = kNNtrain.get_items(idxList)

        # Get prediction and pred probability
        model = AZorngRF.RFLearner(train)
        predList = model(kNNtrain[iidx], returnDFV=True)
        pred = predList[0].value
        prob = predList[1]
        actual = kNNtrain[iidx].get_class().value

        # The prob of the predEx is more important
        dist = measure(extTrain[idx], kNNtrain[iidx])

        # alpha should be greater the less certain the model
        try:
            if pred != actual:
                alpha = 1.0 + abs(prob)
                if dist < 0.001:
                    alphaEx = alpha
            else:
                alpha = 1.0 - abs(prob)
                if dist < 0.001:
                    alphaEx = alpha
            alphaList.append(alpha)
        except:
            pass

    alpha = alphaEx + sum(alphaList) / float(len(alphaList))

    return alpha
Ejemplo n.º 4
0
    def sendpredictions(self):
        def getValue(c, ex, getProb=None):
            """ Get the predicted value of ex using classifier c and gets the probability of symbol of order getProb"""
            if getProb != None:
                theValue = c(ex, orange.GetProbabilities)
                if hasattr(c, "isRealProb") and not c.isRealProb():
                    self.warning(
                        0,
                        "The probabilities are not available in this particular case."
                    )
                    return orange.Value('?')
            else:
                theValue = c(ex)
                #print "theValue: ",theValue
            if theValue:
                if getProb != None:
                    return orange.Value(theValue[getProb])
                else:
                    return orange.Value(theValue)
            else:
                self.warning(
                    0,
                    "Some example(s) were not able to be predicted. Check the domain compatibility of train and test datasets!"
                )
                return orange.Value("?")

        self.warning(0)
        if not self.data or not self.outvar:
            self.send("Predictions", None)
            return
        messages = []
        # predictions, data set with class predictions
        classification = self.outvar.varType == orange.VarTypes.Discrete

        # create a new domain for the new data handling the predictions
        domain = orange.Domain(self.data.domain.attributes +
                               [self.data.domain.classVar])

        # Add to the predictions the original meta attributes present in Data
        domain.addmetas(self.data.domain.getmetas())

        # Create the new Data Table containing the Data and the Predictions
        predictions = dataUtilities.DataTable(domain, self.data)

        # The number of examples to be predicted
        nEx = len(self.data)
        # the number of Learners
        nL = len(self.predictors)
        # The number of calculated iteractions
        nIter = 1
        # the number of iterations Done
        iter = 0

        self.progressBarSet(0)
        self.progressBarInit()
        if self.verbose:
            for c in self.predictors.values():
                c.verbose = int(self.verbose)
        if classification:
            if len(self.selectedClasses):
                nIter = (nEx * nL * len(self.selectedClasses)) + (nEx * nL)
                for c in self.predictors.values():
                    for i in self.selectedClasses:
                        m = orange.FloatVariable(
                            name="%s(%s)" %
                            (c.name, str(self.outvar.values[i])))
                        domain.addmeta(orange.newmetaid(), m)
                        for ex in predictions:
                            ex[m.name] = getValue(
                                c, orange.Example(self.data.domain, ex), i)
                            self.progressBarSet((iter * 100) / nIter)
                            iter += 1
            else:
                iter = 0
                nIter = nEx * nL

            for c in self.predictors.values():
                if hasattr(c, 'examplesFixedLog'):
                    c.examplesFixedLog = {}
                m = orange.EnumVariable(name="%s" % c.name,
                                        values=self.outvar.values)
                domain.addmeta(orange.newmetaid(), m)
                for ex in predictions:
                    ex[m.name] = getValue(c,
                                          orange.Example(self.data.domain, ex))
                    self.progressBarSet((iter * 100) / nIter)
                    iter += 1

        else:
            # regression
            nIter = nEx * nL
            for c in self.predictors.values():
                if hasattr(c, 'examplesFixedLog'):
                    c.examplesFixedLog = {}
                m = orange.FloatVariable(name="%s" % c.name)
                domain.addmeta(orange.newmetaid(), m)
                for ex in predictions:
                    ex[m.name] = getValue(c,
                                          orange.Example(self.data.domain, ex))
                    self.progressBarSet((iter * 100) / nIter)
                    iter += 1

        if self.verbose:
            for c in self.predictors.values():
                c.verbose = 0

        #Compute and return individual Var Importance
        iter = 0
        nIter = nEx * nL
        if self.nVarImportance > 0:
            for c in self.predictors.values():
                if hasattr(c, 'getTopImportantVars'):
                    m = orange.StringVariable(name="%s" % c.name +
                                              "(Top Important Vars)")
                    domain.addmeta(orange.newmetaid(), m)
                    for ex in predictions:
                        topVars = c.getTopImportantVars(
                            ex, self.nVarImportance)
                        if topVars:
                            if len(topVars) == 1:
                                topVars = str(topVars[0])
                            else:
                                topVars = str(topVars)
                            #if topVars not in m.values:
                            #    m.values.append(topVars)
                            ex[m.name] = topVars
                        else:
                            ex[m.name] = "?"
                        self.progressBarSet((iter * 100) / nIter)
                        iter += 1

        self.progressBarFinished()
        for c in self.predictors.values():
            if hasattr(c, 'examplesFixedLog') and (
                    'Missing Attributes' in c.examplesFixedLog
            ) and c.examplesFixedLog['Missing Attributes']:
                missingAttrs = ""
                for attr in c.examplesFixedLog['Missing Attributes']:
                    missingAttrs += "  " + attr + "\\n"
                messages.append("QMessageBox.warning( None, \"Missing Attributes\" ,"+\
                       "\"The following attributes were missing in the examples to be predicted:\\n" + \
                       missingAttrs + "\", QMessageBox.Ok)")
            if hasattr(c, 'examplesFixedLog') and ('Fixed Types of variables'
                                                   in c.examplesFixedLog):
                if 'Vars needing type fix' in c.examplesFixedLog:
                    msg = "Some variable types were fixed while predicting with " + c.name + "!\\nTypes Fixed: \\n"
                    for var in c.examplesFixedLog['Vars needing type fix']:
                        msg += "  " + var + ": " + str(
                            c.examplesFixedLog['Vars needing type fix']
                            [var]) + '\\n'
                else:
                    msg = "Some variable types were fixed while predicting with " + c.name + "!"
                messages.append("QMessageBox.warning( None, \"Fixed Types of variables in "+\
                        str(c.examplesFixedLog['Fixed Types of variables'])+
                        " examples\",\""+ msg+"\", QMessageBox.Ok)")
            if hasattr(c, 'examplesFixedLog') and ('Fixed Order of variables'
                                                   in c.examplesFixedLog):
                messages.append("QMessageBox.warning( None, \"Fixed Order of variables in "+\
                        str(c.examplesFixedLog['Fixed Order of variables'])+
                        " examples\",\"The order of variables in test data was not the same has in the original\\n"+\
                        "training set used on "+c.name+", so they were fixed.\", QMessageBox.Ok)")
            if hasattr(c, 'examplesFixedLog') and ('Fixed Number of variables'
                                                   in c.examplesFixedLog):
                messages.append("QMessageBox.warning( None, \"Fixed Number of variables in "+\
                        str(c.examplesFixedLog['Fixed Number of variables'])+
                        " examples\",\"The Number of variables in test data were not the same has in the original\\n"+\
                        "training set used on "+c.name+", so only the variables\\npresent in the training set were used .\", QMessageBox.Ok)")
        predictions.name = self.data.name
        self.send("Predictions", predictions)

        for msg in messages:
            exec(msg) in globals()
Ejemplo n.º 5
0
    def setUp(self):
        """Creates the training and testing data set attributes. """
        self.dataPathD = os.path.join(AZOC.AZORANGEHOME,
                                      "tests/source/data/iris.tab")
        self.dataPathC = os.path.join(
            AZOC.AZORANGEHOME, "tests/source/data/Reg_No_metas_Test.tab")

        # Read in the data
        self.inDataD = dataUtilities.DataTable(self.dataPathD)
        self.inDataC = dataUtilities.DataTable(self.dataPathC)

        # Full path to saved svm model
        global scratchdir
        self.modelPath = os.path.join(scratchdir, "model.svm")
        """Other datasets..."""
        contDataPath = os.path.join(
            AZOC.AZORANGEHOME, "tests/source/data/Reg_No_metas_Imp_Test.tab")
        SVMregDataPath = os.path.join(
            AZOC.AZORANGEHOME, "tests/source/data/Reg_No_metas_Train.tab")
        contTrainDataPath = os.path.join(
            AZOC.AZORANGEHOME, "tests/source/data/Reg_No_metas_Imp_Train.tab")
        dataNoMetaTrainPath = os.path.join(
            AZOC.AZORANGEHOME, "tests/source/data/BinClass_No_metas_Train.tab")
        missingTestDataPath = os.path.join(
            AZOC.AZORANGEHOME,
            "tests/source/data/BinClass_No_metas_Train_missing.tab")

        #These 2 datasets are equal apart from the meta atribute
        dataNoMetaTestPath = os.path.join(
            AZOC.AZORANGEHOME,
            "tests/source/data/BinClass_No_metas_SmallTest.tab")
        dataWMetaTestPath = os.path.join(
            AZOC.AZORANGEHOME,
            "tests/source/data/BinClass_W_metas_SmallTest.tab")

        # Read in the data
        #IrisData
        trainDataPath = os.path.join(AZOC.AZORANGEHOME,
                                     "tests/source/data/irisTrain.tab")
        testDataPath = os.path.join(AZOC.AZORANGEHOME,
                                    "tests/source/data/irisTest.tab")
        self.train_data = dataUtilities.DataTable(trainDataPath)
        self.test_data = dataUtilities.DataTable(testDataPath)

        missingInData = dataUtilities.DataTable(missingTestDataPath)
        contTrainData = dataUtilities.DataTable(contTrainDataPath)
        self.regTrainData = dataUtilities.DataTable(SVMregDataPath)
        contData = dataUtilities.DataTable(contDataPath)
        self.NoMetaTrain = dataUtilities.DataTable(dataNoMetaTrainPath)
        self.NoMetaTest = dataUtilities.DataTable(dataNoMetaTestPath)
        self.WMetaTest = dataUtilities.DataTable(dataWMetaTestPath)

        self.missingTrain = missingInData
        self.missingTest = missingInData
        self.contTrain = contTrainData
        self.contTest = contData

        #Data for domain fix handling

        badVarTypePath = os.path.join(
            AZOC.AZORANGEHOME, "tests/source/data/BinClass_BadVarType.tab")
        badVarNamePath = os.path.join(
            AZOC.AZORANGEHOME, "tests/source/data/BinClass_BadVarName.tab")
        badVarOrderPath = os.path.join(
            AZOC.AZORANGEHOME, "tests/source/data/BinClass_BadVarOrder.tab")
        badVarCountPath = os.path.join(
            AZOC.AZORANGEHOME, "tests/source/data/BinClass_BadVarCount.tab")
        # Read in the data
        self.noBadDataTrain = self.NoMetaTrain
        self.noBadDataTest = self.NoMetaTest
        self.badVarTypeData = dataUtilities.DataTable(badVarTypePath)
        self.badVarNameData = dataUtilities.DataTable(badVarNamePath)
        self.badVarOrderData = dataUtilities.DataTable(badVarOrderPath)
        self.badVarCountData = dataUtilities.DataTable(
            badVarCountPath)  #One less example
Ejemplo n.º 6
0
    def setUp(self):
        """Creates the training and testing data set attributes. """
        trainDataPath = os.path.join(
            AZOC.AZORANGEHOME, "tests/source/data/BinClass_No_metas_Train.tab")
        testDataPath = os.path.join(
            AZOC.AZORANGEHOME, "tests/source/data/BinClass_No_metas_Test.tab")
        trainDataRegPath = os.path.join(
            AZOC.AZORANGEHOME, "tests/source/data/Reg_No_metas_Train.tab")
        testDataRegPath = os.path.join(
            AZOC.AZORANGEHOME, "tests/source/data/Reg_No_metas_Test.tab")
        missingTestDataPath = os.path.join(
            AZOC.AZORANGEHOME,
            "tests/source/data/BinClass_No_metas_Train_missing.tab")
        irisPath = os.path.join(AZOC.AZORANGEHOME,
                                "tests/source/data/iris.tab")
        # Read in the data
        missingInData = dataUtilities.DataTable(missingTestDataPath)
        self.trainData = dataUtilities.DataTable(trainDataPath)
        self.testData = dataUtilities.DataTable(testDataPath)
        self.trainDataReg = dataUtilities.DataTable(trainDataRegPath)
        self.testDataReg = dataUtilities.DataTable(testDataRegPath)
        self.irisData = dataUtilities.DataTable(irisPath)

        ##scPA
        dataNoMetaTrainPath = trainDataPath

        #These 2 datasets are equal apart from the meta atribute
        dataNoMetaTestPath = os.path.join(
            AZOC.AZORANGEHOME,
            "tests/source/data/BinClass_No_metas_SmallTest.tab")
        dataWMetaTestPath = os.path.join(
            AZOC.AZORANGEHOME,
            "tests/source/data/BinClass_W_metas_SmallTest.tab")

        # Read in the data
        #contData = self.testDataReg
        self.missingTrain = missingInData
        self.missingTest = missingInData
        self.NoMetaTrain = dataUtilities.DataTable(dataNoMetaTrainPath)
        self.NoMetaTest = dataUtilities.DataTable(dataNoMetaTestPath)
        self.WMetaTest = dataUtilities.DataTable(dataWMetaTestPath)

        #Data for domain fix handling
        badVarTypePath = os.path.join(
            AZOC.AZORANGEHOME, "tests/source/data/BinClass_BadVarType.tab")
        badVarNamePath = os.path.join(
            AZOC.AZORANGEHOME, "tests/source/data/BinClass_BadVarName.tab")
        badVarOrderPath = os.path.join(
            AZOC.AZORANGEHOME, "tests/source/data/BinClass_BadVarOrder.tab")
        badVarCountPath = os.path.join(
            AZOC.AZORANGEHOME, "tests/source/data/BinClass_BadVarCount.tab")
        RegDAttrPath = os.path.join(
            AZOC.AZORANGEHOME, "tests/source/data/Reg_No_metas_Imp_Train.tab")
        # Read in the data
        self.noBadDataTrain = self.NoMetaTrain
        self.noBadDataTest = self.NoMetaTest
        self.badVarTypeData = dataUtilities.DataTable(badVarTypePath)
        self.badVarNameData = dataUtilities.DataTable(badVarNamePath)
        self.badVarOrderData = dataUtilities.DataTable(badVarOrderPath)
        self.badVarCountData = dataUtilities.DataTable(
            badVarCountPath)  #One less example
        self.RegDAttr = dataUtilities.DataTable(RegDAttrPath)
Ejemplo n.º 7
0
        self.applyButton.setEnabled(True)

    def onMetaButtonUpClick(self):
        self.moveSelection("metaAttributes", "selectedMeta", -1)

    def onMetaButtonDownClick(self):
        self.moveSelection("metaAttributes", "selectedMeta", 1)

    def onAttributesButtonUpClick(self):
        self.moveSelection("chosenAttributes", "selectedChosen", -1)

    def onAttributesButtonDownClick(self):
        self.moveSelection("chosenAttributes", "selectedChosen", 1)


if __name__ == "__main__":
    import sys
    data = dataUtilities.DataTable(r'..\..\doc\datasets\iris.tab')
    # add meta attribute
    data.domain.addmeta(orange.newmetaid(), orange.StringVariable("name"))
    for ex in data:
        ex["name"] = str(ex.getclass())

    a = QApplication(sys.argv)
    ow = OWDataDomain()
    a.setMainWidget(ow)
    ow.show()
    ow.onDataInput(data)
    a.exec_loop()
    ow.saveSettings()
Ejemplo n.º 8
0
learner = %(FullLearnerClass)s()

useDefaults = False
inF = open(inputFile,"r")
if "defaultX" in inputFile:
    useDefaults = True
    #These vars are not used at all for dafaul point. they will be just used to confirn the number of parameters to optimize
    #Vars are also used to create the intRes file as "asked by appspack"
    vars = [str(x).strip() for x in inF.readlines()][1:]
else:  
    vars = [types.FloatType(x) for x in inF.readlines()][1:]
inF.close()

# All Learner's parameters from config file
parameters = %(paramsConfigFile)s.%(learnerType)s
dataSet=dataUtilities.DataTable("%(dataset)s")
N_ATTR = len(dataSet.domain.attributes)
N_EX = len(dataSet) - floor(len(dataSet)/%(nFolds)s)

if dataSet.domain.classVar.varType == orange.VarTypes.Discrete:
    isClassifier = True 
else:
    isClassifier = False
    
#Parameter names to be optimized (sent directly or loaded ahead from input.apps) 
paramKeys = %(paramKeys)s

try:
    if paramKeys == None:
        if not os.path.isfile("%(runPath)sinput.apps"):
            if verbose > 0: print "ERROR: Cannot find the correspondence parameters between names and values! No input.apps file!"
Ejemplo n.º 9
0
        idx = idx + 1
        resDict[idx] = {"actualLabel": actualLabel, "prediction": prediction}

        #print "Break after the first example"
        #if idx == 1: break


if __name__ == "__main__":
    """
    Assumptions;
    Binary classification 

    This main will test the implemented CP methods in a 10 fold CV
    """

    data = dataUtilities.DataTable("MVpotAggrSeries2_DescPrep_Class.txt")
    descList = [
        '"HEP2C_RSV_A2_XTT;EC50 (uM);(Num)"', 'Structure', '"MV Number"'
    ]
    data = dataUtilities.attributeDeselectionData(data, descList)

    print "Please note that the class labels are not generalized and need to be checked for a new data set"
    print "Assumed to be A and N in comparision to RF predictions"
    methods = [
        "kNNratio", "minNN", "avgNN", "probPred", "combo", "LLOO", "LLOOprob"
    ]  # Non-conformity score method
    methods = ["probPred"]
    cpMethod = "transductive"  # inductive or transductive

    #print "Temp position to save comp time!!"
    # Append to python path /home/kgvf414/dev/AZOrange0.5.5/orangeDependencies/src/orange/orange/Orange/distance/
Ejemplo n.º 10
0
        resDict[idx] = {"actualLabel": actualLabel, "prediction": prediction}

        #print "Break after the first example"
        #if idx == 1: break


if __name__ == "__main__":
    """
    Assumptions;
    Binary classification 
    Class labels not generalized, assumed to be 'A' and 'N'

    This main will test the implemented CP methods in a 10 fold CV
    """

    data = dataUtilities.DataTable("trainData.tab")
    descList = ["SMILES", "SMILES_1"]
    data = dataUtilities.attributeDeselectionData(data, descList)

    print "Please note that the class labels are not generalized and need to be checked for a new data set"
    print "Assumed to be A and N"
    methods = ["kNNratio", "minNN", "avgNN", "probPred", "combo", "LLOO", "LLOOprob"]   # Non-conformity score method
    #methods = ["kNNratio"]
    cpMethod = "transductive"   # inductive or transductive

    #print "Temp position to save comp time!!"
    # Append to python path /home/kgvf414/dev/AZOrange0.5.5/orangeDependencies/src/orange/orange/Orange/distance/
    #import instances
    #measure = instances.MahalanobisConstructor(data)
    measure = None
    methodIdx = 1
Ejemplo n.º 11
0
            molList.append(mol)
        else:
            print ex["Smiles"].value
            print ex["Leonumber"].value
    fps = [FingerprintMols.FingerprintMol(x) for x in molList]  # Topological
    #fps = [AllChem.GetMorganFingerprint(x, 2) for x in molList]
    #print "Length of data and fp ", len(data), len(fps)
    return fps


THRS = 0.75

model = AZorngRF.RFread("OI_RFmodel")
predictor = AZOrangePredictor.AZOrangePredictor("OI_RFmodel")

train = dataUtilities.DataTable("BioActivityAZOdesc.txt")

# Calculate fingerprints for train and test sets
fps = getFps(train)

#smiles = test[idx]["Smiles"].value
smiles = "CC(C)n1c(/C=C/[C@H](O)C[C@H](O)CC(=O)O)c(-c2ccc(F)cc2)c2ccccc21"
smiles = "Cc1cc(=Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n3n2)[nH][nH]1"  # Train set
#smiles = "Cc1nc2c(CN3CCOCC3)cc(NC3=CC(C)NN3)nn2c1Cc1ccc(Cl)cc1F"  # From Drawing - Wrong no tautomer
smiles = "Cc1cc(Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n3n2)[nH]n1"  #From drawing of Galilei structure
#smiles = "Cc1cc(=Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n3n2)[nH][nH]1" # Canonicalized from drawing in Galilei
cmd = "env -i HOME='$HOME' bash -l -c './cleanSmiles.sh " + '"' + smiles + '"' + "'"
print cmd
status, cleanSmiles = commands.getstatusoutput(cmd)
print cleanSmiles
predictor.getDescriptors(cleanSmiles)
Ejemplo n.º 12
0
def RFread(dirPath,verbose = 0):
    """Read a RF model from disk and return as a RFClassifier instance. """
    # Read data from disk
    ##scPA   
    #This removes any trailing '/'
    dirPath = os.path.realpath(str(dirPath)) 
    NTrainEx = 0
    basicStat = None 
    # This assures that all related files will be inside a folder
    loadedRFclassifier = ml.CvRTrees()
    # Read the parameters
    if os.path.isfile(os.path.join(dirPath,"parameters.pkl")):
        fileh = open(os.path.join(dirPath,"parameters.pkl"),"r")
        parameters = pickle.load(fileh)
        fileh.close()
    else:
        parameters = {}
    if os.path.isfile(os.path.join(dirPath,"model.rf")):
        #New format
        filePath = os.path.join(dirPath,"model.rf") 
        impDataPath = os.path.join(dirPath,"ImputeData.tab")
        varNamesPath = os.path.join(dirPath,"varNames.txt")
        loadedRFclassifier.load(filePath)
    else:
        #Old Format
        # For RF models we assume that the model file has the same name as the model folder
        #filePath = os.path.join(dirPath,os.path.split(dirPath)[1])
        #root, ext = os.path.splitext(filePath)
        files = os.listdir(dirPath)
        filePath = None
        impDataPath = None
        varNamesPath = None
        # Load the model when found
        for file in files:
            if len(file) >= 9 and file[-9:] == "Saved.tab":
                impDataPath = os.path.join(dirPath,file)
            elif  len(file) >= 12 and file[-12:] == "varNames.txt":
                varNamesPath = os.path.join(dirPath,file)
            elif filePath is None:
                # looking for opencv-ml-random-trees  in first 10 lines
                fh = open(os.path.join(dirPath,file),'r')
                for i in range(10):
                    if "opencv-ml-random-trees" in fh.readline():
                        filePath = os.path.join(dirPath,file)
                        break
                fh.close()
                if filePath:
                    try:
                        loadedRFclassifier.load(filePath)
                    except:
                        filePath = None
        if not filePath or not impDataPath or not varNamesPath:
            print "Error loading RF model: Missing files. Files found:",files
            return None
    ##scPA 
    try:
        impData = dataUtilities.DataTable(impDataPath,createNewOn=orange.Variable.MakeStatus.OK)
        classVar = impData.domain.classVar

        #Load the var names oredered the way it was used when training
        if (os.path.isfile(varNamesPath)):
            if len(impData) == 0:
                useBuiltInMissValHandling = True
            else:
                useBuiltInMissValHandling = False
                impData = impData[0]
            varNamesFile = open(varNamesPath,"r")
            lines = varNamesFile.readlines()
            varNames = eval(lines[0].strip())
            if len(lines) >= 3:
                NTrainEx = eval(lines[1].strip())
                basicStat = eval(lines[2].strip())
            varNamesFile.close()
            thisVer = True
        else:
            useBuiltInMissValHandling = False
            if verbose > 0: print "WARNING: The model loaded was probably saved with azorange version 0.2.1 or lower"
            varNames = [attr.name for attr in impData.domain.attributes]
            thisVer = False
    except:
        if verbose > 0: print "ERROR: It was not possible to load the impute data or the varNames."
        return None
    ##ecPA   also added , imputeData=impData to nexti call 
    return RFClassifier(classifier = loadedRFclassifier, classVar = classVar, imputeData=impData, verbose = verbose, varNames = varNames, thisVer=thisVer, useBuiltInMissValHandling = useBuiltInMissValHandling, NTrainEx = NTrainEx, basicStat = basicStat, parameters = parameters)
Ejemplo n.º 13
0
    def __call__(self, trainingData, weight = None):
        """Creates an RF model from the data in trainingData. """
        if not AZBaseClasses.AZLearner.__call__(self,trainingData, weight):
            return None

        # Set the number of theatd to be used ny opencv
        cv.cvSetNumThreads(max(int(self.NumThreads),0))
        #Remove from the domain any unused values of discrete attributes including class
        trainingData = dataUtilities.getDataWithoutUnusedValues(trainingData,True)

        # Object holding the data req for predictions (model, domain, etc)
	#print time.asctime(), "=superRFmodel(trainingData.domain)"
        ##scPA
        # Remove meta attributes from training data
        #dataUtilities.rmAllMeta(trainingData)
        if len(trainingData.domain.getmetas()) == 0:
            trainData = trainingData
        else:
            trainData = dataUtilities.getCopyWithoutMeta(trainingData)
        # Impute the data and Convert the ExampleTable to CvMat 
        if self.useBuiltInMissValHandling:
            #Create the imputer empty since we will not be using it
            impData = dataUtilities.DataTable(trainData.domain)
            CvMatrices = dataUtilities.ExampleTable2CvMat(trainData)
        else:
            #Create the imputer
            self.imputer = orange.ImputerConstructor_average(trainData)
            impData=self.imputer.defaults
            trainData = self.imputer(trainData)
            CvMatrices = dataUtilities.ExampleTable2CvMat(trainData)
            CvMatrices["missing_data_mask"] = None
        ##ecPA
        self.learner = ml.CvRTrees()#superRFmodel(trainData.domain)    #This call creates a scratchDir

        # Set RF model parameter values
        #  when nActVars defined as 0, use the sqrt of number of attributes so the user knows what will be used
        # This would be done in the C level if left as 0
        if self.nActVars == "0" and len(trainData.domain.attributes)>0:
            self.nActVars =  str(int(sqrt(len(trainData.domain.attributes))))
	#print time.asctime(), "=self.setParameters"
        params = self.setParameters(trainData)
        # Print values of the parameters
        if self.verbose > 0: self.printOuts(params)
        #**************************************************************************************************//
        #                      Check for irrational input arguments
        #**************************************************************************************************//
        if params.min_sample_count >= len(trainingData):
            if self.verbose > 0: print "ERROR! Invalid minSample: ",params.min_sample_count
            if self.verbose > 0: print "minSample must be smaller than the number of examples."
            if self.verbose > 0: print "The number of examples is: ",len(trainingData)
            if len(trainingData) > 10:
                if self.verbose > 0: print "minSample assigned to default value: 10"
                params.min_sample_count = 10
            else:
                if self.verbose > 0: print "Too few examples!!"
                if self.verbose > 0: print "Terminating"
                if self.verbose > 0: print "No random forest model built"
                return None
        if params.nactive_vars > len(trainingData.domain.attributes):
            if self.verbose > 0: print "ERROR! Invalid nActVars: ",params.nactive_vars
            if self.verbose > 0: print "nActVars must be smaller than or equal to the number of variables."
            if self.verbose > 0: print "The number of variables is: ", len(trainingData.domain.attributes)
            if self.verbose > 0: print "nActVars assigned to default value: sqrt(nVars)=",sqrt(len(trainingData.domain.attributes))
            params.nactive_vars = 0;
        # Train RF model on data in openCVFile
	#print time.asctime(), "=Start Training"
        #Process the priors and Count the number of values in class var
        if  trainingData.domain.classVar.varType == orange.VarTypes.Discrete:
            cls_count = len(trainData.domain.classVar.values)
            priors = self.convertPriors(self.priors,trainingData.domain.classVar)
            if type(priors) == str: #If a string is returned, there was a failure, and it is the respective error mnessage.
                print priors
                return None 
        else:
            cls_count = 0
            priors = None
        # Call the train method
        self.learner.train( CvMatrices["matrix"],ml.CV_ROW_SAMPLE,CvMatrices["responses"],None,None,CvMatrices["varTypes"],CvMatrices["missing_data_mask"],params,cls_count,  priors and str(priors).replace(","," ") or None)
        if self.learner.get_var_importance():
            varImportanceList = self.learner.get_var_importance()
            varImportance = {}
            varName = []
            varImp = []
            for idx,attr in enumerate(CvMatrices["varNames"]):
                varImportance[attr] = varImportanceList[idx]
            #Uncomment next lines if needed the outpuit already ordered
            #============================= begin =================================
            #    varName.append(attr)
            #    varImp.append(varImportanceList[idx])
            #Order the vars in terms of importance
            # insertion sort algorithm
            #for i in range(1, len(varImp)):
            #    save = varImp[i]
            #    saveName = varName[i]
            #    j = i
            #    while j > 0 and varImp[j - 1] < save:
            #        varImp[j] = varImp[j - 1]
            #        varName[j] = varName[j - 1]
            #        j -= 1
            #    varImp[j] = save
            #    varName[j] = saveName
            #For debug: test if assign var importance was correct
            #for attr in varImportance:
            #    if varImportance[attr] != varImp[varName.index(attr)]:
            #        print "ERROR: Variable importance of ", attr, " is not correct!"
            #OrderedVarImportance = {"VarNames":varName, "VarImportance":varImp}
            #=============================  end  =================================
        else:
            varImportance = {}
        #print time.asctime(), "=Done"
        # Save info about the variables used in the model (used by the write method)
        #attributeInfo = dataUtilities.DataTable(trainData.domain)
        # place the impute data as the first example of this data
        #attributeInfo.append(self.imputer.defaults)
        return RFClassifier(classifier = self.learner, classVar = impData.domain.classVar, imputeData=impData, verbose = self.verbose, varNames = CvMatrices["varNames"],thisVer=True,useBuiltInMissValHandling = self.useBuiltInMissValHandling, varImportance = varImportance, basicStat = self.basicStat, NTrainEx = len(trainingData), parameters = self.parameters)
Ejemplo n.º 14
0
        }

    if verbose:
        printStat(resDict, labels)

    return SVMparam, resDict


if __name__ == "__main__":
    """
    Assumptions;
    Binary classification 
    This main will test the implemented CP methods in a 10 fold CV
    """

    data = dataUtilities.DataTable('clusterTrain_bulk.txt')
    attrList = [
        '"HLM_XEN025;Mean;CLint (uL/min/mg);(Num)"', 'Structure', 'MV Number',
        "Class List"
    ]
    data = dataUtilities.attributeDeselectionData(data, attrList)

    method = "probPred"

    SVMparam = []
    resultsFile = "CPresultst.txt"
    fid = open(resultsFile, "w")
    fid.write(
        "Name\tActualLabel\tLabel1\tLabel2\tPvalue1\tPvalue2\tConf1\tConf2\tPrediction\n"
    )
    fid.close()
Ejemplo n.º 15
0
    def write(self, dirPath):
        """ Save a Consensus model to disk including the domain used """
        if not self.classVar or not self.domain or not self.varNames:
            self._setDomainAndClass()
        if not self.NTrainEx or not self.basicStat or not self.imputeData:
            self._setStatData()

        try:

            #This removes any trailing '/'
            dirPath = os.path.realpath(str(dirPath))

            dictionaryFilename = os.path.join(dirPath, 'learnerDict.pkl')
            expressionListFilename = os.path.join(dirPath,
                                                  'expressionList.pkl')
            expressionFilename = os.path.join(dirPath, 'expression.pkl')
            weightsFilename = os.path.join(dirPath, 'weights.pkl')

            if os.path.isdir(dirPath):
                modelFiles = glob.glob(os.path.join(dirPath, 'C*.model'))
                for Mfile in modelFiles:
                    os.system("rm -rf " + Mfile)

                os.system("rm -f " + os.path.join(dirPath, "trainDomain.tab"))
                os.system("rm -f " + os.path.join(dirPath, "learnerDict.pkl"))
                os.system("rm -f " +
                          os.path.join(dirPath, "expressionList.pkl"))
                os.system("rm -f " + os.path.join(dirPath, "expression.pkl"))
                os.system("rm -f " + os.path.join(dirPath, "weights.pkl"))

            # This assures that all related files will be inside a folder
            os.system("mkdir -p " + dirPath)

            # Save the models
            trainDomain = dataUtilities.DataTable(self.domain)

            #Save along with trainDomain file some dummy examples for compatibility
            ex = orange.Example(self.domain)
            for attr in self.domain:
                if attr.varType == orange.VarTypes.Discrete:
                    ex[attr] = attr.values[0]
                elif attr.varType == orange.VarTypes.Continuous:
                    ex[attr] = 0
                elif attr.varType == orange.VarTypes.String:
                    ex[attr] = "NA"

            trainDomain.append(ex)
            trainDomain.save(os.path.join(dirPath, "trainDomain.tab"))

            if type(self.classifiers).__name__ == 'list':
                for idx, c in enumerate(self.classifiers):
                    c.write(os.path.join(dirPath, "C" + str(idx) + ".model"))
            else:
                idx = 0
                dictionaryMapping = {}
                for k, c in self.classifiers.iteritems():
                    c.write(os.path.join(dirPath, "C" + str(idx) + ".model"))
                    dictionaryMapping[k] = idx
                    idx = idx + 1

                output = open(dictionaryFilename, 'wb+')
                pickle.dump(dictionaryMapping, output)
                output.close()

                if type(self.expression).__name__ == 'list':
                    output = open(expressionListFilename, 'wb+')
                    pickle.dump(self.expression, output)
                    output.close()
                else:
                    output = open(expressionFilename, 'wb+')
                    pickle.dump(self.expression, output)
                    output.close()

                if self.weights is not None:
                    output = open(weightsFilename, 'wb+')
                    pickle.dump(self.weights, output)
                    output.close()

        except:
            if self.verbose > 0:
                print "ERROR: Could not save the Consensus model to ", dirPath
            return False
        return True
Ejemplo n.º 16
0
    descFile = "descSelectionResults.txt"


    resultsFid = open(resultsFile, "w")
    resultsFid.write("Data\tTH\tTL\tFH\tFL\tCA\tMCC\n")
    resultsFid.close()
    descFid = open(descFile, "w")
    headerStr = ""
    for project in projectList:
        headerStr = headerStr + "MCC_CV_NO_"+project+"\t"+"MCC_rand_NO_"+project+"\t"+"MCC_ext"+project+"\t"
    headerStr = string.strip(headerStr)
    descFid.write("nDesc\t"+headerStr+"\tMCC_CV_AVG\tMCC_Rand_AVG\tMCC_Ext_AVG\n")
    descFid.close()
    MCCdict = {}
    for projectName in projectList:
        train = dataUtilities.DataTable("XEN025_NO_"+projectName+"Train.txt")
        randTest = dataUtilities.DataTable("XEN025_NO_"+projectName+"RandTest.txt")
        extTest = dataUtilities.DataTable("XEN025"+projectName+"Test.txt")
        resultsFid = open(resultsFile, "a")
        MCCdict[projectName] = {}
        MCCdict = Wrapper(train, randTest, extTest, resultsFid, projectName, MCCdict, descList)
        resultsFid.close()

    print MCCdict        
    descFid = open(descFile, "a")
    for nDesc in descList:
        wrtStr = ""
        descSumCV = 0
        descSumRand = 0
        descSumExt = 0
        for project in projectList:
Ejemplo n.º 17
0
def Consensusread(dirPath, verbose=0):
    """Read a Consensus model from disk and return as a ConsensusClassifier instance. """
    # Read data from disk
    #This removes any trailing '/'
    dirPath = os.path.realpath(str(dirPath))
    basicStat = None
    NTrainEx = None
    imputeData = None
    expression = None
    weights = None
    # This assures that all related files will be inside a folder
    try:
        domainFile = dataUtilities.DataTable(
            os.path.join(dirPath, "trainDomain.tab"))

        learnerFilename = os.path.join(dirPath, 'learnerDict.pkl')
        expressionListFilename = os.path.join(dirPath, 'expressionList.pkl')
        expressionFilename = os.path.join(dirPath, 'expression.pkl')
        weightsFilename = os.path.join(dirPath, 'weights.pkl')

        #Load the models
        modelFiles = glob.glob(os.path.join(dirPath, 'C*.model'))
        modelFiles.sort()
        if len(modelFiles) < 2:
            if verbose > 0: print "ERROR: Missing model files in ", dirPath
            return None
        else:

            if os.path.exists(learnerFilename):
                #
                # We have a custom expression to read
                #

                dictionaryFile = open(learnerFilename, 'rb')
                classifiers = pickle.load(dictionaryFile)
                dictionaryFile.close()

                models = []
                for mFile in modelFiles:
                    models.append(AZBaseClasses.modelRead(mFile))

                for k, v in classifiers.iteritems():
                    classifiers[k] = models[v]

                #Try to load the imputeData, basicStat and NTrainEx from a model that saved it!
                if hasattr(classifiers.itervalues().next(),
                           "basicStat") and classifiers.itervalues().next(
                           ).basicStat and not basicStat:
                    basicStat = classifiers.itervalues().next().basicStat
                if hasattr(classifiers.itervalues().next(),
                           "NTrainEx") and classifiers.itervalues().next(
                           ).NTrainEx and not NTrainEx:
                    NTrainEx = classifiers.itervalues().next().NTrainEx
                if hasattr(classifiers.itervalues().next(),
                           "imputeData") and classifiers.itervalues().next(
                           ).imputeData and not imputeData:
                    imputeData = classifiers.itervalues().next().imputeData
                    domainFile = imputeData  #This is needed for domain compatibility between imputer and domain var

                if os.path.exists(expressionListFilename):
                    file = open(expressionListFilename)
                    expression = pickle.load(file)
                    file.close()
                else:
                    file = open(expressionFilename)
                    expression = pickle.load(file)
                    file.close()

                if os.path.exists(weightsFilename):
                    file = open(weightsFilename)
                    weights = pickle.load(file)
                    file.close()

            else:
                #
                # Default expression to read
                #

                classifiers = []
                for mFile in modelFiles:
                    classifiers.append(AZBaseClasses.modelRead(mFile))

                if not classifiers[-1]:
                    if verbose > 0:
                        print "ERROR: Could not load the model ", mFile
                    return None
                else:
                    #Try to load the imputeData, basicStat and NTrainEx from a model that saved it!
                    if hasattr(
                            classifiers[-1], "basicStat"
                    ) and classifiers[-1].basicStat and not basicStat:
                        basicStat = classifiers[-1].basicStat
                    if hasattr(classifiers[-1], "NTrainEx"
                               ) and classifiers[-1].NTrainEx and not NTrainEx:
                        NTrainEx = classifiers[-1].NTrainEx
                    if hasattr(
                            classifiers[-1], "imputeData"
                    ) and classifiers[-1].imputeData and not imputeData:
                        imputeData = classifiers[-1].imputeData
                        domainFile = imputeData  #This is needed for domain compatibilitu betwene imputer and domain var

    except:
        if verbose > 0:
            print "ERROR: It was not possible to load the Consensus model"
        return None
    return ConsensusClassifier(
        classifiers=classifiers,
        expression=expression,
        weights=weights,
        varNames=[attr.name for attr in domainFile.domain.attributes],
        classVar=domainFile.domain.classVar,
        verbose=verbose,
        domain=domainFile.domain,
        basicStat=basicStat,
        NTrainEx=NTrainEx,
        imputeData=imputeData)
Ejemplo n.º 18
0
                      (MD[0]["_train_id_near3"], MD[0]["_train_SMI_near3"])]
    avg3nearest = MD[0]["_train_av3nearest"]
    if avg3nearest < predictor.highConf:
        confStr = predictor.highConfString
    elif avg3nearest > predictor.lowConf:
        confStr = predictor.lowConfString
    else:
        confStr = predictor.medConfString

    return near3neighbors, confStr


if __name__ == "__main__":
    dataFile = "trainData.txt"
    testDataFile = "testData.txt"
    data = dataUtilities.DataTable(dataFile)
    testData = dataUtilities.DataTable(testDataFile)

    # This data contains SMILES and ID, which data and ex are assumed not to.
    attrList = ["SMILES", "ID"]
    data = dataUtilities.attributeDeselectionData(data, attrList)
    testData = dataUtilities.attributeDeselectionData(testData, attrList)

    # Select one ex
    selectionList = []
    for idx in range(len(testData)):
        selectionList.append(0)
    selectionList[0] = 1  # Select first ex
    ex = testData.select(selectionList)

    # One ex in exampleTable
Ejemplo n.º 19
0
    def test_BuiltIn_Impute(self):
        """Test RF BuiltIn missing values imputation
        Assure that imputation works for the rf models. Test on data with missing values
        """
        #This data is loaded here to speed up the test suite since it is too big
        contTestDataPath = os.path.join(AZOC.AZORANGEHOME,
                                        "tests/source/data/linearTest.tab")
        contTrainDataPath = os.path.join(AZOC.AZORANGEHOME,
                                         "tests/source/data/linearTrain.tab")
        contTrain = dataUtilities.DataTable(contTrainDataPath)
        contTest = dataUtilities.DataTable(contTestDataPath)

        ex1 = contTest[5]
        ex2 = contTest[2]
        AttrEx1 = "Desc 71"
        AttrEx2 = "Desc 72"
        self.assert_(ex1[AttrEx1] != "?",
                     "The var Desc 671 shouldn't be missing!")
        self.assert_(ex2[AttrEx2] != "?",
                     "The var Desc 138 shouldn't be missing!")

        imputer = orange.ImputerConstructor_average(contTrain)
        RFlearner = AZorngRF.RFLearner(NumThreads = 1, maxDepth = "20", minSample = "5", useSurrogates = "false", getVarVariance = "false", \
                                        nActVars = "0", nTrees = "100", forestAcc = "0.001", termCrit = "0",useBuiltInMissValHandling = True )
        rf = RFlearner(contTrain)

        # Prediction for data as it is
        P1 = rf(ex1)
        P2 = rf(ex2)

        # Predictions changing one continuous and one discrete variable to 0
        ex1[AttrEx1] = 0
        ex2[AttrEx2] = 0
        P1_0 = rf(ex1)
        P2_0 = rf(ex2)

        # Predictions changing the same continuous and discrete variable to it's correspondent imputation value
        #ex1["Desc 71"]=imputer.defaults["Desc 71"]
        #ex2["Desc 138"]=imputer.defaults["Desc 138"]
        #P1_imp=rf(ex1)
        #P2_imp=rf(ex2)

        # Predictions changing the same continuous and discrete variable to '?' wich means that the same imputation
        # as in the last case will have to be made inside the classifier. So, the predicted value must be the same
        ex1[AttrEx1] = "?"
        ex2[AttrEx2] = "?"
        self.assert_(ex1[AttrEx1] == "?",
                     "The var Desc 71 should be missing now!")
        self.assert_(ex2[AttrEx2] == "?",
                     "The var Desc 138 should be missing now!")
        P1Miss = rf(ex1)
        P2Miss = rf(ex2)

        # Test if the prediction made for the example with mising value is the same as the one
        # for the example which missing values were substituted using the same method as the classifier does.
        #self.assert_(P1_imp==P1Miss,"Imputation was not made correctly inside the classifier")
        #self.assert_(P2_imp==P2Miss,"Imputation was not made correctly inside the classifier")

        # Assure that if other substitutions on those variables were made, the predicted value would be different,
        # and so, this is a valid method for testing the imputation

        self.assert_(
            P1.value !=
            P2.value)  # Just to assure that we are not comaring equal examples
        self.assert_(
            P1.value != P1Miss.value,
            "The imputed 1 was the same as the original ... try other example")
        self.assert_(
            P1_0.value != P1Miss.value,
            "The imputed 1 was the same as the replaced by 0. The classifier may be replacing missing values by 0"
        )
        self.assert_(
            P2.value != P2Miss.value,
            "The missing imputed 2 was the same as the original ... try other example"
        )
        #self.assert_(P2_0.value!=P2Miss.value,"The missing imputed 2 was the same as the replaced by 0. The classifier may be replacing missing values by 0")

        self.assert_(rf.useBuiltInMissValHandling == True)
        #Test the imputer for saved models
        # Save the model
        scratchdir = os.path.join(AZOC.SCRATCHDIR,
                                  "scratchdirTest" + str(time.time()))
        os.mkdir(scratchdir)
        modelPath = os.path.join(scratchdir, "RFModel")
        rf.write(modelPath)

        # Read in the model
        rfM = AZorngRF.RFread(modelPath)
        self.assert_(rfM.useBuiltInMissValHandling == True)
        # Predict the ex1 and ex2 which are still the examples with missing values '?'
        self.assert_(ex1[AttrEx1] == "?",
                     "Value of Var Desc 6 should be missing!")
        self.assert_(ex2[AttrEx2] == "?",
                     "Value of Var Desc 71 should be missing!")
        self.assert_(
            rfM(ex1) == P1Miss, "Imputation on loaded model is not correct")
        self.assert_(
            rfM(ex2) == P2Miss, "Imputation on loaded model is not correct")
        # Remove the scratch directory
        os.system("/bin/rm -rf " + scratchdir)
Ejemplo n.º 20
0
def rmClass(data):

    newDomain = orange.Domain(data.domain.attributes)
    newData = dataUtilities.DataTable(newDomain, data)
    return newData
Ejemplo n.º 21
0
            exec(msg) in globals()
        ##ecPA


##############################################################################
# Test the widget, run from DOS prompt

if __name__ == "__main__":
    a = QApplication(sys.argv)
    ow = OWPredictions()
    a.setMainWidget(ow)
    ow.show()

    import orngTree

    dataset = dataUtilities.DataTable('../../doc/datasets/iris.tab')
    #    dataset = dataUtilities.DataTable('../../doc/datasets/auto-mpg.tab')
    ind = orange.MakeRandomIndices2(p0=0.5)(dataset)
    data = dataset.select(ind, 0)
    test = dataset.select(ind, 1)
    testnoclass = dataUtilities.DataTable(
        orange.Domain(test.domain.attributes, False), test)
    tree = orngTree.TreeLearner(data)
    tree.name = "tree"
    maj = orange.MajorityLearner(data)
    maj.name = "maj"
    knn = orange.kNNLearner(data, k=10)
    knn.name = "knn"

    if 0:  # data set only
        ow.setData(test)
Ejemplo n.º 22
0
dataFile = "baseDataTmp.txt"
fileH = open(dataFile, "w")
for idx, line in enumerate(lines):
    if idx == 0:
        line = line.replace("[C]([C]=[C])",
                            "Measure").replace("activity", "Activity")
        line = line.replace(
            "[C](=[C][N][N])\t[C](=[C][N][O])\t[C](=[C][N][S])\t[C](=[C][O])",
            "DiscAttr1\tDiscAttr2\tAttr3\tYetOther")
    if idx == 1:
        line = line.replace("C1=CC(=CC=C1[C@H]([C@@H](CO)NC(=O)C(Cl)Cl)O)[N+](=O)[O-]\t5959\t2\t0\t0\t0\t2\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0",\
                            "C1=CC(=CC=C1[C@H]([C@@H](CO)NC(=O)C(Cl)Cl)O)[N+](=O)[O-]\t5959\t2\t0\t0\t0\t2\t0.0\t0.0\t0.0\t0.0\tRed\tYES\tA\tB")
    fileH.write(line)
fileH.close()

data = dataUtilities.DataTable(dataFile, noMeta=False)
dataFile = "baseData.tab"

#[Br]([C])       [C](=[C])       [C](=[C][Br][I])        [C](=[C][Cl])   DiscAttr1       DiscAttr2       Attr3           YetOther
# continuous      continuous      continuous             continuous      Red Green Blue  YES NO          1 2 3 4 5       A B C 1 2
#     0                1                2                      3             4                5             6                  7

data.domain["DiscAttr1"].values.append("Green")
data.domain["DiscAttr1"].values.append("Blue")
data.domain["DiscAttr2"].values.append("NO")
for val in ["1", "2", "3", "4", "5"]:
    data.domain["Attr3"].values.append(val)
for val in ["A", "B", "C", "1", "2"]:
    data.domain["YetOther"].values.append(val)
data.domain["Activity"].values.append("POS")
data.domain["Activity"].values.append("NEG")
Ejemplo n.º 23
0
    print "Number of attributes ", nAttr
    print "Maximum number of desc combinations ", pow(2, nAttr)
    print "Ndesc must be lower than the max number of desc combinations"
    print NdescComb

    # Randomly sample Ndesc combinations
    attrList = getDescComb(data, nAttr, NdescComb)

    # Rank the accuracy of each descriptor by averaging the accuracy of all models including a descriptor

    # Select all descriptors above median accuracy and repeat the random sampling of desc combinations

    return attrList


if __name__ == "__main__":

    dataFile = "trainDataAllEP.txt"
    data = dataUtilities.DataTable(dataFile)

    attrList = [
        "IT03423_Seq_BF", "hERG_IW_pIC50", "IT03423_BF", "IT03423_perc101_BF",
        "Caco2_intrinsic", "ACDlogD74", "Conc_QTc", "IT03713_BF", "IT10850_BF",
        "IT22015_BF", "IT22016_BF"
    ]
    data = dataUtilities.attributeSelectionData(data, attrList)

    NdescComb = 100  # Number of desc combinations to sample in the first iteration
    attrList = descSelection(data, NdescComb)
    print attrList
Ejemplo n.º 24
0
    def createSignImg(self,smi,signature,atomColor,imgPath, endHeight = None):
        colors = []
        print "Creating signature image..."
        if not signature or not atomColor or not smi:
            print "Missing inputs:",str([smi,signature,atomColor])
            return "","",[], []
        if hasattr(self.model, "specialType") and self.model.specialType == 1:
            # Create an Orange ExampleTable with a smiles attribute
            smilesAttr = orange.EnumVariable("SMILEStoPred", values = [smi])
            myDomain = orange.Domain([smilesAttr], 0)
            smilesData = dataUtilities.DataTable(myDomain, [[smi]])
            preCalcData = None
            startHeight = 0
            dataSign,cmpdSignDict, cmpdSignList, sdfStr  = getSignatures.getSignatures(smilesData, startHeight, endHeight, preCalcData, returnAtomID=True)
            cmpdSignList = cmpdSignList[0]
            CLabDesc = []
            # create a mol file
            tmpFile = miscUtilities.generateUniqueFile(desc="NN", ext = "mol")
            file= open(tmpFile,"w")
            molStr=""
            for line in sdfStr[0]:
                if "$$$$" in line:
                    break
                molStr += line
                file.write(line)
            file.close()
        else: 
            CLabDesc,cmpdSignList, tmpFile, molStr  =  self.getClabDescSignList(smi, getMolFile=True)
        if not cmpdSignList or not tmpFile:
            print "Couldn't get the cmpd list or the mol file"
            return "","",[], []
        # create an RDKit mol
        mol = Chem.MolFromMolFile(tmpFile,True,False)
        if not mol:
            mol = Chem.MolFromMolFile(tmpFile,False,False)
        if not mol:
            print "Could not create mol for: ",smi
            return "","",[], []
        adj = GetAdjacencyMatrix(mol)
        # find the NN
        hights = []
        for i in miscUtilities.Range(0,len(cmpdSignList),mol.GetNumAtoms()):
            hList = cmpdSignList[i:i+mol.GetNumAtoms()]
            if len(hList):
                hights.append(cmpdSignList[i:i+mol.GetNumAtoms()])
       
        atoms = []
        hight = None
        for idx,h in enumerate(hights):
            if signature in h:
                for i,a in enumerate(h):
                    if a == signature:
                        atoms.append(i)
                hight = idx
                break
        if len(atoms) == 0:
            print "ERROR: Could not find the atom for ",signature
            return "signatureNOTfound","",[],[]
        #print "IniAtoms: ",atoms
        visitedAtoms = []
        for n in range(hight):
          for atom in copy.deepcopy(atoms):
             if atom not in visitedAtoms:    
                lNN = findNeighbors(atom,adj)
                visitedAtoms.append(atom)
                for lnn in lNN:
                    if lnn not in atoms: 
                        atoms.append(lnn)
        atoms.sort()
        os.system("rm " + tmpFile)
        #Specify the atom colors
        colors=[atomColor]*len(atoms)

        if not imgPath:
            return "",molStr,atoms,colors 
        try:
                #Draw the image
                MolDrawing.elemDict=defaultdict(lambda : (0,0,0))
                Draw.MolToImageFile(mol,imgPath,size=(300, 300), kekulize=True, wedgeBonds=True, highlightAtoms=atoms)
                #Color the Highlighted atoms with the choosen atomColor.
                # Only using one color
                if atomColor == 'r':
                    rgb = (255,0,0)
                elif atomColor == 'g':
                    rgb = (0,255,0)
                else:
                    rgb = (0,0,255)    #Blue
                    
                img = Image.open(imgPath)
                img = img.convert("RGBA")
                pixdata = img.getdata()
                newData = list()
                for item in pixdata:
                  if item[0] == 255 and item[1] == 0 and item[2] == 0:
                    newData.append(rgb + (255,) )
                  else:
                    newData.append(item)
                img.putdata(newData)
                img.save(imgPath)

                if os.path.isfile(imgPath):
                    return imgPath,molStr,atoms,colors
                else:
                    return "",molStr,atoms,colors
        except:
                return "",molStr,atoms,colors
Ejemplo n.º 25
0
import os
from AZutilities import dataUtilities
from AZutilities import getCinfonyDesc
import Orange
import orange

#fileName = "XEN025dragonNewHeaderResp.txt"
fileName = "LiuJCIM2015dragonNewHeaderResp.txt"
path, ext = os.path.splitext(fileName)
outFileName = path + "RDKbulk" + ext

data = dataUtilities.DataTable(fileName)
descList = getCinfonyDesc.getAvailableDescs("rdkPhysChem")
newData = getCinfonyDesc.getRdkDescResult(data, descList)
#descList = getCinfonyDesc.getAvailableDescs("rdk")
#newData = getCinfonyDesc.getRdkDescResult(data, descList, radius = 3)
newData.save(outFileName)
Ejemplo n.º 26
0
 def getSmilesData(self, smiles):
     # Create an Orange ExampleTable with a smiles attribute
     smilesAttr = orange.StringVariable("SMILEStoPred")  
     myDomain = orange.Domain([smilesAttr], 0)
     self.smilesData = dataUtilities.DataTable(myDomain, [[smiles]])
Ejemplo n.º 27
0
                                  resultsFile)
        idx = idx + 1
        resDict[idx] = {"actualLabel": actualLabel, "prediction": prediction}

        #print "Break after the first example"
        #if idx == 1: break


if __name__ == "__main__":
    """
    Assumptions;
    Binary classification 
    This main will test the implemented CP methods in a 10 fold CV
    """

    data = dataUtilities.DataTable("HLMSeries2_rdkPhysChemPrepClass.txt")
    attrList = [
        '"Medivir;HLM (XEN025);CLint (uL/min/mg);(Num)"', 'Structure',
        '"MV Number"', "rdk.MolecularFormula"
    ]
    data = dataUtilities.attributeDeselectionData(data, attrList)

    print "Select all attributes"
    descListList = [[]]
    for attr in data.domain.attributes:
        descListList[0].append(attr.name)

    #methods = ["kNNratio", "minNN", "avgNN", "probPred", "combo", "LLOO", "LLOOprob"]   # Non-conformity score method
    methods = ["probPred"]
    cpMethod = "transductive"  # inductive or transductive
Ejemplo n.º 28
0
    def getDescriptors(self, smiles):
        self.getSmilesData(smiles)

        # Calculate descriptors defined in the model files
        descList = self.model.varNames
   
        savedSmilesData = dataUtilities.DataTable(self.smilesData)

        #Try 3 time to get All compounds descriptors
        nTry = 3       
        errorDesc = "" 
        while nTry > 0:
           try:
           #if True:
                traceLog = "Model Location:"+str(self.modelLocation)+"\n"
                nBadEx = 0        
                # Determine Signature and non-Signature descriptor names
                cinfonyDesc, clabDesc, signatureHeight, bbrcDesc, signDesc = descUtilities.getDescTypes(descList)
                # Signatures
                if "sign" in DescMethodsAvailable and signatureHeight:
                    traceLog += "Calculating signatures...\n"
                    print "Calculating signatures...."
                    preCalcData = dataUtilities.DataTable(self.preDefSignatureFile)
                    startHeight = 0                # Not used desc ignored in model prediction
                    endHeight = signatureHeight  
                    self.smilesData  = getSignatures.getSignatures(self.smilesData, startHeight, endHeight, preCalcData)

                # C-Lab desc
                if "clab" in DescMethodsAvailable and clabDesc:
                    traceLog += "Calculating C-Lab...\n"
                    print "Calculating C-Lab desc...."
                    self.smilesData = ClabUtilities.appendCLabDesc(clabDesc, self.smilesData)

                # Cinfony
                if cinfonyDesc:
                    traceLog += "Calculating Cinfony...\n"
                    print "Calculating Cinfony desc..."
                    self.smilesData = getCinfonyDesc.getCinfonyDescResults(self.smilesData, cinfonyDesc, radius = 5)

                # bbrcDesc
                if "bbrc" in DescMethodsAvailable and bbrcDesc:
                    traceLog += "Calculating BBRC...\n"
                    print "Calculating BBRC desc..."
                    self.smilesData = getBBRCDesc.getBBRCDescResult(self.smilesData, algo = "FTM", minSupPar = 1, descList = bbrcDesc)

                # Detect if the descripts calaculation or something else went wrong!
                for ex in self.smilesData:
                   if sum([ex[attr].isSpecial() for attr in self.smilesData.domain.attributes]) == len(self.smilesData.domain.attributes):
                        nBadEx +=1
                if nBadEx:
                    traceLog += "WARNING: Desc. Calculation: From the "+str(len(self.smilesData))+" compounds, "+str(nBadEx)+" could not be calculated!\n"
                    print "WARNING: Desc. Calculation: From the "+str(len(self.smilesData))+" compounds, "+str(nBadEx)+" could not be calculated!"
                    print "WARNING:   Tying again..."
                    self.smilesData = dataUtilities.DataTable(savedSmilesData)
                    nTry -= 1
                else:
                    nTry = 0
           #else:
           except Exception, e:
                errorDesc = "Error Calculating Descriptors:;"+traceLog+str(e)+";"
                nTry -= 1
Ejemplo n.º 29
0
            print "pred, prob, actual, correct  ", pred, prob, actual, correct
            fid = open("predictions_"+label+".txt", "a")
            fid.write(pred+"\t"+str(prob)+"\t"+actual+"\t"+str(correct)+"\n")
            fid.close()
        else:
            outAD = outAD + 1
    
    print CM
    MCC = round(evalUtilities.calcMCC(CM),3)
    print "MCC of test set ", MCC
    print "Fraction of outAD in test set ", float(outAD)/len(test)
    return MCC, float(outAD)/len(test)
    


data = dataUtilities.DataTable("IIDsetAZOdesc.txt")
# Partition the data set into a test and a train set
indices2 = Orange.data.sample.SubsetIndices2(p0=0.10)
ind = indices2(data)
train = data.select(ind, 1)
randTest = data.select(ind, 0)

extTest = dataUtilities.DataTable("nonIIDtestAZOdesc.txt")
print "Train set ", len(train)
print "randTest set ", len(randTest)
print "extTest set ", len(extTest)

# Calculate fingerprints for train and test sets
fps = getFps(train)
fpsRandTest = getFps(randTest)
fpsExtTest = getFps(extTest)
Ejemplo n.º 30
0
            miscUtilities.removeDir(scratchdir)
        else:
            self.setErrors(
                "The directory " + str(scratchdir) +
                " was not deleted because verbose flag is ON", "DEBUG")


class ProgressBar:
    def __init__(self, widget, iterations):
        self.iter = iterations
        self.widget = widget
        self.count = 0
        self.widget.progressBarInit()

    def advance(self):
        self.count += 1
        self.widget.progressBarSet(int(self.count * 100 / self.iter))

    def finish(self):
        self.widget.progressBarFinished()


if __name__ == "__main__":
    appl = QApplication(sys.argv)
    ow = OWParamOpt()
    appl.setMainWidget(ow)
    ow.show()
    dataset = dataUtilities.DataTable('iris.tab')
    ow.data(dataset)
    appl.exec_loop()