def optimizeParameters(self): """ Sets up the input learner with tuned parameters """ self.clearErrors() self.tunedPars = None if hasattr(self.learner, "optimized"): self.learner.optimized = False if not self.learner: self.send("Learner - Tuned", None) self.send("Examples - Optimization Steps", None) self.updateInfo() return # Apply the parameters var with values on configuration table of GUI (user could have changed them!) if not self.updateParametersFromTable(): return if not self.dataset: self.dataset = None self.send("Learner - Tuned", None) self.send("Examples - Optimization Steps", None) self.updateInfo() return # Progess Bar 1 optSteps = 3 progress1 = QProgressDialog( "Gathering data and configuring the optimizer...", "Cancel", 0, optSteps, self, Qt.Dialog) #, "progress", True ) progress1.setWindowModality(Qt.WindowModal) bar1 = QProgressBar(progress1) bar1.show() progress1.setBar(bar1) #progress1.setTotalSteps(optSteps) progress1.setMinimumDuration(0) progress1.forceShow() progress1.setValue(0) time.sleep(0.1) progress1.setValue(0) # Create path for running the optimizer randNr = random.randint(0, 10000) if self.execEnv == 0: scratchdir = miscUtilities.createScratchDir( desc="OWParamOpt_Serial") else: scratchdir = miscUtilities.createScratchDir( desc="OWParamOpt_MPI", baseDir=AZOC.NFS_SCRATCHDIR) # Save the dataset to the optimizer running path OrngFile = os.path.join(scratchdir, "OrngData.tab") orange.saveTabDelimited(OrngFile, self.dataset) # Advance Progress Bar progress1.setValue(1) # Define the evaluation method to use if self.dataset.domain.classVar.varType == orange.VarTypes.Continuous: fMin = self.RMethods[self.RMethod][2] evalM = self.RMethods[self.RMethod][1] else: fMin = self.CMethods[self.CMethod][2] evalM = self.CMethods[self.CMethod][1] try: if os.path.exists( os.path.join(scratchdir, "AZLearnersParamsConfig.py")): os.system( "rm " + str(os.path.join(scratchdir, "AZLearnersParamsConfig.py"))) paramFile = file( os.path.join(scratchdir, "AZLearnersParamsConfig.py"), "w") paramFile.write(self.learnerType + "= " + str(self.parameters) + "\r\n") paramFile.close() progress1.setValue(2) # Run the optimizer which will configure the input learner and aditionaly return [<minimum of objective function found>, <optimized parameters>] # Serial print "ENV:", self.execEnv if self.execEnv == 0: print "Executing the optimizer in serial mode on local machine" optPID = self.optimizer( learner=self.learner, dataSet=OrngFile, evaluateMethod=evalM, findMin=fMin, nFolds=self.nFolds, samplingMethod=self.SMethods[self.SMethod][1], runPath=scratchdir, verbose=self.verbose, externalControl=1, useParameters=self.parameters, useGridSearchFirst=self.UseGridSearch, gridSearchInnerPoints=self.nInnerPoints, np=None, machinefile=None, advancedMPIoptions="", ) # Local mpi elif self.execEnv == 1: print "Executing the optimizer in parallel mode on local machine" optPID = self.optimizer( learner=self.learner, dataSet=OrngFile, evaluateMethod=evalM, findMin=fMin, nFolds=self.nFolds, samplingMethod=self.SMethods[self.SMethod][1], runPath=scratchdir, verbose=self.verbose, externalControl=1, useParameters=self.parameters, useGridSearchFirst=self.UseGridSearch, gridSearchInnerPoints=self.nInnerPoints, machinefile=0) # Sge Molndal elif self.execEnv == 2: print "Executing the optimizer in parallel mode in the batch queue on the sge" print "*****************runPath*****************" optPID = self.optimizer( learner=self.learner, dataSet=OrngFile, evaluateMethod=evalM, findMin=fMin, nFolds=self.nFolds, samplingMethod=self.SMethods[self.SMethod][1], runPath=scratchdir, verbose=self.verbose, externalControl=1, useParameters=self.parameters, useGridSearchFirst=self.UseGridSearch, gridSearchInnerPoints=self.nInnerPoints, np=8, machinefile="qsub") #, sgeEnv = "sge_seml") elif self.execEnv == 3: print "Executing the optimizer in parallel mode in the quick queue on the sge" print "*****************runPath*****************" optPID = self.optimizer( learner=self.learner, dataSet=OrngFile, evaluateMethod=evalM, findMin=fMin, nFolds=self.nFolds, samplingMethod=self.SMethods[self.SMethod][1], runPath=scratchdir, verbose=self.verbose, externalControl=1, useParameters=self.parameters, useGridSearchFirst=self.UseGridSearch, gridSearchInnerPoints=self.nInnerPoints, np=8, machinefile="qsub", queueType="quick.q") #, sgeEnv = "sge_seml") else: print "No SGE Env. selected. Nothing will happen." except: progress1.close() self.updateInfo() self.setErrors( "Some error(s) occurred during the optimization.\nCheck the " + str(scratchdir) + " and the output terminal for more information") self.send("Learner - Tuned", None) self.send("Examples - Optimization Steps", None) return progress1.setValue(3) if type(optPID) != types.IntType: progress1.close() self.updateInfo() self.setErrors("Some error(s) occurred during optimization:\n" + str(optPID)) self.send("Learner - Tuned", None) self.send("Examples - Optimization Steps", None) return progress1.close() # Progess Bar optSteps = (1 + round( (len(self.dataset) * len(self.dataset.domain.attributes) * self.nParameters) / 1000)) * 8 print "Learner optimization started at " + time.asctime() print "Optimization steps = ", int( optSteps), " (estimated to aprox. ", optSteps / 2, " seconds)" progress = QProgressDialog("Learner optimization started at " + time.asctime() + " ,please wait...", "Abort Optimization", 0, optSteps, self, Qt.Dialog) #, "progress", True ) progress.setWindowModality(Qt.WindowModal) bar = QProgressBar(progress) bar.show() progress.setBar(bar) #progress.setTotalSteps(optSteps) progress.setMinimumDuration(0) stepsDone = 0 progress.setValue(stepsDone) progress.forceShow() #Loop waiting for the optimizer to finish while 1: if stepsDone < (progress.maximum() - 1): progress.setValue(stepsDone) stepsDone += 1 time.sleep(0.5) else: bar.setTextVisible(False) progress.setLabelText( "The optimizer is taking longer than expected, please wait some more time..." ) stepsDone = 0 progress.setValue(stepsDone) time.sleep(0.5) if progress.wasCanceled(): if not self.optimizer.stop(): progress.setLabelText( "Could not stop the optimizer! Please wait until it finish..." ) else: self.setErrors( "Learner optimization stopped by user at " + time.asctime(), "WARNING") break if self.optimizer.isFinished(): print "Learner optimization finished at " + time.asctime() break progress.setValue(progress.maximum() - 1) time.sleep(0.5) progress.setValue(progress.maximum()) self.tunedPars = self.optimizer.tunedParameters if self.verbose > 0: if self.optimizer.usedMPI: print "appspack version used in fact: MPI" else: print "appspack version used in fact: SERIAL" if type(self.tunedPars ) != types.ListType or self.learner.optimized == False: self.send("Learner - Tuned", None) self.send("Examples - Optimization Steps", None) else: self.send("Learner - Tuned", self.learner) self.intRes = dataUtilities.DataTable(scratchdir + "/optimizationLog.txt") self.send("Examples - Optimization Steps", self.intRes) self.updateInfo() if self.verbose == 0: miscUtilities.removeDir(scratchdir) else: self.setErrors( "The directory " + str(scratchdir) + " was not deleted because verbose flag is ON", "DEBUG")
self.classifier = None self.error(0,"ERROR: It was not possible to create a classifyer. Check any previous errors.") #print time.asctime(), " -> self.data = self.data" else: self.classifier = None #print time.asctime(), " -> self.send(...)" self.send("Classifier", self.classifier) def pbchange(self, val): self.progressBarSet(val*100) ############################################################################## # Test the widget, run from DOS prompt # > python OWDataTable.py) # Make sure that a sample data set (adult_sample.tab) is in the directory if __name__=="__main__": a=QApplication(sys.argv) ow=OWCvRF() a.setMainWidget(ow) d = dataUtilities.DataTable('adult_sample') ow.setData(d) ow.show() a.exec_loop() ow.saveSettings()
def LLOOprob_b(idx, extTrain, measure=None): """ Use the fraction of kNN correctly predicted by a local model Hard coded to 50 NN. Modeling method. RF of Tree? """ distList = [] if not measure: measure = orange.ExamplesDistanceConstructor_Euclidean(extTrain) for runIdx in range(len(extTrain)): if runIdx != idx: dist = measure(extTrain[idx], extTrain[runIdx]) distList.append(dist) # Get the distance of the 50th NN distList.sort() thresDist = distList[ 50] # Smaller number of NN does not work with returnDFV # Find the predEx and the 20 NN kNN = [] for runIdx in range(len(extTrain)): dist = measure(extTrain[idx], extTrain[runIdx]) if dist <= thresDist: kNN.append(extTrain[runIdx]) kNNtrain = dataUtilities.DataTable(kNN) # Find the fraction of correctly predicted ex in a LOO over kNN alphaList = [] alphaEx = 0 for iidx in range(len(kNNtrain)): # Deselect example idx in extTrain idxList = range(0, iidx) idxList.extend(range(iidx + 1, len(kNNtrain))) train = kNNtrain.get_items(idxList) # Get prediction and pred probability model = AZorngRF.RFLearner(train) predList = model(kNNtrain[iidx], returnDFV=True) pred = predList[0].value prob = predList[1] actual = kNNtrain[iidx].get_class().value # The prob of the predEx is more important dist = measure(extTrain[idx], kNNtrain[iidx]) # alpha should be greater the less certain the model try: if pred != actual: alpha = 1.0 + abs(prob) if dist < 0.001: alphaEx = alpha else: alpha = 1.0 - abs(prob) if dist < 0.001: alphaEx = alpha alphaList.append(alpha) except: pass alpha = alphaEx + sum(alphaList) / float(len(alphaList)) return alpha
def sendpredictions(self): def getValue(c, ex, getProb=None): """ Get the predicted value of ex using classifier c and gets the probability of symbol of order getProb""" if getProb != None: theValue = c(ex, orange.GetProbabilities) if hasattr(c, "isRealProb") and not c.isRealProb(): self.warning( 0, "The probabilities are not available in this particular case." ) return orange.Value('?') else: theValue = c(ex) #print "theValue: ",theValue if theValue: if getProb != None: return orange.Value(theValue[getProb]) else: return orange.Value(theValue) else: self.warning( 0, "Some example(s) were not able to be predicted. Check the domain compatibility of train and test datasets!" ) return orange.Value("?") self.warning(0) if not self.data or not self.outvar: self.send("Predictions", None) return messages = [] # predictions, data set with class predictions classification = self.outvar.varType == orange.VarTypes.Discrete # create a new domain for the new data handling the predictions domain = orange.Domain(self.data.domain.attributes + [self.data.domain.classVar]) # Add to the predictions the original meta attributes present in Data domain.addmetas(self.data.domain.getmetas()) # Create the new Data Table containing the Data and the Predictions predictions = dataUtilities.DataTable(domain, self.data) # The number of examples to be predicted nEx = len(self.data) # the number of Learners nL = len(self.predictors) # The number of calculated iteractions nIter = 1 # the number of iterations Done iter = 0 self.progressBarSet(0) self.progressBarInit() if self.verbose: for c in self.predictors.values(): c.verbose = int(self.verbose) if classification: if len(self.selectedClasses): nIter = (nEx * nL * len(self.selectedClasses)) + (nEx * nL) for c in self.predictors.values(): for i in self.selectedClasses: m = orange.FloatVariable( name="%s(%s)" % (c.name, str(self.outvar.values[i]))) domain.addmeta(orange.newmetaid(), m) for ex in predictions: ex[m.name] = getValue( c, orange.Example(self.data.domain, ex), i) self.progressBarSet((iter * 100) / nIter) iter += 1 else: iter = 0 nIter = nEx * nL for c in self.predictors.values(): if hasattr(c, 'examplesFixedLog'): c.examplesFixedLog = {} m = orange.EnumVariable(name="%s" % c.name, values=self.outvar.values) domain.addmeta(orange.newmetaid(), m) for ex in predictions: ex[m.name] = getValue(c, orange.Example(self.data.domain, ex)) self.progressBarSet((iter * 100) / nIter) iter += 1 else: # regression nIter = nEx * nL for c in self.predictors.values(): if hasattr(c, 'examplesFixedLog'): c.examplesFixedLog = {} m = orange.FloatVariable(name="%s" % c.name) domain.addmeta(orange.newmetaid(), m) for ex in predictions: ex[m.name] = getValue(c, orange.Example(self.data.domain, ex)) self.progressBarSet((iter * 100) / nIter) iter += 1 if self.verbose: for c in self.predictors.values(): c.verbose = 0 #Compute and return individual Var Importance iter = 0 nIter = nEx * nL if self.nVarImportance > 0: for c in self.predictors.values(): if hasattr(c, 'getTopImportantVars'): m = orange.StringVariable(name="%s" % c.name + "(Top Important Vars)") domain.addmeta(orange.newmetaid(), m) for ex in predictions: topVars = c.getTopImportantVars( ex, self.nVarImportance) if topVars: if len(topVars) == 1: topVars = str(topVars[0]) else: topVars = str(topVars) #if topVars not in m.values: # m.values.append(topVars) ex[m.name] = topVars else: ex[m.name] = "?" self.progressBarSet((iter * 100) / nIter) iter += 1 self.progressBarFinished() for c in self.predictors.values(): if hasattr(c, 'examplesFixedLog') and ( 'Missing Attributes' in c.examplesFixedLog ) and c.examplesFixedLog['Missing Attributes']: missingAttrs = "" for attr in c.examplesFixedLog['Missing Attributes']: missingAttrs += " " + attr + "\\n" messages.append("QMessageBox.warning( None, \"Missing Attributes\" ,"+\ "\"The following attributes were missing in the examples to be predicted:\\n" + \ missingAttrs + "\", QMessageBox.Ok)") if hasattr(c, 'examplesFixedLog') and ('Fixed Types of variables' in c.examplesFixedLog): if 'Vars needing type fix' in c.examplesFixedLog: msg = "Some variable types were fixed while predicting with " + c.name + "!\\nTypes Fixed: \\n" for var in c.examplesFixedLog['Vars needing type fix']: msg += " " + var + ": " + str( c.examplesFixedLog['Vars needing type fix'] [var]) + '\\n' else: msg = "Some variable types were fixed while predicting with " + c.name + "!" messages.append("QMessageBox.warning( None, \"Fixed Types of variables in "+\ str(c.examplesFixedLog['Fixed Types of variables'])+ " examples\",\""+ msg+"\", QMessageBox.Ok)") if hasattr(c, 'examplesFixedLog') and ('Fixed Order of variables' in c.examplesFixedLog): messages.append("QMessageBox.warning( None, \"Fixed Order of variables in "+\ str(c.examplesFixedLog['Fixed Order of variables'])+ " examples\",\"The order of variables in test data was not the same has in the original\\n"+\ "training set used on "+c.name+", so they were fixed.\", QMessageBox.Ok)") if hasattr(c, 'examplesFixedLog') and ('Fixed Number of variables' in c.examplesFixedLog): messages.append("QMessageBox.warning( None, \"Fixed Number of variables in "+\ str(c.examplesFixedLog['Fixed Number of variables'])+ " examples\",\"The Number of variables in test data were not the same has in the original\\n"+\ "training set used on "+c.name+", so only the variables\\npresent in the training set were used .\", QMessageBox.Ok)") predictions.name = self.data.name self.send("Predictions", predictions) for msg in messages: exec(msg) in globals()
def setUp(self): """Creates the training and testing data set attributes. """ self.dataPathD = os.path.join(AZOC.AZORANGEHOME, "tests/source/data/iris.tab") self.dataPathC = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/Reg_No_metas_Test.tab") # Read in the data self.inDataD = dataUtilities.DataTable(self.dataPathD) self.inDataC = dataUtilities.DataTable(self.dataPathC) # Full path to saved svm model global scratchdir self.modelPath = os.path.join(scratchdir, "model.svm") """Other datasets...""" contDataPath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/Reg_No_metas_Imp_Test.tab") SVMregDataPath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/Reg_No_metas_Train.tab") contTrainDataPath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/Reg_No_metas_Imp_Train.tab") dataNoMetaTrainPath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/BinClass_No_metas_Train.tab") missingTestDataPath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/BinClass_No_metas_Train_missing.tab") #These 2 datasets are equal apart from the meta atribute dataNoMetaTestPath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/BinClass_No_metas_SmallTest.tab") dataWMetaTestPath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/BinClass_W_metas_SmallTest.tab") # Read in the data #IrisData trainDataPath = os.path.join(AZOC.AZORANGEHOME, "tests/source/data/irisTrain.tab") testDataPath = os.path.join(AZOC.AZORANGEHOME, "tests/source/data/irisTest.tab") self.train_data = dataUtilities.DataTable(trainDataPath) self.test_data = dataUtilities.DataTable(testDataPath) missingInData = dataUtilities.DataTable(missingTestDataPath) contTrainData = dataUtilities.DataTable(contTrainDataPath) self.regTrainData = dataUtilities.DataTable(SVMregDataPath) contData = dataUtilities.DataTable(contDataPath) self.NoMetaTrain = dataUtilities.DataTable(dataNoMetaTrainPath) self.NoMetaTest = dataUtilities.DataTable(dataNoMetaTestPath) self.WMetaTest = dataUtilities.DataTable(dataWMetaTestPath) self.missingTrain = missingInData self.missingTest = missingInData self.contTrain = contTrainData self.contTest = contData #Data for domain fix handling badVarTypePath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/BinClass_BadVarType.tab") badVarNamePath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/BinClass_BadVarName.tab") badVarOrderPath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/BinClass_BadVarOrder.tab") badVarCountPath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/BinClass_BadVarCount.tab") # Read in the data self.noBadDataTrain = self.NoMetaTrain self.noBadDataTest = self.NoMetaTest self.badVarTypeData = dataUtilities.DataTable(badVarTypePath) self.badVarNameData = dataUtilities.DataTable(badVarNamePath) self.badVarOrderData = dataUtilities.DataTable(badVarOrderPath) self.badVarCountData = dataUtilities.DataTable( badVarCountPath) #One less example
def setUp(self): """Creates the training and testing data set attributes. """ trainDataPath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/BinClass_No_metas_Train.tab") testDataPath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/BinClass_No_metas_Test.tab") trainDataRegPath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/Reg_No_metas_Train.tab") testDataRegPath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/Reg_No_metas_Test.tab") missingTestDataPath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/BinClass_No_metas_Train_missing.tab") irisPath = os.path.join(AZOC.AZORANGEHOME, "tests/source/data/iris.tab") # Read in the data missingInData = dataUtilities.DataTable(missingTestDataPath) self.trainData = dataUtilities.DataTable(trainDataPath) self.testData = dataUtilities.DataTable(testDataPath) self.trainDataReg = dataUtilities.DataTable(trainDataRegPath) self.testDataReg = dataUtilities.DataTable(testDataRegPath) self.irisData = dataUtilities.DataTable(irisPath) ##scPA dataNoMetaTrainPath = trainDataPath #These 2 datasets are equal apart from the meta atribute dataNoMetaTestPath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/BinClass_No_metas_SmallTest.tab") dataWMetaTestPath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/BinClass_W_metas_SmallTest.tab") # Read in the data #contData = self.testDataReg self.missingTrain = missingInData self.missingTest = missingInData self.NoMetaTrain = dataUtilities.DataTable(dataNoMetaTrainPath) self.NoMetaTest = dataUtilities.DataTable(dataNoMetaTestPath) self.WMetaTest = dataUtilities.DataTable(dataWMetaTestPath) #Data for domain fix handling badVarTypePath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/BinClass_BadVarType.tab") badVarNamePath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/BinClass_BadVarName.tab") badVarOrderPath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/BinClass_BadVarOrder.tab") badVarCountPath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/BinClass_BadVarCount.tab") RegDAttrPath = os.path.join( AZOC.AZORANGEHOME, "tests/source/data/Reg_No_metas_Imp_Train.tab") # Read in the data self.noBadDataTrain = self.NoMetaTrain self.noBadDataTest = self.NoMetaTest self.badVarTypeData = dataUtilities.DataTable(badVarTypePath) self.badVarNameData = dataUtilities.DataTable(badVarNamePath) self.badVarOrderData = dataUtilities.DataTable(badVarOrderPath) self.badVarCountData = dataUtilities.DataTable( badVarCountPath) #One less example self.RegDAttr = dataUtilities.DataTable(RegDAttrPath)
self.applyButton.setEnabled(True) def onMetaButtonUpClick(self): self.moveSelection("metaAttributes", "selectedMeta", -1) def onMetaButtonDownClick(self): self.moveSelection("metaAttributes", "selectedMeta", 1) def onAttributesButtonUpClick(self): self.moveSelection("chosenAttributes", "selectedChosen", -1) def onAttributesButtonDownClick(self): self.moveSelection("chosenAttributes", "selectedChosen", 1) if __name__ == "__main__": import sys data = dataUtilities.DataTable(r'..\..\doc\datasets\iris.tab') # add meta attribute data.domain.addmeta(orange.newmetaid(), orange.StringVariable("name")) for ex in data: ex["name"] = str(ex.getclass()) a = QApplication(sys.argv) ow = OWDataDomain() a.setMainWidget(ow) ow.show() ow.onDataInput(data) a.exec_loop() ow.saveSettings()
learner = %(FullLearnerClass)s() useDefaults = False inF = open(inputFile,"r") if "defaultX" in inputFile: useDefaults = True #These vars are not used at all for dafaul point. they will be just used to confirn the number of parameters to optimize #Vars are also used to create the intRes file as "asked by appspack" vars = [str(x).strip() for x in inF.readlines()][1:] else: vars = [types.FloatType(x) for x in inF.readlines()][1:] inF.close() # All Learner's parameters from config file parameters = %(paramsConfigFile)s.%(learnerType)s dataSet=dataUtilities.DataTable("%(dataset)s") N_ATTR = len(dataSet.domain.attributes) N_EX = len(dataSet) - floor(len(dataSet)/%(nFolds)s) if dataSet.domain.classVar.varType == orange.VarTypes.Discrete: isClassifier = True else: isClassifier = False #Parameter names to be optimized (sent directly or loaded ahead from input.apps) paramKeys = %(paramKeys)s try: if paramKeys == None: if not os.path.isfile("%(runPath)sinput.apps"): if verbose > 0: print "ERROR: Cannot find the correspondence parameters between names and values! No input.apps file!"
idx = idx + 1 resDict[idx] = {"actualLabel": actualLabel, "prediction": prediction} #print "Break after the first example" #if idx == 1: break if __name__ == "__main__": """ Assumptions; Binary classification This main will test the implemented CP methods in a 10 fold CV """ data = dataUtilities.DataTable("MVpotAggrSeries2_DescPrep_Class.txt") descList = [ '"HEP2C_RSV_A2_XTT;EC50 (uM);(Num)"', 'Structure', '"MV Number"' ] data = dataUtilities.attributeDeselectionData(data, descList) print "Please note that the class labels are not generalized and need to be checked for a new data set" print "Assumed to be A and N in comparision to RF predictions" methods = [ "kNNratio", "minNN", "avgNN", "probPred", "combo", "LLOO", "LLOOprob" ] # Non-conformity score method methods = ["probPred"] cpMethod = "transductive" # inductive or transductive #print "Temp position to save comp time!!" # Append to python path /home/kgvf414/dev/AZOrange0.5.5/orangeDependencies/src/orange/orange/Orange/distance/
resDict[idx] = {"actualLabel": actualLabel, "prediction": prediction} #print "Break after the first example" #if idx == 1: break if __name__ == "__main__": """ Assumptions; Binary classification Class labels not generalized, assumed to be 'A' and 'N' This main will test the implemented CP methods in a 10 fold CV """ data = dataUtilities.DataTable("trainData.tab") descList = ["SMILES", "SMILES_1"] data = dataUtilities.attributeDeselectionData(data, descList) print "Please note that the class labels are not generalized and need to be checked for a new data set" print "Assumed to be A and N" methods = ["kNNratio", "minNN", "avgNN", "probPred", "combo", "LLOO", "LLOOprob"] # Non-conformity score method #methods = ["kNNratio"] cpMethod = "transductive" # inductive or transductive #print "Temp position to save comp time!!" # Append to python path /home/kgvf414/dev/AZOrange0.5.5/orangeDependencies/src/orange/orange/Orange/distance/ #import instances #measure = instances.MahalanobisConstructor(data) measure = None methodIdx = 1
molList.append(mol) else: print ex["Smiles"].value print ex["Leonumber"].value fps = [FingerprintMols.FingerprintMol(x) for x in molList] # Topological #fps = [AllChem.GetMorganFingerprint(x, 2) for x in molList] #print "Length of data and fp ", len(data), len(fps) return fps THRS = 0.75 model = AZorngRF.RFread("OI_RFmodel") predictor = AZOrangePredictor.AZOrangePredictor("OI_RFmodel") train = dataUtilities.DataTable("BioActivityAZOdesc.txt") # Calculate fingerprints for train and test sets fps = getFps(train) #smiles = test[idx]["Smiles"].value smiles = "CC(C)n1c(/C=C/[C@H](O)C[C@H](O)CC(=O)O)c(-c2ccc(F)cc2)c2ccccc21" smiles = "Cc1cc(=Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n3n2)[nH][nH]1" # Train set #smiles = "Cc1nc2c(CN3CCOCC3)cc(NC3=CC(C)NN3)nn2c1Cc1ccc(Cl)cc1F" # From Drawing - Wrong no tautomer smiles = "Cc1cc(Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n3n2)[nH]n1" #From drawing of Galilei structure #smiles = "Cc1cc(=Nc2cc(CN3CCOCC3)c3nc(C)c(Cc4ccc(Cl)cc4F)n3n2)[nH][nH]1" # Canonicalized from drawing in Galilei cmd = "env -i HOME='$HOME' bash -l -c './cleanSmiles.sh " + '"' + smiles + '"' + "'" print cmd status, cleanSmiles = commands.getstatusoutput(cmd) print cleanSmiles predictor.getDescriptors(cleanSmiles)
def RFread(dirPath,verbose = 0): """Read a RF model from disk and return as a RFClassifier instance. """ # Read data from disk ##scPA #This removes any trailing '/' dirPath = os.path.realpath(str(dirPath)) NTrainEx = 0 basicStat = None # This assures that all related files will be inside a folder loadedRFclassifier = ml.CvRTrees() # Read the parameters if os.path.isfile(os.path.join(dirPath,"parameters.pkl")): fileh = open(os.path.join(dirPath,"parameters.pkl"),"r") parameters = pickle.load(fileh) fileh.close() else: parameters = {} if os.path.isfile(os.path.join(dirPath,"model.rf")): #New format filePath = os.path.join(dirPath,"model.rf") impDataPath = os.path.join(dirPath,"ImputeData.tab") varNamesPath = os.path.join(dirPath,"varNames.txt") loadedRFclassifier.load(filePath) else: #Old Format # For RF models we assume that the model file has the same name as the model folder #filePath = os.path.join(dirPath,os.path.split(dirPath)[1]) #root, ext = os.path.splitext(filePath) files = os.listdir(dirPath) filePath = None impDataPath = None varNamesPath = None # Load the model when found for file in files: if len(file) >= 9 and file[-9:] == "Saved.tab": impDataPath = os.path.join(dirPath,file) elif len(file) >= 12 and file[-12:] == "varNames.txt": varNamesPath = os.path.join(dirPath,file) elif filePath is None: # looking for opencv-ml-random-trees in first 10 lines fh = open(os.path.join(dirPath,file),'r') for i in range(10): if "opencv-ml-random-trees" in fh.readline(): filePath = os.path.join(dirPath,file) break fh.close() if filePath: try: loadedRFclassifier.load(filePath) except: filePath = None if not filePath or not impDataPath or not varNamesPath: print "Error loading RF model: Missing files. Files found:",files return None ##scPA try: impData = dataUtilities.DataTable(impDataPath,createNewOn=orange.Variable.MakeStatus.OK) classVar = impData.domain.classVar #Load the var names oredered the way it was used when training if (os.path.isfile(varNamesPath)): if len(impData) == 0: useBuiltInMissValHandling = True else: useBuiltInMissValHandling = False impData = impData[0] varNamesFile = open(varNamesPath,"r") lines = varNamesFile.readlines() varNames = eval(lines[0].strip()) if len(lines) >= 3: NTrainEx = eval(lines[1].strip()) basicStat = eval(lines[2].strip()) varNamesFile.close() thisVer = True else: useBuiltInMissValHandling = False if verbose > 0: print "WARNING: The model loaded was probably saved with azorange version 0.2.1 or lower" varNames = [attr.name for attr in impData.domain.attributes] thisVer = False except: if verbose > 0: print "ERROR: It was not possible to load the impute data or the varNames." return None ##ecPA also added , imputeData=impData to nexti call return RFClassifier(classifier = loadedRFclassifier, classVar = classVar, imputeData=impData, verbose = verbose, varNames = varNames, thisVer=thisVer, useBuiltInMissValHandling = useBuiltInMissValHandling, NTrainEx = NTrainEx, basicStat = basicStat, parameters = parameters)
def __call__(self, trainingData, weight = None): """Creates an RF model from the data in trainingData. """ if not AZBaseClasses.AZLearner.__call__(self,trainingData, weight): return None # Set the number of theatd to be used ny opencv cv.cvSetNumThreads(max(int(self.NumThreads),0)) #Remove from the domain any unused values of discrete attributes including class trainingData = dataUtilities.getDataWithoutUnusedValues(trainingData,True) # Object holding the data req for predictions (model, domain, etc) #print time.asctime(), "=superRFmodel(trainingData.domain)" ##scPA # Remove meta attributes from training data #dataUtilities.rmAllMeta(trainingData) if len(trainingData.domain.getmetas()) == 0: trainData = trainingData else: trainData = dataUtilities.getCopyWithoutMeta(trainingData) # Impute the data and Convert the ExampleTable to CvMat if self.useBuiltInMissValHandling: #Create the imputer empty since we will not be using it impData = dataUtilities.DataTable(trainData.domain) CvMatrices = dataUtilities.ExampleTable2CvMat(trainData) else: #Create the imputer self.imputer = orange.ImputerConstructor_average(trainData) impData=self.imputer.defaults trainData = self.imputer(trainData) CvMatrices = dataUtilities.ExampleTable2CvMat(trainData) CvMatrices["missing_data_mask"] = None ##ecPA self.learner = ml.CvRTrees()#superRFmodel(trainData.domain) #This call creates a scratchDir # Set RF model parameter values # when nActVars defined as 0, use the sqrt of number of attributes so the user knows what will be used # This would be done in the C level if left as 0 if self.nActVars == "0" and len(trainData.domain.attributes)>0: self.nActVars = str(int(sqrt(len(trainData.domain.attributes)))) #print time.asctime(), "=self.setParameters" params = self.setParameters(trainData) # Print values of the parameters if self.verbose > 0: self.printOuts(params) #**************************************************************************************************// # Check for irrational input arguments #**************************************************************************************************// if params.min_sample_count >= len(trainingData): if self.verbose > 0: print "ERROR! Invalid minSample: ",params.min_sample_count if self.verbose > 0: print "minSample must be smaller than the number of examples." if self.verbose > 0: print "The number of examples is: ",len(trainingData) if len(trainingData) > 10: if self.verbose > 0: print "minSample assigned to default value: 10" params.min_sample_count = 10 else: if self.verbose > 0: print "Too few examples!!" if self.verbose > 0: print "Terminating" if self.verbose > 0: print "No random forest model built" return None if params.nactive_vars > len(trainingData.domain.attributes): if self.verbose > 0: print "ERROR! Invalid nActVars: ",params.nactive_vars if self.verbose > 0: print "nActVars must be smaller than or equal to the number of variables." if self.verbose > 0: print "The number of variables is: ", len(trainingData.domain.attributes) if self.verbose > 0: print "nActVars assigned to default value: sqrt(nVars)=",sqrt(len(trainingData.domain.attributes)) params.nactive_vars = 0; # Train RF model on data in openCVFile #print time.asctime(), "=Start Training" #Process the priors and Count the number of values in class var if trainingData.domain.classVar.varType == orange.VarTypes.Discrete: cls_count = len(trainData.domain.classVar.values) priors = self.convertPriors(self.priors,trainingData.domain.classVar) if type(priors) == str: #If a string is returned, there was a failure, and it is the respective error mnessage. print priors return None else: cls_count = 0 priors = None # Call the train method self.learner.train( CvMatrices["matrix"],ml.CV_ROW_SAMPLE,CvMatrices["responses"],None,None,CvMatrices["varTypes"],CvMatrices["missing_data_mask"],params,cls_count, priors and str(priors).replace(","," ") or None) if self.learner.get_var_importance(): varImportanceList = self.learner.get_var_importance() varImportance = {} varName = [] varImp = [] for idx,attr in enumerate(CvMatrices["varNames"]): varImportance[attr] = varImportanceList[idx] #Uncomment next lines if needed the outpuit already ordered #============================= begin ================================= # varName.append(attr) # varImp.append(varImportanceList[idx]) #Order the vars in terms of importance # insertion sort algorithm #for i in range(1, len(varImp)): # save = varImp[i] # saveName = varName[i] # j = i # while j > 0 and varImp[j - 1] < save: # varImp[j] = varImp[j - 1] # varName[j] = varName[j - 1] # j -= 1 # varImp[j] = save # varName[j] = saveName #For debug: test if assign var importance was correct #for attr in varImportance: # if varImportance[attr] != varImp[varName.index(attr)]: # print "ERROR: Variable importance of ", attr, " is not correct!" #OrderedVarImportance = {"VarNames":varName, "VarImportance":varImp} #============================= end ================================= else: varImportance = {} #print time.asctime(), "=Done" # Save info about the variables used in the model (used by the write method) #attributeInfo = dataUtilities.DataTable(trainData.domain) # place the impute data as the first example of this data #attributeInfo.append(self.imputer.defaults) return RFClassifier(classifier = self.learner, classVar = impData.domain.classVar, imputeData=impData, verbose = self.verbose, varNames = CvMatrices["varNames"],thisVer=True,useBuiltInMissValHandling = self.useBuiltInMissValHandling, varImportance = varImportance, basicStat = self.basicStat, NTrainEx = len(trainingData), parameters = self.parameters)
} if verbose: printStat(resDict, labels) return SVMparam, resDict if __name__ == "__main__": """ Assumptions; Binary classification This main will test the implemented CP methods in a 10 fold CV """ data = dataUtilities.DataTable('clusterTrain_bulk.txt') attrList = [ '"HLM_XEN025;Mean;CLint (uL/min/mg);(Num)"', 'Structure', 'MV Number', "Class List" ] data = dataUtilities.attributeDeselectionData(data, attrList) method = "probPred" SVMparam = [] resultsFile = "CPresultst.txt" fid = open(resultsFile, "w") fid.write( "Name\tActualLabel\tLabel1\tLabel2\tPvalue1\tPvalue2\tConf1\tConf2\tPrediction\n" ) fid.close()
def write(self, dirPath): """ Save a Consensus model to disk including the domain used """ if not self.classVar or not self.domain or not self.varNames: self._setDomainAndClass() if not self.NTrainEx or not self.basicStat or not self.imputeData: self._setStatData() try: #This removes any trailing '/' dirPath = os.path.realpath(str(dirPath)) dictionaryFilename = os.path.join(dirPath, 'learnerDict.pkl') expressionListFilename = os.path.join(dirPath, 'expressionList.pkl') expressionFilename = os.path.join(dirPath, 'expression.pkl') weightsFilename = os.path.join(dirPath, 'weights.pkl') if os.path.isdir(dirPath): modelFiles = glob.glob(os.path.join(dirPath, 'C*.model')) for Mfile in modelFiles: os.system("rm -rf " + Mfile) os.system("rm -f " + os.path.join(dirPath, "trainDomain.tab")) os.system("rm -f " + os.path.join(dirPath, "learnerDict.pkl")) os.system("rm -f " + os.path.join(dirPath, "expressionList.pkl")) os.system("rm -f " + os.path.join(dirPath, "expression.pkl")) os.system("rm -f " + os.path.join(dirPath, "weights.pkl")) # This assures that all related files will be inside a folder os.system("mkdir -p " + dirPath) # Save the models trainDomain = dataUtilities.DataTable(self.domain) #Save along with trainDomain file some dummy examples for compatibility ex = orange.Example(self.domain) for attr in self.domain: if attr.varType == orange.VarTypes.Discrete: ex[attr] = attr.values[0] elif attr.varType == orange.VarTypes.Continuous: ex[attr] = 0 elif attr.varType == orange.VarTypes.String: ex[attr] = "NA" trainDomain.append(ex) trainDomain.save(os.path.join(dirPath, "trainDomain.tab")) if type(self.classifiers).__name__ == 'list': for idx, c in enumerate(self.classifiers): c.write(os.path.join(dirPath, "C" + str(idx) + ".model")) else: idx = 0 dictionaryMapping = {} for k, c in self.classifiers.iteritems(): c.write(os.path.join(dirPath, "C" + str(idx) + ".model")) dictionaryMapping[k] = idx idx = idx + 1 output = open(dictionaryFilename, 'wb+') pickle.dump(dictionaryMapping, output) output.close() if type(self.expression).__name__ == 'list': output = open(expressionListFilename, 'wb+') pickle.dump(self.expression, output) output.close() else: output = open(expressionFilename, 'wb+') pickle.dump(self.expression, output) output.close() if self.weights is not None: output = open(weightsFilename, 'wb+') pickle.dump(self.weights, output) output.close() except: if self.verbose > 0: print "ERROR: Could not save the Consensus model to ", dirPath return False return True
descFile = "descSelectionResults.txt" resultsFid = open(resultsFile, "w") resultsFid.write("Data\tTH\tTL\tFH\tFL\tCA\tMCC\n") resultsFid.close() descFid = open(descFile, "w") headerStr = "" for project in projectList: headerStr = headerStr + "MCC_CV_NO_"+project+"\t"+"MCC_rand_NO_"+project+"\t"+"MCC_ext"+project+"\t" headerStr = string.strip(headerStr) descFid.write("nDesc\t"+headerStr+"\tMCC_CV_AVG\tMCC_Rand_AVG\tMCC_Ext_AVG\n") descFid.close() MCCdict = {} for projectName in projectList: train = dataUtilities.DataTable("XEN025_NO_"+projectName+"Train.txt") randTest = dataUtilities.DataTable("XEN025_NO_"+projectName+"RandTest.txt") extTest = dataUtilities.DataTable("XEN025"+projectName+"Test.txt") resultsFid = open(resultsFile, "a") MCCdict[projectName] = {} MCCdict = Wrapper(train, randTest, extTest, resultsFid, projectName, MCCdict, descList) resultsFid.close() print MCCdict descFid = open(descFile, "a") for nDesc in descList: wrtStr = "" descSumCV = 0 descSumRand = 0 descSumExt = 0 for project in projectList:
def Consensusread(dirPath, verbose=0): """Read a Consensus model from disk and return as a ConsensusClassifier instance. """ # Read data from disk #This removes any trailing '/' dirPath = os.path.realpath(str(dirPath)) basicStat = None NTrainEx = None imputeData = None expression = None weights = None # This assures that all related files will be inside a folder try: domainFile = dataUtilities.DataTable( os.path.join(dirPath, "trainDomain.tab")) learnerFilename = os.path.join(dirPath, 'learnerDict.pkl') expressionListFilename = os.path.join(dirPath, 'expressionList.pkl') expressionFilename = os.path.join(dirPath, 'expression.pkl') weightsFilename = os.path.join(dirPath, 'weights.pkl') #Load the models modelFiles = glob.glob(os.path.join(dirPath, 'C*.model')) modelFiles.sort() if len(modelFiles) < 2: if verbose > 0: print "ERROR: Missing model files in ", dirPath return None else: if os.path.exists(learnerFilename): # # We have a custom expression to read # dictionaryFile = open(learnerFilename, 'rb') classifiers = pickle.load(dictionaryFile) dictionaryFile.close() models = [] for mFile in modelFiles: models.append(AZBaseClasses.modelRead(mFile)) for k, v in classifiers.iteritems(): classifiers[k] = models[v] #Try to load the imputeData, basicStat and NTrainEx from a model that saved it! if hasattr(classifiers.itervalues().next(), "basicStat") and classifiers.itervalues().next( ).basicStat and not basicStat: basicStat = classifiers.itervalues().next().basicStat if hasattr(classifiers.itervalues().next(), "NTrainEx") and classifiers.itervalues().next( ).NTrainEx and not NTrainEx: NTrainEx = classifiers.itervalues().next().NTrainEx if hasattr(classifiers.itervalues().next(), "imputeData") and classifiers.itervalues().next( ).imputeData and not imputeData: imputeData = classifiers.itervalues().next().imputeData domainFile = imputeData #This is needed for domain compatibility between imputer and domain var if os.path.exists(expressionListFilename): file = open(expressionListFilename) expression = pickle.load(file) file.close() else: file = open(expressionFilename) expression = pickle.load(file) file.close() if os.path.exists(weightsFilename): file = open(weightsFilename) weights = pickle.load(file) file.close() else: # # Default expression to read # classifiers = [] for mFile in modelFiles: classifiers.append(AZBaseClasses.modelRead(mFile)) if not classifiers[-1]: if verbose > 0: print "ERROR: Could not load the model ", mFile return None else: #Try to load the imputeData, basicStat and NTrainEx from a model that saved it! if hasattr( classifiers[-1], "basicStat" ) and classifiers[-1].basicStat and not basicStat: basicStat = classifiers[-1].basicStat if hasattr(classifiers[-1], "NTrainEx" ) and classifiers[-1].NTrainEx and not NTrainEx: NTrainEx = classifiers[-1].NTrainEx if hasattr( classifiers[-1], "imputeData" ) and classifiers[-1].imputeData and not imputeData: imputeData = classifiers[-1].imputeData domainFile = imputeData #This is needed for domain compatibilitu betwene imputer and domain var except: if verbose > 0: print "ERROR: It was not possible to load the Consensus model" return None return ConsensusClassifier( classifiers=classifiers, expression=expression, weights=weights, varNames=[attr.name for attr in domainFile.domain.attributes], classVar=domainFile.domain.classVar, verbose=verbose, domain=domainFile.domain, basicStat=basicStat, NTrainEx=NTrainEx, imputeData=imputeData)
(MD[0]["_train_id_near3"], MD[0]["_train_SMI_near3"])] avg3nearest = MD[0]["_train_av3nearest"] if avg3nearest < predictor.highConf: confStr = predictor.highConfString elif avg3nearest > predictor.lowConf: confStr = predictor.lowConfString else: confStr = predictor.medConfString return near3neighbors, confStr if __name__ == "__main__": dataFile = "trainData.txt" testDataFile = "testData.txt" data = dataUtilities.DataTable(dataFile) testData = dataUtilities.DataTable(testDataFile) # This data contains SMILES and ID, which data and ex are assumed not to. attrList = ["SMILES", "ID"] data = dataUtilities.attributeDeselectionData(data, attrList) testData = dataUtilities.attributeDeselectionData(testData, attrList) # Select one ex selectionList = [] for idx in range(len(testData)): selectionList.append(0) selectionList[0] = 1 # Select first ex ex = testData.select(selectionList) # One ex in exampleTable
def test_BuiltIn_Impute(self): """Test RF BuiltIn missing values imputation Assure that imputation works for the rf models. Test on data with missing values """ #This data is loaded here to speed up the test suite since it is too big contTestDataPath = os.path.join(AZOC.AZORANGEHOME, "tests/source/data/linearTest.tab") contTrainDataPath = os.path.join(AZOC.AZORANGEHOME, "tests/source/data/linearTrain.tab") contTrain = dataUtilities.DataTable(contTrainDataPath) contTest = dataUtilities.DataTable(contTestDataPath) ex1 = contTest[5] ex2 = contTest[2] AttrEx1 = "Desc 71" AttrEx2 = "Desc 72" self.assert_(ex1[AttrEx1] != "?", "The var Desc 671 shouldn't be missing!") self.assert_(ex2[AttrEx2] != "?", "The var Desc 138 shouldn't be missing!") imputer = orange.ImputerConstructor_average(contTrain) RFlearner = AZorngRF.RFLearner(NumThreads = 1, maxDepth = "20", minSample = "5", useSurrogates = "false", getVarVariance = "false", \ nActVars = "0", nTrees = "100", forestAcc = "0.001", termCrit = "0",useBuiltInMissValHandling = True ) rf = RFlearner(contTrain) # Prediction for data as it is P1 = rf(ex1) P2 = rf(ex2) # Predictions changing one continuous and one discrete variable to 0 ex1[AttrEx1] = 0 ex2[AttrEx2] = 0 P1_0 = rf(ex1) P2_0 = rf(ex2) # Predictions changing the same continuous and discrete variable to it's correspondent imputation value #ex1["Desc 71"]=imputer.defaults["Desc 71"] #ex2["Desc 138"]=imputer.defaults["Desc 138"] #P1_imp=rf(ex1) #P2_imp=rf(ex2) # Predictions changing the same continuous and discrete variable to '?' wich means that the same imputation # as in the last case will have to be made inside the classifier. So, the predicted value must be the same ex1[AttrEx1] = "?" ex2[AttrEx2] = "?" self.assert_(ex1[AttrEx1] == "?", "The var Desc 71 should be missing now!") self.assert_(ex2[AttrEx2] == "?", "The var Desc 138 should be missing now!") P1Miss = rf(ex1) P2Miss = rf(ex2) # Test if the prediction made for the example with mising value is the same as the one # for the example which missing values were substituted using the same method as the classifier does. #self.assert_(P1_imp==P1Miss,"Imputation was not made correctly inside the classifier") #self.assert_(P2_imp==P2Miss,"Imputation was not made correctly inside the classifier") # Assure that if other substitutions on those variables were made, the predicted value would be different, # and so, this is a valid method for testing the imputation self.assert_( P1.value != P2.value) # Just to assure that we are not comaring equal examples self.assert_( P1.value != P1Miss.value, "The imputed 1 was the same as the original ... try other example") self.assert_( P1_0.value != P1Miss.value, "The imputed 1 was the same as the replaced by 0. The classifier may be replacing missing values by 0" ) self.assert_( P2.value != P2Miss.value, "The missing imputed 2 was the same as the original ... try other example" ) #self.assert_(P2_0.value!=P2Miss.value,"The missing imputed 2 was the same as the replaced by 0. The classifier may be replacing missing values by 0") self.assert_(rf.useBuiltInMissValHandling == True) #Test the imputer for saved models # Save the model scratchdir = os.path.join(AZOC.SCRATCHDIR, "scratchdirTest" + str(time.time())) os.mkdir(scratchdir) modelPath = os.path.join(scratchdir, "RFModel") rf.write(modelPath) # Read in the model rfM = AZorngRF.RFread(modelPath) self.assert_(rfM.useBuiltInMissValHandling == True) # Predict the ex1 and ex2 which are still the examples with missing values '?' self.assert_(ex1[AttrEx1] == "?", "Value of Var Desc 6 should be missing!") self.assert_(ex2[AttrEx2] == "?", "Value of Var Desc 71 should be missing!") self.assert_( rfM(ex1) == P1Miss, "Imputation on loaded model is not correct") self.assert_( rfM(ex2) == P2Miss, "Imputation on loaded model is not correct") # Remove the scratch directory os.system("/bin/rm -rf " + scratchdir)
def rmClass(data): newDomain = orange.Domain(data.domain.attributes) newData = dataUtilities.DataTable(newDomain, data) return newData
exec(msg) in globals() ##ecPA ############################################################################## # Test the widget, run from DOS prompt if __name__ == "__main__": a = QApplication(sys.argv) ow = OWPredictions() a.setMainWidget(ow) ow.show() import orngTree dataset = dataUtilities.DataTable('../../doc/datasets/iris.tab') # dataset = dataUtilities.DataTable('../../doc/datasets/auto-mpg.tab') ind = orange.MakeRandomIndices2(p0=0.5)(dataset) data = dataset.select(ind, 0) test = dataset.select(ind, 1) testnoclass = dataUtilities.DataTable( orange.Domain(test.domain.attributes, False), test) tree = orngTree.TreeLearner(data) tree.name = "tree" maj = orange.MajorityLearner(data) maj.name = "maj" knn = orange.kNNLearner(data, k=10) knn.name = "knn" if 0: # data set only ow.setData(test)
dataFile = "baseDataTmp.txt" fileH = open(dataFile, "w") for idx, line in enumerate(lines): if idx == 0: line = line.replace("[C]([C]=[C])", "Measure").replace("activity", "Activity") line = line.replace( "[C](=[C][N][N])\t[C](=[C][N][O])\t[C](=[C][N][S])\t[C](=[C][O])", "DiscAttr1\tDiscAttr2\tAttr3\tYetOther") if idx == 1: line = line.replace("C1=CC(=CC=C1[C@H]([C@@H](CO)NC(=O)C(Cl)Cl)O)[N+](=O)[O-]\t5959\t2\t0\t0\t0\t2\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0\t0.0",\ "C1=CC(=CC=C1[C@H]([C@@H](CO)NC(=O)C(Cl)Cl)O)[N+](=O)[O-]\t5959\t2\t0\t0\t0\t2\t0.0\t0.0\t0.0\t0.0\tRed\tYES\tA\tB") fileH.write(line) fileH.close() data = dataUtilities.DataTable(dataFile, noMeta=False) dataFile = "baseData.tab" #[Br]([C]) [C](=[C]) [C](=[C][Br][I]) [C](=[C][Cl]) DiscAttr1 DiscAttr2 Attr3 YetOther # continuous continuous continuous continuous Red Green Blue YES NO 1 2 3 4 5 A B C 1 2 # 0 1 2 3 4 5 6 7 data.domain["DiscAttr1"].values.append("Green") data.domain["DiscAttr1"].values.append("Blue") data.domain["DiscAttr2"].values.append("NO") for val in ["1", "2", "3", "4", "5"]: data.domain["Attr3"].values.append(val) for val in ["A", "B", "C", "1", "2"]: data.domain["YetOther"].values.append(val) data.domain["Activity"].values.append("POS") data.domain["Activity"].values.append("NEG")
print "Number of attributes ", nAttr print "Maximum number of desc combinations ", pow(2, nAttr) print "Ndesc must be lower than the max number of desc combinations" print NdescComb # Randomly sample Ndesc combinations attrList = getDescComb(data, nAttr, NdescComb) # Rank the accuracy of each descriptor by averaging the accuracy of all models including a descriptor # Select all descriptors above median accuracy and repeat the random sampling of desc combinations return attrList if __name__ == "__main__": dataFile = "trainDataAllEP.txt" data = dataUtilities.DataTable(dataFile) attrList = [ "IT03423_Seq_BF", "hERG_IW_pIC50", "IT03423_BF", "IT03423_perc101_BF", "Caco2_intrinsic", "ACDlogD74", "Conc_QTc", "IT03713_BF", "IT10850_BF", "IT22015_BF", "IT22016_BF" ] data = dataUtilities.attributeSelectionData(data, attrList) NdescComb = 100 # Number of desc combinations to sample in the first iteration attrList = descSelection(data, NdescComb) print attrList
def createSignImg(self,smi,signature,atomColor,imgPath, endHeight = None): colors = [] print "Creating signature image..." if not signature or not atomColor or not smi: print "Missing inputs:",str([smi,signature,atomColor]) return "","",[], [] if hasattr(self.model, "specialType") and self.model.specialType == 1: # Create an Orange ExampleTable with a smiles attribute smilesAttr = orange.EnumVariable("SMILEStoPred", values = [smi]) myDomain = orange.Domain([smilesAttr], 0) smilesData = dataUtilities.DataTable(myDomain, [[smi]]) preCalcData = None startHeight = 0 dataSign,cmpdSignDict, cmpdSignList, sdfStr = getSignatures.getSignatures(smilesData, startHeight, endHeight, preCalcData, returnAtomID=True) cmpdSignList = cmpdSignList[0] CLabDesc = [] # create a mol file tmpFile = miscUtilities.generateUniqueFile(desc="NN", ext = "mol") file= open(tmpFile,"w") molStr="" for line in sdfStr[0]: if "$$$$" in line: break molStr += line file.write(line) file.close() else: CLabDesc,cmpdSignList, tmpFile, molStr = self.getClabDescSignList(smi, getMolFile=True) if not cmpdSignList or not tmpFile: print "Couldn't get the cmpd list or the mol file" return "","",[], [] # create an RDKit mol mol = Chem.MolFromMolFile(tmpFile,True,False) if not mol: mol = Chem.MolFromMolFile(tmpFile,False,False) if not mol: print "Could not create mol for: ",smi return "","",[], [] adj = GetAdjacencyMatrix(mol) # find the NN hights = [] for i in miscUtilities.Range(0,len(cmpdSignList),mol.GetNumAtoms()): hList = cmpdSignList[i:i+mol.GetNumAtoms()] if len(hList): hights.append(cmpdSignList[i:i+mol.GetNumAtoms()]) atoms = [] hight = None for idx,h in enumerate(hights): if signature in h: for i,a in enumerate(h): if a == signature: atoms.append(i) hight = idx break if len(atoms) == 0: print "ERROR: Could not find the atom for ",signature return "signatureNOTfound","",[],[] #print "IniAtoms: ",atoms visitedAtoms = [] for n in range(hight): for atom in copy.deepcopy(atoms): if atom not in visitedAtoms: lNN = findNeighbors(atom,adj) visitedAtoms.append(atom) for lnn in lNN: if lnn not in atoms: atoms.append(lnn) atoms.sort() os.system("rm " + tmpFile) #Specify the atom colors colors=[atomColor]*len(atoms) if not imgPath: return "",molStr,atoms,colors try: #Draw the image MolDrawing.elemDict=defaultdict(lambda : (0,0,0)) Draw.MolToImageFile(mol,imgPath,size=(300, 300), kekulize=True, wedgeBonds=True, highlightAtoms=atoms) #Color the Highlighted atoms with the choosen atomColor. # Only using one color if atomColor == 'r': rgb = (255,0,0) elif atomColor == 'g': rgb = (0,255,0) else: rgb = (0,0,255) #Blue img = Image.open(imgPath) img = img.convert("RGBA") pixdata = img.getdata() newData = list() for item in pixdata: if item[0] == 255 and item[1] == 0 and item[2] == 0: newData.append(rgb + (255,) ) else: newData.append(item) img.putdata(newData) img.save(imgPath) if os.path.isfile(imgPath): return imgPath,molStr,atoms,colors else: return "",molStr,atoms,colors except: return "",molStr,atoms,colors
import os from AZutilities import dataUtilities from AZutilities import getCinfonyDesc import Orange import orange #fileName = "XEN025dragonNewHeaderResp.txt" fileName = "LiuJCIM2015dragonNewHeaderResp.txt" path, ext = os.path.splitext(fileName) outFileName = path + "RDKbulk" + ext data = dataUtilities.DataTable(fileName) descList = getCinfonyDesc.getAvailableDescs("rdkPhysChem") newData = getCinfonyDesc.getRdkDescResult(data, descList) #descList = getCinfonyDesc.getAvailableDescs("rdk") #newData = getCinfonyDesc.getRdkDescResult(data, descList, radius = 3) newData.save(outFileName)
def getSmilesData(self, smiles): # Create an Orange ExampleTable with a smiles attribute smilesAttr = orange.StringVariable("SMILEStoPred") myDomain = orange.Domain([smilesAttr], 0) self.smilesData = dataUtilities.DataTable(myDomain, [[smiles]])
resultsFile) idx = idx + 1 resDict[idx] = {"actualLabel": actualLabel, "prediction": prediction} #print "Break after the first example" #if idx == 1: break if __name__ == "__main__": """ Assumptions; Binary classification This main will test the implemented CP methods in a 10 fold CV """ data = dataUtilities.DataTable("HLMSeries2_rdkPhysChemPrepClass.txt") attrList = [ '"Medivir;HLM (XEN025);CLint (uL/min/mg);(Num)"', 'Structure', '"MV Number"', "rdk.MolecularFormula" ] data = dataUtilities.attributeDeselectionData(data, attrList) print "Select all attributes" descListList = [[]] for attr in data.domain.attributes: descListList[0].append(attr.name) #methods = ["kNNratio", "minNN", "avgNN", "probPred", "combo", "LLOO", "LLOOprob"] # Non-conformity score method methods = ["probPred"] cpMethod = "transductive" # inductive or transductive
def getDescriptors(self, smiles): self.getSmilesData(smiles) # Calculate descriptors defined in the model files descList = self.model.varNames savedSmilesData = dataUtilities.DataTable(self.smilesData) #Try 3 time to get All compounds descriptors nTry = 3 errorDesc = "" while nTry > 0: try: #if True: traceLog = "Model Location:"+str(self.modelLocation)+"\n" nBadEx = 0 # Determine Signature and non-Signature descriptor names cinfonyDesc, clabDesc, signatureHeight, bbrcDesc, signDesc = descUtilities.getDescTypes(descList) # Signatures if "sign" in DescMethodsAvailable and signatureHeight: traceLog += "Calculating signatures...\n" print "Calculating signatures...." preCalcData = dataUtilities.DataTable(self.preDefSignatureFile) startHeight = 0 # Not used desc ignored in model prediction endHeight = signatureHeight self.smilesData = getSignatures.getSignatures(self.smilesData, startHeight, endHeight, preCalcData) # C-Lab desc if "clab" in DescMethodsAvailable and clabDesc: traceLog += "Calculating C-Lab...\n" print "Calculating C-Lab desc...." self.smilesData = ClabUtilities.appendCLabDesc(clabDesc, self.smilesData) # Cinfony if cinfonyDesc: traceLog += "Calculating Cinfony...\n" print "Calculating Cinfony desc..." self.smilesData = getCinfonyDesc.getCinfonyDescResults(self.smilesData, cinfonyDesc, radius = 5) # bbrcDesc if "bbrc" in DescMethodsAvailable and bbrcDesc: traceLog += "Calculating BBRC...\n" print "Calculating BBRC desc..." self.smilesData = getBBRCDesc.getBBRCDescResult(self.smilesData, algo = "FTM", minSupPar = 1, descList = bbrcDesc) # Detect if the descripts calaculation or something else went wrong! for ex in self.smilesData: if sum([ex[attr].isSpecial() for attr in self.smilesData.domain.attributes]) == len(self.smilesData.domain.attributes): nBadEx +=1 if nBadEx: traceLog += "WARNING: Desc. Calculation: From the "+str(len(self.smilesData))+" compounds, "+str(nBadEx)+" could not be calculated!\n" print "WARNING: Desc. Calculation: From the "+str(len(self.smilesData))+" compounds, "+str(nBadEx)+" could not be calculated!" print "WARNING: Tying again..." self.smilesData = dataUtilities.DataTable(savedSmilesData) nTry -= 1 else: nTry = 0 #else: except Exception, e: errorDesc = "Error Calculating Descriptors:;"+traceLog+str(e)+";" nTry -= 1
print "pred, prob, actual, correct ", pred, prob, actual, correct fid = open("predictions_"+label+".txt", "a") fid.write(pred+"\t"+str(prob)+"\t"+actual+"\t"+str(correct)+"\n") fid.close() else: outAD = outAD + 1 print CM MCC = round(evalUtilities.calcMCC(CM),3) print "MCC of test set ", MCC print "Fraction of outAD in test set ", float(outAD)/len(test) return MCC, float(outAD)/len(test) data = dataUtilities.DataTable("IIDsetAZOdesc.txt") # Partition the data set into a test and a train set indices2 = Orange.data.sample.SubsetIndices2(p0=0.10) ind = indices2(data) train = data.select(ind, 1) randTest = data.select(ind, 0) extTest = dataUtilities.DataTable("nonIIDtestAZOdesc.txt") print "Train set ", len(train) print "randTest set ", len(randTest) print "extTest set ", len(extTest) # Calculate fingerprints for train and test sets fps = getFps(train) fpsRandTest = getFps(randTest) fpsExtTest = getFps(extTest)
miscUtilities.removeDir(scratchdir) else: self.setErrors( "The directory " + str(scratchdir) + " was not deleted because verbose flag is ON", "DEBUG") class ProgressBar: def __init__(self, widget, iterations): self.iter = iterations self.widget = widget self.count = 0 self.widget.progressBarInit() def advance(self): self.count += 1 self.widget.progressBarSet(int(self.count * 100 / self.iter)) def finish(self): self.widget.progressBarFinished() if __name__ == "__main__": appl = QApplication(sys.argv) ow = OWParamOpt() appl.setMainWidget(ow) ow.show() dataset = dataUtilities.DataTable('iris.tab') ow.data(dataset) appl.exec_loop()