def kNNratio(idx, extTrain, measure=None): """ Use the fraction of kNN with the same response. """ attrList = ["SMILES_1"] extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList) distList = [] if not measure: # measure = instances.MahalanobisConstructor(extTrain) measure = orange.ExamplesDistanceConstructor_Euclidean(extTrain) for runIdx in range(len(extTrain)): if runIdx != idx: dist = measure(extTrain[idx], extTrain[runIdx]) distList.append(dist) # Get the distance of the 10th NN distList.sort() thresDist = distList[9] # Find the labels of the 10 NN sameCount = 0 for runIdx in range(len(extTrain)): if runIdx != idx: dist = measure(extTrain[idx], extTrain[runIdx]) if dist <= thresDist: if extTrain[idx].get_class().value == extTrain[runIdx].get_class().value: sameCount = sameCount + 1 alpha = 1.00 - float(sameCount) / 10.0 return alpha
def avgNN(idx, extTrain, measure=None): """ Use the ratio between the distance to the kNN of the same and of the other class """ attrList = ["SMILES_1"] extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList) distListSame = [] distListDiff = [] # measure = Orange.distance.Euclidean(extTrain) if not measure: measure = orange.ExamplesDistanceConstructor_Euclidean(extTrain) for runIdx in range(len(extTrain)): if runIdx != idx: dist = measure(extTrain[idx], extTrain[runIdx]) if extTrain[idx].get_class().value == extTrain[runIdx].get_class().value: distListSame.append(dist) else: distListDiff.append(dist) distListSame.sort() avgSame = sum(distListSame[0:10]) / 10.0 distListDiff.sort() avgDiff = sum(distListDiff[0:10]) / 10.0 if avgDiff == 0: alpha = max(distListDiff) else: alpha = avgSame / float(avgDiff) return alpha
def probPred(idx, extTrain, SVMparam): """ Use the RF prediction probability to set the non-conf score """ attrList = ["SMILES_1"] extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList) # Deselect example idx in extTrain idxList = range(0, idx) idxList.extend(range(idx + 1, len(extTrain))) train = extTrain.get_items(idxList) # Train a model model = AZorngRF.RFLearner(train) #model, SVMparam = trainSVMOptParam(train, SVMparam) # Predict example idx predList = model(extTrain[idx], returnDFV=True) pred = predList[0].value prob = predList[1] actual = extTrain[idx].get_class().value #print pred, actual, prob # More non conforming if prediction is different from actual label if pred != actual: alpha = 1.0 + abs(prob) else: alpha = 1.0 - abs(prob) #print alpha return alpha, SVMparam
def getMahalanobisResults(predictor): if predictor.highConf == None and predictor.lowConf == None: return None, None testData = dataUtilities.attributeDeselectionData(predictor.exToPred, ["SMILEStoPred"]) trainData = dataUtilities.DataTable(predictor.trainDataPath) ExampleFix = dataUtilities.ExFix(trainData.domain, None, False) exFixed1 = ExampleFix.fixExample(testData[0]) if testData.hasMissingValues(): averageImputer = orange.ImputerConstructor_average(trainData) dat = averageImputer(exFixed1) else: dat = exFixed1 tab = dataUtilities.DataTable(trainData.domain) tab.append(dat) MD = calcMahalanobis(trainData, tab) near3neighbours = [ (MD[0]["_train_id_near1"], MD[0]["_train_SMI_near1"]), (MD[0]["_train_id_near2"], MD[0]["_train_SMI_near2"]), (MD[0]["_train_id_near3"], MD[0]["_train_SMI_near3"]), ] avg3nearest = MD[0]["_train_av3nearest"] if avg3nearest < predictor.highConf: confStr = predictor.highConfString elif avg3nearest > predictor.lowConf: confStr = predictor.lowConfString else: confStr = predictor.medConfString return near3neighbours, confStr
def avgNN(idx, extTrain, measure=None): """ Use the ratio between the distance to the kNN of the same and of the other class """ attrList = ["SMILES_1"] extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList) distListSame = [] distListDiff = [] #measure = Orange.distance.Euclidean(extTrain) if not measure: measure = orange.ExamplesDistanceConstructor_Euclidean(extTrain) for runIdx in range(len(extTrain)): if runIdx != idx: dist = measure(extTrain[idx], extTrain[runIdx]) if extTrain[idx].get_class().value == extTrain[runIdx].get_class( ).value: distListSame.append(dist) else: distListDiff.append(dist) distListSame.sort() avgSame = sum(distListSame[0:10]) / 10.0 distListDiff.sort() avgDiff = sum(distListDiff[0:10]) / 10.0 if avgDiff == 0: alpha = max(distListDiff) else: alpha = avgSame / float(avgDiff) return alpha
def getAccStat(rankSumTuple, nDesc, train, randTest, extTest, resultsFid, projectName): print "Select features based on top ranked features" attrList = [] for elem in rankSumTuple: if len(attrList) < nDesc: attrList.append(elem[0]) train = dataUtilities.attributeSelectionData(train, attrList) train = dataUtilities.attributeDeselectionData( train, ['HLM_XEN025;Mean;CLint (uL/min/mg);(Num)']) print train.domain.attributes, len( train.domain.attributes), train.domain.classVar # Get accuracies learners = [AZorngRF.RFLearner(nTrees=100)] print "CV accuracy" MCC_CV = printCV(train, learners, resultsFid, projectName) Model = learners[0](train) print "Random Test set accuracy" MCC_rand = printTestSetAcc(Model, randTest, learners, resultsFid, projectName, True) print "External Test set accuracy" MCC_ext = printTestSetAcc(Model, extTest, learners, resultsFid, projectName, False) return MCC_CV, MCC_rand, MCC_ext
def kNNratio(idx, extTrain, measure=None): """ Use the fraction of kNN with the same response. """ attrList = ["SMILES_1"] extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList) distList = [] if not measure: #measure = instances.MahalanobisConstructor(extTrain) measure = orange.ExamplesDistanceConstructor_Euclidean(extTrain) for runIdx in range(len(extTrain)): if runIdx != idx: dist = measure(extTrain[idx], extTrain[runIdx]) distList.append(dist) # Get the distance of the 10th NN distList.sort() thresDist = distList[9] # Find the labels of the 10 NN sameCount = 0 for runIdx in range(len(extTrain)): if runIdx != idx: dist = measure(extTrain[idx], extTrain[runIdx]) if dist <= thresDist: if extTrain[idx].get_class( ).value == extTrain[runIdx].get_class().value: sameCount = sameCount + 1 alpha = 1.00 - float(sameCount) / 10.0 return alpha
def probPred(idx, extTrain, SVMparam): """ Use the RF prediction probability to set the non-conf score """ attrList = ["SMILES_1"] extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList) # Deselect example idx in extTrain idxList = range(0, idx) idxList.extend(range(idx + 1, len(extTrain))) train = extTrain.get_items(idxList) # Train a model # model = AZorngRF.RFLearner(train, nActVars = 2) model, SVMparam = trainSVMOptParam(train, SVMparam) # Predict example idx predList = model(extTrain[idx], returnDFV=True) pred = predList[0].value prob = predList[1] actual = extTrain[idx].get_class().value # print pred, actual, prob # More non conforming if prediction is different from actual label if pred != actual: alpha = 1.0 + abs(prob) else: alpha = 1.0 - abs(prob) # print alpha return alpha, SVMparam
def cross_validation_plusFTM(data, learners, k, f, att_list): """ Perform k-fold cross validation and add FTM features (minsup = f) in each fold The FTM features for each training fold are recalculated for the test fold (NO FTM run!) att_list - is the list of attributes that will be removed before learning For reference see also: http://orange.biolab.si/doc/ofb/accuracy5.py http://orange.biolab.si/doc/ofb/c_performance.htm """ acc = [0.0]*len(learners) roc = [0.0]*len(learners) selection = orange.MakeRandomIndicesCV(data, folds=k) for test_fold in range(k): train_data = data.select(selection, test_fold, negate=1) # print "len->train: ", # print len(train_data) # add ftm features train_data_ftm = getFTMDescResult(train_data, f) minsupStr = str(f).replace(".","") filename = data.name + "_ftm_" + minsupStr + "_" + str(test_fold) + ".tab" #train_data.save(filename) train_scaled = dataUtilities.attributeDeselectionData(train_data_ftm, att_list) # recalc and add ftm features to test fold test_data = data.select(selection, test_fold) smarts = train_data_ftm.domain.attributes[len(train_data.domain.attributes):] print "# FTM features: ", print len(smarts) test_data_ftm = getSMARTSrecalcDesc(test_data,smarts) test_scaled = dataUtilities.attributeDeselectionData(test_data_ftm, att_list) classifiers = [] for l in learners: classifiers.append(l(train_scaled)) acc1 = accuracy(test_scaled, classifiers) auroc1 = aroc(test_scaled, classifiers) print "%d: %s" % (test_fold+1, acc1) print "%d: %s" % (test_fold+1, auroc1) for j in range(len(learners)): acc[j] += acc1[j] roc[j] += auroc1[j] for j in range(len(learners)): acc[j] = acc[j]/k roc[j] = roc[j]/k return acc, roc
def getBBRCDescResult(dataIN, algo = "FTM", minSupPar = 2, ChisqSig = None, active = None, verbose = 0, descList = []): """ delegate to different algorithm methods """ if not descList: descList = [] outData = None if active is not None: activeLabel = active else: if dataIN.domain.classVar: activeLabel = dataIN.domain.classVar.values[0] # For BBRC the active class can be any since it will only use the "count" else: activeLabel = None if (algo == "FTM"): # Using BBRC without class correlation BBRCCalc = BBRC(verbose = verbose) BBRCCalc.minsup = minSupPar BBRCCalc.active = activeLabel #Disanling class correlation BBRCCalc.DynamicUpperBound = False BBRCCalc.ChisqSig = 0.0 BBRCCalc.Backbone = False outData = BBRCCalc.getDesc(dataIN) elif (algo == "BBRC"): BBRCCalc = BBRC(verbose = verbose) BBRCCalc.minsup = minSupPar BBRCCalc.active = activeLabel if ChisqSig is not None: if ChisqSig < 0 or ChisqSig > 1: print "ERROR: ChisqSig must be defined between 0 and 1" return None BBRCCalc.ChisqSig = ChisqSig else: BBRCCalc.ChisqSig = 0.95 outData = BBRCCalc.getDesc(dataIN) elif (algo == "LAST-PM"): outData = getFMinerDescResult(data,minsup,algo) else: print "Algorithm "+str(algo)+" is unknown!" if not outData: return None newAttrs = [attr.name for attr in outData.domain if attr.name not in dataIN.domain] if descList: desAttrs = [attr for attr in newAttrs if attr not in descList] else: desAttrs = [] print "BBRC descriptors requested: "+str(len(descList) or "ALL") print "BBRC descriptors returned: "+str(len(newAttrs)-len(desAttrs)) if desAttrs: outData = dataUtilities.attributeDeselectionData(outData, desAttrs) unknownAttrs = [attr for attr in descList if attr not in outData.domain] print "Attributes not found among the structural descriptors: ",len(unknownAttrs)," (set to 0.0)" outData = dataUtilities.attributeAddData(outData, unknownAttrs, orange.FloatVariable, 0.0) return outData
def buildConsensus(trainData, learners, MLMethods, logFile = None): log(logFile, "Building a consensus model based on optimized MLmethods: "+str([ml for ml in MLMethods])+"...") if trainData.domain.classVar.varType == orange.VarTypes.Discrete: #Expression: If CAavg_{POS} ge CAavg_{NEG} -> POS else -> NEG # where CAavg_{POS} is the average of classification accuracies of all models predicting POS. CLASS0 = str(trainData.domain.classVar.values[0]) CLASS1 = str(trainData.domain.classVar.values[1]) #exprTest0 exprTest0 = "(0" for ml in MLMethods: exprTest0 += "+( "+ml+" == "+CLASS0+" )*"+str(MLMethods[ml]["optAcc"])+" " exprTest0 += ")/IF0(sum([False" for ml in MLMethods: exprTest0 += ", "+ml+" == "+CLASS0+" " exprTest0 += "]),1)" # exprTest1 exprTest1 = "(0" for ml in MLMethods: exprTest1 += "+( "+ml+" == "+CLASS1+" )*"+str(MLMethods[ml]["optAcc"])+" " exprTest1 += ")/IF0(sum([False" for ml in MLMethods: exprTest1 += ", "+ml+" == "+CLASS1+" " exprTest1 += "]),1)" # expression expression = [exprTest0+" >= "+exprTest1+" -> "+CLASS0," -> "+CLASS1] else: Q2sum = sum([MLMethods[ml]["optAcc"] for ml in MLMethods]) expression = "(1 / "+str(Q2sum)+") * (0" for ml in MLMethods: expression += " + "+str(MLMethods[ml]["optAcc"])+" * " + ml +" " expression += ")" consensusLearners = {} for learnerName in learners: consensusLearners[learnerName] = learners[learnerName] learner = AZorngConsensus.ConsensusLearner(learners = consensusLearners, expression = expression) log(logFile, " Training Consensus Learner") smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: log(logFile,"Found SMILES attribute:"+smilesAttr) if learner.specialType == 1: trainData = dataUtilities.attributeSelectionData(trainData, [smilesAttr, trainData.domain.classVar.name]) log(logFile,"Selected attrs: "+str([attr.name for attr in trainData.domain])) else: trainData = dataUtilities.attributeDeselectionData(trainData, [smilesAttr]) log(logFile,"Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] +\ [attr.name for attr in trainData.domain[len(trainData.domain)-3:]])) return learner(trainData)
def LLOO(idx, extTrain, measure=None): """ Use the fraction of kNN correctly predicted by a local model Hard coded to 20 NN. Modeling method. RF of Tree? """ attrList = ["SMILES_1"] extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList) distList = [] if not measure: measure = orange.ExamplesDistanceConstructor_Euclidean(extTrain) for runIdx in range(len(extTrain)): if runIdx != idx: dist = measure(extTrain[idx], extTrain[runIdx]) distList.append(dist) # Get the distance of the 20th NN distList.sort() thresDist = distList[19] # Find the labels of the 20 NN kNN = [] for runIdx in range(len(extTrain)): dist = measure(extTrain[idx], extTrain[runIdx]) if dist <= thresDist: kNN.append(extTrain[runIdx]) kNNtrain = dataUtilities.DataTable(kNN) # Find the fraction of correctly predicted ex in a LOO over kNN corrPred = 0 for idx in range(len(kNNtrain)): # Deselect example idx in extTrain idxList = range(0, idx) idxList.extend(range(idx + 1, len(kNNtrain))) train = kNNtrain.get_items(idxList) # Train a model model = AZorngRF.RFLearner(train) # model = Orange.classification.tree.TreeLearner(train) pred = model(kNNtrain[idx]).value actual = kNNtrain[idx].get_class().value if pred == actual: corrPred = corrPred + 1 alpha = 1.0 - float(corrPred) / len(kNNtrain) return alpha
def LLOO(idx, extTrain, measure=None): """ Use the fraction of kNN correctly predicted by a local model Hard coded to 20 NN. Modeling method. RF of Tree? """ attrList = ["SMILES_1"] extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList) distList = [] if not measure: measure = orange.ExamplesDistanceConstructor_Euclidean(extTrain) for runIdx in range(len(extTrain)): if runIdx != idx: dist = measure(extTrain[idx], extTrain[runIdx]) distList.append(dist) # Get the distance of the 20th NN distList.sort() thresDist = distList[19] # Find the labels of the 20 NN kNN = [] for runIdx in range(len(extTrain)): dist = measure(extTrain[idx], extTrain[runIdx]) if dist <= thresDist: kNN.append(extTrain[runIdx]) kNNtrain = dataUtilities.DataTable(kNN) # Find the fraction of correctly predicted ex in a LOO over kNN corrPred = 0 for idx in range(len(kNNtrain)): # Deselect example idx in extTrain idxList = range(0, idx) idxList.extend(range(idx + 1, len(kNNtrain))) train = kNNtrain.get_items(idxList) # Train a model model = AZorngRF.RFLearner(train) #model = Orange.classification.tree.TreeLearner(train) pred = model(kNNtrain[idx]).value actual = kNNtrain[idx].get_class().value if pred == actual: corrPred = corrPred + 1 alpha = 1.0 - float(corrPred) / len(kNNtrain) return alpha
def getMahalanobisResults(predictor, invCovMatFile=None, centerFile=None, dataTableFile=None): domain = None if predictor.highConf == None and predictor.lowConf == None: return None, None if not dataTableFile and (not hasattr(predictor, "trainDataPath") or not predictor.trainDataPath): print "The predictor does not have a trainDataPath specifyed. We need it for calculating Mahalanobis results!" return None, None testData = dataUtilities.attributeDeselectionData(predictor.exToPred, ["SMILEStoPred"]) if not dataTableFile: trainData = dataUtilities.DataTable(predictor.trainDataPath) domain = trainData.domain else: trainData = None domain = predictor.model.domain ExampleFix = dataUtilities.ExFix(domain, None, False) exFixed1 = ExampleFix.fixExample(testData[0]) if testData.hasMissingValues(): if not trainData: averageImputer = orange.Imputer_defaults( predictor.model.imputeData) else: averageImputer = orange.ImputerConstructor_average(trainData) dat = averageImputer(exFixed1) else: dat = exFixed1 tab = dataUtilities.DataTable(domain) tab.append(dat) MD = calcMahalanobis(trainData, tab, invCovMatFile, centerFile, dataTableFile, domain) near3neighbors = [(MD[0]["_train_id_near1"], MD[0]["_train_SMI_near1"]), (MD[0]["_train_id_near2"], MD[0]["_train_SMI_near2"]), (MD[0]["_train_id_near3"], MD[0]["_train_SMI_near3"])] avg3nearest = MD[0]["_train_av3nearest"] if avg3nearest < predictor.highConf: confStr = predictor.highConfString elif avg3nearest > predictor.lowConf: confStr = predictor.lowConfString else: confStr = predictor.medConfString return near3neighbors, confStr
def probPredInd(trainSet, calSet): """ Use the RF prediction probability to set the non-conf score """ attrList = ["SMILES_1"] trainSet = dataUtilities.attributeDeselectionData(trainSet, attrList) # Train a model model = AZorngRF.RFLearner(trainSet) # Get the list of NC for all ex in calSet alphaList = [] for ex in calSet: alpha = getProbPredAlpha(model, ex) alphaList.append(alpha) return alphaList, model
def filterDesc(data, zeroFracT = 0.95, LowVarT = 0.01, HighCorrT = 0.95): print "Initial number of descriptors ", len(data.domain.attributes) # Find the descriptors for which the fraction of zeros is smaller than zeroFracT - keep these attrList = [] rmAttrList = [] for attr in data.domain.attributes: valueList = [] nZero = 0 for ex in data: value = ex[attr.name].value if value == 0: nZero = nZero + 1 valueList.append(value) zeroFrac = float(nZero)/len(valueList) if zeroFrac < zeroFracT: attrList.append(attr.name) else: rmAttrList.append(attr.name) print "Descriptors deselected because of a large fraction of zeros: " print rmAttrList data = dataUtilities.attributeSelectionData(data, attrList) print "Remaining number of descriptors ", len(data.domain.attributes) # Filter descriptors based on normalized variance rmAttrList = [] for attr in data.domain.attributes: valueList = [] for ex in data: value = ex[attr.name].value valueList.append(value) variance = numpy.var(valueList) mean = numpy.mean(valueList) normVar = variance/mean if normVar < LowVarT: rmAttrList.append(attr.name) print "Descriptors deselected because of low variance " print rmAttrList data = dataUtilities.attributeDeselectionData(data, rmAttrList) print "Remaining number of descriptors ", len(data.domain.attributes) print "Correlation filter not implemented yet" return data
def getDesc(trainDataFile): # Read the SAR for which to calculate descriptors sarData = dataUtilities.DataTable(trainDataFile) # Get names of RDK descriptors rdkDescs = getCinfonyDesc.getAvailableDescs("rdk") # Calculate the descriptors trainData = getCinfonyDesc.getCinfonyDescResults(sarData, rdkDescs) # Deselect the SMILES attribute attrList = ["SMILES"] trainData = dataUtilities.attributeDeselectionData(trainData, attrList) # Save the trainData set trainData.save("trainData.tab") return trainData
def minNN(idx, extTrain, maxDistRatio=None, measure=None): """ Use the ratio between the distance to the nearest neighbor of the same and of the other class Two versions exist, with and without scaling with the max distance ratio within the train set. """ attrList = ["SMILES_1"] extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList) distListSame = [] distListDiff = [] #measure = Orange.distance.Euclidean(extTrain) if not measure: measure = orange.ExamplesDistanceConstructor_Euclidean(extTrain) for runIdx in range(len(extTrain)): if runIdx != idx: dist = measure(extTrain[idx], extTrain[runIdx]) if extTrain[idx].get_class().value == extTrain[runIdx].get_class( ).value: distListSame.append(dist) else: distListDiff.append(dist) minDistSame = min(distListSame) minDistDiff = min(distListDiff) if minDistDiff == 0: if maxDistRatio: alpha = 1.0 else: alpha = max(distListDiff) else: if maxDistRatio: alpha = minDistSame / (float(minDistDiff) * maxDistRatio) else: alpha = minDistSame / float(minDistDiff) #fid = open("tempFile.txt", "a") #fid.write(str(minDistSame)+"\t"+str(minDistDiff)+"\t"+str(maxDistRatio)+"\t"+str(alpha)+"\n") #fid.close() return alpha
def getMahalanobisResults(predictor, invCovMatFile = None, centerFile = None, dataTableFile = None): domain = None if predictor.highConf == None and predictor.lowConf == None: return None, None if not hasattr(predictor,"trainDataPath") or not predictor.trainDataPath: print "The predictor does not have a trainDataPath specifyed. We need it for calculating Mahalanobis results!" return None, None testData = dataUtilities.attributeDeselectionData(predictor.exToPred,["SMILEStoPred"]) if not dataTableFile: trainData = dataUtilities.DataTable(predictor.trainDataPath) domain = trainData.domain else: trainData = None domain = predictor.model.domain ExampleFix = dataUtilities.ExFix(domain,None,False) exFixed1 = ExampleFix.fixExample(testData[0]) if testData.hasMissingValues(): if not trainData: averageImputer = orange.Imputer_defaults(predictor.model.imputeData) else: averageImputer = orange.ImputerConstructor_average(trainData) dat = averageImputer(exFixed1) else: dat = exFixed1 tab = dataUtilities.DataTable(domain) tab.append(dat) MD = calcMahalanobis(trainData, tab, invCovMatFile, centerFile, dataTableFile, domain) near3neighbors = [ (MD[0]["_train_id_near1"], MD[0]["_train_SMI_near1"]), (MD[0]["_train_id_near2"], MD[0]["_train_SMI_near2"]), (MD[0]["_train_id_near3" ], MD[0]["_train_SMI_near3"]) ] avg3nearest = MD[0]["_train_av3nearest"] if avg3nearest < predictor.highConf: confStr = predictor.highConfString elif avg3nearest > predictor.lowConf: confStr = predictor.lowConfString else: confStr = predictor.medConfString return near3neighbors, confStr
def minNN(idx, extTrain, maxDistRatio=None, measure=None): """ Use the ratio between the distance to the nearest neighbor of the same and of the other class Two versions exist, with and without scaling with the max distance ratio within the train set. """ attrList = ["SMILES_1"] extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList) distListSame = [] distListDiff = [] # measure = Orange.distance.Euclidean(extTrain) if not measure: measure = orange.ExamplesDistanceConstructor_Euclidean(extTrain) for runIdx in range(len(extTrain)): if runIdx != idx: dist = measure(extTrain[idx], extTrain[runIdx]) if extTrain[idx].get_class().value == extTrain[runIdx].get_class().value: distListSame.append(dist) else: distListDiff.append(dist) minDistSame = min(distListSame) minDistDiff = min(distListDiff) if minDistDiff == 0: if maxDistRatio: alpha = 1.0 else: alpha = max(distListDiff) else: if maxDistRatio: alpha = minDistSame / (float(minDistDiff) * maxDistRatio) else: alpha = minDistSame / float(minDistDiff) # fid = open("tempFile.txt", "a") # fid.write(str(minDistSame)+"\t"+str(minDistDiff)+"\t"+str(maxDistRatio)+"\t"+str(alpha)+"\n") # fid.close() return alpha
def getDesc(trainDataFile): # Read the SAR for which to calculate descriptors sarData = dataUtilities.DataTable(trainDataFile) # Get names of RDK descriptors rdkDescs = getCinfonyDesc.getAvailableDescs("rdk") # Calculate the descriptors trainData = getCinfonyDesc.getCinfonyDescResults(sarData, rdkDescs) # Deselect the SMILES attribute attrList = [ attr.name for attr in trainData.domain.attributes if attr.varType == orange.Variable.String ] trainData = dataUtilities.attributeDeselectionData(trainData, attrList) # Save the trainData set trainData.save("trainData.tab") return trainData
def getBBRCDescResult(dataIN, algo="FTM", minSupPar=2, ChisqSig=None, active=None, verbose=0, descList=[]): """ delegate to different algorithm methods """ if not descList: descList = [] outData = None if active is not None: activeLabel = active else: if dataIN.domain.classVar: activeLabel = dataIN.domain.classVar.values[ 0] # For BBRC the active class can be any since it will only use the "count" else: activeLabel = None if (algo == "FTM"): # Using BBRC without class correlation BBRCCalc = BBRC(verbose=verbose) BBRCCalc.minsup = minSupPar BBRCCalc.active = activeLabel #Disanling class correlation BBRCCalc.DynamicUpperBound = False BBRCCalc.ChisqSig = 0.0 BBRCCalc.Backbone = False outData = BBRCCalc.getDesc(dataIN) elif (algo == "BBRC"): BBRCCalc = BBRC(verbose=verbose) BBRCCalc.minsup = minSupPar BBRCCalc.active = activeLabel if ChisqSig is not None: if ChisqSig < 0 or ChisqSig > 1: print "ERROR: ChisqSig must be defined between 0 and 1" return None BBRCCalc.ChisqSig = ChisqSig else: BBRCCalc.ChisqSig = 0.95 outData = BBRCCalc.getDesc(dataIN) elif (algo == "LAST-PM"): outData = getFMinerDescResult(data, minsup, algo) else: print "Algorithm " + str(algo) + " is unknown!" if not outData: return None newAttrs = [ attr.name for attr in outData.domain if attr.name not in dataIN.domain ] if descList: desAttrs = [attr for attr in newAttrs if attr not in descList] else: desAttrs = [] print "BBRC descriptors requested: " + str(len(descList) or "ALL") print "BBRC descriptors returned: " + str(len(newAttrs) - len(desAttrs)) if desAttrs: outData = dataUtilities.attributeDeselectionData(outData, desAttrs) unknownAttrs = [attr for attr in descList if attr not in outData.domain] print "Attributes not found among the structural descriptors: ", len( unknownAttrs), " (set to 0.0)" outData = dataUtilities.attributeAddData(outData, unknownAttrs, orange.FloatVariable, 0.0) return outData
#if idx == 1: break if __name__ == "__main__": """ Assumptions; Binary classification This main will test the implemented CP methods in a 10 fold CV """ data = dataUtilities.DataTable("HLMSeries2_rdkPhysChemPrepClass.txt") attrList = [ '"Medivir;HLM (XEN025);CLint (uL/min/mg);(Num)"', 'Structure', '"MV Number"', "rdk.MolecularFormula" ] data = dataUtilities.attributeDeselectionData(data, attrList) print "Select all attributes" descListList = [[]] for attr in data.domain.attributes: descListList[0].append(attr.name) #methods = ["kNNratio", "minNN", "avgNN", "probPred", "combo", "LLOO", "LLOOprob"] # Non-conformity score method methods = ["probPred"] cpMethod = "transductive" # inductive or transductive #print "Temp position to save comp time!!" # Append to python path /home/kgvf414/dev/AZOrange0.5.5/orangeDependencies/src/orange/orange/Orange/distance/ #import instances #measure = instances.MahalanobisConstructor(data) measure = None
def getExamplesAndSetTrainBias(self, data, testAttrFilter, testFilterVal): """ Collects and returns the examples that match the filterValue at the Attr defined The remaining examples (that do not match the filterValue at the Attr defined) are placed in the trainBias to be added in all train events. """ self.trainBias = None if testAttrFilter is not None and testFilterVal is not None and testAttrFilter in data.domain: if type(testFilterVal) != list: raise Exception( "Invalid Attr filter value. It must be a list of strings") else: allDataEx = len(data) examples = orange.ExampleTable(data.domain) self.trainBias = orange.ExampleTable(data.domain) for ex in data: inExamples = False for Vfilter in testFilterVal: if ex[testAttrFilter].value == Vfilter: examples.append(ex) inExamples = True break if not inExamples: self.trainBias.append(ex) print "INFO: Variable control validation:" print " Examples in data: " + str(allDataEx) print " Examples selected for validation: " + str( len(examples)) print " Examples to be appended to the train set: " + str( len(self.trainBias)) examples = dataUtilities.attributeDeselectionData( examples, [testAttrFilter]) elif testAttrFilter is not None and testFilterVal is None and testAttrFilter in data.domain: #Enable pre-selected-indices self.fixedIdx = orange.LongList() allDataEx = len(data) examples = orange.ExampleTable(data.domain) self.trainBias = orange.ExampleTable(data.domain) foldsCounter = {} for ex in data: value = str(ex[testAttrFilter].value) if not miscUtilities.isNumber(value): raise Exception("Invalid fold value:" + str(value) + ". It must be str convertable to an int.") value = int(float(value)) if value not in foldsCounter: foldsCounter[value] = 1 else: foldsCounter[value] += 1 if not miscUtilities.isNumber: raise Exception("Invalid fold value:" + str(value) + ". It must be str convertable to an int.") if value != 0: examples.append(ex) self.fixedIdx.append(value - 1) else: self.trainBias.append(ex) print "INFO: Pre-selected " + str( len([f for f in foldsCounter if f != 0])) + " folds for CV:" print " Examples in data: " + str(allDataEx) print " Examples selected for validation: " + str( len(examples)) print " Examples to be appended to the train set: " + str( len(self.trainBias)) examples = dataUtilities.attributeDeselectionData( examples, [testAttrFilter]) else: examples = data return examples
elif avg3nearest > predictor.lowConf: confStr = predictor.lowConfString else: confStr = predictor.medConfString return near3neighbors, confStr if __name__ == "__main__": dataFile = "trainData.txt" testDataFile = "testData.txt" data = dataUtilities.DataTable(dataFile) testData = dataUtilities.DataTable(testDataFile) # This data contains SMILES and ID, which data and ex are assumed not to. attrList = ["SMILES", "ID"] data = dataUtilities.attributeDeselectionData(data, attrList) testData = dataUtilities.attributeDeselectionData(testData, attrList) # Select one ex selectionList = [] for idx in range(len(testData)): selectionList.append(0) selectionList[0] = 1 # Select first ex ex = testData.select(selectionList) # One ex in exampleTable #MD = calcMahalanobis(data, ex) # Multiple ex in exampleTable MD = calcMahalanobis(data, testData) #print "Returned MD" #print MD
def getSimDescriptors(InReference, InData, methods, active_ids = None, pharmacophore_file = None, callBack = None): """ calculates similarity descriptors for a training set (orange object) using the given similarity methods against the given actives Possible method strings in methods are the names of the sim_* methods below, e.g. rdk_topo_fps for sim_rdk_topo_fps callBack function, if defined, will be called on each step sending the pergentage done (0-100): e.g. callBack(25) the callBack function shall return True of False which will indicate to this method if the process it to be continued or Not. e.g. if callBack(25) == False it indicates the caller want's to stop the process of calculating descriptors """ # Pre-process input Data to standardize the SMILES SMILESattr = getSMILESAttr(InData) if not SMILESattr: return None #TODO: Create a method in dataUtilities to standardize the attribute smilesName in place having the attr origSmiles as ID if "AZutilities.extraUtilities" in sys.modules and hasattr(extraUtilities, "StandardizeSMILES"): # Call a method for standardizing the SMILES in Data. # The method is expected to change the attribute defined as smiAttr in data object cleanedData = True # Process InData tmpDomain = orange.Domain([orange.StringVariable("OrigSMI_ID")]+[attr for attr in InData.domain]) data = orange.ExampleTable(tmpDomain,InData) # Fill the OrigSMI_ID for ex in data: ex["OrigSMI_ID"] = ex[SMILESattr] extraUtilities.StandardizeSMILES(data, smiAttr = SMILESattr, cName="OrigSMI_ID") # Process Input actives activesDomain = orange.Domain([orange.StringVariable("OrigSMI_ID"), orange.StringVariable("SMILES")],0) activesData = orange.ExampleTable(activesDomain) for act in InReference: activesData.append([act,act]) extraUtilities.StandardizeSMILES(activesData, smiAttr = "SMILES", cName="OrigSMI_ID") #print activesData.domain actives = [] for ex in activesData: actives.append(str(ex["SMILES"].value)) else: data = InData print "NO cleaning" actives = InReference cleanedData = False # adjust the header atts = [] for m in methods: count = 1 for a in actives: attname = m + '(reference_'+ str(count)+ ')' #print "ATT: " + str(attname) #print "M: " + str(m) atts.append(orange.FloatVariable(attname)) count += 1 newdomain = orange.Domain(data.domain.attributes + atts, data.domain.classVar) newdata = orange.ExampleTable(newdomain, data) att_idx = 0 # if callBack is defined, it will be called with the percentage done, i.e. 0-100 if active_ids: nTotalSteps = len(newdata) * ( (len(methods)-1) * len(actives) + len(active_ids) ) else: nTotalSteps = len(methods) * len(actives) * len(newdata) stepsDone = 0 # fill up the data for m in methods: if m == 'rdk_topo_fps': count = 1 for a in actives: attname = m + '(active_'+ str(count)+ ')' for j in range(len(newdata)): instance = newdata[j] val = 0.0 try: val = orng_sim_rdk_topo_fps(a, instance) except RuntimeError: print str(a) +" and " +str(instance) + "- unable to calculate topo fp" tmp = orange.Value(atts[att_idx], val) instance[atts[att_idx]] = tmp if callBack: stepsDone += 1 if not callBack((100*stepsDone)/nTotalSteps): return None att_idx += 1 elif m == 'rdk_MACCS_keys': count = 1 for a in actives: c = 1 attname = m + '(active_'+ str(count)+ ')' for j in range(len(newdata)): instance = newdata[j] val = 0.0 try: val = orng_sim_rdk_MACCS_keys(a, instance) except RuntimeError: print str(a) + " and " +str(instance) + "- unable to calculate MACCS key" tmp = orange.Value(atts[att_idx], val) instance[atts[att_idx]] = tmp if callBack: stepsDone += 1 if not callBack((100*stepsDone)/nTotalSteps): return None att_idx += 1 elif m == 'rdk_morgan_fps': count = 1 for a in actives: attname = m + '(active_'+ str(count)+ ')' for j in range(len(newdata)): instance = newdata[j] val = 0.0 try: val = orng_sim_rdk_morgan_fps(a, instance) except RuntimeError: print str(a) + " and " +str(instance) + "- unable to calculate morgan fp" tmp = orange.Value(atts[att_idx], val) instance[atts[att_idx]] = tmp if callBack: stepsDone += 1 if not callBack((100*stepsDone)/nTotalSteps): return None att_idx += 1 elif m == 'rdk_morgan_features_fps': count = 1 for a in actives: attname = m + '(active_'+ str(count)+ ')' for j in range(len(newdata)): instance = newdata[j] val = 0.0 try: val = orng_sim_rdk_morgan_features_fps(a, instance) except RuntimeError: print str(a) + " and " +str(instance) + "- unable to calculate morgan features fp" tmp = orange.Value(atts[att_idx], val) instance[atts[att_idx]] = tmp if callBack: stepsDone += 1 if not callBack((100*stepsDone)/nTotalSteps): return None att_idx += 1 elif m == 'rdk_atompair_fps': count = 1 for a in actives: attname = m + '(active_'+ str(count)+ ')' for j in range(len(newdata)): instance = newdata[j] val = 0.0 try: val = orng_sim_rdk_atompair_fps(a, instance) except RuntimeError: print str(a) + " and " +str(instance) + "- unable to calculate topo fp" tmp = orange.Value(atts[att_idx], val) instance[atts[att_idx]] = tmp if callBack: stepsDone += 1 if not callBack((100*stepsDone)/nTotalSteps): return None att_idx += 1 elif m == 'azo_pharmacophore_fps': count = 1 for a in active_ids: attname = m + '(active_'+ str(count)+ ')' for j in range(len(newdata)): instance = newdata[j] tmp = orange.Value(atts[att_idx], azo_pharmacophore_az_inhouse(a, instance, pharmacophore_file)) instance[atts[att_idx]] = tmp if callBack: stepsDone += 1 if not callBack((100*stepsDone)/nTotalSteps): return None att_idx += 1 if cleanedData: #Remove the fixed SMILES and revert to the Original SMILES newdata = dataUtilities.attributeDeselectionData(newdata,[SMILESattr]) newdata.domain["OrigSMI_ID"].name = SMILESattr return newdata
def getExamplesAndSetTrainBias(self, data, testAttrFilter, testFilterVal): """ Collects and returns the examples that match the filterValue at the Attr defined The remaining examples (that do not match the filterValue at the Attr defined) are placed in the trainBias to be added in all train events. """ self.trainBias = None if testAttrFilter is not None and testFilterVal is not None and testAttrFilter in data.domain: if type(testFilterVal) != list: raise Exception("Invalid Attr filter value. It must be a list of strings") else: allDataEx = len(data) examples = orange.ExampleTable(data.domain) self.trainBias = orange.ExampleTable(data.domain) for ex in data: inExamples = False for Vfilter in testFilterVal: if ex[testAttrFilter].value == Vfilter: examples.append(ex) inExamples = True break if not inExamples: self.trainBias.append(ex) print "INFO: Variable control validation:" print " Examples in data: "+str(allDataEx) print " Examples selected for validation: "+str(len(examples)) print " Examples to be appended to the train set: "+str(len(self.trainBias)) examples = dataUtilities.attributeDeselectionData(examples, [testAttrFilter]) elif testAttrFilter is not None and testFilterVal is None and testAttrFilter in data.domain: #Enable pre-selected-indices self.fixedIdx = orange.LongList() allDataEx = len(data) examples = orange.ExampleTable(data.domain) self.trainBias = orange.ExampleTable(data.domain) foldsCounter = {} for ex in data: value = str(ex[testAttrFilter].value) if not miscUtilities.isNumber(value): raise Exception("Invalid fold value:"+str(value)+". It must be str convertable to an int.") value = int(float(value)) if value not in foldsCounter: foldsCounter[value] = 1 else: foldsCounter[value] += 1 if not miscUtilities.isNumber: raise Exception("Invalid fold value:"+str(value)+". It must be str convertable to an int.") if value != 0: examples.append(ex) self.fixedIdx.append(value - 1) else: self.trainBias.append(ex) print "INFO: Pre-selected "+str(len([f for f in foldsCounter if f != 0]))+" folds for CV:" print " Examples in data: "+str(allDataEx) print " Examples selected for validation: "+str(len(examples)) print " Examples to be appended to the train set: "+str(len(self.trainBias)) examples = dataUtilities.attributeDeselectionData(examples, [testAttrFilter]) else: examples = data return examples
def getAcc(self, callBack=None, callBackWithFoldModel=None): """ For regression problems, it returns the RMSE and the Q2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None # Set the response type self.responseType = self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" self.__log(" " + str(self.responseType)) #Create the Train and test sets if self.usePreDefFolds: DataIdxs = self.preDefIndices else: DataIdxs = self.sampler(self.data, self.nExtFolds) foldsN = [f for f in dict.fromkeys(DataIdxs) if f != 0 ] #Folds used only from 1 on ... 0 are for fixed train Bias nFolds = len(foldsN) #Fix the Indexes based on DataIdxs # (0s) represents the train set ( >= 1s) represents the test set folds if self.useVarCtrlCV: nShifted = [0] * nFolds for idx, isTest in enumerate( self.preDefIndices ): # self.preDefIndices == 0 are to be used in TrainBias if not isTest: if DataIdxs[idx]: nShifted[DataIdxs[idx]] += 1 DataIdxs[idx] = 0 for idx, shift in enumerate(nShifted): self.__log("In fold " + str(idx) + ", " + str(shift) + " examples were shifted to the train set.") #Var for saving each Fols result optAcc = {} results = {} exp_pred = {} nTrainEx = {} nTestEx = {} #Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models = {} self.__log("Calculating Statistics for MLmethods:") self.__log(" " + str([x for x in MLmethods])) #Check data in advance so that, by chance, it will not faill at the last fold! for foldN in foldsN: trainData = self.data.select(DataIdxs, foldN, negate=1) self.__checkTrainData(trainData) #Optional!! # Order Learners so that PLS is the first sortedML = [ml for ml in MLmethods] if "PLS" in sortedML: sortedML.remove("PLS") sortedML.insert(0, "PLS") stepsDone = 0 nTotalSteps = len(sortedML) * self.nExtFolds for ml in sortedML: startTime = time.time() self.__log(" > " + str(ml) + "...") try: #Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] optAcc[ml] = [] logTxt = "" for foldN in foldsN: if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs, foldN, negate=1) testData = self.data.select(DataIdxs, foldN) smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: self.__log("Found SMILES attribute:" + smilesAttr) if MLmethods[ml].specialType == 1: trainData = dataUtilities.attributeSelectionData( trainData, [smilesAttr, trainData.domain.classVar.name]) testData = dataUtilities.attributeSelectionData( testData, [smilesAttr, testData.domain.classVar.name]) self.__log( "Selected attrs: " + str([attr.name for attr in trainData.domain])) else: trainData = dataUtilities.attributeDeselectionData( trainData, [smilesAttr]) testData = dataUtilities.attributeDeselectionData( testData, [smilesAttr]) self.__log("Selected attrs: " + str( [attr.name for attr in trainData.domain[0:3]] + ["..."] + [ attr.name for attr in trainData. domain[len(trainData.domain) - 3:] ])) nTrainEx[ml].append(len(trainData)) nTestEx[ml].append(len(testData)) #Test if trainsets inside optimizer will respect dataSize criterias. # if not, don't optimize, but still train the model dontOptimize = False if self.responseType != "Classification" and ( len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20): dontOptimize = True else: tmpDataIdxs = self.sampler(trainData, self.nInnerFolds) tmpTrainData = trainData.select(tmpDataIdxs, 1, negate=1) if not self.__checkTrainData(tmpTrainData, False): dontOptimize = True SpecialModel = None if dontOptimize: logTxt += " Fold " + str( foldN ) + ": Too few compounds to optimize model hyper-parameters\n" self.__log(logTxt) if trainData.domain.classVar.varType == orange.VarTypes.Discrete: res = evalUtilities.crossValidation( [MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices. StratifiedIfPossible, random_generator=random.randint(0, 100)) CA = evalUtilities.CA(res)[0] optAcc[ml].append(CA) else: res = evalUtilities.crossValidation( [MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices. StratifiedIfPossible, random_generator=random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: if MLmethods[ml].specialType == 1: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optInfo, SpecialModel = MLmethods[ ml].optimizePars(trainData, folds=5) optAcc[ml].append(optInfo["Acc"]) else: res = evalUtilities.crossValidation( [MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices. StratifiedIfPossible, random_generator=random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: runPath = miscUtilities.createScratchDir( baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData)) trainData.save( os.path.join(runPath, "trainData.tab")) tunedPars = paramOptUtilities.getOptParam( learner=MLmethods[ml], trainDataFile=os.path.join( runPath, "trainData.tab"), paramList=self.paramList, useGrid=False, verbose=self.verbose, queueType=self.queueType, runPath=runPath, nExtFolds=None, nFolds=self.nInnerFolds, logFile=self.logFile, getTunedPars=True, fixedParams=self.fixedParams) if not MLmethods[ml] or not MLmethods[ml].optimized: self.__log( " WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized.") self.__log( " It will be ignored") #self.__log(" It will be set to default parameters") self.__log( " DEBUG can be done in: " + runPath) #Set learner back to default #MLmethods[ml] = MLmethods[ml].__class__() raise Exception("The learner " + str(ml) + " was not optimized.") else: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optAcc[ml].append(tunedPars[0]) else: res = evalUtilities.crossValidation( [MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices. StratifiedIfPossible, random_generator=random.randint( 0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) miscUtilities.removeDir(runPath) #Train the model if SpecialModel is not None: model = SpecialModel else: model = MLmethods[ml](trainData) models[ml].append(model) #Test the model if self.responseType == "Classification": results[ml].append( (evalUtilities.getClassificationAccuracy( testData, model), evalUtilities.getConfMat(testData, model))) else: local_exp_pred = [] # Predict using bulk-predict predictions = model(testData) # Gather predictions for n, ex in enumerate(testData): local_exp_pred.append( (ex.getclass().value, predictions[n].value)) results[ml].append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))) #Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred if callBack: stepsDone += 1 if not callBack((100 * stepsDone) / nTotalSteps): return None if callBackWithFoldModel: callBackWithFoldModel(model) res = self.createStatObj( results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml], self.responseType, self.nExtFolds, logTxt, labels=hasattr(self.data.domain.classVar, "values") and list(self.data.domain.classVar.values) or None) if self.verbose > 0: print "UnbiasedAccuracyGetter!Results " + ml + ":\n" pprint(res) if not res: raise Exception("No results available!") res["runningTime"] = time.time() - startTime statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) self.__log(" OK") except: self.__log(" Learner " + str(ml) + " failed to create/optimize the model!") error = str(sys.exc_info()[0]) +" "+\ str(sys.exc_info()[1]) +" "+\ str(traceback.extract_tb(sys.exc_info()[2])) self.__log(error) res = self.createStatObj() statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: #We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! # When only one or no stable models, build a consensus based on all models # ALWAYS exclude specialType models (MLmethods[ml].specialType > 0) consensusMLs = {} for modelName in statistics: StabilityValue = statistics[modelName]["StabilityValue"] if StabilityValue is not None and statistics[modelName][ "stable"]: consensusMLs[modelName] = copy.deepcopy( statistics[modelName]) self.__log("Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods.") if len(consensusMLs ) <= 1: # we need more models to build a consensus! consensusMLs = {} for modelName in statistics: consensusMLs[modelName] = copy.deepcopy( statistics[modelName]) # Exclude specialType models excludeThis = [] for learnerName in consensusMLs: if models[learnerName][0].specialType > 0: excludeThis.append(learnerName) for learnerName in excludeThis: consensusMLs.pop(learnerName) self.__log(" > Excluded special model " + learnerName) self.__log(" > Stable modules: " + str(consensusMLs.keys())) if len(consensusMLs) >= 2: #Var for saving each Fols result startTime = time.time() Cresults = [] Cexp_pred = [] CnTrainEx = [] CnTestEx = [] self.__log( "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs])) for foldN in range(self.nExtFolds): if self.responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) # exprTest0 exprTest0 = "(0" for ml in consensusMLs: exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str( optAcc[ml][foldN]) + " " exprTest0 += ")/IF0(sum([False" for ml in consensusMLs: exprTest0 += ", " + ml + " == " + CLASS0 + " " exprTest0 += "]),1)" # exprTest1 exprTest1 = "(0" for ml in consensusMLs: exprTest1 += "+( " + ml + " == " + CLASS1 + " )*" + str( optAcc[ml][foldN]) + " " exprTest1 += ")/IF0(sum([False" for ml in consensusMLs: exprTest1 += ", " + ml + " == " + CLASS1 + " " exprTest1 += "]),1)" # Expression expression = [ exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1 ] else: Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs]) expression = "(1 / " + str(Q2sum) + ") * (0" for ml in consensusMLs: expression += " + " + str( optAcc[ml][foldN]) + " * " + ml + " " expression += ")" testData = self.data.select( DataIdxs, foldN + 1) # fold 0 if for the train Bias!! smilesAttr = dataUtilities.getSMILESAttr(testData) if smilesAttr: self.__log("Found SMILES attribute:" + smilesAttr) testData = dataUtilities.attributeDeselectionData( testData, [smilesAttr]) self.__log("Selected attrs: " + str( [attr.name for attr in trainData.domain[0:3]] + ["..."] + [ attr.name for attr in trainData.domain[len(trainData.domain) - 3:] ])) CnTestEx.append(len(testData)) consensusClassifiers = {} for learnerName in consensusMLs: consensusClassifiers[learnerName] = models[ learnerName][foldN] model = AZorngConsensus.ConsensusClassifier( classifiers=consensusClassifiers, expression=expression) CnTrainEx.append(model.NTrainEx) #Test the model if self.responseType == "Classification": Cresults.append( (evalUtilities.getClassificationAccuracy( testData, model), evalUtilities.getConfMat(testData, model))) else: local_exp_pred = [] # Predict using bulk-predict predictions = model(testData) # Gather predictions for n, ex in enumerate(testData): local_exp_pred.append( (ex.getclass().value, predictions[n].value)) Cresults.append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))) #Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj( Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds, labels=hasattr(self.data.domain.classVar, "values") and list(self.data.domain.classVar.values) or None) res["runningTime"] = time.time() - startTime statistics["Consensus"] = copy.deepcopy(res) statistics["Consensus"][ "IndividualStatistics"] = copy.deepcopy(consensusMLs) self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics #By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]
def getAcc(self, callBack = None, callBackWithFoldModel = None): """ For regression problems, it returns the RMSE and the Q2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None # Set the response type self.responseType = self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" self.__log(" "+str(self.responseType)) #Create the Train and test sets if self.usePreDefFolds: DataIdxs = self.preDefIndices else: DataIdxs = self.sampler(self.data, self.nExtFolds) foldsN = [f for f in dict.fromkeys(DataIdxs) if f != 0] #Folds used only from 1 on ... 0 are for fixed train Bias nFolds = len(foldsN) #Fix the Indexes based on DataIdxs # (0s) represents the train set ( >= 1s) represents the test set folds if self.useVarCtrlCV: nShifted = [0] * nFolds for idx,isTest in enumerate(self.preDefIndices): # self.preDefIndices == 0 are to be used in TrainBias if not isTest: if DataIdxs[idx]: nShifted[DataIdxs[idx]] += 1 DataIdxs[idx] = 0 for idx,shift in enumerate(nShifted): self.__log("In fold "+str(idx)+", "+str(shift)+" examples were shifted to the train set.") #Var for saving each Fols result optAcc = {} results = {} exp_pred = {} nTrainEx = {} nTestEx = {} #Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models={} self.__log("Calculating Statistics for MLmethods:") self.__log(" "+str([x for x in MLmethods])) #Check data in advance so that, by chance, it will not faill at the last fold! for foldN in foldsN: trainData = self.data.select(DataIdxs,foldN,negate=1) self.__checkTrainData(trainData) #Optional!! # Order Learners so that PLS is the first sortedML = [ml for ml in MLmethods] if "PLS" in sortedML: sortedML.remove("PLS") sortedML.insert(0,"PLS") stepsDone = 0 nTotalSteps = len(sortedML) * self.nExtFolds for ml in sortedML: startTime = time.time() self.__log(" > "+str(ml)+"...") try: #Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] optAcc[ml] = [] logTxt = "" for foldN in foldsN: if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs,foldN,negate=1) testData = self.data.select(DataIdxs,foldN) smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: self.__log("Found SMILES attribute:"+smilesAttr) if MLmethods[ml].specialType == 1: trainData = dataUtilities.attributeSelectionData(trainData, [smilesAttr, trainData.domain.classVar.name]) testData = dataUtilities.attributeSelectionData(testData, [smilesAttr, testData.domain.classVar.name]) self.__log("Selected attrs: "+str([attr.name for attr in trainData.domain])) else: trainData = dataUtilities.attributeDeselectionData(trainData, [smilesAttr]) testData = dataUtilities.attributeDeselectionData(testData, [smilesAttr]) self.__log("Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] + [attr.name for attr in trainData.domain[len(trainData.domain)-3:]])) nTrainEx[ml].append(len(trainData)) nTestEx[ml].append(len(testData)) #Test if trainsets inside optimizer will respect dataSize criterias. # if not, don't optimize, but still train the model dontOptimize = False if self.responseType != "Classification" and (len(trainData)*(1-1.0/self.nInnerFolds) < 20): dontOptimize = True else: tmpDataIdxs = self.sampler(trainData, self.nInnerFolds) tmpTrainData = trainData.select(tmpDataIdxs,1,negate=1) if not self.__checkTrainData(tmpTrainData, False): dontOptimize = True SpecialModel = None if dontOptimize: logTxt += " Fold "+str(foldN)+": Too few compounds to optimize model hyper-parameters\n" self.__log(logTxt) if trainData.domain.classVar.varType == orange.VarTypes.Discrete: res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100)) CA = evalUtilities.CA(res)[0] optAcc[ml].append(CA) else: res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: if MLmethods[ml].specialType == 1: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optInfo, SpecialModel = MLmethods[ml].optimizePars(trainData, folds = 5) optAcc[ml].append(optInfo["Acc"]) else: res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "AccWOptParam", seed = id(trainData)) trainData.save(os.path.join(runPath,"trainData.tab")) tunedPars = paramOptUtilities.getOptParam( learner = MLmethods[ml], trainDataFile = os.path.join(runPath,"trainData.tab"), paramList = self.paramList, useGrid = False, verbose = self.verbose, queueType = self.queueType, runPath = runPath, nExtFolds = None, nFolds = self.nInnerFolds, logFile = self.logFile, getTunedPars = True, fixedParams = self.fixedParams) if not MLmethods[ml] or not MLmethods[ml].optimized: self.__log(" WARNING: GETACCWOPTPARAM: The learner "+str(ml)+" was not optimized.") self.__log(" It will be ignored") #self.__log(" It will be set to default parameters") self.__log(" DEBUG can be done in: "+runPath) #Set learner back to default #MLmethods[ml] = MLmethods[ml].__class__() raise Exception("The learner "+str(ml)+" was not optimized.") else: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optAcc[ml].append(tunedPars[0]) else: res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) miscUtilities.removeDir(runPath) #Train the model if SpecialModel is not None: model = SpecialModel else: model = MLmethods[ml](trainData) models[ml].append(model) #Test the model if self.responseType == "Classification": results[ml].append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) ) else: local_exp_pred = [] # Predict using bulk-predict predictions = model(testData) # Gather predictions for n,ex in enumerate(testData): local_exp_pred.append((ex.getclass().value, predictions[n].value)) results[ml].append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) ) #Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred if callBack: stepsDone += 1 if not callBack((100*stepsDone)/nTotalSteps): return None if callBackWithFoldModel: callBackWithFoldModel(model) res = self.createStatObj(results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml],self.responseType, self.nExtFolds, logTxt, labels = hasattr(self.data.domain.classVar,"values") and list(self.data.domain.classVar.values) or None ) if self.verbose > 0: print "UnbiasedAccuracyGetter!Results "+ml+":\n" pprint(res) if not res: raise Exception("No results available!") res["runningTime"] = time.time() - startTime statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) self.__log(" OK") except: self.__log(" Learner "+str(ml)+" failed to create/optimize the model!") error = str(sys.exc_info()[0]) +" "+\ str(sys.exc_info()[1]) +" "+\ str(traceback.extract_tb(sys.exc_info()[2])) self.__log(error) res = self.createStatObj() statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: #We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! # When only one or no stable models, build a consensus based on all models # ALWAYS exclude specialType models (MLmethods[ml].specialType > 0) consensusMLs={} for modelName in statistics: StabilityValue = statistics[modelName]["StabilityValue"] if StabilityValue is not None and statistics[modelName]["stable"]: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) self.__log("Found "+str(len(consensusMLs))+" stable MLmethods out of "+str(len(statistics))+" MLmethods.") if len(consensusMLs) <= 1: # we need more models to build a consensus! consensusMLs={} for modelName in statistics: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) # Exclude specialType models excludeThis = [] for learnerName in consensusMLs: if models[learnerName][0].specialType > 0: excludeThis.append(learnerName) for learnerName in excludeThis: consensusMLs.pop(learnerName) self.__log(" > Excluded special model " + learnerName) self.__log(" > Stable modules: " + str(consensusMLs.keys())) if len(consensusMLs) >= 2: #Var for saving each Fols result startTime = time.time() Cresults = [] Cexp_pred = [] CnTrainEx = [] CnTestEx = [] self.__log("Calculating the statistics for a Consensus model based on "+str([ml for ml in consensusMLs])) for foldN in range(self.nExtFolds): if self.responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) # exprTest0 exprTest0 = "(0" for ml in consensusMLs: exprTest0 += "+( "+ml+" == "+CLASS0+" )*"+str(optAcc[ml][foldN])+" " exprTest0 += ")/IF0(sum([False" for ml in consensusMLs: exprTest0 += ", "+ml+" == "+CLASS0+" " exprTest0 += "]),1)" # exprTest1 exprTest1 = "(0" for ml in consensusMLs: exprTest1 += "+( "+ml+" == "+CLASS1+" )*"+str(optAcc[ml][foldN])+" " exprTest1 += ")/IF0(sum([False" for ml in consensusMLs: exprTest1 += ", "+ml+" == "+CLASS1+" " exprTest1 += "]),1)" # Expression expression = [exprTest0+" >= "+exprTest1+" -> "+CLASS0," -> "+CLASS1] else: Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs]) expression = "(1 / "+str(Q2sum)+") * (0" for ml in consensusMLs: expression += " + "+str(optAcc[ml][foldN])+" * "+ml+" " expression += ")" testData = self.data.select(DataIdxs,foldN+1) # fold 0 if for the train Bias!! smilesAttr = dataUtilities.getSMILESAttr(testData) if smilesAttr: self.__log("Found SMILES attribute:"+smilesAttr) testData = dataUtilities.attributeDeselectionData(testData, [smilesAttr]) self.__log("Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] + [attr.name for attr in trainData.domain[len(trainData.domain)-3:]])) CnTestEx.append(len(testData)) consensusClassifiers = {} for learnerName in consensusMLs: consensusClassifiers[learnerName] = models[learnerName][foldN] model = AZorngConsensus.ConsensusClassifier(classifiers = consensusClassifiers, expression = expression) CnTrainEx.append(model.NTrainEx) #Test the model if self.responseType == "Classification": Cresults.append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) ) else: local_exp_pred = [] # Predict using bulk-predict predictions = model(testData) # Gather predictions for n,ex in enumerate(testData): local_exp_pred.append((ex.getclass().value, predictions[n].value)) Cresults.append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) ) #Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds, labels = hasattr(self.data.domain.classVar,"values") and list(self.data.domain.classVar.values) or None ) res["runningTime"] = time.time() - startTime statistics["Consensus"] = copy.deepcopy(res) statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs) self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics #By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]
def __init__(self, **kwds): self.verbose = 0 self.logFile = None self.resultsFile = None self.nExtFolds = 5 self.nInnerFolds = 5 self.data = None self.learner = None self.paramList = None self.queueType = "NoSGE" self.responseType = None self.fixedParams = {} self.testAttrFilter = None self.testFilterVal = None self.sampler = dataUtilities.SeedDataSampler # Append arguments to the __dict__ member variable self.__dict__.update(kwds) self.learnerName = "" self.preDefIndices = orange.LongList() self.usePreDefFolds = False self.useVarCtrlCV = False if self.testAttrFilter and self.testAttrFilter in self.data.domain: if self.testFilterVal and type(self.testFilterVal) == list and type(self.testAttrFilter) == str: self.useVarCtrlCV = True self.usePreDefFolds = False for ex in self.data: if ex[self.testAttrFilter].value in self.testFilterVal: # Compound selected to be allowed in the test set self.preDefIndices.append(1) else: # Compound to not include in the test set. Always to be shifted to the train self.preDefIndices.append(0) elif self.testFilterVal is None: self.usePreDefFolds = True self.useVarCtrlCV = False #Enable pre-selected-indices ( index 0 will be set for train Bias) foldsCounter = {} for ex in self.data: value = str(ex[self.testAttrFilter].value) if not miscUtilities.isNumber(value): self.__log("Invalid fold value:"+str(value)+". It must be str convertable to an int.") return False value = int(float(value)) if value not in foldsCounter: foldsCounter[value] = 1 else: foldsCounter[value] += 1 self.preDefIndices.append(value) self.__log( "INFO: Pre-selected "+str(len([f for f in foldsCounter.keys() if f != 0]))+" folds for CV:") self.__log( " Examples in data: "+str(sum(foldsCounter.values()))) self.__log( " Examples selected for validation: "+str(sum([foldsCounter[f] for f in foldsCounter if f != 0]))) self.__log( " Examples to be appended to the train set: "+ str(0 in foldsCounter.keys() and foldsCounter[0] or 0)) else: self.__log("ERROR: Attribute Filter Ctrl was selected, but attribute is not in expected format: " + str(self.testAttrFilter)) return False self.data = dataUtilities.attributeDeselectionData(self.data, [self.testAttrFilter]) else: self.usePreDefFolds = False self.useVarCtrlCV = False self.testAttrFilter = None self.testFilterVal = None
def getProbabilitiesAsAttribute(self, algorithm=None, minsup=None, atts=None): """ For regression problems, it returns the RMSE and the Q2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None parameters: algo - key for the structural feature generation algorithm (set dependent structural features that have to be calculated inside the crossvalidation) minsup - minimum support for the algorithm atts - attributes to be removed before learning (e.g. meta etc...) """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None if algorithm: self.__log(" Additional features to be calculated inside of cross-validation") self.__log(" Algorithm for structural features: " + str(algorithm)) self.__log(" Minimum support parameter: " + str(minsup)) # Set the response type self.responseType = ( self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" ) self.__log(" " + str(self.responseType)) # Create the Train and test sets DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds) # Var for saving each Fols result optAcc = {} results = {} exp_pred = {} nTrainEx = {} nTestEx = {} # Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models = {} rocs = {} self.__log("Calculating Statistics for MLmethods:") self.__log(" " + str([x for x in MLmethods])) # Check data in advance so that, by chance, it will not faill at the last fold! for foldN in range(self.nExtFolds): trainData = self.data.select(DataIdxs[foldN], negate=1) self.__checkTrainData(trainData) # Optional!! # Order Learners so that PLS is the first sortedML = [ml for ml in MLmethods] if "PLS" in sortedML: sortedML.remove("PLS") sortedML.insert(0, "PLS") for ml in sortedML: self.__log(" > " + str(ml) + "...") try: # Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] rocs[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] optAcc[ml] = [] ### mods TG prediction_attribute = orange.FloatVariable("class_prob") domain = [data.domain.attributes, prediction_attribute, data.domain.classvar] data_new = orange.ExampleTable(domain) logTxt = "" for foldN in range(self.nExtFolds): if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs[foldN], negate=1) orig_len = len(trainData.domain.attributes) # add structural descriptors to the training data (TG) if algorithm: trainData_structDesc = getStructuralDesc.getStructuralDescResult(trainData, algorithm, minsup) trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, atts) testData = self.data.select(DataIdxs[foldN]) # print "IDX: ", # print DataIdxs[foldN] # calculate the feature values for the test data (TG) if algorithm: cut_off = orig_len - len(atts) smarts = trainData.domain.attributes[cut_off:] self.__log(" Number of structural features added: " + str(len(smarts))) testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData, smarts) testData = dataUtilities.attributeDeselectionData(testData_structDesc, atts) nTrainEx[ml].append(len(trainData)) nTestEx[ml].append(len(testData)) # Test if trainsets inside optimizer will respect dataSize criterias. # if not, don't optimize, but still train the model dontOptimize = False if self.responseType != "Classification" and (len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20): dontOptimize = True else: tmpDataIdxs = dataUtilities.SeedDataSampler(trainData, self.nInnerFolds) tmpTrainData = trainData.select(tmpDataIdxs[0], negate=1) if not self.__checkTrainData(tmpTrainData, False): dontOptimize = True if dontOptimize: logTxt += ( " Fold " + str(foldN) + ": Too few compounds to optimize model hyper-parameters\n" ) self.__log(logTxt) if trainData.domain.classVar.varType == orange.VarTypes.Discrete: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) CA = evalUtilities.CA(res)[0] optAcc[ml].append(CA) else: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: runPath = miscUtilities.createScratchDir( baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData) ) trainData.save(os.path.join(runPath, "trainData.tab")) tunedPars = paramOptUtilities.getOptParam( learner=MLmethods[ml], trainDataFile=os.path.join(runPath, "trainData.tab"), paramList=self.paramList, useGrid=False, verbose=self.verbose, queueType=self.queueType, runPath=runPath, nExtFolds=None, nFolds=self.nInnerFolds, logFile=self.logFile, getTunedPars=True, ) if not MLmethods[ml] or not MLmethods[ml].optimized: self.__log( " WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized." ) self.__log(" It will be ignored") # self.__log(" It will be set to default parameters") self.__log(" DEBUG can be done in: " + runPath) # Set learner back to default # MLmethods[ml] = MLmethods[ml].__class__() raise Exception("The learner " + str(ml) + " was not optimized.") else: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optAcc[ml].append(tunedPars[0]) else: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) miscUtilities.removeDir(runPath) # Train the model model = MLmethods[ml](trainData) models[ml].append(model) # Test the model if self.responseType == "Classification": results[ml].append( ( evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model), ) ) roc = self.aroc(testData, [model]) rocs[ml].append(roc) # save the prediction probabilities else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) results[ml].append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred)) ) # Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred res = self.createStatObj( results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml], self.responseType, self.nExtFolds, logTxt, rocs[ml], ) if self.verbose > 0: print "UnbiasedAccuracyGetter!Results " + ml + ":\n" pprint(res) if not res: raise Exception("No results available!") statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) self.__log(" OK") except: self.__log(" Learner " + str(ml) + " failed to create/optimize the model!") res = self.createStatObj() statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: # We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! # When only one or no stable models, build a consensus based on all models consensusMLs = {} for modelName in statistics: StabilityValue = statistics[modelName]["StabilityValue"] if StabilityValue is not None and statistics[modelName]["stable"]: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) self.__log( "Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods." ) if len(consensusMLs) <= 1: # we need more models to build a consensus! consensusMLs = {} for modelName in statistics: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) if len(consensusMLs) >= 2: # Var for saving each Fols result Cresults = [] Cexp_pred = [] CnTrainEx = [] CnTestEx = [] self.__log( "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs]) ) for foldN in range(self.nExtFolds): if self.responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) exprTest0 = "(0" for ml in consensusMLs: exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str(optAcc[ml][foldN]) + " " exprTest0 += ")/IF0(sum([False" for ml in consensusMLs: exprTest0 += ", " + ml + " == " + CLASS0 + " " exprTest0 += "]),1)" exprTest1 = exprTest0.replace(CLASS0, CLASS1) expression = [exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1] else: Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs]) expression = "(1 / " + str(Q2sum) + ") * (0" for ml in consensusMLs: expression += " + " + str(optAcc[ml][foldN]) + " * " + ml + " " expression += ")" testData = self.data.select(DataIdxs[foldN]) CnTestEx.append(len(testData)) consensusClassifiers = {} for learnerName in consensusMLs: consensusClassifiers[learnerName] = models[learnerName][foldN] model = AZorngConsensus.ConsensusClassifier(classifiers=consensusClassifiers, expression=expression) CnTrainEx.append(model.NTrainEx) # Test the model if self.responseType == "Classification": Cresults.append( ( evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model), ) ) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) Cresults.append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred)) ) # Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds) statistics["Consensus"] = copy.deepcopy(res) statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs) self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics # By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]
def getAcc(self, algorithm = None, minsup = None, atts = None): """ For regression problems, it returns the RMSE and the R2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"R2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None if (self.algorithm): self.__log(" Additional structural features to be calculated inside of cross-validation") self.__log(" Algorithm for structural features: "+str(self.algorithm)) self.__log(" Minimum support parameter: "+str(self.minsup)) # Set the response type responseType = self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" self.__log(" "+str(responseType)) #Create the Train and test sets DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds) #Var for saving each Fols result results = {} exp_pred = {} #Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models={} self.__log("Calculating Statistics for MLmethods:") self.__log(" "+str([x for x in MLmethods])) for ml in MLmethods: self.__log(" > "+str(ml)+"...") try: #Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] for foldN in range(self.nExtFolds): if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs[foldN],negate=1) orig_len = len(trainData.domain.attributes) if (self.algorithm): # add structural descriptors to the training data (TG) trainData_structDesc = getStructuralDesc.getStructuralDescResult(trainData, self.algorithm, self.minsup) trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, self.atts) runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "AccWOptParam") trainData.save(os.path.join(runPath,"trainData.tab")) testData = self.data.select(DataIdxs[foldN]) if (self.algorithm): # calculate the feature values for the test data (TG) cut_off = orig_len - len(self.atts) smarts = trainData.domain.attributes[cut_off:] self.__log(" Number of structural features added: "+str(len(smarts))) testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData,smarts) testData = dataUtilities.attributeDeselectionData(testData_structDesc, self.atts) paramOptUtilities.getOptParam( learner = MLmethods[ml], trainDataFile = os.path.join(runPath,"trainData.tab"), paramList = self.paramList, useGrid = False, verbose = self.verbose, queueType = self.queueType, runPath = runPath, nExtFolds = None, nFolds = self.nInnerFolds ) if not MLmethods[ml].optimized: self.__log(" The learner "+str(ml)+" was not optimized.") raise Exception("The learner "+str(ml)+" was not optimized.") miscUtilities.removeDir(runPath) #Train the model model = MLmethods[ml](trainData) models[ml].append(model) #Test the model if responseType == "Classification": results[ml].append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) ) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) results[ml].append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) ) #Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred res = self.createStatObj(results[ml], exp_pred[ml], responseType, self.nExtFolds) if self.verbose > 0: print "AccWOptParamGetter!Results "+ml+":\n" pprint(res) if not res: raise Exception("No results available!") statistics[ml] = res.copy() self.__writeResults(res) self.__log(" OK") except: self.__log(" Learner "+str(ml)+" failed to optimize!") res = self.createStatObj() statistics[ml] = res.copy() if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: #We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! stableML={} for modelName in statistics: if statistics[modelName]["StabilityValue"] < AZOC.QSARSTABILITYTHRESHOLD: # Select only stable models stableML[modelName] = statistics[modelName].copy() if len(stableML) >= 2: self.__log("Found "+str(len(stableML))+" stable MLmethods out of "+str(len(statistics))+" MLmethods.") if responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) exprTest0 = "(0" for ml in stableML: exprTest0 += "+( "+ml+" == "+CLASS0+" )*"+str(stableML[ml]["CA"])+" " exprTest0 += ")/IF0(sum([False" for ml in stableML: exprTest0 += ", "+ml+" == "+CLASS0+" " exprTest0 += "]),1)" exprTest1 = exprTest0.replace(CLASS0,CLASS1) expression = [exprTest0+" >= "+exprTest1+" -> "+CLASS0," -> "+CLASS1] else: R2sum = sum([stableML[ml]["R2"] for ml in stableML]) expression = "(1 / "+str(R2sum)+") * (0" for ml in stableML: expression += " + "+str(stableML[ml]["R2"])+" * "+ml+" " expression += ")" #Var for saving each Fols result Cresults = [] Cexp_pred = [] self.__log("Calculating the statistics for a Consensus model") for foldN in range(self.nExtFolds): testData = self.data.select(DataIdxs[foldN]) consensusClassifiers = {} for learnerName in stableML: consensusClassifiers[learnerName] = models[learnerName][foldN] model = AZorngConsensus.ConsensusClassifier(classifiers = consensusClassifiers, expression = expression) #Test the model if responseType == "Classification": Cresults.append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) ) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) Cresults.append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) ) #Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj(Cresults, Cexp_pred, responseType, self.nExtFolds) statistics["Consensus"] = res.copy() statistics["Consensus"]["IndividualStatistics"] = stableML.copy() self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics #By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]
def buildModel(trainData, MLMethod, queueType = "NoSGE", verbose = 0, logFile = None): """ Buld the method passed in MLMethod and optimize ( "IndividualStatistics" not in MLMethod) if MLMethod is a Consensus ("individualStatistics" in MLMethod) , build each and optimize first all models and after build the consensus! """ log(logFile, "Building and optimizing learner: "+MLMethod["MLMethod"]+"...") learners = {} MLMethods = {} if "IndividualStatistics" in MLMethod: #It is a consensus and will certaily not contain any #special model as it was filtered in the getUnbiasedAcc for ML in MLMethod["IndividualStatistics"]: MLMethods[ML] = copy.deepcopy(MLMethod["IndividualStatistics"][ML]) else: ML = MLMethod["MLMethod"] if MLMETHODS[ML](name = ML).specialType == 1: # If is a special model and has a built-in optimizaer log(logFile, " This is a special model") smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: log(logFile,"Found SMILES attribute:"+smilesAttr) trainData = dataUtilities.attributeSelectionData(trainData, [smilesAttr, trainData.domain.classVar.name]) optInfo, SpecialModel = MLMETHODS[ML](name = ML).optimizePars(trainData, folds = 5) return SpecialModel else: MLMethods[MLMethod["MLMethod"]] = MLMethod smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: trainData = dataUtilities.attributeDeselectionData(trainData, [smilesAttr]) # optimize all MLMethods for ML in MLMethods: log(logFile, " Optimizing MLmethod: "+ML) learners[ML] = MLMETHODS[ML](name = ML) runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "competitiveWorkflow_BuildModel") trainData.save(os.path.join(runPath,"trainData.tab")) tunedPars = paramOptUtilities.getOptParam( learner = learners[ML], trainDataFile = os.path.join(runPath,"trainData.tab"), useGrid = False, verbose = verbose, queueType = queueType, runPath = runPath, nExtFolds = None, logFile = logFile, getTunedPars = True) if not learners[ML].optimized: print "WARNING: competitiveWorkflow: The learner "+str(learners[ML])+" was not optimized." #print " Using default parameters" print " The "+str(learners[ML])+" will not be included" #print " Returning None" print " DEBUG can be made in: "+runPath #Setting default parameters #learners[ML] = learners[ML].__class__() #return None learners.pop(ML) continue else: print "Optimized learner ",learners[ML] if trainData.domain.classVar.varType == orange.VarTypes.Discrete: MLMethods[ML]["optAcc"] = tunedPars[0] else: res = orngTest.crossValidation([learners[ML]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator = random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] MLMethods[ML]["optAcc"] = R2 miscUtilities.removeDir(runPath) #Train the model if len(learners) == 1: log(logFile, " Building the model:"+learners.keys()[0]) model = learners[learners.keys()[0]](trainData) elif len(learners) >= 1: model = buildConsensus(trainData,learners,MLMethods) else: print "ERROR: No Learners were selected!" return None return model
from trainingMethods import AZorngRF from trainingMethods import AZorngCvBoost from trainingMethods import AZorngCvSVM import orngTest import orngStat import orange import string data = dataUtilities.DataTable("IIDsetAZOdesc.txt") test = dataUtilities.DataTable("nonIIDtestAZOdesc.txt") descList = [ "ID", "Smiles", "Conc", "Effect", "Conc_1", "Effect_1", "ID_1", "origSmiles_1", "BioActivity_1" ] data = dataUtilities.attributeDeselectionData(data, descList) # Deselect descriptors with no variance descList = ["rdk.fr_dihydropyridine", "rdk.fr_nitroso", "rdk.fr_benzodiazepine", "rdk.fr_thiocyan", "rdk.VSA_EState4" ,"rdk.VSA_EState6" \ ,"rdk.VSA_EState7" ,"rdk.VSA_EState1" ,"rdk.VSA_EState2" ,"rdk.VSA_EState3" ,"rdk.SlogP_VSA9" ,"rdk.SMR_VSA8" ,"rdk.fr_diazo" \ ,"rdk.fr_prisulfonamd" ,"rdk.fr_isocyan" ,"rdk.fr_azide" ,"rdk.fr_isothiocyan"] data = dataUtilities.attributeDeselectionData(data, descList) print "Length domain ", len(data.domain) learner = AZorngCvSVM.CvSVMLearner(C=32, gamma=0.03125) #learner = AZorngRF.RFLearner() #learner = AZorngRF.RFLearner(stratify = "Yes") # No effect #learner = AZorngCvBoost.CvBoostLearner() #learner.stratify = "Yes" # No effect #learner.priors = {"Active":0.80, "Inactive":0.20}
def __init__(self, **kwds): self.verbose = 0 self.logFile = None self.resultsFile = None self.nExtFolds = 5 self.nInnerFolds = 5 self.data = None self.learner = None self.paramList = None self.queueType = "NoSGE" self.responseType = None self.fixedParams = {} self.testAttrFilter = None self.testFilterVal = None self.sampler = dataUtilities.SeedDataSampler # Append arguments to the __dict__ member variable self.__dict__.update(kwds) self.learnerName = "" self.preDefIndices = orange.LongList() self.usePreDefFolds = False self.useVarCtrlCV = False if self.testAttrFilter and self.testAttrFilter in self.data.domain: if self.testFilterVal and type( self.testFilterVal) == list and type( self.testAttrFilter) == str: self.useVarCtrlCV = True self.usePreDefFolds = False for ex in self.data: if ex[self. testAttrFilter].value in self.testFilterVal: # Compound selected to be allowed in the test set self.preDefIndices.append(1) else: # Compound to not include in the test set. Always to be shifted to the train self.preDefIndices.append(0) elif self.testFilterVal is None: self.usePreDefFolds = True self.useVarCtrlCV = False #Enable pre-selected-indices ( index 0 will be set for train Bias) foldsCounter = {} for ex in self.data: value = str(ex[self.testAttrFilter].value) if not miscUtilities.isNumber(value): self.__log("Invalid fold value:" + str(value) + ". It must be str convertable to an int.") return False value = int(float(value)) if value not in foldsCounter: foldsCounter[value] = 1 else: foldsCounter[value] += 1 self.preDefIndices.append(value) self.__log("INFO: Pre-selected " + str(len([f for f in foldsCounter.keys() if f != 0])) + " folds for CV:") self.__log(" Examples in data: " + str(sum(foldsCounter.values()))) self.__log( " Examples selected for validation: " + str(sum([foldsCounter[f] for f in foldsCounter if f != 0]))) self.__log( " Examples to be appended to the train set: " + str(0 in foldsCounter.keys() and foldsCounter[0] or 0)) else: self.__log( "ERROR: Attribute Filter Ctrl was selected, but attribute is not in expected format: " + str(self.testAttrFilter)) return False self.data = dataUtilities.attributeDeselectionData( self.data, [self.testAttrFilter]) else: self.usePreDefFolds = False self.useVarCtrlCV = False self.testAttrFilter = None self.testFilterVal = None
def getXNN(trainSmilesList, train, predEx, smilesAttrName, nameAttr, X, simType): if simType == "Topological": fpsTrain = [FingerprintMols.FingerprintMol(x) for x in trainSmilesList] fp = FingerprintMols.FingerprintMol( Chem.MolFromSmiles(predEx[smilesAttrName].value)) elif simType == "Morgan": fpsTrain = [ AllChem.GetMorganFingerprint(x, 2) for x in trainSmilesList ] fp = AllChem.GetMorganFingerprint( Chem.MolFromSmiles(predEx[smilesAttrName].value), 2) elif simType == "MACCS": fpsTrain = [MACCSkeys.GenMACCSKeys(x) for x in trainSmilesList] fp = MACCSkeys.GenMACCSKeys( Chem.MolFromSmiles(predEx[smilesAttrName].value)) elif simType == "Mahalanobis": attrList = [smilesAttrName, nameAttr] predEx = dataUtilities.attributeDeselectionExample(predEx, attrList) fp = getDescVect(predEx) numTrain = dataUtilities.attributeDeselectionData(train, attrList) trainMat = [] for ex in numTrain: descVect = getDescVect(ex) trainMat.append(descVect) norm = Mahalanobis.create_inverse_covariance_norm(trainMat) else: print "This type of sim is not implemented ", simType simDict = {} idx = 0 simList = [] for ex in train: if simType == "Topological": sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp) elif simType == "Morgan": sim = DataStructs.DiceSimilarity(fpsTrain[idx], fp) elif simType == "MACCS": sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp) elif simType == "Mahalanobis": descVect = trainMat[idx] dist = Mahalanobis.compute_distance(fp, descVect, norm) sim = dist else: print "This type of sim is not implemented ", simType idx = idx + 1 simDict[ex[nameAttr].value] = sim simList.append(sim) if simType == "Mahalanobis": # Mahalanobis gives a distance while the other methods are similarities simList.sort() else: simList.sort(reverse=True) simList = simList[0:X] medSim = round(numpy.median(simList), 3) stdSim = round(numpy.std(simList), 3) minSim = round(min(simList), 3) maxSim = round(max(simList), 3) entropy = round(getRespVar(simList, simDict, train, nameAttr), 3) entropyClosest = round( getRespVar(simList[0:X / 2], simDict, train, nameAttr), 3) return medSim, stdSim, minSim, maxSim, entropy, entropyClosest
def buildConsensus(trainData, learners, MLMethods, logFile=None): log( logFile, "Building a consensus model based on optimized MLmethods: " + str([ml for ml in MLMethods]) + "...") if trainData.domain.classVar.varType == orange.VarTypes.Discrete: #Expression: If CAavg_{POS} ge CAavg_{NEG} -> POS else -> NEG # where CAavg_{POS} is the average of classification accuracies of all models predicting POS. CLASS0 = str(trainData.domain.classVar.values[0]) CLASS1 = str(trainData.domain.classVar.values[1]) #exprTest0 exprTest0 = "(0" for ml in MLMethods: exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str( MLMethods[ml]["optAcc"]) + " " exprTest0 += ")/IF0(sum([False" for ml in MLMethods: exprTest0 += ", " + ml + " == " + CLASS0 + " " exprTest0 += "]),1)" # exprTest1 exprTest1 = "(0" for ml in MLMethods: exprTest1 += "+( " + ml + " == " + CLASS1 + " )*" + str( MLMethods[ml]["optAcc"]) + " " exprTest1 += ")/IF0(sum([False" for ml in MLMethods: exprTest1 += ", " + ml + " == " + CLASS1 + " " exprTest1 += "]),1)" # expression expression = [ exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1 ] else: Q2sum = sum([MLMethods[ml]["optAcc"] for ml in MLMethods]) expression = "(1 / " + str(Q2sum) + ") * (0" for ml in MLMethods: expression += " + " + str( MLMethods[ml]["optAcc"]) + " * " + ml + " " expression += ")" consensusLearners = {} for learnerName in learners: consensusLearners[learnerName] = learners[learnerName] learner = AZorngConsensus.ConsensusLearner(learners=consensusLearners, expression=expression) log(logFile, " Training Consensus Learner") smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: log(logFile, "Found SMILES attribute:" + smilesAttr) if learner.specialType == 1: trainData = dataUtilities.attributeSelectionData( trainData, [smilesAttr, trainData.domain.classVar.name]) log( logFile, "Selected attrs: " + str([attr.name for attr in trainData.domain])) else: trainData = dataUtilities.attributeDeselectionData( trainData, [smilesAttr]) log(logFile,"Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] +\ [attr.name for attr in trainData.domain[len(trainData.domain)-3:]])) return learner(trainData)
def buildModel(trainData, MLMethod, queueType="NoSGE", verbose=0, logFile=None): """ Buld the method passed in MLMethod and optimize ( "IndividualStatistics" not in MLMethod) if MLMethod is a Consensus ("individualStatistics" in MLMethod) , build each and optimize first all models and after build the consensus! """ log(logFile, "Building and optimizing learner: " + MLMethod["MLMethod"] + "...") learners = {} MLMethods = {} if "IndividualStatistics" in MLMethod: #It is a consensus and will certaily not contain any #special model as it was filtered in the getUnbiasedAcc for ML in MLMethod["IndividualStatistics"]: MLMethods[ML] = copy.deepcopy(MLMethod["IndividualStatistics"][ML]) else: ML = MLMethod["MLMethod"] if MLMETHODS[ML]( name=ML ).specialType == 1: # If is a special model and has a built-in optimizaer log(logFile, " This is a special model") smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: log(logFile, "Found SMILES attribute:" + smilesAttr) trainData = dataUtilities.attributeSelectionData( trainData, [smilesAttr, trainData.domain.classVar.name]) optInfo, SpecialModel = MLMETHODS[ML](name=ML).optimizePars( trainData, folds=5) return SpecialModel else: MLMethods[MLMethod["MLMethod"]] = MLMethod smilesAttr = dataUtilities.getSMILESAttr(trainData) if smilesAttr: trainData = dataUtilities.attributeDeselectionData( trainData, [smilesAttr]) # optimize all MLMethods for ML in MLMethods: log(logFile, " Optimizing MLmethod: " + ML) learners[ML] = MLMETHODS[ML](name=ML) runPath = miscUtilities.createScratchDir( baseDir=AZOC.NFS_SCRATCHDIR, desc="competitiveWorkflow_BuildModel") trainData.save(os.path.join(runPath, "trainData.tab")) tunedPars = paramOptUtilities.getOptParam(learner=learners[ML], trainDataFile=os.path.join( runPath, "trainData.tab"), useGrid=False, verbose=verbose, queueType=queueType, runPath=runPath, nExtFolds=None, logFile=logFile, getTunedPars=True) if not learners[ML].optimized: print "WARNING: competitiveWorkflow: The learner " + str( learners[ML]) + " was not optimized." #print " Using default parameters" print " The " + str(learners[ML]) + " will not be included" #print " Returning None" print " DEBUG can be made in: " + runPath #Setting default parameters #learners[ML] = learners[ML].__class__() #return None learners.pop(ML) continue else: print "Optimized learner ", learners[ML] if trainData.domain.classVar.varType == orange.VarTypes.Discrete: MLMethods[ML]["optAcc"] = tunedPars[0] else: res = orngTest.crossValidation( [learners[ML]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100)) R2 = evalUtilities.R2(res)[0] MLMethods[ML]["optAcc"] = R2 miscUtilities.removeDir(runPath) #Train the model if len(learners) == 1: log(logFile, " Building the model:" + learners.keys()[0]) model = learners[learners.keys()[0]](trainData) elif len(learners) >= 1: model = buildConsensus(trainData, learners, MLMethods) else: print "ERROR: No Learners were selected!" return None return model
def getAcc(self, callBack=None, algorithm=None, params=None, atts=None, holdout=None): """ For regression problems, it returns the RMSE and the Q2 For Classification problems, it returns CA and the ConfMat The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]} For the EvalResults not supported for a specific learner/datase, the respective result will be None if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus made out of those that were stable It some error occurred, the respective values in the Dict will be None parameters: algorithm - list of feature generation algorithms (set dependent features that have to be calculated inside the crossvalidation) params - dictionary of parameters atts - attributes to be removed before learning (e.g. meta etc...) """ self.__log("Starting Calculating MLStatistics") statistics = {} if not self.__areInputsOK(): return None if holdout: self.nExtFolds = 1 if algorithm: self.__log(" Additional features to be calculated inside of cross-validation") for i in algorithm: self.__log(" Algorithm: " + str(i)) for j, v in params.iteritems(): self.__log(" Parameter: " + str(j) + " = " + str(v)) # Set the response type self.responseType = ( self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression" ) self.__log(" " + str(self.responseType)) # Create the Train and test sets DataIdxs = None if holdout: self.__log("Using hold out evaluation with " + str(holdout) + "*100 % of data for training") DataIdxs = dataUtilities.SeedDataSampler_holdOut(self.data, holdout) else: DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds) # Var for saving each Fols result optAcc = {} results = {} exp_pred = {} nTrainEx = {} nTestEx = {} # Set a dict of learners MLmethods = {} if type(self.learner) == dict: for ml in self.learner: MLmethods[ml] = self.learner[ml] else: MLmethods[self.learner.name] = self.learner models = {} rocs = {} self.__log("Calculating Statistics for MLmethods:") self.__log(" " + str([x for x in MLmethods])) # Check data in advance so that, by chance, it will not fail at the last fold! for foldN in range(self.nExtFolds): trainData = self.data.select(DataIdxs[foldN], negate=1) self.__checkTrainData(trainData) # Optional!! # Order Learners so that PLS is the first sortedML = [ml for ml in MLmethods] if "PLS" in sortedML: sortedML.remove("PLS") sortedML.insert(0, "PLS") stepsDone = 0 nTotalSteps = len(sortedML) * self.nExtFolds for ml in sortedML: self.__log(" > " + str(ml) + "...") try: # Var for saving each Fols result results[ml] = [] exp_pred[ml] = [] models[ml] = [] rocs[ml] = [] nTrainEx[ml] = [] nTestEx[ml] = [] optAcc[ml] = [] logTxt = "" for foldN in range(self.nExtFolds): if type(self.learner) == dict: self.paramList = None trainData = self.data.select(DataIdxs[foldN], negate=1) orig_len = len(trainData.domain.attributes) refs = None methods = [ "rdk_MACCS_keys", "rdk_topo_fps", "rdk_morgan_fps", "rdk_morgan_features_fps", "rdk_atompair_fps", ] train_domain = None # add structural descriptors to the training data (TG) if algorithm: for i in range(len(algorithm)): if algorithm[i] == "structClust": self.__log("Algorithm " + str(i) + ": " + str(algorithm[i])) actData = orange.ExampleTable(trainData.domain) for d in trainData: # only valid for simboosted qsar paper experiments!? if d.getclass() == "2": actData.append(d) refs = structuralClustering.getReferenceStructures( actData, threshold=params["threshold"], minClusterSize=params["minClusterSize"], numThreads=2, ) self.__log( " found " + str(len(refs)) + " reference structures in " + str(len(actData)) + " active structures" ) orig_len = orig_len + (len(refs) * len(methods)) trainData_sim = SimBoostedQSAR.getSimDescriptors(refs, trainData, methods) if i == (len(algorithm) - 1): trainData = dataUtilities.attributeDeselectionData(trainData_sim, atts) else: trainData = dataUtilities.attributeDeselectionData(trainData_sim, []) elif algorithm[i] == "ECFP": self.__log("Algorithm " + str(i) + ": " + str(algorithm[i])) trainData_ecfp = getCinfonyDesc.getCinfonyDescResults(trainData, ["rdk.FingerPrints"]) train_domain = trainData_ecfp.domain if i == (len(algorithm) - 1): trainData = dataUtilities.attributeDeselectionData(trainData_ecfp, atts) else: trainData = dataUtilities.attributeDeselectionData(trainData_ecfp, []) else: self.__log("Algorithm " + str(i) + ": " + str(algorithm[i])) trainData_structDesc = getStructuralDesc.getStructuralDescResult( trainData, algorithm[i], params["minsup"] ) if i == (len(algorithm) - 1): trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, atts) else: trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, []) # trainData.save("/home/girschic/proj/AZ/ProjDev/train.tab") testData = self.data.select(DataIdxs[foldN]) # calculate the feature values for the test data (TG) if algorithm: for i in range(len(algorithm)): if algorithm[i] == "structClust": self.__log(str(algorithm[i])) testData_sim = SimBoostedQSAR.getSimDescriptors(refs, testData, methods) if i == (len(algorithm) - 1): testData = dataUtilities.attributeDeselectionData(testData_sim, atts) else: testData = dataUtilities.attributeDeselectionData(testData_sim, []) elif algorithm[i] == "ECFP": self.__log(str(algorithm[i])) # testData_ecfp = orange.ExampleTable(train_domain) tmp_dat = [] for d in testData: tmp = getCinfonyDesc.getRdkFPforTestInstance(train_domain, d) tmp_dat.append(tmp) testData_ecfp = orange.ExampleTable(tmp_dat[0].domain, tmp_dat) if i == (len(algorithm) - 1): # print "removing atts" testData = dataUtilities.attributeDeselectionData(testData_ecfp, atts) else: # print "removing no atts" testData = dataUtilities.attributeDeselectionData(testData_ecfp, []) else: cut_off = orig_len - len(atts) smarts = trainData.domain.attributes[cut_off:] self.__log(" Number of structural features added: " + str(len(smarts))) testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData, smarts) if i == (len(algorithm) - 1): testData = dataUtilities.attributeDeselectionData(testData_structDesc, atts) else: testData = dataUtilities.attributeDeselectionData(testData_structDesc, []) # testData.save("/home/girschic/proj/AZ/ProjDev/test.tab") nTrainEx[ml].append(len(trainData)) nTestEx[ml].append(len(testData)) # Test if trainsets inside optimizer will respect dataSize criterias. # if not, don't optimize, but still train the model dontOptimize = False if self.responseType != "Classification" and (len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20): dontOptimize = True else: tmpDataIdxs = dataUtilities.SeedDataSampler(trainData, self.nInnerFolds) tmpTrainData = trainData.select(tmpDataIdxs[0], negate=1) if not self.__checkTrainData(tmpTrainData, False): dontOptimize = True if dontOptimize: logTxt += ( " Fold " + str(foldN) + ": Too few compounds to optimize model hyper-parameters\n" ) self.__log(logTxt) if trainData.domain.classVar.varType == orange.VarTypes.Discrete: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) CA = evalUtilities.CA(res)[0] optAcc[ml].append(CA) else: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) else: runPath = miscUtilities.createScratchDir( baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData) ) # self.__log(" run path:"+str(runPath)) trainData.save(os.path.join(runPath, "trainData.tab")) tunedPars = paramOptUtilities.getOptParam( learner=MLmethods[ml], trainDataFile=os.path.join(runPath, "trainData.tab"), paramList=self.paramList, useGrid=False, verbose=self.verbose, queueType=self.queueType, runPath=runPath, nExtFolds=None, nFolds=self.nInnerFolds, logFile=self.logFile, getTunedPars=True, ) if not MLmethods[ml] or not MLmethods[ml].optimized: self.__log( " WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized." ) self.__log(" It will be ignored") # self.__log(" It will be set to default parameters") self.__log(" DEBUG can be done in: " + runPath) # Set learner back to default # MLmethods[ml] = MLmethods[ml].__class__() raise Exception("The learner " + str(ml) + " was not optimized.") else: if trainData.domain.classVar.varType == orange.VarTypes.Discrete: optAcc[ml].append(tunedPars[0]) else: res = orngTest.crossValidation( [MLmethods[ml]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator=random.randint(0, 100), ) R2 = evalUtilities.R2(res)[0] optAcc[ml].append(R2) miscUtilities.removeDir(runPath) # Train the model model = MLmethods[ml](trainData) models[ml].append(model) # Test the model if self.responseType == "Classification": results[ml].append( ( evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model), ) ) roc = self.aroc(testData, [model]) rocs[ml].append(roc) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) results[ml].append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred)) ) # Save the experimental value and correspondent predicted value exp_pred[ml] += local_exp_pred if callBack: stepsDone += 1 if not callBack((100 * stepsDone) / nTotalSteps): return None res = self.createStatObj( results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml], self.responseType, self.nExtFolds, logTxt, rocs[ml], ) if self.verbose > 0: print "UnbiasedAccuracyGetter!Results " + ml + ":\n" pprint(res) if not res: raise Exception("No results available!") statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) self.__log(" OK") except: print "Unexpected error:", print sys.exc_info()[0] print sys.exc_info()[1] self.__log(" Learner " + str(ml) + " failed to create/optimize the model!") res = self.createStatObj( results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml], self.responseType, self.nExtFolds, logTxt, rocs[ml], ) statistics[ml] = copy.deepcopy(res) self.__writeResults(statistics) if not statistics or len(statistics) < 1: self.__log("ERROR: No statistics to return!") return None elif len(statistics) > 1: # We still need to build a consensus model out of the stable models # ONLY if there are more that one model stable! # When only one or no stable models, build a consensus based on all models consensusMLs = {} for modelName in statistics: StabilityValue = statistics[modelName]["StabilityValue"] if StabilityValue is not None and statistics[modelName]["stable"]: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) self.__log( "Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods." ) if len(consensusMLs) <= 1: # we need more models to build a consensus! consensusMLs = {} for modelName in statistics: consensusMLs[modelName] = copy.deepcopy(statistics[modelName]) if len(consensusMLs) >= 2: # Var for saving each Fols result Cresults = [] Cexp_pred = [] CnTrainEx = [] CnTestEx = [] self.__log( "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs]) ) for foldN in range(self.nExtFolds): if self.responseType == "Classification": CLASS0 = str(self.data.domain.classVar.values[0]) CLASS1 = str(self.data.domain.classVar.values[1]) exprTest0 = "(0" for ml in consensusMLs: exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str(optAcc[ml][foldN]) + " " exprTest0 += ")/IF0(sum([False" for ml in consensusMLs: exprTest0 += ", " + ml + " == " + CLASS0 + " " exprTest0 += "]),1)" exprTest1 = exprTest0.replace(CLASS0, CLASS1) expression = [exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1] else: Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs]) expression = "(1 / " + str(Q2sum) + ") * (0" for ml in consensusMLs: expression += " + " + str(optAcc[ml][foldN]) + " * " + ml + " " expression += ")" testData = self.data.select(DataIdxs[foldN]) CnTestEx.append(len(testData)) consensusClassifiers = {} for learnerName in consensusMLs: consensusClassifiers[learnerName] = models[learnerName][foldN] model = AZorngConsensus.ConsensusClassifier(classifiers=consensusClassifiers, expression=expression) CnTrainEx.append(model.NTrainEx) # Test the model if self.responseType == "Classification": Cresults.append( ( evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model), ) ) else: local_exp_pred = [] for ex in testData: local_exp_pred.append((ex.getclass(), model(ex))) Cresults.append( (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred)) ) # Save the experimental value and correspondent predicted value Cexp_pred += local_exp_pred res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds) statistics["Consensus"] = copy.deepcopy(res) statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs) self.__writeResults(statistics) self.__log("Returned multiple ML methods statistics.") return statistics # By default return the only existing statistics! self.__writeResults(statistics) self.__log("Returned only one ML method statistics.") return statistics[statistics.keys()[0]]
#print "Break after the first example" #if idx == 1: break if __name__ == "__main__": """ Assumptions; Binary classification Class labels not generalized, assumed to be 'A' and 'N' This main will test the implemented CP methods in a 10 fold CV """ data = dataUtilities.DataTable("trainData.tab") descList = ["SMILES", "SMILES_1"] data = dataUtilities.attributeDeselectionData(data, descList) print "Please note that the class labels are not generalized and need to be checked for a new data set" print "Assumed to be A and N" methods = ["kNNratio", "minNN", "avgNN", "probPred", "combo", "LLOO", "LLOOprob"] # Non-conformity score method #methods = ["kNNratio"] cpMethod = "transductive" # inductive or transductive #print "Temp position to save comp time!!" # Append to python path /home/kgvf414/dev/AZOrange0.5.5/orangeDependencies/src/orange/orange/Orange/distance/ #import instances #measure = instances.MahalanobisConstructor(data) measure = None methodIdx = 1 for method in methods:
extTest = dataUtilities.DataTable("nonIIDtestAZOdesc.txt") print "Train set ", len(train) print "randTest set ", len(randTest) print "extTest set ", len(extTest) # Calculate fingerprints for train and test sets fps = getFps(train) fpsRandTest = getFps(randTest) fpsExtTest = getFps(extTest) # Deselect descriptors with no variance descList = ["ID", "Smiles", "Conc", "Effect", "Conc_1", "Effect_1", "ID_1", "origSmiles_1", "BioActivity_1" \ ,"rdk.fr_dihydropyridine", "rdk.fr_nitroso", "rdk.fr_benzodiazepine", "rdk.fr_thiocyan", "rdk.VSA_EState4" ,"rdk.VSA_EState6" \ ,"rdk.VSA_EState7" ,"rdk.VSA_EState1" ,"rdk.VSA_EState2" ,"rdk.VSA_EState3" ,"rdk.SlogP_VSA9" ,"rdk.SMR_VSA8" ,"rdk.fr_diazo" \ ,"rdk.fr_prisulfonamd" ,"rdk.fr_isocyan" ,"rdk.fr_azide" ,"rdk.fr_isothiocyan"] train = dataUtilities.attributeDeselectionData(train, descList) print "Length domain ", len(train.domain) #learner = AZorngCvSVM.CvSVMLearner(C=32, gamma=0.03125) learner = AZorngRF.RFLearner() #learner = AZorngRF.RFLearner(stratify = "Yes") # No effect #learner = AZorngCvBoost.CvBoostLearner() #learner.stratify = "Yes" # No effect #learner.priors = {"Active":0.80, "Inactive":0.20} model = learner(train) thrsList = [0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85] fileName = "optThrs.txt" fid = open(fileName, "w") fid.write("Thrs\tMCC_IID\toutAD_IID\tMCC_nonIID\toutAD_nonIID\n") for thrs in thrsList:
def getSimDescriptors(InActives, InData, methods, active_ids=None, pharmacophore_file=None, callBack=None): """ calculates similarity descriptors for a training set (orange object) using the given similarity methods against the given actives Possible method strings in methods are the names of the sim_* methods below, e.g. rdk_topo_fps for sim_rdk_topo_fps callBack function, if defined, will be called on each step sending the pergentage done (0-100): e.g. callBack(25) the callBack function shall return True of False which will indicate to this method if the process it to be continued or Not. e.g. if callBack(25) == False it indicates the caller want's to stop the process of calculating descriptors """ # Pre-process input Data tto standardize the SMILES SMILESattr = getSMILESAttr(InData) if not SMILESattr: return None #TODO: Create a method in dataUtilities to standardize the attribute smilesName in place having the attr origSmiles as ID if "AZutilities.extraUtilities" in sys.modules and hasattr( extraUtilities, "StandardizeSMILES"): # Call a method for standardizing the SMILES in Data. # The method is expected to change the attribute defined as smiAttr in data object cleanedData = True # Process InData tmpDomain = orange.Domain([orange.StringVariable("OrigSMI_ID")] + [attr for attr in InData.domain]) data = orange.ExampleTable(tmpDomain, InData) # Fill the OrigSMI_ID for ex in data: ex["OrigSMI_ID"] = ex[SMILESattr] extraUtilities.StandardizeSMILES(data, smiAttr=SMILESattr, cName="OrigSMI_ID") # Process Input actives activesDomain = orange.Domain([ orange.StringVariable("OrigSMI_ID"), orange.StringVariable("SMILES") ], 0) activesData = orange.ExampleTable(activesDomain) for act in InActives: activesData.append([act, act]) extraUtilities.StandardizeSMILES(activesData, smiAttr="SMILES", cName="OrigSMI_ID") #print activesData.domain actives = [] for ex in activesData: actives.append(str(ex["SMILES"].value)) else: data = InData actives = InActives cleanedData = False # adjust the header atts = [] for m in methods: count = 1 for a in actives: attname = m + '(active_' + str(count) + ')' atts.append(orange.FloatVariable(attname)) count += 1 newdomain = orange.Domain(data.domain.attributes + atts, data.domain.classVar) newdata = orange.ExampleTable(newdomain, data) att_idx = 0 # if callBack is defined, it will be called with the percentage done, i.e. 0-100 if active_ids: nTotalSteps = len(newdata) * ( (len(methods) - 1) * len(actives) + len(active_ids)) else: nTotalSteps = len(methods) * len(actives) * len(newdata) stepsDone = 0 # fill up the data for m in methods: if m == 'rdk_topo_fps': count = 1 for a in actives: attname = m + '(active_' + str(count) + ')' for j in range(len(newdata)): instance = newdata[j] tmp = orange.Value(atts[att_idx], orng_sim_rdk_topo_fps(a, instance)) instance[atts[att_idx]] = tmp if callBack: stepsDone += 1 if not callBack((100 * stepsDone) / nTotalSteps): return None att_idx += 1 elif m == 'rdk_MACCS_keys': count = 1 for a in actives: attname = m + '(active_' + str(count) + ')' for j in range(len(newdata)): instance = newdata[j] tmp = orange.Value(atts[att_idx], orng_sim_rdk_MACCS_keys(a, instance)) instance[atts[att_idx]] = tmp if callBack: stepsDone += 1 if not callBack((100 * stepsDone) / nTotalSteps): return None att_idx += 1 elif m == 'rdk_morgan_fps': count = 1 for a in actives: attname = m + '(active_' + str(count) + ')' for j in range(len(newdata)): instance = newdata[j] tmp = orange.Value(atts[att_idx], orng_sim_rdk_morgan_fps(a, instance)) instance[atts[att_idx]] = tmp if callBack: stepsDone += 1 if not callBack((100 * stepsDone) / nTotalSteps): return None att_idx += 1 elif m == 'rdk_morgan_features_fps': count = 1 for a in actives: attname = m + '(active_' + str(count) + ')' for j in range(len(newdata)): instance = newdata[j] tmp = orange.Value( atts[att_idx], orng_sim_rdk_morgan_features_fps(a, instance)) instance[atts[att_idx]] = tmp if callBack: stepsDone += 1 if not callBack((100 * stepsDone) / nTotalSteps): return None att_idx += 1 elif m == 'rdk_atompair_fps': count = 1 for a in actives: attname = m + '(active_' + str(count) + ')' for j in range(len(newdata)): instance = newdata[j] tmp = orange.Value(atts[att_idx], orng_sim_rdk_atompair_fps(a, instance)) instance[atts[att_idx]] = tmp if callBack: stepsDone += 1 if not callBack((100 * stepsDone) / nTotalSteps): return None att_idx += 1 elif m == 'azo_pharmacophore_fps': count = 1 for a in active_ids: attname = m + '(active_' + str(count) + ')' for j in range(len(newdata)): instance = newdata[j] tmp = orange.Value( atts[att_idx], azo_pharmacophore_az_inhouse(a, instance, pharmacophore_file)) instance[atts[att_idx]] = tmp if callBack: stepsDone += 1 if not callBack((100 * stepsDone) / nTotalSteps): return None att_idx += 1 if cleanedData: #Remove the fixed SMILES and revert to the Original SMILES newdata = dataUtilities.attributeDeselectionData(newdata, [SMILESattr]) newdata.domain["OrigSMI_ID"].name = SMILESattr return newdata