コード例 #1
0
def kNNratio(idx, extTrain, measure=None):
    """
    Use the fraction of kNN with the same response.
    """
    attrList = ["SMILES_1"]
    extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList)

    distList = []
    if not measure:
        # measure = instances.MahalanobisConstructor(extTrain)
        measure = orange.ExamplesDistanceConstructor_Euclidean(extTrain)
    for runIdx in range(len(extTrain)):
        if runIdx != idx:
            dist = measure(extTrain[idx], extTrain[runIdx])
            distList.append(dist)

    # Get the distance of the 10th NN
    distList.sort()
    thresDist = distList[9]

    # Find the labels of the 10 NN
    sameCount = 0
    for runIdx in range(len(extTrain)):
        if runIdx != idx:
            dist = measure(extTrain[idx], extTrain[runIdx])
            if dist <= thresDist:
                if extTrain[idx].get_class().value == extTrain[runIdx].get_class().value:
                    sameCount = sameCount + 1
    alpha = 1.00 - float(sameCount) / 10.0

    return alpha
コード例 #2
0
def avgNN(idx, extTrain, measure=None):
    """
    Use the ratio between the distance to the kNN of the same and of the other class
    """
    attrList = ["SMILES_1"]
    extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList)

    distListSame = []
    distListDiff = []
    # measure = Orange.distance.Euclidean(extTrain)
    if not measure:
        measure = orange.ExamplesDistanceConstructor_Euclidean(extTrain)
    for runIdx in range(len(extTrain)):
        if runIdx != idx:
            dist = measure(extTrain[idx], extTrain[runIdx])
            if extTrain[idx].get_class().value == extTrain[runIdx].get_class().value:
                distListSame.append(dist)
            else:
                distListDiff.append(dist)
    distListSame.sort()
    avgSame = sum(distListSame[0:10]) / 10.0
    distListDiff.sort()
    avgDiff = sum(distListDiff[0:10]) / 10.0
    if avgDiff == 0:
        alpha = max(distListDiff)
    else:
        alpha = avgSame / float(avgDiff)

    return alpha
コード例 #3
0
def probPred(idx, extTrain, SVMparam):
    """
    Use the RF prediction probability to set the non-conf score
    """
    attrList = ["SMILES_1"]
    extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList)

    # Deselect example idx in extTrain
    idxList = range(0, idx)
    idxList.extend(range(idx + 1, len(extTrain)))
    train = extTrain.get_items(idxList)

    # Train a model
    model = AZorngRF.RFLearner(train)
    #model, SVMparam = trainSVMOptParam(train, SVMparam)

    # Predict example idx
    predList = model(extTrain[idx], returnDFV=True)
    pred = predList[0].value
    prob = predList[1]
    actual = extTrain[idx].get_class().value
    #print pred, actual, prob

    # More non conforming if prediction is different from actual label
    if pred != actual:
        alpha = 1.0 + abs(prob)
    else:
        alpha = 1.0 - abs(prob)

    #print alpha
    return alpha, SVMparam
コード例 #4
0
ファイル: similarityMetrics.py プロジェクト: egonw/AZOrange
def getMahalanobisResults(predictor):

    if predictor.highConf == None and predictor.lowConf == None:
        return None, None
    testData = dataUtilities.attributeDeselectionData(predictor.exToPred, ["SMILEStoPred"])
    trainData = dataUtilities.DataTable(predictor.trainDataPath)
    ExampleFix = dataUtilities.ExFix(trainData.domain, None, False)
    exFixed1 = ExampleFix.fixExample(testData[0])
    if testData.hasMissingValues():
        averageImputer = orange.ImputerConstructor_average(trainData)
        dat = averageImputer(exFixed1)
    else:
        dat = exFixed1

    tab = dataUtilities.DataTable(trainData.domain)
    tab.append(dat)

    MD = calcMahalanobis(trainData, tab)
    near3neighbours = [
        (MD[0]["_train_id_near1"], MD[0]["_train_SMI_near1"]),
        (MD[0]["_train_id_near2"], MD[0]["_train_SMI_near2"]),
        (MD[0]["_train_id_near3"], MD[0]["_train_SMI_near3"]),
    ]
    avg3nearest = MD[0]["_train_av3nearest"]
    if avg3nearest < predictor.highConf:
        confStr = predictor.highConfString
    elif avg3nearest > predictor.lowConf:
        confStr = predictor.lowConfString
    else:
        confStr = predictor.medConfString

    return near3neighbours, confStr
コード例 #5
0
def avgNN(idx, extTrain, measure=None):
    """
    Use the ratio between the distance to the kNN of the same and of the other class
    """
    attrList = ["SMILES_1"]
    extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList)

    distListSame = []
    distListDiff = []
    #measure = Orange.distance.Euclidean(extTrain)
    if not measure:
        measure = orange.ExamplesDistanceConstructor_Euclidean(extTrain)
    for runIdx in range(len(extTrain)):
        if runIdx != idx:
            dist = measure(extTrain[idx], extTrain[runIdx])
            if extTrain[idx].get_class().value == extTrain[runIdx].get_class(
            ).value:
                distListSame.append(dist)
            else:
                distListDiff.append(dist)
    distListSame.sort()
    avgSame = sum(distListSame[0:10]) / 10.0
    distListDiff.sort()
    avgDiff = sum(distListDiff[0:10]) / 10.0
    if avgDiff == 0:
        alpha = max(distListDiff)
    else:
        alpha = avgSame / float(avgDiff)

    return alpha
コード例 #6
0
ファイル: descSelectionDragon.py プロジェクト: accsc/AZOrange
def getAccStat(rankSumTuple, nDesc, train, randTest, extTest, resultsFid,
               projectName):

    print "Select features based on top ranked features"
    attrList = []
    for elem in rankSumTuple:
        if len(attrList) < nDesc:
            attrList.append(elem[0])
    train = dataUtilities.attributeSelectionData(train, attrList)
    train = dataUtilities.attributeDeselectionData(
        train, ['HLM_XEN025;Mean;CLint (uL/min/mg);(Num)'])

    print train.domain.attributes, len(
        train.domain.attributes), train.domain.classVar

    # Get accuracies
    learners = [AZorngRF.RFLearner(nTrees=100)]
    print "CV accuracy"
    MCC_CV = printCV(train, learners, resultsFid, projectName)
    Model = learners[0](train)
    print "Random Test set accuracy"
    MCC_rand = printTestSetAcc(Model, randTest, learners, resultsFid,
                               projectName, True)
    print "External Test set accuracy"
    MCC_ext = printTestSetAcc(Model, extTest, learners, resultsFid,
                              projectName, False)

    return MCC_CV, MCC_rand, MCC_ext
コード例 #7
0
def kNNratio(idx, extTrain, measure=None):
    """
    Use the fraction of kNN with the same response.
    """
    attrList = ["SMILES_1"]
    extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList)

    distList = []
    if not measure:
        #measure = instances.MahalanobisConstructor(extTrain)
        measure = orange.ExamplesDistanceConstructor_Euclidean(extTrain)
    for runIdx in range(len(extTrain)):
        if runIdx != idx:
            dist = measure(extTrain[idx], extTrain[runIdx])
            distList.append(dist)

    # Get the distance of the 10th NN
    distList.sort()
    thresDist = distList[9]

    # Find the labels of the 10 NN
    sameCount = 0
    for runIdx in range(len(extTrain)):
        if runIdx != idx:
            dist = measure(extTrain[idx], extTrain[runIdx])
            if dist <= thresDist:
                if extTrain[idx].get_class(
                ).value == extTrain[runIdx].get_class().value:
                    sameCount = sameCount + 1
    alpha = 1.00 - float(sameCount) / 10.0

    return alpha
コード例 #8
0
def probPred(idx, extTrain, SVMparam):
    """
    Use the RF prediction probability to set the non-conf score
    """
    attrList = ["SMILES_1"]
    extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList)

    # Deselect example idx in extTrain
    idxList = range(0, idx)
    idxList.extend(range(idx + 1, len(extTrain)))
    train = extTrain.get_items(idxList)

    # Train a model
    # model = AZorngRF.RFLearner(train, nActVars = 2)
    model, SVMparam = trainSVMOptParam(train, SVMparam)

    # Predict example idx
    predList = model(extTrain[idx], returnDFV=True)
    pred = predList[0].value
    prob = predList[1]
    actual = extTrain[idx].get_class().value
    # print pred, actual, prob

    # More non conforming if prediction is different from actual label
    if pred != actual:
        alpha = 1.0 + abs(prob)
    else:
        alpha = 1.0 - abs(prob)

    # print alpha
    return alpha, SVMparam
コード例 #9
0
def cross_validation_plusFTM(data, learners, k, f, att_list):
    """
    Perform k-fold cross validation and add FTM features (minsup = f) in each fold 
    The FTM features for each training fold are recalculated for the test fold (NO FTM run!)
    att_list - is the
    list of attributes that will be removed before learning 
    For reference see also:
    http://orange.biolab.si/doc/ofb/accuracy5.py
    http://orange.biolab.si/doc/ofb/c_performance.htm
    """
    acc = [0.0]*len(learners)
    roc = [0.0]*len(learners)
    selection = orange.MakeRandomIndicesCV(data, folds=k)
    for test_fold in range(k):
        train_data = data.select(selection, test_fold, negate=1)
#        print "len->train: ",
#        print len(train_data)
        # add ftm features 
        train_data_ftm = getFTMDescResult(train_data, f)
	minsupStr = str(f).replace(".","")
        filename = data.name + "_ftm_" + minsupStr + "_" + str(test_fold) + ".tab"
        #train_data.save(filename)
        train_scaled = dataUtilities.attributeDeselectionData(train_data_ftm, att_list)
        
        # recalc and add ftm features to test fold
        test_data = data.select(selection, test_fold)
        smarts = train_data_ftm.domain.attributes[len(train_data.domain.attributes):]
        print "# FTM features: ",
        print len(smarts)
        test_data_ftm = getSMARTSrecalcDesc(test_data,smarts)
        test_scaled = dataUtilities.attributeDeselectionData(test_data_ftm, att_list)
                
        classifiers = []
        for l in learners:
            classifiers.append(l(train_scaled))
        acc1 = accuracy(test_scaled, classifiers)
        auroc1 = aroc(test_scaled, classifiers)
        print "%d: %s" % (test_fold+1, acc1)
        print "%d: %s" % (test_fold+1, auroc1)
        for j in range(len(learners)):
            acc[j] += acc1[j]
            roc[j] += auroc1[j]
    for j in range(len(learners)):
        acc[j] = acc[j]/k
        roc[j] = roc[j]/k
    return acc, roc
コード例 #10
0
ファイル: getBBRCDesc.py プロジェクト: AZCompTox/AZOrange
def getBBRCDescResult(dataIN, algo = "FTM", minSupPar = 2, ChisqSig = None, active = None, verbose = 0, descList = []):
    """ delegate to different algorithm methods 
    """
    if not descList:
        descList = []
    outData = None
    if active is not None:
        activeLabel = active
    else:
        if dataIN.domain.classVar:
            activeLabel = dataIN.domain.classVar.values[0]   # For BBRC the active class can be any since it will only use the "count"
        else:
            activeLabel = None
            
    if (algo == "FTM"):              # Using BBRC without class correlation
        BBRCCalc = BBRC(verbose = verbose)
        BBRCCalc.minsup = minSupPar
        BBRCCalc.active = activeLabel
        #Disanling class correlation
        BBRCCalc.DynamicUpperBound = False 
        BBRCCalc.ChisqSig = 0.0
        BBRCCalc.Backbone = False

        outData = BBRCCalc.getDesc(dataIN)
    elif (algo == "BBRC"):
        BBRCCalc = BBRC(verbose = verbose)
        BBRCCalc.minsup = minSupPar
        BBRCCalc.active = activeLabel
        if ChisqSig is not None:       
            if ChisqSig < 0 or ChisqSig > 1:
                print "ERROR: ChisqSig must be defined between 0 and 1"
                return None 
            BBRCCalc.ChisqSig = ChisqSig
        else:
            BBRCCalc.ChisqSig = 0.95
        outData = BBRCCalc.getDesc(dataIN)
    elif (algo == "LAST-PM"):
        outData = getFMinerDescResult(data,minsup,algo)
    else:
        print "Algorithm "+str(algo)+" is unknown!"
    if not outData:
        return None
    newAttrs = [attr.name for attr in outData.domain if attr.name not in dataIN.domain]
    if descList:
        desAttrs = [attr for attr in newAttrs if attr not in descList]
    else:
        desAttrs = []
    print "BBRC descriptors requested: "+str(len(descList) or  "ALL")
    print "BBRC descriptors returned: "+str(len(newAttrs)-len(desAttrs))
    if desAttrs:
        outData = dataUtilities.attributeDeselectionData(outData, desAttrs)
    unknownAttrs = [attr for attr in descList if attr not in outData.domain]
    print "Attributes not found among the structural descriptors: ",len(unknownAttrs)," (set to 0.0)"
    outData = dataUtilities.attributeAddData(outData, unknownAttrs, orange.FloatVariable, 0.0)
    return outData
コード例 #11
0
def buildConsensus(trainData, learners, MLMethods, logFile = None):
        log(logFile, "Building a consensus model based on optimized MLmethods: "+str([ml for ml in MLMethods])+"...")
        if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
            #Expression: If  CAavg_{POS} ge CAavg_{NEG} -> POS  else -> NEG 
            #    where CAavg_{POS} is the average of classification accuracies of all models predicting POS.
            CLASS0 = str(trainData.domain.classVar.values[0])
            CLASS1 = str(trainData.domain.classVar.values[1])
            #exprTest0
            exprTest0 = "(0"
            for ml in MLMethods:
                exprTest0 += "+( "+ml+" == "+CLASS0+" )*"+str(MLMethods[ml]["optAcc"])+" "
            exprTest0 += ")/IF0(sum([False"
            for ml in MLMethods:
                exprTest0 += ", "+ml+" == "+CLASS0+" "
            exprTest0 += "]),1)"
            # exprTest1
            exprTest1 = "(0"
            for ml in MLMethods:
                exprTest1 += "+( "+ml+" == "+CLASS1+" )*"+str(MLMethods[ml]["optAcc"])+" "
            exprTest1 += ")/IF0(sum([False"
            for ml in MLMethods:
                exprTest1 += ", "+ml+" == "+CLASS1+" "
            exprTest1 += "]),1)"
            # expression
            expression = [exprTest0+" >= "+exprTest1+" -> "+CLASS0," -> "+CLASS1]
        else:
            Q2sum = sum([MLMethods[ml]["optAcc"] for ml in MLMethods])
            expression = "(1 / "+str(Q2sum)+") * (0"
            for ml in MLMethods:
                expression += " + "+str(MLMethods[ml]["optAcc"])+" * " + ml +" "
            expression += ")" 

        consensusLearners = {}
        for learnerName in learners:
            consensusLearners[learnerName] = learners[learnerName]
        
        learner = AZorngConsensus.ConsensusLearner(learners = consensusLearners, expression = expression)
        log(logFile, "  Training Consensus Learner")
        smilesAttr = dataUtilities.getSMILESAttr(trainData)
        if smilesAttr:
            log(logFile,"Found SMILES attribute:"+smilesAttr)
            if learner.specialType == 1:
               trainData = dataUtilities.attributeSelectionData(trainData, [smilesAttr, trainData.domain.classVar.name])
               log(logFile,"Selected attrs: "+str([attr.name for attr in trainData.domain]))
            else:
               trainData = dataUtilities.attributeDeselectionData(trainData, [smilesAttr])
               log(logFile,"Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] +\
                                              [attr.name for attr in trainData.domain[len(trainData.domain)-3:]]))

        return learner(trainData)
コード例 #12
0
def LLOO(idx, extTrain, measure=None):
    """
    Use the fraction of kNN correctly predicted by a local model
    Hard coded to 20 NN.
    Modeling method. RF of Tree?
    """
    attrList = ["SMILES_1"]
    extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList)

    distList = []
    if not measure:
        measure = orange.ExamplesDistanceConstructor_Euclidean(extTrain)
    for runIdx in range(len(extTrain)):
        if runIdx != idx:
            dist = measure(extTrain[idx], extTrain[runIdx])
            distList.append(dist)

    # Get the distance of the 20th NN
    distList.sort()
    thresDist = distList[19]

    # Find the labels of the 20 NN
    kNN = []
    for runIdx in range(len(extTrain)):
        dist = measure(extTrain[idx], extTrain[runIdx])
        if dist <= thresDist:
            kNN.append(extTrain[runIdx])
    kNNtrain = dataUtilities.DataTable(kNN)

    # Find the fraction of correctly predicted ex in a LOO over kNN
    corrPred = 0
    for idx in range(len(kNNtrain)):

        # Deselect example idx in extTrain
        idxList = range(0, idx)
        idxList.extend(range(idx + 1, len(kNNtrain)))
        train = kNNtrain.get_items(idxList)

        # Train a model
        model = AZorngRF.RFLearner(train)
        # model = Orange.classification.tree.TreeLearner(train)

        pred = model(kNNtrain[idx]).value
        actual = kNNtrain[idx].get_class().value
        if pred == actual:
            corrPred = corrPred + 1
    alpha = 1.0 - float(corrPred) / len(kNNtrain)

    return alpha
コード例 #13
0
def LLOO(idx, extTrain, measure=None):
    """
    Use the fraction of kNN correctly predicted by a local model
    Hard coded to 20 NN.
    Modeling method. RF of Tree?
    """
    attrList = ["SMILES_1"]
    extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList)

    distList = []
    if not measure:
        measure = orange.ExamplesDistanceConstructor_Euclidean(extTrain)
    for runIdx in range(len(extTrain)):
        if runIdx != idx:
            dist = measure(extTrain[idx], extTrain[runIdx])
            distList.append(dist)

    # Get the distance of the 20th NN
    distList.sort()
    thresDist = distList[19]

    # Find the labels of the 20 NN
    kNN = []
    for runIdx in range(len(extTrain)):
        dist = measure(extTrain[idx], extTrain[runIdx])
        if dist <= thresDist:
            kNN.append(extTrain[runIdx])
    kNNtrain = dataUtilities.DataTable(kNN)

    # Find the fraction of correctly predicted ex in a LOO over kNN
    corrPred = 0
    for idx in range(len(kNNtrain)):

        # Deselect example idx in extTrain
        idxList = range(0, idx)
        idxList.extend(range(idx + 1, len(kNNtrain)))
        train = kNNtrain.get_items(idxList)

        # Train a model
        model = AZorngRF.RFLearner(train)
        #model = Orange.classification.tree.TreeLearner(train)

        pred = model(kNNtrain[idx]).value
        actual = kNNtrain[idx].get_class().value
        if pred == actual:
            corrPred = corrPred + 1
    alpha = 1.0 - float(corrPred) / len(kNNtrain)

    return alpha
コード例 #14
0
def getMahalanobisResults(predictor,
                          invCovMatFile=None,
                          centerFile=None,
                          dataTableFile=None):
    domain = None
    if predictor.highConf == None and predictor.lowConf == None:
        return None, None
    if not dataTableFile and (not hasattr(predictor, "trainDataPath")
                              or not predictor.trainDataPath):
        print "The predictor does not have a trainDataPath specifyed. We need it for calculating Mahalanobis results!"
        return None, None
    testData = dataUtilities.attributeDeselectionData(predictor.exToPred,
                                                      ["SMILEStoPred"])
    if not dataTableFile:
        trainData = dataUtilities.DataTable(predictor.trainDataPath)
        domain = trainData.domain
    else:
        trainData = None
        domain = predictor.model.domain
    ExampleFix = dataUtilities.ExFix(domain, None, False)
    exFixed1 = ExampleFix.fixExample(testData[0])
    if testData.hasMissingValues():
        if not trainData:
            averageImputer = orange.Imputer_defaults(
                predictor.model.imputeData)
        else:
            averageImputer = orange.ImputerConstructor_average(trainData)
        dat = averageImputer(exFixed1)
    else:
        dat = exFixed1

    tab = dataUtilities.DataTable(domain)
    tab.append(dat)

    MD = calcMahalanobis(trainData, tab, invCovMatFile, centerFile,
                         dataTableFile, domain)
    near3neighbors = [(MD[0]["_train_id_near1"], MD[0]["_train_SMI_near1"]),
                      (MD[0]["_train_id_near2"], MD[0]["_train_SMI_near2"]),
                      (MD[0]["_train_id_near3"], MD[0]["_train_SMI_near3"])]
    avg3nearest = MD[0]["_train_av3nearest"]
    if avg3nearest < predictor.highConf:
        confStr = predictor.highConfString
    elif avg3nearest > predictor.lowConf:
        confStr = predictor.lowConfString
    else:
        confStr = predictor.medConfString

    return near3neighbors, confStr
コード例 #15
0
def probPredInd(trainSet, calSet):
    """
    Use the RF prediction probability to set the non-conf score
    """
    attrList = ["SMILES_1"]
    trainSet = dataUtilities.attributeDeselectionData(trainSet, attrList)

    # Train a model
    model = AZorngRF.RFLearner(trainSet)

    # Get the list of NC for all ex in calSet
    alphaList = []
    for ex in calSet:
        alpha = getProbPredAlpha(model, ex)
        alphaList.append(alpha)

    return alphaList, model
コード例 #16
0
def probPredInd(trainSet, calSet):
    """
    Use the RF prediction probability to set the non-conf score
    """
    attrList = ["SMILES_1"]
    trainSet = dataUtilities.attributeDeselectionData(trainSet, attrList)

    # Train a model
    model = AZorngRF.RFLearner(trainSet)

    # Get the list of NC for all ex in calSet
    alphaList = []
    for ex in calSet:
        alpha = getProbPredAlpha(model, ex)
        alphaList.append(alpha)

    return alphaList, model
コード例 #17
0
ファイル: descProcessing.py プロジェクト: tojojames/AZOrange
def filterDesc(data, zeroFracT = 0.95, LowVarT = 0.01, HighCorrT = 0.95):

    print "Initial number of descriptors ", len(data.domain.attributes)

    # Find the descriptors for which the fraction of zeros is smaller than zeroFracT - keep these
    attrList = []
    rmAttrList = []
    for attr in data.domain.attributes:
        valueList = []
        nZero = 0
        for ex in data:
            value = ex[attr.name].value
            if value == 0:
                nZero = nZero + 1
            valueList.append(value)
        zeroFrac = float(nZero)/len(valueList)
        if zeroFrac < zeroFracT:
            attrList.append(attr.name)
        else:
            rmAttrList.append(attr.name)
    print "Descriptors deselected because of a large fraction of zeros: "
    print rmAttrList
    data = dataUtilities.attributeSelectionData(data, attrList)
    print "Remaining number of descriptors ", len(data.domain.attributes)

    # Filter descriptors based on normalized variance
    rmAttrList = []
    for attr in data.domain.attributes:
        valueList = []
        for ex in data:
            value = ex[attr.name].value
            valueList.append(value)
        variance = numpy.var(valueList)
        mean = numpy.mean(valueList)
        normVar = variance/mean
        if normVar < LowVarT:
            rmAttrList.append(attr.name)
        
    print "Descriptors deselected because of low variance "
    print rmAttrList
    data = dataUtilities.attributeDeselectionData(data, rmAttrList)
    print "Remaining number of descriptors ", len(data.domain.attributes)
    
    print "Correlation filter not implemented yet"

    return data
コード例 #18
0
def getDesc(trainDataFile):

    # Read the SAR for which to calculate descriptors
    sarData = dataUtilities.DataTable(trainDataFile) 

    # Get names of RDK descriptors
    rdkDescs = getCinfonyDesc.getAvailableDescs("rdk")
   
    # Calculate the descriptors 
    trainData = getCinfonyDesc.getCinfonyDescResults(sarData, rdkDescs)

    # Deselect the SMILES attribute
    attrList = ["SMILES"]
    trainData = dataUtilities.attributeDeselectionData(trainData, attrList)
     
    # Save the trainData set
    trainData.save("trainData.tab") 

    return trainData
コード例 #19
0
def minNN(idx, extTrain, maxDistRatio=None, measure=None):
    """
    Use the ratio between the distance to the nearest neighbor of the same and of the other class
    Two versions exist, with and without scaling with the max distance ratio within the train set. 
    """

    attrList = ["SMILES_1"]
    extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList)

    distListSame = []
    distListDiff = []
    #measure = Orange.distance.Euclidean(extTrain)
    if not measure:
        measure = orange.ExamplesDistanceConstructor_Euclidean(extTrain)
    for runIdx in range(len(extTrain)):
        if runIdx != idx:
            dist = measure(extTrain[idx], extTrain[runIdx])
            if extTrain[idx].get_class().value == extTrain[runIdx].get_class(
            ).value:
                distListSame.append(dist)
            else:
                distListDiff.append(dist)
    minDistSame = min(distListSame)
    minDistDiff = min(distListDiff)
    if minDistDiff == 0:
        if maxDistRatio:
            alpha = 1.0
        else:
            alpha = max(distListDiff)
    else:
        if maxDistRatio:
            alpha = minDistSame / (float(minDistDiff) * maxDistRatio)
        else:
            alpha = minDistSame / float(minDistDiff)

    #fid = open("tempFile.txt", "a")
    #fid.write(str(minDistSame)+"\t"+str(minDistDiff)+"\t"+str(maxDistRatio)+"\t"+str(alpha)+"\n")
    #fid.close()

    return alpha
コード例 #20
0
def getMahalanobisResults(predictor, invCovMatFile = None, centerFile = None, dataTableFile = None):
        domain = None
        if predictor.highConf == None and predictor.lowConf == None:
            return None, None
        if not hasattr(predictor,"trainDataPath") or not predictor.trainDataPath:
            print "The predictor does not have a trainDataPath specifyed. We need it for calculating Mahalanobis results!"
            return None, None
        testData = dataUtilities.attributeDeselectionData(predictor.exToPred,["SMILEStoPred"])
        if not dataTableFile:
            trainData = dataUtilities.DataTable(predictor.trainDataPath)
            domain = trainData.domain
        else:
            trainData = None
            domain = predictor.model.domain
        ExampleFix = dataUtilities.ExFix(domain,None,False)
        exFixed1 = ExampleFix.fixExample(testData[0])
        if testData.hasMissingValues():
            if not trainData:
                averageImputer = orange.Imputer_defaults(predictor.model.imputeData) 
            else:
                averageImputer = orange.ImputerConstructor_average(trainData)
            dat = averageImputer(exFixed1)
        else:
            dat = exFixed1

        tab = dataUtilities.DataTable(domain)
        tab.append(dat)

        MD = calcMahalanobis(trainData, tab, invCovMatFile, centerFile, dataTableFile, domain)
        near3neighbors = [ (MD[0]["_train_id_near1"], MD[0]["_train_SMI_near1"]), (MD[0]["_train_id_near2"], MD[0]["_train_SMI_near2"]), (MD[0]["_train_id_near3" ], MD[0]["_train_SMI_near3"]) ]
        avg3nearest = MD[0]["_train_av3nearest"]
        if avg3nearest < predictor.highConf:
            confStr = predictor.highConfString
        elif avg3nearest > predictor.lowConf:
            confStr = predictor.lowConfString
        else:
            confStr = predictor.medConfString

	return near3neighbors, confStr
コード例 #21
0
def minNN(idx, extTrain, maxDistRatio=None, measure=None):
    """
    Use the ratio between the distance to the nearest neighbor of the same and of the other class
    Two versions exist, with and without scaling with the max distance ratio within the train set. 
    """

    attrList = ["SMILES_1"]
    extTrain = dataUtilities.attributeDeselectionData(extTrain, attrList)

    distListSame = []
    distListDiff = []
    # measure = Orange.distance.Euclidean(extTrain)
    if not measure:
        measure = orange.ExamplesDistanceConstructor_Euclidean(extTrain)
    for runIdx in range(len(extTrain)):
        if runIdx != idx:
            dist = measure(extTrain[idx], extTrain[runIdx])
            if extTrain[idx].get_class().value == extTrain[runIdx].get_class().value:
                distListSame.append(dist)
            else:
                distListDiff.append(dist)
    minDistSame = min(distListSame)
    minDistDiff = min(distListDiff)
    if minDistDiff == 0:
        if maxDistRatio:
            alpha = 1.0
        else:
            alpha = max(distListDiff)
    else:
        if maxDistRatio:
            alpha = minDistSame / (float(minDistDiff) * maxDistRatio)
        else:
            alpha = minDistSame / float(minDistDiff)

    # fid = open("tempFile.txt", "a")
    # fid.write(str(minDistSame)+"\t"+str(minDistDiff)+"\t"+str(maxDistRatio)+"\t"+str(alpha)+"\n")
    # fid.close()

    return alpha
コード例 #22
0
def getDesc(trainDataFile):

    # Read the SAR for which to calculate descriptors
    sarData = dataUtilities.DataTable(trainDataFile)

    # Get names of RDK descriptors
    rdkDescs = getCinfonyDesc.getAvailableDescs("rdk")

    # Calculate the descriptors
    trainData = getCinfonyDesc.getCinfonyDescResults(sarData, rdkDescs)

    # Deselect the SMILES attribute
    attrList = [
        attr.name for attr in trainData.domain.attributes
        if attr.varType == orange.Variable.String
    ]

    trainData = dataUtilities.attributeDeselectionData(trainData, attrList)

    # Save the trainData set
    trainData.save("trainData.tab")

    return trainData
コード例 #23
0
ファイル: getBBRCDesc.py プロジェクト: tojojames/AZOrange
def getBBRCDescResult(dataIN,
                      algo="FTM",
                      minSupPar=2,
                      ChisqSig=None,
                      active=None,
                      verbose=0,
                      descList=[]):
    """ delegate to different algorithm methods 
    """
    if not descList:
        descList = []
    outData = None
    if active is not None:
        activeLabel = active
    else:
        if dataIN.domain.classVar:
            activeLabel = dataIN.domain.classVar.values[
                0]  # For BBRC the active class can be any since it will only use the "count"
        else:
            activeLabel = None

    if (algo == "FTM"):  # Using BBRC without class correlation
        BBRCCalc = BBRC(verbose=verbose)
        BBRCCalc.minsup = minSupPar
        BBRCCalc.active = activeLabel
        #Disanling class correlation
        BBRCCalc.DynamicUpperBound = False
        BBRCCalc.ChisqSig = 0.0
        BBRCCalc.Backbone = False

        outData = BBRCCalc.getDesc(dataIN)
    elif (algo == "BBRC"):
        BBRCCalc = BBRC(verbose=verbose)
        BBRCCalc.minsup = minSupPar
        BBRCCalc.active = activeLabel
        if ChisqSig is not None:
            if ChisqSig < 0 or ChisqSig > 1:
                print "ERROR: ChisqSig must be defined between 0 and 1"
                return None
            BBRCCalc.ChisqSig = ChisqSig
        else:
            BBRCCalc.ChisqSig = 0.95
        outData = BBRCCalc.getDesc(dataIN)
    elif (algo == "LAST-PM"):
        outData = getFMinerDescResult(data, minsup, algo)
    else:
        print "Algorithm " + str(algo) + " is unknown!"
    if not outData:
        return None
    newAttrs = [
        attr.name for attr in outData.domain if attr.name not in dataIN.domain
    ]
    if descList:
        desAttrs = [attr for attr in newAttrs if attr not in descList]
    else:
        desAttrs = []
    print "BBRC descriptors requested: " + str(len(descList) or "ALL")
    print "BBRC descriptors returned: " + str(len(newAttrs) - len(desAttrs))
    if desAttrs:
        outData = dataUtilities.attributeDeselectionData(outData, desAttrs)
    unknownAttrs = [attr for attr in descList if attr not in outData.domain]
    print "Attributes not found among the structural descriptors: ", len(
        unknownAttrs), " (set to 0.0)"
    outData = dataUtilities.attributeAddData(outData, unknownAttrs,
                                             orange.FloatVariable, 0.0)
    return outData
コード例 #24
0
        #if idx == 1: break


if __name__ == "__main__":
    """
    Assumptions;
    Binary classification 
    This main will test the implemented CP methods in a 10 fold CV
    """

    data = dataUtilities.DataTable("HLMSeries2_rdkPhysChemPrepClass.txt")
    attrList = [
        '"Medivir;HLM (XEN025);CLint (uL/min/mg);(Num)"', 'Structure',
        '"MV Number"', "rdk.MolecularFormula"
    ]
    data = dataUtilities.attributeDeselectionData(data, attrList)

    print "Select all attributes"
    descListList = [[]]
    for attr in data.domain.attributes:
        descListList[0].append(attr.name)

    #methods = ["kNNratio", "minNN", "avgNN", "probPred", "combo", "LLOO", "LLOOprob"]   # Non-conformity score method
    methods = ["probPred"]
    cpMethod = "transductive"  # inductive or transductive

    #print "Temp position to save comp time!!"
    # Append to python path /home/kgvf414/dev/AZOrange0.5.5/orangeDependencies/src/orange/orange/Orange/distance/
    #import instances
    #measure = instances.MahalanobisConstructor(data)
    measure = None
コード例 #25
0
ファイル: evalUtilities.py プロジェクト: tojojames/AZOrange
    def getExamplesAndSetTrainBias(self, data, testAttrFilter, testFilterVal):
        """
        Collects and returns the examples that match the filterValue at the Attr defined
        The remaining examples (that do not match the filterValue at the Attr defined) are
        placed in the trainBias to be added in all train events.
        """
        self.trainBias = None
        if testAttrFilter is not None and testFilterVal is not None and testAttrFilter in data.domain:
            if type(testFilterVal) != list:
                raise Exception(
                    "Invalid Attr filter value. It must be a list of strings")
            else:
                allDataEx = len(data)
                examples = orange.ExampleTable(data.domain)
                self.trainBias = orange.ExampleTable(data.domain)
                for ex in data:
                    inExamples = False
                    for Vfilter in testFilterVal:
                        if ex[testAttrFilter].value == Vfilter:
                            examples.append(ex)
                            inExamples = True
                            break
                    if not inExamples:
                        self.trainBias.append(ex)

                print "INFO: Variable control validation:"
                print "      Examples in data: " + str(allDataEx)
                print "      Examples selected for validation: " + str(
                    len(examples))
                print "      Examples to be appended to the train set: " + str(
                    len(self.trainBias))
                examples = dataUtilities.attributeDeselectionData(
                    examples, [testAttrFilter])
        elif testAttrFilter is not None and testFilterVal is None and testAttrFilter in data.domain:
            #Enable pre-selected-indices
            self.fixedIdx = orange.LongList()
            allDataEx = len(data)
            examples = orange.ExampleTable(data.domain)
            self.trainBias = orange.ExampleTable(data.domain)
            foldsCounter = {}
            for ex in data:
                value = str(ex[testAttrFilter].value)
                if not miscUtilities.isNumber(value):
                    raise Exception("Invalid fold value:" + str(value) +
                                    ". It must be str convertable to an int.")
                value = int(float(value))
                if value not in foldsCounter:
                    foldsCounter[value] = 1
                else:
                    foldsCounter[value] += 1
                if not miscUtilities.isNumber:
                    raise Exception("Invalid fold value:" + str(value) +
                                    ". It must be str convertable to an int.")
                if value != 0:
                    examples.append(ex)
                    self.fixedIdx.append(value - 1)
                else:
                    self.trainBias.append(ex)

            print "INFO: Pre-selected " + str(
                len([f for f in foldsCounter if f != 0])) + " folds for CV:"
            print "      Examples in data: " + str(allDataEx)
            print "      Examples selected for validation: " + str(
                len(examples))
            print "      Examples to be appended to the train set: " + str(
                len(self.trainBias))
            examples = dataUtilities.attributeDeselectionData(
                examples, [testAttrFilter])

        else:
            examples = data

        return examples
コード例 #26
0
        elif avg3nearest > predictor.lowConf:
            confStr = predictor.lowConfString
        else:
            confStr = predictor.medConfString

	return near3neighbors, confStr

if __name__ == "__main__":
    dataFile = "trainData.txt"
    testDataFile = "testData.txt"
    data = dataUtilities.DataTable(dataFile) 
    testData = dataUtilities.DataTable(testDataFile)

    # This data contains SMILES and ID, which data and ex are assumed not to. 
    attrList = ["SMILES", "ID"]
    data = dataUtilities.attributeDeselectionData(data, attrList)
    testData = dataUtilities.attributeDeselectionData(testData, attrList)

    # Select one ex
    selectionList = []
    for idx in range(len(testData)):
        selectionList.append(0)
    selectionList[0] = 1  # Select first ex
    ex = testData.select(selectionList)

    # One ex in exampleTable
    #MD = calcMahalanobis(data, ex)
    # Multiple ex in exampleTable
    MD = calcMahalanobis(data, testData)
    #print "Returned MD"
    #print MD
コード例 #27
0
ファイル: SimBoostedQSAR.py プロジェクト: girschic/AZOrange
def getSimDescriptors(InReference, InData, methods, active_ids = None, pharmacophore_file = None, callBack = None):
        """ calculates similarity descriptors for a training set (orange object) using the 
                given similarity methods against the given actives
                Possible method strings in methods are the names of the sim_* methods below,
                e.g. rdk_topo_fps for sim_rdk_topo_fps
            callBack function, if defined, will be called on each step sending the pergentage done (0-100): 
                   e.g. callBack(25)
                the callBack function shall return True of False which will indicate to this method if the process it to be continued or Not.
                   e.g. if callBack(25) == False it indicates the caller want's to stop the process of calculating descriptors                 
        """
        # Pre-process input Data to standardize the SMILES
        SMILESattr = getSMILESAttr(InData)
        
        if not SMILESattr:
            return None

        #TODO: Create a method in dataUtilities to standardize the attribute smilesName in place having the attr origSmiles as ID
        if "AZutilities.extraUtilities" in sys.modules and hasattr(extraUtilities, "StandardizeSMILES"):
            # Call a method for standardizing the SMILES in Data.
            # The method is expected to change the attribute defined as smiAttr in data object
            cleanedData = True
            # Process InData
            tmpDomain =  orange.Domain([orange.StringVariable("OrigSMI_ID")]+[attr for attr in InData.domain])
            data = orange.ExampleTable(tmpDomain,InData)
            #    Fill the OrigSMI_ID
            for ex in data:
                ex["OrigSMI_ID"] = ex[SMILESattr]
            extraUtilities.StandardizeSMILES(data, smiAttr = SMILESattr, cName="OrigSMI_ID")
            # Process  Input actives
            activesDomain = orange.Domain([orange.StringVariable("OrigSMI_ID"), orange.StringVariable("SMILES")],0) 
            activesData = orange.ExampleTable(activesDomain)
            for act in InReference:
                activesData.append([act,act])
            extraUtilities.StandardizeSMILES(activesData, smiAttr = "SMILES", cName="OrigSMI_ID")
            #print activesData.domain
            actives = []
            for ex in activesData:
                actives.append(str(ex["SMILES"].value))
        else:
            data = InData
	    print "NO cleaning"
            actives = InReference  
            cleanedData = False

        # adjust the header
        atts = []
        for m in methods:
                count = 1
                for a in actives:
                        attname = m + '(reference_'+ str(count)+ ')'
			#print "ATT: " + str(attname)
			#print "M: " + str(m)
                        atts.append(orange.FloatVariable(attname))
                        count += 1        
                        
        newdomain = orange.Domain(data.domain.attributes + atts, data.domain.classVar)
        newdata = orange.ExampleTable(newdomain, data)
        
        att_idx = 0
        # if callBack is defined, it will be called with the percentage done, i.e. 0-100
        if active_ids:
            nTotalSteps = len(newdata) * ( (len(methods)-1) * len(actives) + len(active_ids) )
        else:
            nTotalSteps = len(methods) * len(actives) * len(newdata)
        stepsDone   = 0  
        
        # fill up the data        
        for m in methods:
                if m == 'rdk_topo_fps':
                        count = 1
                        for a in actives:
                                attname = m + '(active_'+ str(count)+ ')'
                                for j in range(len(newdata)):
                                        instance = newdata[j]
					val = 0.0
					try:
						val = orng_sim_rdk_topo_fps(a, instance)
					except RuntimeError:
						print str(a) +" and " +str(instance) +  "- unable to calculate topo fp"
						
                                        tmp = orange.Value(atts[att_idx], val)
                                        instance[atts[att_idx]] = tmp
                                        if callBack: 
                                            stepsDone += 1
                                            if not callBack((100*stepsDone)/nTotalSteps): return None
                                att_idx += 1        
                                
                elif m == 'rdk_MACCS_keys':
                        count = 1
                        for a in actives:
				c = 1
                                attname = m + '(active_'+ str(count)+ ')'
                                for j in range(len(newdata)):
                                        instance = newdata[j]
					val = 0.0
					try:
						val = orng_sim_rdk_MACCS_keys(a, instance)
					except RuntimeError:
						print str(a) + " and " +str(instance) + "- unable to calculate MACCS key"

                                        tmp = orange.Value(atts[att_idx], val)
                                        instance[atts[att_idx]] = tmp
                                        if callBack: 
                                            stepsDone += 1
                                            if not callBack((100*stepsDone)/nTotalSteps): return None

                                att_idx += 1                

                elif m == 'rdk_morgan_fps':
                        count = 1
                        for a in actives:
                                attname = m + '(active_'+ str(count)+ ')'
                                for j in range(len(newdata)):
                                        instance = newdata[j]
					val = 0.0
					try:
						val = orng_sim_rdk_morgan_fps(a, instance)
					except RuntimeError:
						print str(a) + " and " +str(instance) + "- unable to calculate morgan fp"

                                        tmp = orange.Value(atts[att_idx], val)
                                        instance[atts[att_idx]] = tmp
                                        if callBack: 
                                            stepsDone += 1
                                            if not callBack((100*stepsDone)/nTotalSteps): return None
        
                                att_idx += 1        
                                
                elif m == 'rdk_morgan_features_fps':
                        count = 1
                        for a in actives:
                                attname = m + '(active_'+ str(count)+ ')'
                                for j in range(len(newdata)):
                                        instance = newdata[j]
					val = 0.0
					try: 
						val = orng_sim_rdk_morgan_features_fps(a, instance)
					except RuntimeError:
						print str(a) + " and " +str(instance) + "- unable to calculate morgan features fp"

                                        tmp = orange.Value(atts[att_idx], val)
                                        instance[atts[att_idx]] = tmp
                                        if callBack: 
                                            stepsDone += 1
                                            if not callBack((100*stepsDone)/nTotalSteps): return None

                                att_idx += 1        
                                        
                elif m == 'rdk_atompair_fps':
                        count = 1
                        for a in actives:
                                attname = m + '(active_'+ str(count)+ ')'
                                for j in range(len(newdata)):
                                        instance = newdata[j]
					val = 0.0
					try:
						val = orng_sim_rdk_atompair_fps(a, instance)
					except RuntimeError:
						print str(a) + " and " +str(instance) + "- unable to calculate topo fp"

                                        tmp = orange.Value(atts[att_idx], val)
                                        instance[atts[att_idx]] = tmp                
                                        if callBack: 
                                            stepsDone += 1
                                            if not callBack((100*stepsDone)/nTotalSteps): return None

                                att_idx += 1        
        
                elif m == 'azo_pharmacophore_fps':
                        count = 1
                        for a in active_ids:
                                attname = m + '(active_'+ str(count)+ ')'
                                for j in range(len(newdata)):
                                            instance = newdata[j]
                                            tmp = orange.Value(atts[att_idx], azo_pharmacophore_az_inhouse(a, instance, pharmacophore_file))
                                            instance[atts[att_idx]] = tmp
                                            if callBack: 
                                                stepsDone += 1
                                                if not callBack((100*stepsDone)/nTotalSteps): return None

                                att_idx += 1
                                                
        if cleanedData:      
            #Remove the fixed SMILES and revert to the Original SMILES           
            newdata = dataUtilities.attributeDeselectionData(newdata,[SMILESattr])
            newdata.domain["OrigSMI_ID"].name = SMILESattr
        return newdata
コード例 #28
0
ファイル: evalUtilities.py プロジェクト: AZCompTox/AZOrange
    def getExamplesAndSetTrainBias(self, data, testAttrFilter, testFilterVal):
        """
        Collects and returns the examples that match the filterValue at the Attr defined
        The remaining examples (that do not match the filterValue at the Attr defined) are
        placed in the trainBias to be added in all train events.
        """
        self.trainBias = None
        if testAttrFilter is not None and  testFilterVal is not None and testAttrFilter in data.domain:
            if type(testFilterVal) != list:
                raise Exception("Invalid Attr filter value. It must be a list of strings")
            else:
                allDataEx = len(data)
                examples = orange.ExampleTable(data.domain)
                self.trainBias = orange.ExampleTable(data.domain)
                for ex in data:
                    inExamples = False
                    for Vfilter in testFilterVal:
                        if ex[testAttrFilter].value == Vfilter:
                            examples.append(ex)
                            inExamples = True
                            break
                    if not inExamples:
                        self.trainBias.append(ex)

                print "INFO: Variable control validation:"
                print "      Examples in data: "+str(allDataEx)
                print "      Examples selected for validation: "+str(len(examples))
                print "      Examples to be appended to the train set: "+str(len(self.trainBias))
                examples = dataUtilities.attributeDeselectionData(examples, [testAttrFilter])
        elif testAttrFilter is not None and testFilterVal is None and testAttrFilter in data.domain:
            #Enable pre-selected-indices
            self.fixedIdx = orange.LongList()
            allDataEx = len(data)
            examples = orange.ExampleTable(data.domain)
            self.trainBias = orange.ExampleTable(data.domain)
            foldsCounter = {}
            for ex in data:
                value = str(ex[testAttrFilter].value)
                if not miscUtilities.isNumber(value):
                   raise Exception("Invalid fold value:"+str(value)+". It must be str convertable to an int.")
                value = int(float(value))
                if value not in foldsCounter:
                    foldsCounter[value] = 1
                else:
                    foldsCounter[value] += 1
                if not miscUtilities.isNumber:
                    raise Exception("Invalid fold value:"+str(value)+". It must be str convertable to an int.")
                if value != 0:
                    examples.append(ex)
                    self.fixedIdx.append(value - 1)
                else:
                    self.trainBias.append(ex)

            print "INFO: Pre-selected "+str(len([f for f in foldsCounter if f != 0]))+" folds for CV:"
            print "      Examples in data: "+str(allDataEx)
            print "      Examples selected for validation: "+str(len(examples))
            print "      Examples to be appended to the train set: "+str(len(self.trainBias))
            examples = dataUtilities.attributeDeselectionData(examples, [testAttrFilter])

        else:
            examples = data

        return examples
コード例 #29
0
    def getAcc(self, callBack=None, callBackWithFoldModel=None):
        """ For regression problems, it returns the RMSE and the Q2 
            For Classification problems, it returns CA and the ConfMat
            The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]}
            For the EvalResults not supported for a specific learner/datase, the respective result will be None

            if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus
                made out of those that were stable

            It some error occurred, the respective values in the Dict will be None
        """
        self.__log("Starting Calculating MLStatistics")
        statistics = {}
        if not self.__areInputsOK():
            return None
        # Set the response type
        self.responseType = self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression"
        self.__log("  " + str(self.responseType))

        #Create the Train and test sets
        if self.usePreDefFolds:
            DataIdxs = self.preDefIndices
        else:
            DataIdxs = self.sampler(self.data, self.nExtFolds)
        foldsN = [f for f in dict.fromkeys(DataIdxs) if f != 0
                  ]  #Folds used only from 1 on ... 0 are for fixed train Bias
        nFolds = len(foldsN)
        #Fix the Indexes based on DataIdxs
        # (0s) represents the train set  ( >= 1s) represents the test set folds
        if self.useVarCtrlCV:
            nShifted = [0] * nFolds
            for idx, isTest in enumerate(
                    self.preDefIndices
            ):  # self.preDefIndices == 0 are to be used in TrainBias
                if not isTest:
                    if DataIdxs[idx]:
                        nShifted[DataIdxs[idx]] += 1
                        DataIdxs[idx] = 0
            for idx, shift in enumerate(nShifted):
                self.__log("In fold " + str(idx) + ", " + str(shift) +
                           " examples were shifted to the train set.")

        #Var for saving each Fols result
        optAcc = {}
        results = {}
        exp_pred = {}
        nTrainEx = {}
        nTestEx = {}

        #Set a dict of learners
        MLmethods = {}
        if type(self.learner) == dict:
            for ml in self.learner:
                MLmethods[ml] = self.learner[ml]
        else:
            MLmethods[self.learner.name] = self.learner

        models = {}
        self.__log("Calculating Statistics for MLmethods:")
        self.__log("  " + str([x for x in MLmethods]))

        #Check data in advance so that, by chance, it will not faill at the last fold!
        for foldN in foldsN:
            trainData = self.data.select(DataIdxs, foldN, negate=1)
            self.__checkTrainData(trainData)

        #Optional!!
        # Order Learners so that PLS is the first
        sortedML = [ml for ml in MLmethods]
        if "PLS" in sortedML:
            sortedML.remove("PLS")
            sortedML.insert(0, "PLS")

        stepsDone = 0
        nTotalSteps = len(sortedML) * self.nExtFolds
        for ml in sortedML:
            startTime = time.time()
            self.__log("    > " + str(ml) + "...")
            try:
                #Var for saving each Fols result
                results[ml] = []
                exp_pred[ml] = []
                models[ml] = []
                nTrainEx[ml] = []
                nTestEx[ml] = []
                optAcc[ml] = []
                logTxt = ""
                for foldN in foldsN:
                    if type(self.learner) == dict:
                        self.paramList = None

                    trainData = self.data.select(DataIdxs, foldN, negate=1)
                    testData = self.data.select(DataIdxs, foldN)
                    smilesAttr = dataUtilities.getSMILESAttr(trainData)
                    if smilesAttr:
                        self.__log("Found SMILES attribute:" + smilesAttr)
                        if MLmethods[ml].specialType == 1:
                            trainData = dataUtilities.attributeSelectionData(
                                trainData,
                                [smilesAttr, trainData.domain.classVar.name])
                            testData = dataUtilities.attributeSelectionData(
                                testData,
                                [smilesAttr, testData.domain.classVar.name])
                            self.__log(
                                "Selected attrs: " +
                                str([attr.name for attr in trainData.domain]))
                        else:
                            trainData = dataUtilities.attributeDeselectionData(
                                trainData, [smilesAttr])
                            testData = dataUtilities.attributeDeselectionData(
                                testData, [smilesAttr])
                            self.__log("Selected attrs: " + str(
                                [attr.name for attr in trainData.domain[0:3]] +
                                ["..."] + [
                                    attr.name for attr in trainData.
                                    domain[len(trainData.domain) - 3:]
                                ]))

                    nTrainEx[ml].append(len(trainData))
                    nTestEx[ml].append(len(testData))
                    #Test if trainsets inside optimizer will respect dataSize criterias.
                    #  if not, don't optimize, but still train the model
                    dontOptimize = False
                    if self.responseType != "Classification" and (
                            len(trainData) *
                        (1 - 1.0 / self.nInnerFolds) < 20):
                        dontOptimize = True
                    else:
                        tmpDataIdxs = self.sampler(trainData, self.nInnerFolds)
                        tmpTrainData = trainData.select(tmpDataIdxs,
                                                        1,
                                                        negate=1)
                        if not self.__checkTrainData(tmpTrainData, False):
                            dontOptimize = True

                    SpecialModel = None
                    if dontOptimize:
                        logTxt += "       Fold " + str(
                            foldN
                        ) + ": Too few compounds to optimize model hyper-parameters\n"
                        self.__log(logTxt)
                        if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                            res = evalUtilities.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                stratified=orange.MakeRandomIndices.
                                StratifiedIfPossible,
                                random_generator=random.randint(0, 100))
                            CA = evalUtilities.CA(res)[0]
                            optAcc[ml].append(CA)
                        else:
                            res = evalUtilities.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                stratified=orange.MakeRandomIndices.
                                StratifiedIfPossible,
                                random_generator=random.randint(0, 100))
                            R2 = evalUtilities.R2(res)[0]
                            optAcc[ml].append(R2)
                    else:
                        if MLmethods[ml].specialType == 1:
                            if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                                optInfo, SpecialModel = MLmethods[
                                    ml].optimizePars(trainData, folds=5)
                                optAcc[ml].append(optInfo["Acc"])
                            else:
                                res = evalUtilities.crossValidation(
                                    [MLmethods[ml]],
                                    trainData,
                                    folds=5,
                                    stratified=orange.MakeRandomIndices.
                                    StratifiedIfPossible,
                                    random_generator=random.randint(0, 100))
                                R2 = evalUtilities.R2(res)[0]
                                optAcc[ml].append(R2)
                        else:
                            runPath = miscUtilities.createScratchDir(
                                baseDir=AZOC.NFS_SCRATCHDIR,
                                desc="AccWOptParam",
                                seed=id(trainData))
                            trainData.save(
                                os.path.join(runPath, "trainData.tab"))
                            tunedPars = paramOptUtilities.getOptParam(
                                learner=MLmethods[ml],
                                trainDataFile=os.path.join(
                                    runPath, "trainData.tab"),
                                paramList=self.paramList,
                                useGrid=False,
                                verbose=self.verbose,
                                queueType=self.queueType,
                                runPath=runPath,
                                nExtFolds=None,
                                nFolds=self.nInnerFolds,
                                logFile=self.logFile,
                                getTunedPars=True,
                                fixedParams=self.fixedParams)
                            if not MLmethods[ml] or not MLmethods[ml].optimized:
                                self.__log(
                                    "       WARNING: GETACCWOPTPARAM: The learner "
                                    + str(ml) + " was not optimized.")
                                self.__log(
                                    "                It will be ignored")
                                #self.__log("                It will be set to default parameters")
                                self.__log(
                                    "                    DEBUG can be done in: "
                                    + runPath)
                                #Set learner back to default
                                #MLmethods[ml] = MLmethods[ml].__class__()
                                raise Exception("The learner " + str(ml) +
                                                " was not optimized.")
                            else:
                                if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                                    optAcc[ml].append(tunedPars[0])
                                else:
                                    res = evalUtilities.crossValidation(
                                        [MLmethods[ml]],
                                        trainData,
                                        folds=5,
                                        stratified=orange.MakeRandomIndices.
                                        StratifiedIfPossible,
                                        random_generator=random.randint(
                                            0, 100))
                                    R2 = evalUtilities.R2(res)[0]
                                    optAcc[ml].append(R2)

                                miscUtilities.removeDir(runPath)
                    #Train the model
                    if SpecialModel is not None:
                        model = SpecialModel
                    else:
                        model = MLmethods[ml](trainData)
                    models[ml].append(model)
                    #Test the model
                    if self.responseType == "Classification":
                        results[ml].append(
                            (evalUtilities.getClassificationAccuracy(
                                testData, model),
                             evalUtilities.getConfMat(testData, model)))
                    else:
                        local_exp_pred = []
                        # Predict using bulk-predict
                        predictions = model(testData)
                        # Gather predictions
                        for n, ex in enumerate(testData):
                            local_exp_pred.append(
                                (ex.getclass().value, predictions[n].value))
                        results[ml].append(
                            (evalUtilities.calcRMSE(local_exp_pred),
                             evalUtilities.calcRsqrt(local_exp_pred)))
                        #Save the experimental value and correspondent predicted value
                        exp_pred[ml] += local_exp_pred
                    if callBack:
                        stepsDone += 1
                        if not callBack((100 * stepsDone) / nTotalSteps):
                            return None
                    if callBackWithFoldModel:
                        callBackWithFoldModel(model)

                res = self.createStatObj(
                    results[ml],
                    exp_pred[ml],
                    nTrainEx[ml],
                    nTestEx[ml],
                    self.responseType,
                    self.nExtFolds,
                    logTxt,
                    labels=hasattr(self.data.domain.classVar, "values")
                    and list(self.data.domain.classVar.values) or None)
                if self.verbose > 0:
                    print "UnbiasedAccuracyGetter!Results  " + ml + ":\n"
                    pprint(res)
                if not res:
                    raise Exception("No results available!")
                res["runningTime"] = time.time() - startTime
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)
                self.__log("       OK")
            except:
                self.__log("       Learner " + str(ml) +
                           " failed to create/optimize the model!")
                error = str(sys.exc_info()[0]) +" "+\
                            str(sys.exc_info()[1]) +" "+\
                            str(traceback.extract_tb(sys.exc_info()[2]))
                self.__log(error)

                res = self.createStatObj()
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)

        if not statistics or len(statistics) < 1:
            self.__log("ERROR: No statistics to return!")
            return None
        elif len(statistics) > 1:
            #We still need to build a consensus model out of the stable models
            #   ONLY if there are more that one model stable!
            #   When only one or no stable models, build a consensus based on all models
            # ALWAYS exclude specialType models (MLmethods[ml].specialType > 0)
            consensusMLs = {}
            for modelName in statistics:
                StabilityValue = statistics[modelName]["StabilityValue"]
                if StabilityValue is not None and statistics[modelName][
                        "stable"]:
                    consensusMLs[modelName] = copy.deepcopy(
                        statistics[modelName])

            self.__log("Found " + str(len(consensusMLs)) +
                       " stable MLmethods out of " + str(len(statistics)) +
                       " MLmethods.")

            if len(consensusMLs
                   ) <= 1:  # we need more models to build a consensus!
                consensusMLs = {}
                for modelName in statistics:
                    consensusMLs[modelName] = copy.deepcopy(
                        statistics[modelName])

            # Exclude specialType models
            excludeThis = []
            for learnerName in consensusMLs:
                if models[learnerName][0].specialType > 0:
                    excludeThis.append(learnerName)
            for learnerName in excludeThis:
                consensusMLs.pop(learnerName)
                self.__log("    > Excluded special model " + learnerName)
            self.__log("    > Stable modules: " + str(consensusMLs.keys()))

            if len(consensusMLs) >= 2:
                #Var for saving each Fols result
                startTime = time.time()
                Cresults = []
                Cexp_pred = []
                CnTrainEx = []
                CnTestEx = []
                self.__log(
                    "Calculating the statistics for a Consensus model based on "
                    + str([ml for ml in consensusMLs]))
                for foldN in range(self.nExtFolds):
                    if self.responseType == "Classification":
                        CLASS0 = str(self.data.domain.classVar.values[0])
                        CLASS1 = str(self.data.domain.classVar.values[1])
                        # exprTest0
                        exprTest0 = "(0"
                        for ml in consensusMLs:
                            exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str(
                                optAcc[ml][foldN]) + " "
                        exprTest0 += ")/IF0(sum([False"
                        for ml in consensusMLs:
                            exprTest0 += ", " + ml + " == " + CLASS0 + " "
                        exprTest0 += "]),1)"
                        # exprTest1
                        exprTest1 = "(0"
                        for ml in consensusMLs:
                            exprTest1 += "+( " + ml + " == " + CLASS1 + " )*" + str(
                                optAcc[ml][foldN]) + " "
                        exprTest1 += ")/IF0(sum([False"
                        for ml in consensusMLs:
                            exprTest1 += ", " + ml + " == " + CLASS1 + " "
                        exprTest1 += "]),1)"
                        # Expression
                        expression = [
                            exprTest0 + " >= " + exprTest1 + " -> " + CLASS0,
                            " -> " + CLASS1
                        ]
                    else:
                        Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs])
                        expression = "(1 / " + str(Q2sum) + ") * (0"
                        for ml in consensusMLs:
                            expression += " + " + str(
                                optAcc[ml][foldN]) + " * " + ml + " "
                        expression += ")"

                    testData = self.data.select(
                        DataIdxs, foldN + 1)  # fold 0 if for the train Bias!!
                    smilesAttr = dataUtilities.getSMILESAttr(testData)
                    if smilesAttr:
                        self.__log("Found SMILES attribute:" + smilesAttr)
                        testData = dataUtilities.attributeDeselectionData(
                            testData, [smilesAttr])
                        self.__log("Selected attrs: " + str(
                            [attr.name
                             for attr in trainData.domain[0:3]] + ["..."] + [
                                 attr.name for attr in
                                 trainData.domain[len(trainData.domain) - 3:]
                             ]))

                    CnTestEx.append(len(testData))
                    consensusClassifiers = {}
                    for learnerName in consensusMLs:
                        consensusClassifiers[learnerName] = models[
                            learnerName][foldN]

                    model = AZorngConsensus.ConsensusClassifier(
                        classifiers=consensusClassifiers,
                        expression=expression)
                    CnTrainEx.append(model.NTrainEx)
                    #Test the model
                    if self.responseType == "Classification":
                        Cresults.append(
                            (evalUtilities.getClassificationAccuracy(
                                testData, model),
                             evalUtilities.getConfMat(testData, model)))
                    else:
                        local_exp_pred = []
                        # Predict using bulk-predict
                        predictions = model(testData)
                        # Gather predictions
                        for n, ex in enumerate(testData):
                            local_exp_pred.append(
                                (ex.getclass().value, predictions[n].value))
                        Cresults.append(
                            (evalUtilities.calcRMSE(local_exp_pred),
                             evalUtilities.calcRsqrt(local_exp_pred)))
                        #Save the experimental value and correspondent predicted value
                        Cexp_pred += local_exp_pred

                res = self.createStatObj(
                    Cresults,
                    Cexp_pred,
                    CnTrainEx,
                    CnTestEx,
                    self.responseType,
                    self.nExtFolds,
                    labels=hasattr(self.data.domain.classVar, "values")
                    and list(self.data.domain.classVar.values) or None)
                res["runningTime"] = time.time() - startTime
                statistics["Consensus"] = copy.deepcopy(res)
                statistics["Consensus"][
                    "IndividualStatistics"] = copy.deepcopy(consensusMLs)
                self.__writeResults(statistics)
            self.__log("Returned multiple ML methods statistics.")
            return statistics

        #By default return the only existing statistics!
        self.__writeResults(statistics)
        self.__log("Returned only one ML method statistics.")
        return statistics[statistics.keys()[0]]
コード例 #30
0
    def getAcc(self, callBack = None, callBackWithFoldModel = None):
        """ For regression problems, it returns the RMSE and the Q2 
            For Classification problems, it returns CA and the ConfMat
            The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]}
            For the EvalResults not supported for a specific learner/datase, the respective result will be None

            if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus
                made out of those that were stable

            It some error occurred, the respective values in the Dict will be None
        """
        self.__log("Starting Calculating MLStatistics")
        statistics = {}
        if not self.__areInputsOK():
            return None
        # Set the response type
        self.responseType =  self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification"  or "Regression"
        self.__log("  "+str(self.responseType))

        #Create the Train and test sets
        if self.usePreDefFolds:
            DataIdxs = self.preDefIndices 
        else:
            DataIdxs = self.sampler(self.data, self.nExtFolds) 
        foldsN = [f for f in dict.fromkeys(DataIdxs) if f != 0] #Folds used only from 1 on ... 0 are for fixed train Bias
        nFolds = len(foldsN)
        #Fix the Indexes based on DataIdxs
        # (0s) represents the train set  ( >= 1s) represents the test set folds
        if self.useVarCtrlCV:
            nShifted = [0] * nFolds
            for idx,isTest in enumerate(self.preDefIndices):  # self.preDefIndices == 0 are to be used in TrainBias
                if not isTest:
                    if DataIdxs[idx]:
                        nShifted[DataIdxs[idx]] += 1
                        DataIdxs[idx] = 0
            for idx,shift in enumerate(nShifted):
                self.__log("In fold "+str(idx)+", "+str(shift)+" examples were shifted to the train set.")

        #Var for saving each Fols result
        optAcc = {}
        results = {}
        exp_pred = {}
        nTrainEx = {}
        nTestEx = {}
        
        #Set a dict of learners
        MLmethods = {}
        if type(self.learner) == dict:
            for ml in self.learner:
                MLmethods[ml] = self.learner[ml]
        else:
            MLmethods[self.learner.name] = self.learner

        models={}
        self.__log("Calculating Statistics for MLmethods:")
        self.__log("  "+str([x for x in MLmethods]))

        #Check data in advance so that, by chance, it will not faill at the last fold!
        for foldN in foldsN:
            trainData = self.data.select(DataIdxs,foldN,negate=1)
            self.__checkTrainData(trainData)

        #Optional!!
        # Order Learners so that PLS is the first
        sortedML = [ml for ml in MLmethods]
        if "PLS" in sortedML:
            sortedML.remove("PLS")
            sortedML.insert(0,"PLS")

        stepsDone = 0
        nTotalSteps = len(sortedML) * self.nExtFolds  
        for ml in sortedML:
          startTime = time.time()
          self.__log("    > "+str(ml)+"...")
          try:
            #Var for saving each Fols result
            results[ml] = []
            exp_pred[ml] = []
            models[ml] = []
            nTrainEx[ml] = []
            nTestEx[ml] = []
            optAcc[ml] = []
            logTxt = ""
            for foldN in foldsN:
                if type(self.learner) == dict:
                    self.paramList = None

                trainData = self.data.select(DataIdxs,foldN,negate=1)
                testData = self.data.select(DataIdxs,foldN)
                smilesAttr = dataUtilities.getSMILESAttr(trainData)
                if smilesAttr:
                    self.__log("Found SMILES attribute:"+smilesAttr)
                    if MLmethods[ml].specialType == 1:
                       trainData = dataUtilities.attributeSelectionData(trainData, [smilesAttr, trainData.domain.classVar.name]) 
                       testData = dataUtilities.attributeSelectionData(testData, [smilesAttr, testData.domain.classVar.name]) 
                       self.__log("Selected attrs: "+str([attr.name for attr in trainData.domain]))
                    else:
                       trainData = dataUtilities.attributeDeselectionData(trainData, [smilesAttr]) 
                       testData = dataUtilities.attributeDeselectionData(testData, [smilesAttr]) 
                       self.__log("Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] + [attr.name for attr in trainData.domain[len(trainData.domain)-3:]]))

                nTrainEx[ml].append(len(trainData))
                nTestEx[ml].append(len(testData))
                #Test if trainsets inside optimizer will respect dataSize criterias.
                #  if not, don't optimize, but still train the model
                dontOptimize = False
                if self.responseType != "Classification" and (len(trainData)*(1-1.0/self.nInnerFolds) < 20):
                    dontOptimize = True
                else:                      
                    tmpDataIdxs = self.sampler(trainData, self.nInnerFolds)
                    tmpTrainData = trainData.select(tmpDataIdxs,1,negate=1)
                    if not self.__checkTrainData(tmpTrainData, False):
                        dontOptimize = True

                SpecialModel = None
                if dontOptimize:
                    logTxt += "       Fold "+str(foldN)+": Too few compounds to optimize model hyper-parameters\n"
                    self.__log(logTxt)
                    if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                        res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100))
                        CA = evalUtilities.CA(res)[0]
                        optAcc[ml].append(CA)
                    else:
                        res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100))
                        R2 = evalUtilities.R2(res)[0]
                        optAcc[ml].append(R2)
                else:
                    if MLmethods[ml].specialType == 1: 
                            if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                                    optInfo, SpecialModel = MLmethods[ml].optimizePars(trainData, folds = 5)
                                    optAcc[ml].append(optInfo["Acc"])
                            else:
                                    res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100))
                                    R2 = evalUtilities.R2(res)[0]
                                    optAcc[ml].append(R2)
                    else:
                            runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "AccWOptParam", seed = id(trainData))
                            trainData.save(os.path.join(runPath,"trainData.tab"))
                            tunedPars = paramOptUtilities.getOptParam(
                                learner = MLmethods[ml], 
                                trainDataFile = os.path.join(runPath,"trainData.tab"), 
                                paramList = self.paramList, 
                                useGrid = False, 
                                verbose = self.verbose, 
                                queueType = self.queueType, 
                                runPath = runPath, 
                                nExtFolds = None, 
                                nFolds = self.nInnerFolds,
                                logFile = self.logFile,
                                getTunedPars = True,
                                fixedParams = self.fixedParams)
                            if not MLmethods[ml] or not MLmethods[ml].optimized:
                                self.__log("       WARNING: GETACCWOPTPARAM: The learner "+str(ml)+" was not optimized.")
                                self.__log("                It will be ignored")
                                #self.__log("                It will be set to default parameters")
                                self.__log("                    DEBUG can be done in: "+runPath)
                                #Set learner back to default 
                                #MLmethods[ml] = MLmethods[ml].__class__()
                                raise Exception("The learner "+str(ml)+" was not optimized.")
                            else:
                                if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                                    optAcc[ml].append(tunedPars[0])
                                else:
                                    res = evalUtilities.crossValidation([MLmethods[ml]], trainData, folds=5, stratified=orange.MakeRandomIndices.StratifiedIfPossible, random_generator = random.randint(0, 100))
                                    R2 = evalUtilities.R2(res)[0]
                                    optAcc[ml].append(R2)

                                miscUtilities.removeDir(runPath) 
                #Train the model
                if SpecialModel is not None:
                    model = SpecialModel 
                else:
                    model = MLmethods[ml](trainData)
                models[ml].append(model)
                #Test the model
                if self.responseType == "Classification":
                    results[ml].append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) )
                else:
                    local_exp_pred = []
                    # Predict using bulk-predict
                    predictions = model(testData)
                    # Gather predictions
                    for n,ex in enumerate(testData):
                        local_exp_pred.append((ex.getclass().value, predictions[n].value))
                    results[ml].append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) )
                    #Save the experimental value and correspondent predicted value
                    exp_pred[ml] += local_exp_pred
                if callBack:
                     stepsDone += 1
                     if not callBack((100*stepsDone)/nTotalSteps): return None
                if callBackWithFoldModel:
                    callBackWithFoldModel(model) 

            res = self.createStatObj(results[ml], exp_pred[ml], nTrainEx[ml], nTestEx[ml],self.responseType, self.nExtFolds, logTxt, labels = hasattr(self.data.domain.classVar,"values") and list(self.data.domain.classVar.values) or None )
            if self.verbose > 0: 
                print "UnbiasedAccuracyGetter!Results  "+ml+":\n"
                pprint(res)
            if not res:
                raise Exception("No results available!")
            res["runningTime"] = time.time() - startTime
            statistics[ml] = copy.deepcopy(res)
            self.__writeResults(statistics)
            self.__log("       OK")
          except:
            self.__log("       Learner "+str(ml)+" failed to create/optimize the model!")
            error = str(sys.exc_info()[0]) +" "+\
                        str(sys.exc_info()[1]) +" "+\
                        str(traceback.extract_tb(sys.exc_info()[2]))
            self.__log(error)
 
            res = self.createStatObj()
            statistics[ml] = copy.deepcopy(res)
            self.__writeResults(statistics)

        if not statistics or len(statistics) < 1:
            self.__log("ERROR: No statistics to return!")
            return None
        elif len(statistics) > 1:
            #We still need to build a consensus model out of the stable models 
            #   ONLY if there are more that one model stable!
            #   When only one or no stable models, build a consensus based on all models
            # ALWAYS exclude specialType models (MLmethods[ml].specialType > 0)
            consensusMLs={}
            for modelName in statistics:
                StabilityValue = statistics[modelName]["StabilityValue"]
                if StabilityValue is not None and statistics[modelName]["stable"]:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            self.__log("Found "+str(len(consensusMLs))+" stable MLmethods out of "+str(len(statistics))+" MLmethods.")

            if len(consensusMLs) <= 1:   # we need more models to build a consensus!
                consensusMLs={}
                for modelName in statistics:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            # Exclude specialType models 
            excludeThis = []
            for learnerName in consensusMLs:
                if models[learnerName][0].specialType > 0:
                    excludeThis.append(learnerName)
            for learnerName in excludeThis:
                consensusMLs.pop(learnerName)
                self.__log("    > Excluded special model " + learnerName)
            self.__log("    > Stable modules: " + str(consensusMLs.keys()))

            if len(consensusMLs) >= 2:
                #Var for saving each Fols result
                startTime = time.time()
                Cresults = []
                Cexp_pred = []
                CnTrainEx = []
                CnTestEx = []
                self.__log("Calculating the statistics for a Consensus model based on "+str([ml for ml in consensusMLs]))
                for foldN in range(self.nExtFolds):
                    if self.responseType == "Classification":
                        CLASS0 = str(self.data.domain.classVar.values[0])
                        CLASS1 = str(self.data.domain.classVar.values[1])
                        # exprTest0
                        exprTest0 = "(0"
                        for ml in consensusMLs:
                            exprTest0 += "+( "+ml+" == "+CLASS0+" )*"+str(optAcc[ml][foldN])+" "
                        exprTest0 += ")/IF0(sum([False"
                        for ml in consensusMLs:
                            exprTest0 += ", "+ml+" == "+CLASS0+" "
                        exprTest0 += "]),1)"
                        # exprTest1
                        exprTest1 = "(0"
                        for ml in consensusMLs:
                            exprTest1 += "+( "+ml+" == "+CLASS1+" )*"+str(optAcc[ml][foldN])+" "
                        exprTest1 += ")/IF0(sum([False"
                        for ml in consensusMLs:
                            exprTest1 += ", "+ml+" == "+CLASS1+" "
                        exprTest1 += "]),1)"
                        # Expression
                        expression = [exprTest0+" >= "+exprTest1+" -> "+CLASS0," -> "+CLASS1]
                    else:
                        Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs])
                        expression = "(1 / "+str(Q2sum)+") * (0"
                        for ml in consensusMLs:
                            expression += " + "+str(optAcc[ml][foldN])+" * "+ml+" "
                        expression += ")"

                    testData = self.data.select(DataIdxs,foldN+1)  # fold 0 if for the train Bias!!
                    smilesAttr = dataUtilities.getSMILESAttr(testData)
                    if smilesAttr:
                        self.__log("Found SMILES attribute:"+smilesAttr)
                        testData = dataUtilities.attributeDeselectionData(testData, [smilesAttr])
                        self.__log("Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] + [attr.name for attr in trainData.domain[len(trainData.domain)-3:]]))

                    CnTestEx.append(len(testData))
                    consensusClassifiers = {}
                    for learnerName in consensusMLs:
                        consensusClassifiers[learnerName] = models[learnerName][foldN]

                    model = AZorngConsensus.ConsensusClassifier(classifiers = consensusClassifiers, expression = expression)     
                    CnTrainEx.append(model.NTrainEx)
                    #Test the model
                    if self.responseType == "Classification":
                        Cresults.append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) )
                    else:
                        local_exp_pred = []
                        # Predict using bulk-predict
                        predictions = model(testData)
                        # Gather predictions
                        for n,ex in enumerate(testData):
                            local_exp_pred.append((ex.getclass().value, predictions[n].value))
                        Cresults.append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) )
                        #Save the experimental value and correspondent predicted value
                        Cexp_pred += local_exp_pred

                res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds, labels = hasattr(self.data.domain.classVar,"values") and list(self.data.domain.classVar.values) or None )
                res["runningTime"] = time.time() - startTime
                statistics["Consensus"] = copy.deepcopy(res)
                statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs)
                self.__writeResults(statistics)
            self.__log("Returned multiple ML methods statistics.")
            return statistics
                 
        #By default return the only existing statistics!
        self.__writeResults(statistics)
        self.__log("Returned only one ML method statistics.")
        return statistics[statistics.keys()[0]]
コード例 #31
0
    def __init__(self, **kwds):
        self.verbose = 0
        self.logFile = None
        self.resultsFile = None
        self.nExtFolds = 5
        self.nInnerFolds = 5
        self.data = None
        self.learner = None
        self.paramList = None
        self.queueType = "NoSGE"
        self.responseType = None
        self.fixedParams = {} 
        self.testAttrFilter = None
        self.testFilterVal = None
        self.sampler = dataUtilities.SeedDataSampler
        # Append arguments to the __dict__ member variable 
        self.__dict__.update(kwds)
        self.learnerName = ""

        self.preDefIndices = orange.LongList()
        self.usePreDefFolds = False 
        self.useVarCtrlCV = False
        if self.testAttrFilter and self.testAttrFilter in self.data.domain:
            if self.testFilterVal and type(self.testFilterVal) == list and type(self.testAttrFilter) == str:
                self.useVarCtrlCV = True
                self.usePreDefFolds = False
                for ex in self.data:
                    if ex[self.testAttrFilter].value in self.testFilterVal: # Compound selected to be allowed in the test set
                        self.preDefIndices.append(1)
                    else:                                                 # Compound to not include in the test set. Always to be shifted to the train
                        self.preDefIndices.append(0)
            elif self.testFilterVal is None:
                    self.usePreDefFolds = True
                    self.useVarCtrlCV = False
                    #Enable pre-selected-indices  ( index 0 will be set for train Bias)
                    foldsCounter = {}
                    for ex in self.data:
                        value = str(ex[self.testAttrFilter].value)
                        if not miscUtilities.isNumber(value):
                            self.__log("Invalid fold value:"+str(value)+". It must be str convertable to an int.")
                            return False
                        value = int(float(value))
                        if value not in foldsCounter:
                            foldsCounter[value] = 1
                        else:
                            foldsCounter[value] += 1
                        self.preDefIndices.append(value)

                    self.__log( "INFO: Pre-selected "+str(len([f for f in foldsCounter.keys() if f != 0]))+" folds for CV:")
                    self.__log( "      Examples in data: "+str(sum(foldsCounter.values())))
                    self.__log( "      Examples selected for validation: "+str(sum([foldsCounter[f] for f in foldsCounter if f != 0])))
                    self.__log( "      Examples to be appended to the train set: "+ str(0 in foldsCounter.keys() and foldsCounter[0] or 0))
            else:
                self.__log("ERROR: Attribute Filter Ctrl was selected, but attribute is not in expected format: " + str(self.testAttrFilter))
                return False
            self.data = dataUtilities.attributeDeselectionData(self.data, [self.testAttrFilter]) 
        else:
            self.usePreDefFolds = False
            self.useVarCtrlCV = False
            self.testAttrFilter = None
            self.testFilterVal = None
コード例 #32
0
    def getProbabilitiesAsAttribute(self, algorithm=None, minsup=None, atts=None):
        """ For regression problems, it returns the RMSE and the Q2 
            For Classification problems, it returns CA and the ConfMat
            The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]}
            For the EvalResults not supported for a specific learner/datase, the respective result will be None

            if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus
                made out of those that were stable

            It some error occurred, the respective values in the Dict will be None
                
			parameters:
                algo - key for the structural feature generation algorithm (set dependent structural features that have to be calculated inside the crossvalidation)
                minsup - minimum support for the algorithm
                atts - attributes to be removed before learning (e.g. meta etc...)
        """
        self.__log("Starting Calculating MLStatistics")
        statistics = {}
        if not self.__areInputsOK():
            return None

        if algorithm:
            self.__log(" Additional features to be calculated inside of cross-validation")
            self.__log(" Algorithm for structural features: " + str(algorithm))
            self.__log(" Minimum support parameter: " + str(minsup))

        # Set the response type
        self.responseType = (
            self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression"
        )
        self.__log("  " + str(self.responseType))

        # Create the Train and test sets
        DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds)

        # Var for saving each Fols result
        optAcc = {}
        results = {}
        exp_pred = {}
        nTrainEx = {}
        nTestEx = {}

        # Set a dict of learners
        MLmethods = {}
        if type(self.learner) == dict:
            for ml in self.learner:
                MLmethods[ml] = self.learner[ml]
        else:
            MLmethods[self.learner.name] = self.learner

        models = {}
        rocs = {}
        self.__log("Calculating Statistics for MLmethods:")
        self.__log("  " + str([x for x in MLmethods]))

        # Check data in advance so that, by chance, it will not faill at the last fold!
        for foldN in range(self.nExtFolds):
            trainData = self.data.select(DataIdxs[foldN], negate=1)
            self.__checkTrainData(trainData)

        # Optional!!
        # Order Learners so that PLS is the first
        sortedML = [ml for ml in MLmethods]
        if "PLS" in sortedML:
            sortedML.remove("PLS")
            sortedML.insert(0, "PLS")

        for ml in sortedML:
            self.__log("    > " + str(ml) + "...")
            try:
                # Var for saving each Fols result
                results[ml] = []
                exp_pred[ml] = []
                models[ml] = []
                rocs[ml] = []
                nTrainEx[ml] = []
                nTestEx[ml] = []
                optAcc[ml] = []

                ### mods TG
                prediction_attribute = orange.FloatVariable("class_prob")
                domain = [data.domain.attributes, prediction_attribute, data.domain.classvar]
                data_new = orange.ExampleTable(domain)

                logTxt = ""
                for foldN in range(self.nExtFolds):
                    if type(self.learner) == dict:
                        self.paramList = None

                    trainData = self.data.select(DataIdxs[foldN], negate=1)
                    orig_len = len(trainData.domain.attributes)
                    # add structural descriptors to the training data (TG)
                    if algorithm:
                        trainData_structDesc = getStructuralDesc.getStructuralDescResult(trainData, algorithm, minsup)
                        trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, atts)

                    testData = self.data.select(DataIdxs[foldN])
                    # print "IDX: ",
                    # print DataIdxs[foldN]
                    # calculate the feature values for the test data (TG)
                    if algorithm:
                        cut_off = orig_len - len(atts)
                        smarts = trainData.domain.attributes[cut_off:]
                        self.__log("  Number of structural features added: " + str(len(smarts)))
                        testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData, smarts)
                        testData = dataUtilities.attributeDeselectionData(testData_structDesc, atts)

                    nTrainEx[ml].append(len(trainData))
                    nTestEx[ml].append(len(testData))
                    # Test if trainsets inside optimizer will respect dataSize criterias.
                    #  if not, don't optimize, but still train the model
                    dontOptimize = False
                    if self.responseType != "Classification" and (len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20):
                        dontOptimize = True
                    else:
                        tmpDataIdxs = dataUtilities.SeedDataSampler(trainData, self.nInnerFolds)
                        tmpTrainData = trainData.select(tmpDataIdxs[0], negate=1)
                        if not self.__checkTrainData(tmpTrainData, False):
                            dontOptimize = True

                    if dontOptimize:
                        logTxt += (
                            "       Fold " + str(foldN) + ": Too few compounds to optimize model hyper-parameters\n"
                        )
                        self.__log(logTxt)
                        if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                            res = orngTest.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                randomGenerator=random.randint(0, 100),
                            )
                            CA = evalUtilities.CA(res)[0]
                            optAcc[ml].append(CA)
                        else:
                            res = orngTest.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                randomGenerator=random.randint(0, 100),
                            )
                            R2 = evalUtilities.R2(res)[0]
                            optAcc[ml].append(R2)
                    else:
                        runPath = miscUtilities.createScratchDir(
                            baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData)
                        )
                        trainData.save(os.path.join(runPath, "trainData.tab"))

                        tunedPars = paramOptUtilities.getOptParam(
                            learner=MLmethods[ml],
                            trainDataFile=os.path.join(runPath, "trainData.tab"),
                            paramList=self.paramList,
                            useGrid=False,
                            verbose=self.verbose,
                            queueType=self.queueType,
                            runPath=runPath,
                            nExtFolds=None,
                            nFolds=self.nInnerFolds,
                            logFile=self.logFile,
                            getTunedPars=True,
                        )
                        if not MLmethods[ml] or not MLmethods[ml].optimized:
                            self.__log(
                                "       WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized."
                            )
                            self.__log("                It will be ignored")
                            # self.__log("                It will be set to default parameters")
                            self.__log("                    DEBUG can be done in: " + runPath)
                            # Set learner back to default
                            # MLmethods[ml] = MLmethods[ml].__class__()
                            raise Exception("The learner " + str(ml) + " was not optimized.")
                        else:
                            if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                                optAcc[ml].append(tunedPars[0])
                            else:
                                res = orngTest.crossValidation(
                                    [MLmethods[ml]],
                                    trainData,
                                    folds=5,
                                    strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                    randomGenerator=random.randint(0, 100),
                                )
                                R2 = evalUtilities.R2(res)[0]
                                optAcc[ml].append(R2)

                            miscUtilities.removeDir(runPath)
                    # Train the model
                    model = MLmethods[ml](trainData)
                    models[ml].append(model)
                    # Test the model
                    if self.responseType == "Classification":
                        results[ml].append(
                            (
                                evalUtilities.getClassificationAccuracy(testData, model),
                                evalUtilities.getConfMat(testData, model),
                            )
                        )
                        roc = self.aroc(testData, [model])
                        rocs[ml].append(roc)

                    # save the prediction probabilities

                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        results[ml].append(
                            (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))
                        )
                        # Save the experimental value and correspondent predicted value
                        exp_pred[ml] += local_exp_pred

                res = self.createStatObj(
                    results[ml],
                    exp_pred[ml],
                    nTrainEx[ml],
                    nTestEx[ml],
                    self.responseType,
                    self.nExtFolds,
                    logTxt,
                    rocs[ml],
                )
                if self.verbose > 0:
                    print "UnbiasedAccuracyGetter!Results  " + ml + ":\n"
                    pprint(res)
                if not res:
                    raise Exception("No results available!")
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)
                self.__log("       OK")
            except:
                self.__log("       Learner " + str(ml) + " failed to create/optimize the model!")
                res = self.createStatObj()
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)

        if not statistics or len(statistics) < 1:
            self.__log("ERROR: No statistics to return!")
            return None
        elif len(statistics) > 1:
            # We still need to build a consensus model out of the stable models
            #   ONLY if there are more that one model stable!
            #   When only one or no stable models, build a consensus based on all models
            consensusMLs = {}
            for modelName in statistics:
                StabilityValue = statistics[modelName]["StabilityValue"]
                if StabilityValue is not None and statistics[modelName]["stable"]:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            self.__log(
                "Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods."
            )

            if len(consensusMLs) <= 1:  # we need more models to build a consensus!
                consensusMLs = {}
                for modelName in statistics:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            if len(consensusMLs) >= 2:
                # Var for saving each Fols result
                Cresults = []
                Cexp_pred = []
                CnTrainEx = []
                CnTestEx = []
                self.__log(
                    "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs])
                )
                for foldN in range(self.nExtFolds):
                    if self.responseType == "Classification":
                        CLASS0 = str(self.data.domain.classVar.values[0])
                        CLASS1 = str(self.data.domain.classVar.values[1])
                        exprTest0 = "(0"
                        for ml in consensusMLs:
                            exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str(optAcc[ml][foldN]) + " "
                        exprTest0 += ")/IF0(sum([False"
                        for ml in consensusMLs:
                            exprTest0 += ", " + ml + " == " + CLASS0 + " "
                        exprTest0 += "]),1)"
                        exprTest1 = exprTest0.replace(CLASS0, CLASS1)
                        expression = [exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1]
                    else:
                        Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs])
                        expression = "(1 / " + str(Q2sum) + ") * (0"
                        for ml in consensusMLs:
                            expression += " + " + str(optAcc[ml][foldN]) + " * " + ml + " "
                        expression += ")"

                    testData = self.data.select(DataIdxs[foldN])
                    CnTestEx.append(len(testData))
                    consensusClassifiers = {}
                    for learnerName in consensusMLs:
                        consensusClassifiers[learnerName] = models[learnerName][foldN]

                    model = AZorngConsensus.ConsensusClassifier(classifiers=consensusClassifiers, expression=expression)
                    CnTrainEx.append(model.NTrainEx)
                    # Test the model
                    if self.responseType == "Classification":
                        Cresults.append(
                            (
                                evalUtilities.getClassificationAccuracy(testData, model),
                                evalUtilities.getConfMat(testData, model),
                            )
                        )
                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        Cresults.append(
                            (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))
                        )
                        # Save the experimental value and correspondent predicted value
                        Cexp_pred += local_exp_pred

                res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds)
                statistics["Consensus"] = copy.deepcopy(res)
                statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs)
                self.__writeResults(statistics)
            self.__log("Returned multiple ML methods statistics.")
            return statistics

        # By default return the only existing statistics!
        self.__writeResults(statistics)
        self.__log("Returned only one ML method statistics.")
        return statistics[statistics.keys()[0]]
コード例 #33
0
ファイル: getAccWOptParam.py プロジェクト: girschic/AZOrange
    def getAcc(self, algorithm = None, minsup = None, atts = None):
        """ For regression problems, it returns the RMSE and the R2 
            For Classification problems, it returns CA and the ConfMat
            The return is made in a Dict: {"RMSE":0.2,"R2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]}
            For the EvalResults not supported for a specific learner/datase, the respective result will be None

            if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus
                made out of those that were stable

            It some error occurred, the respective values in the Dict will be None

        """
        self.__log("Starting Calculating MLStatistics")
        statistics = {}
        if not self.__areInputsOK():
            return None
        
        if (self.algorithm):
            self.__log(" Additional structural features to be calculated inside of cross-validation")
            self.__log(" Algorithm for structural features: "+str(self.algorithm))
            self.__log(" Minimum support parameter: "+str(self.minsup))
        
        # Set the response type
        responseType =  self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification"  or "Regression"
        self.__log("  "+str(responseType))

        #Create the Train and test sets
        DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds) 
        
        #Var for saving each Fols result
        results = {}
        exp_pred = {}
        
        #Set a dict of learners
        MLmethods = {}
        if type(self.learner) == dict:
            for ml in self.learner:
                MLmethods[ml] = self.learner[ml]
        else:
            MLmethods[self.learner.name] = self.learner

        models={}
        self.__log("Calculating Statistics for MLmethods:")
        self.__log("  "+str([x for x in MLmethods]))
        for ml in MLmethods:
          self.__log("    > "+str(ml)+"...")
          try:
            #Var for saving each Fols result
            results[ml] = []
            exp_pred[ml] = []
            models[ml] = []
            for foldN in range(self.nExtFolds):
                if type(self.learner) == dict:
                    self.paramList = None

                trainData = self.data.select(DataIdxs[foldN],negate=1)
                orig_len = len(trainData.domain.attributes)

		if (self.algorithm):
			# add structural descriptors to the training data (TG)
                	trainData_structDesc = getStructuralDesc.getStructuralDescResult(trainData, self.algorithm, self.minsup)
                	trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, self.atts)

                runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "AccWOptParam")
                trainData.save(os.path.join(runPath,"trainData.tab"))

                testData = self.data.select(DataIdxs[foldN])
		if (self.algorithm):
			# calculate the feature values for the test data (TG)
			cut_off = orig_len - len(self.atts)
                	smarts = trainData.domain.attributes[cut_off:]
			self.__log("  Number of structural features added: "+str(len(smarts)))
			testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData,smarts)
			testData = dataUtilities.attributeDeselectionData(testData_structDesc, self.atts)

                paramOptUtilities.getOptParam(
                    learner = MLmethods[ml], 
                    trainDataFile = os.path.join(runPath,"trainData.tab"), 
                    paramList = self.paramList, 
                    useGrid = False, 
                    verbose = self.verbose, 
                    queueType = self.queueType, 
                    runPath = runPath, 
                    nExtFolds = None, 
                    nFolds = self.nInnerFolds
                    )
                if not MLmethods[ml].optimized:
                    self.__log("       The learner "+str(ml)+" was not optimized.")
                    raise Exception("The learner "+str(ml)+" was not optimized.")
                miscUtilities.removeDir(runPath) 
		
                #Train the model
                model = MLmethods[ml](trainData)
                models[ml].append(model)
                #Test the model
                if responseType == "Classification":
                    results[ml].append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) )
                else:
                    local_exp_pred = []
                    for ex in testData:
                        local_exp_pred.append((ex.getclass(), model(ex)))
                    results[ml].append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) )
                    #Save the experimental value and correspondent predicted value
                    exp_pred[ml] += local_exp_pred
            res = self.createStatObj(results[ml], exp_pred[ml], responseType, self.nExtFolds)
            if self.verbose > 0: 
                print "AccWOptParamGetter!Results  "+ml+":\n"
                pprint(res)
            if not res:
                raise Exception("No results available!")
            statistics[ml] = res.copy()
            self.__writeResults(res)
            self.__log("       OK")
          except:
            self.__log("       Learner "+str(ml)+" failed to optimize!")
            res = self.createStatObj()
            statistics[ml] = res.copy()

        if not statistics or len(statistics) < 1:
            self.__log("ERROR: No statistics to return!")
            return None
        elif len(statistics) > 1:
            #We still need to build a consensus model out of the stable models 
            #   ONLY if there are more that one model stable!
            stableML={}
            for modelName in statistics:
                if statistics[modelName]["StabilityValue"] < AZOC.QSARSTABILITYTHRESHOLD:   # Select only stable models
                    stableML[modelName] = statistics[modelName].copy()
            if len(stableML) >= 2:
                self.__log("Found "+str(len(stableML))+" stable MLmethods out of "+str(len(statistics))+" MLmethods.")
                if responseType == "Classification":
                    CLASS0 = str(self.data.domain.classVar.values[0])
                    CLASS1 = str(self.data.domain.classVar.values[1])
                    exprTest0 = "(0"
                    for ml in stableML:
                        exprTest0 += "+( "+ml+" == "+CLASS0+" )*"+str(stableML[ml]["CA"])+" "
                    exprTest0 += ")/IF0(sum([False"
                    for ml in stableML:
                        exprTest0 += ", "+ml+" == "+CLASS0+" "
                    exprTest0 += "]),1)"
                    exprTest1 = exprTest0.replace(CLASS0,CLASS1)
                    expression = [exprTest0+" >= "+exprTest1+" -> "+CLASS0," -> "+CLASS1]
                else:
                    R2sum = sum([stableML[ml]["R2"] for ml in stableML])
                    expression = "(1 / "+str(R2sum)+") * (0"
                    for ml in stableML:
                        expression += " + "+str(stableML[ml]["R2"])+" * "+ml+" "
                    expression += ")"

                #Var for saving each Fols result
                Cresults = []
                Cexp_pred = []
                self.__log("Calculating the statistics for a Consensus model")
                for foldN in range(self.nExtFolds):
                    testData = self.data.select(DataIdxs[foldN])
                    consensusClassifiers = {}
                    for learnerName in stableML:
                        consensusClassifiers[learnerName] = models[learnerName][foldN]

                    model = AZorngConsensus.ConsensusClassifier(classifiers = consensusClassifiers, expression = expression)     
                    #Test the model
                    if responseType == "Classification":
                        Cresults.append((evalUtilities.getClassificationAccuracy(testData, model), evalUtilities.getConfMat(testData, model) ) )
                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        Cresults.append((evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred) ) )
                        #Save the experimental value and correspondent predicted value
                        Cexp_pred += local_exp_pred

                res = self.createStatObj(Cresults, Cexp_pred, responseType, self.nExtFolds)
                statistics["Consensus"] = res.copy()
                statistics["Consensus"]["IndividualStatistics"] = stableML.copy()
                self.__writeResults(statistics)
            self.__log("Returned multiple ML methods statistics.")
            return statistics
                 
        #By default return the only existing statistics!
        self.__writeResults(statistics)
        self.__log("Returned only one ML method statistics.")
        return statistics[statistics.keys()[0]]
コード例 #34
0
def buildModel(trainData, MLMethod, queueType = "NoSGE", verbose = 0, logFile = None):
        """
        Buld the method passed in MLMethod and optimize ( "IndividualStatistics"  not in MLMethod)
        if MLMethod is a Consensus ("individualStatistics"  in MLMethod) , build each and optimize first all models and after build the consensus!
        """
        log(logFile, "Building and optimizing learner: "+MLMethod["MLMethod"]+"...")
        learners = {}
        MLMethods = {}
        if "IndividualStatistics"  in MLMethod:         #It is a consensus and will certaily not contain any 
                                                        #special model as it was filtered in the getUnbiasedAcc
            for ML in MLMethod["IndividualStatistics"]:
                MLMethods[ML] = copy.deepcopy(MLMethod["IndividualStatistics"][ML])
        else:
            ML = MLMethod["MLMethod"]
            if MLMETHODS[ML](name = ML).specialType == 1:  # If is a special model and has a built-in optimizaer
                log(logFile, "       This is a special model")
                smilesAttr = dataUtilities.getSMILESAttr(trainData)
                if smilesAttr:
                    log(logFile,"Found SMILES attribute:"+smilesAttr)
                    trainData = dataUtilities.attributeSelectionData(trainData, [smilesAttr, trainData.domain.classVar.name])
                optInfo, SpecialModel = MLMETHODS[ML](name = ML).optimizePars(trainData, folds = 5)
                return SpecialModel
            else:
                MLMethods[MLMethod["MLMethod"]] = MLMethod

        smilesAttr = dataUtilities.getSMILESAttr(trainData)
        if smilesAttr:
            trainData = dataUtilities.attributeDeselectionData(trainData, [smilesAttr])

        # optimize all MLMethods
        for ML in MLMethods:
            log(logFile, "  Optimizing MLmethod: "+ML)
            learners[ML] = MLMETHODS[ML](name = ML)

            runPath = miscUtilities.createScratchDir(baseDir = AZOC.NFS_SCRATCHDIR, desc = "competitiveWorkflow_BuildModel")
            trainData.save(os.path.join(runPath,"trainData.tab"))

            tunedPars = paramOptUtilities.getOptParam(
                learner = learners[ML],
                trainDataFile = os.path.join(runPath,"trainData.tab"),
                useGrid = False,
                verbose = verbose,
                queueType = queueType,
                runPath = runPath,
                nExtFolds = None,
                logFile = logFile,
                getTunedPars = True)

            
            if not learners[ML].optimized:
                print "WARNING: competitiveWorkflow: The learner "+str(learners[ML])+" was not optimized."
                #print "         Using default parameters"
                print "         The "+str(learners[ML])+" will not be included"
                #print "         Returning None"
                print "             DEBUG can be made in: "+runPath 
                #Setting default parameters
                #learners[ML] = learners[ML].__class__()   
                #return None
                learners.pop(ML)
                continue
            else:
                print "Optimized learner ",learners[ML]      
                if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                    MLMethods[ML]["optAcc"] = tunedPars[0] 
                else:
                    res = orngTest.crossValidation([learners[ML]], trainData, folds=5, strat=orange.MakeRandomIndices.StratifiedIfPossible, randomGenerator = random.randint(0, 100))
                    R2 = evalUtilities.R2(res)[0]  
                    MLMethods[ML]["optAcc"] = R2
                miscUtilities.removeDir(runPath)
        #Train the model
        if len(learners) == 1:
            log(logFile, "  Building the model:"+learners.keys()[0])
            model = learners[learners.keys()[0]](trainData)
        elif len(learners) >= 1:
            model = buildConsensus(trainData,learners,MLMethods)  
        else:
            print "ERROR: No Learners were selected!"
            return None

        return model
コード例 #35
0
from trainingMethods import AZorngRF
from trainingMethods import AZorngCvBoost
from trainingMethods import AZorngCvSVM
import orngTest
import orngStat
import orange
import string

data = dataUtilities.DataTable("IIDsetAZOdesc.txt")
test = dataUtilities.DataTable("nonIIDtestAZOdesc.txt")

descList = [
    "ID", "Smiles", "Conc", "Effect", "Conc_1", "Effect_1", "ID_1",
    "origSmiles_1", "BioActivity_1"
]
data = dataUtilities.attributeDeselectionData(data, descList)

# Deselect descriptors with no variance
descList = ["rdk.fr_dihydropyridine", "rdk.fr_nitroso", "rdk.fr_benzodiazepine", "rdk.fr_thiocyan", "rdk.VSA_EState4" ,"rdk.VSA_EState6" \
,"rdk.VSA_EState7" ,"rdk.VSA_EState1" ,"rdk.VSA_EState2" ,"rdk.VSA_EState3" ,"rdk.SlogP_VSA9" ,"rdk.SMR_VSA8" ,"rdk.fr_diazo" \
,"rdk.fr_prisulfonamd" ,"rdk.fr_isocyan" ,"rdk.fr_azide" ,"rdk.fr_isothiocyan"]
data = dataUtilities.attributeDeselectionData(data, descList)
print "Length domain ", len(data.domain)

learner = AZorngCvSVM.CvSVMLearner(C=32, gamma=0.03125)
#learner = AZorngRF.RFLearner()
#learner = AZorngRF.RFLearner(stratify = "Yes") # No effect
#learner = AZorngCvBoost.CvBoostLearner()
#learner.stratify = "Yes" # No effect
#learner.priors = {"Active":0.80, "Inactive":0.20}
コード例 #36
0
    def __init__(self, **kwds):
        self.verbose = 0
        self.logFile = None
        self.resultsFile = None
        self.nExtFolds = 5
        self.nInnerFolds = 5
        self.data = None
        self.learner = None
        self.paramList = None
        self.queueType = "NoSGE"
        self.responseType = None
        self.fixedParams = {}
        self.testAttrFilter = None
        self.testFilterVal = None
        self.sampler = dataUtilities.SeedDataSampler
        # Append arguments to the __dict__ member variable
        self.__dict__.update(kwds)
        self.learnerName = ""

        self.preDefIndices = orange.LongList()
        self.usePreDefFolds = False
        self.useVarCtrlCV = False
        if self.testAttrFilter and self.testAttrFilter in self.data.domain:
            if self.testFilterVal and type(
                    self.testFilterVal) == list and type(
                        self.testAttrFilter) == str:
                self.useVarCtrlCV = True
                self.usePreDefFolds = False
                for ex in self.data:
                    if ex[self.
                          testAttrFilter].value in self.testFilterVal:  # Compound selected to be allowed in the test set
                        self.preDefIndices.append(1)
                    else:  # Compound to not include in the test set. Always to be shifted to the train
                        self.preDefIndices.append(0)
            elif self.testFilterVal is None:
                self.usePreDefFolds = True
                self.useVarCtrlCV = False
                #Enable pre-selected-indices  ( index 0 will be set for train Bias)
                foldsCounter = {}
                for ex in self.data:
                    value = str(ex[self.testAttrFilter].value)
                    if not miscUtilities.isNumber(value):
                        self.__log("Invalid fold value:" + str(value) +
                                   ". It must be str convertable to an int.")
                        return False
                    value = int(float(value))
                    if value not in foldsCounter:
                        foldsCounter[value] = 1
                    else:
                        foldsCounter[value] += 1
                    self.preDefIndices.append(value)

                self.__log("INFO: Pre-selected " +
                           str(len([f
                                    for f in foldsCounter.keys() if f != 0])) +
                           " folds for CV:")
                self.__log("      Examples in data: " +
                           str(sum(foldsCounter.values())))
                self.__log(
                    "      Examples selected for validation: " +
                    str(sum([foldsCounter[f]
                             for f in foldsCounter if f != 0])))
                self.__log(
                    "      Examples to be appended to the train set: " +
                    str(0 in foldsCounter.keys() and foldsCounter[0] or 0))
            else:
                self.__log(
                    "ERROR: Attribute Filter Ctrl was selected, but attribute is not in expected format: "
                    + str(self.testAttrFilter))
                return False
            self.data = dataUtilities.attributeDeselectionData(
                self.data, [self.testAttrFilter])
        else:
            self.usePreDefFolds = False
            self.useVarCtrlCV = False
            self.testAttrFilter = None
            self.testFilterVal = None
コード例 #37
0
ファイル: PredConfMetrics.py プロジェクト: accsc/AZOrange
def getXNN(trainSmilesList, train, predEx, smilesAttrName, nameAttr, X,
           simType):

    if simType == "Topological":
        fpsTrain = [FingerprintMols.FingerprintMol(x) for x in trainSmilesList]
        fp = FingerprintMols.FingerprintMol(
            Chem.MolFromSmiles(predEx[smilesAttrName].value))
    elif simType == "Morgan":
        fpsTrain = [
            AllChem.GetMorganFingerprint(x, 2) for x in trainSmilesList
        ]
        fp = AllChem.GetMorganFingerprint(
            Chem.MolFromSmiles(predEx[smilesAttrName].value), 2)
    elif simType == "MACCS":
        fpsTrain = [MACCSkeys.GenMACCSKeys(x) for x in trainSmilesList]
        fp = MACCSkeys.GenMACCSKeys(
            Chem.MolFromSmiles(predEx[smilesAttrName].value))
    elif simType == "Mahalanobis":
        attrList = [smilesAttrName, nameAttr]
        predEx = dataUtilities.attributeDeselectionExample(predEx, attrList)
        fp = getDescVect(predEx)
        numTrain = dataUtilities.attributeDeselectionData(train, attrList)
        trainMat = []
        for ex in numTrain:
            descVect = getDescVect(ex)
            trainMat.append(descVect)
        norm = Mahalanobis.create_inverse_covariance_norm(trainMat)
    else:
        print "This type of sim is not implemented ", simType

    simDict = {}
    idx = 0
    simList = []
    for ex in train:
        if simType == "Topological":
            sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp)
        elif simType == "Morgan":
            sim = DataStructs.DiceSimilarity(fpsTrain[idx], fp)
        elif simType == "MACCS":
            sim = DataStructs.FingerprintSimilarity(fpsTrain[idx], fp)
        elif simType == "Mahalanobis":
            descVect = trainMat[idx]
            dist = Mahalanobis.compute_distance(fp, descVect, norm)
            sim = dist
        else:
            print "This type of sim is not implemented ", simType
        idx = idx + 1
        simDict[ex[nameAttr].value] = sim
        simList.append(sim)

    if simType == "Mahalanobis":  # Mahalanobis gives a distance while the other methods are similarities
        simList.sort()
    else:
        simList.sort(reverse=True)
    simList = simList[0:X]
    medSim = round(numpy.median(simList), 3)
    stdSim = round(numpy.std(simList), 3)
    minSim = round(min(simList), 3)
    maxSim = round(max(simList), 3)

    entropy = round(getRespVar(simList, simDict, train, nameAttr), 3)
    entropyClosest = round(
        getRespVar(simList[0:X / 2], simDict, train, nameAttr), 3)

    return medSim, stdSim, minSim, maxSim, entropy, entropyClosest
コード例 #38
0
def buildConsensus(trainData, learners, MLMethods, logFile=None):
    log(
        logFile, "Building a consensus model based on optimized MLmethods: " +
        str([ml for ml in MLMethods]) + "...")
    if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
        #Expression: If  CAavg_{POS} ge CAavg_{NEG} -> POS  else -> NEG
        #    where CAavg_{POS} is the average of classification accuracies of all models predicting POS.
        CLASS0 = str(trainData.domain.classVar.values[0])
        CLASS1 = str(trainData.domain.classVar.values[1])
        #exprTest0
        exprTest0 = "(0"
        for ml in MLMethods:
            exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str(
                MLMethods[ml]["optAcc"]) + " "
        exprTest0 += ")/IF0(sum([False"
        for ml in MLMethods:
            exprTest0 += ", " + ml + " == " + CLASS0 + " "
        exprTest0 += "]),1)"
        # exprTest1
        exprTest1 = "(0"
        for ml in MLMethods:
            exprTest1 += "+( " + ml + " == " + CLASS1 + " )*" + str(
                MLMethods[ml]["optAcc"]) + " "
        exprTest1 += ")/IF0(sum([False"
        for ml in MLMethods:
            exprTest1 += ", " + ml + " == " + CLASS1 + " "
        exprTest1 += "]),1)"
        # expression
        expression = [
            exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1
        ]
    else:
        Q2sum = sum([MLMethods[ml]["optAcc"] for ml in MLMethods])
        expression = "(1 / " + str(Q2sum) + ") * (0"
        for ml in MLMethods:
            expression += " + " + str(
                MLMethods[ml]["optAcc"]) + " * " + ml + " "
        expression += ")"

    consensusLearners = {}
    for learnerName in learners:
        consensusLearners[learnerName] = learners[learnerName]

    learner = AZorngConsensus.ConsensusLearner(learners=consensusLearners,
                                               expression=expression)
    log(logFile, "  Training Consensus Learner")
    smilesAttr = dataUtilities.getSMILESAttr(trainData)
    if smilesAttr:
        log(logFile, "Found SMILES attribute:" + smilesAttr)
        if learner.specialType == 1:
            trainData = dataUtilities.attributeSelectionData(
                trainData, [smilesAttr, trainData.domain.classVar.name])
            log(
                logFile, "Selected attrs: " +
                str([attr.name for attr in trainData.domain]))
        else:
            trainData = dataUtilities.attributeDeselectionData(
                trainData, [smilesAttr])
            log(logFile,"Selected attrs: "+str([attr.name for attr in trainData.domain[0:3]] + ["..."] +\
                                           [attr.name for attr in trainData.domain[len(trainData.domain)-3:]]))

    return learner(trainData)
コード例 #39
0
def buildModel(trainData,
               MLMethod,
               queueType="NoSGE",
               verbose=0,
               logFile=None):
    """
        Buld the method passed in MLMethod and optimize ( "IndividualStatistics"  not in MLMethod)
        if MLMethod is a Consensus ("individualStatistics"  in MLMethod) , build each and optimize first all models and after build the consensus!
        """
    log(logFile,
        "Building and optimizing learner: " + MLMethod["MLMethod"] + "...")
    learners = {}
    MLMethods = {}
    if "IndividualStatistics" in MLMethod:  #It is a consensus and will certaily not contain any
        #special model as it was filtered in the getUnbiasedAcc
        for ML in MLMethod["IndividualStatistics"]:
            MLMethods[ML] = copy.deepcopy(MLMethod["IndividualStatistics"][ML])
    else:
        ML = MLMethod["MLMethod"]
        if MLMETHODS[ML](
                name=ML
        ).specialType == 1:  # If is a special model and has a built-in optimizaer
            log(logFile, "       This is a special model")
            smilesAttr = dataUtilities.getSMILESAttr(trainData)
            if smilesAttr:
                log(logFile, "Found SMILES attribute:" + smilesAttr)
                trainData = dataUtilities.attributeSelectionData(
                    trainData, [smilesAttr, trainData.domain.classVar.name])
            optInfo, SpecialModel = MLMETHODS[ML](name=ML).optimizePars(
                trainData, folds=5)
            return SpecialModel
        else:
            MLMethods[MLMethod["MLMethod"]] = MLMethod

    smilesAttr = dataUtilities.getSMILESAttr(trainData)
    if smilesAttr:
        trainData = dataUtilities.attributeDeselectionData(
            trainData, [smilesAttr])

    # optimize all MLMethods
    for ML in MLMethods:
        log(logFile, "  Optimizing MLmethod: " + ML)
        learners[ML] = MLMETHODS[ML](name=ML)

        runPath = miscUtilities.createScratchDir(
            baseDir=AZOC.NFS_SCRATCHDIR, desc="competitiveWorkflow_BuildModel")
        trainData.save(os.path.join(runPath, "trainData.tab"))

        tunedPars = paramOptUtilities.getOptParam(learner=learners[ML],
                                                  trainDataFile=os.path.join(
                                                      runPath,
                                                      "trainData.tab"),
                                                  useGrid=False,
                                                  verbose=verbose,
                                                  queueType=queueType,
                                                  runPath=runPath,
                                                  nExtFolds=None,
                                                  logFile=logFile,
                                                  getTunedPars=True)

        if not learners[ML].optimized:
            print "WARNING: competitiveWorkflow: The learner " + str(
                learners[ML]) + " was not optimized."
            #print "         Using default parameters"
            print "         The " + str(learners[ML]) + " will not be included"
            #print "         Returning None"
            print "             DEBUG can be made in: " + runPath
            #Setting default parameters
            #learners[ML] = learners[ML].__class__()
            #return None
            learners.pop(ML)
            continue
        else:
            print "Optimized learner ", learners[ML]
            if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                MLMethods[ML]["optAcc"] = tunedPars[0]
            else:
                res = orngTest.crossValidation(
                    [learners[ML]],
                    trainData,
                    folds=5,
                    strat=orange.MakeRandomIndices.StratifiedIfPossible,
                    randomGenerator=random.randint(0, 100))
                R2 = evalUtilities.R2(res)[0]
                MLMethods[ML]["optAcc"] = R2
            miscUtilities.removeDir(runPath)
    #Train the model
    if len(learners) == 1:
        log(logFile, "  Building the model:" + learners.keys()[0])
        model = learners[learners.keys()[0]](trainData)
    elif len(learners) >= 1:
        model = buildConsensus(trainData, learners, MLMethods)
    else:
        print "ERROR: No Learners were selected!"
        return None

    return model
コード例 #40
0
    def getAcc(self, callBack=None, algorithm=None, params=None, atts=None, holdout=None):
        """ For regression problems, it returns the RMSE and the Q2 
            For Classification problems, it returns CA and the ConfMat
            The return is made in a Dict: {"RMSE":0.2,"Q2":0.1,"CA":0.98,"CM":[[TP, FP],[FN,TN]]}
            For the EvalResults not supported for a specific learner/datase, the respective result will be None

            if the learner is a dict {"LearnerName":learner, ...} the results will be a dict with results for all Learners and for a consensus
                made out of those that were stable

            It some error occurred, the respective values in the Dict will be None
                
			parameters:
                algorithm - list of feature generation algorithms (set dependent features that have to be calculated inside the crossvalidation)
                params - dictionary of parameters
                atts - attributes to be removed before learning (e.g. meta etc...)
        """
        self.__log("Starting Calculating MLStatistics")
        statistics = {}
        if not self.__areInputsOK():
            return None

        if holdout:
            self.nExtFolds = 1

        if algorithm:
            self.__log(" Additional features to be calculated inside of cross-validation")
            for i in algorithm:
                self.__log(" Algorithm: " + str(i))
            for j, v in params.iteritems():
                self.__log(" Parameter: " + str(j) + " = " + str(v))

        # Set the response type
        self.responseType = (
            self.data.domain.classVar.varType == orange.VarTypes.Discrete and "Classification" or "Regression"
        )
        self.__log("  " + str(self.responseType))

        # Create the Train and test sets
        DataIdxs = None
        if holdout:
            self.__log("Using hold out evaluation with " + str(holdout) + "*100 % of data for training")
            DataIdxs = dataUtilities.SeedDataSampler_holdOut(self.data, holdout)
        else:
            DataIdxs = dataUtilities.SeedDataSampler(self.data, self.nExtFolds)

        # Var for saving each Fols result
        optAcc = {}
        results = {}
        exp_pred = {}
        nTrainEx = {}
        nTestEx = {}

        # Set a dict of learners
        MLmethods = {}
        if type(self.learner) == dict:
            for ml in self.learner:
                MLmethods[ml] = self.learner[ml]
        else:
            MLmethods[self.learner.name] = self.learner

        models = {}
        rocs = {}
        self.__log("Calculating Statistics for MLmethods:")
        self.__log("  " + str([x for x in MLmethods]))

        # Check data in advance so that, by chance, it will not fail at the last fold!
        for foldN in range(self.nExtFolds):
            trainData = self.data.select(DataIdxs[foldN], negate=1)
            self.__checkTrainData(trainData)

        # Optional!!
        # Order Learners so that PLS is the first
        sortedML = [ml for ml in MLmethods]
        if "PLS" in sortedML:
            sortedML.remove("PLS")
            sortedML.insert(0, "PLS")

        stepsDone = 0
        nTotalSteps = len(sortedML) * self.nExtFolds
        for ml in sortedML:
            self.__log("    > " + str(ml) + "...")
            try:
                # Var for saving each Fols result
                results[ml] = []
                exp_pred[ml] = []
                models[ml] = []
                rocs[ml] = []
                nTrainEx[ml] = []
                nTestEx[ml] = []
                optAcc[ml] = []
                logTxt = ""

                for foldN in range(self.nExtFolds):
                    if type(self.learner) == dict:
                        self.paramList = None

                    trainData = self.data.select(DataIdxs[foldN], negate=1)
                    orig_len = len(trainData.domain.attributes)
                    refs = None
                    methods = [
                        "rdk_MACCS_keys",
                        "rdk_topo_fps",
                        "rdk_morgan_fps",
                        "rdk_morgan_features_fps",
                        "rdk_atompair_fps",
                    ]
                    train_domain = None
                    # add structural descriptors to the training data (TG)
                    if algorithm:
                        for i in range(len(algorithm)):
                            if algorithm[i] == "structClust":
                                self.__log("Algorithm " + str(i) + ": " + str(algorithm[i]))
                                actData = orange.ExampleTable(trainData.domain)
                                for d in trainData:
                                    # only valid for simboosted qsar paper experiments!?
                                    if d.getclass() == "2":
                                        actData.append(d)

                                refs = structuralClustering.getReferenceStructures(
                                    actData,
                                    threshold=params["threshold"],
                                    minClusterSize=params["minClusterSize"],
                                    numThreads=2,
                                )
                                self.__log(
                                    " found "
                                    + str(len(refs))
                                    + " reference structures in "
                                    + str(len(actData))
                                    + " active structures"
                                )
                                orig_len = orig_len + (len(refs) * len(methods))
                                trainData_sim = SimBoostedQSAR.getSimDescriptors(refs, trainData, methods)

                                if i == (len(algorithm) - 1):
                                    trainData = dataUtilities.attributeDeselectionData(trainData_sim, atts)
                                else:
                                    trainData = dataUtilities.attributeDeselectionData(trainData_sim, [])

                            elif algorithm[i] == "ECFP":
                                self.__log("Algorithm " + str(i) + ": " + str(algorithm[i]))
                                trainData_ecfp = getCinfonyDesc.getCinfonyDescResults(trainData, ["rdk.FingerPrints"])
                                train_domain = trainData_ecfp.domain
                                if i == (len(algorithm) - 1):
                                    trainData = dataUtilities.attributeDeselectionData(trainData_ecfp, atts)
                                else:
                                    trainData = dataUtilities.attributeDeselectionData(trainData_ecfp, [])

                            else:
                                self.__log("Algorithm " + str(i) + ": " + str(algorithm[i]))
                                trainData_structDesc = getStructuralDesc.getStructuralDescResult(
                                    trainData, algorithm[i], params["minsup"]
                                )
                                if i == (len(algorithm) - 1):
                                    trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, atts)
                                else:
                                    trainData = dataUtilities.attributeDeselectionData(trainData_structDesc, [])

                                    # trainData.save("/home/girschic/proj/AZ/ProjDev/train.tab")
                    testData = self.data.select(DataIdxs[foldN])
                    # calculate the feature values for the test data (TG)
                    if algorithm:
                        for i in range(len(algorithm)):
                            if algorithm[i] == "structClust":
                                self.__log(str(algorithm[i]))
                                testData_sim = SimBoostedQSAR.getSimDescriptors(refs, testData, methods)
                                if i == (len(algorithm) - 1):
                                    testData = dataUtilities.attributeDeselectionData(testData_sim, atts)
                                else:
                                    testData = dataUtilities.attributeDeselectionData(testData_sim, [])
                            elif algorithm[i] == "ECFP":
                                self.__log(str(algorithm[i]))
                                # testData_ecfp = orange.ExampleTable(train_domain)
                                tmp_dat = []
                                for d in testData:
                                    tmp = getCinfonyDesc.getRdkFPforTestInstance(train_domain, d)
                                    tmp_dat.append(tmp)
                                testData_ecfp = orange.ExampleTable(tmp_dat[0].domain, tmp_dat)
                                if i == (len(algorithm) - 1):
                                    # 						print "removing atts"
                                    testData = dataUtilities.attributeDeselectionData(testData_ecfp, atts)
                                else:
                                    # 						print "removing no atts"
                                    testData = dataUtilities.attributeDeselectionData(testData_ecfp, [])

                            else:
                                cut_off = orig_len - len(atts)
                                smarts = trainData.domain.attributes[cut_off:]
                                self.__log("  Number of structural features added: " + str(len(smarts)))
                                testData_structDesc = getStructuralDesc.getSMARTSrecalcDesc(testData, smarts)
                                if i == (len(algorithm) - 1):
                                    testData = dataUtilities.attributeDeselectionData(testData_structDesc, atts)
                                else:
                                    testData = dataUtilities.attributeDeselectionData(testData_structDesc, [])

                    #                testData.save("/home/girschic/proj/AZ/ProjDev/test.tab")
                    nTrainEx[ml].append(len(trainData))
                    nTestEx[ml].append(len(testData))
                    # Test if trainsets inside optimizer will respect dataSize criterias.
                    #  if not, don't optimize, but still train the model
                    dontOptimize = False
                    if self.responseType != "Classification" and (len(trainData) * (1 - 1.0 / self.nInnerFolds) < 20):
                        dontOptimize = True
                    else:
                        tmpDataIdxs = dataUtilities.SeedDataSampler(trainData, self.nInnerFolds)
                        tmpTrainData = trainData.select(tmpDataIdxs[0], negate=1)
                        if not self.__checkTrainData(tmpTrainData, False):
                            dontOptimize = True

                    if dontOptimize:
                        logTxt += (
                            "       Fold " + str(foldN) + ": Too few compounds to optimize model hyper-parameters\n"
                        )
                        self.__log(logTxt)
                        if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                            res = orngTest.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                randomGenerator=random.randint(0, 100),
                            )
                            CA = evalUtilities.CA(res)[0]
                            optAcc[ml].append(CA)
                        else:
                            res = orngTest.crossValidation(
                                [MLmethods[ml]],
                                trainData,
                                folds=5,
                                strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                randomGenerator=random.randint(0, 100),
                            )
                            R2 = evalUtilities.R2(res)[0]
                            optAcc[ml].append(R2)
                    else:
                        runPath = miscUtilities.createScratchDir(
                            baseDir=AZOC.NFS_SCRATCHDIR, desc="AccWOptParam", seed=id(trainData)
                        )
                        # 		    self.__log("	run path:"+str(runPath))
                        trainData.save(os.path.join(runPath, "trainData.tab"))

                        tunedPars = paramOptUtilities.getOptParam(
                            learner=MLmethods[ml],
                            trainDataFile=os.path.join(runPath, "trainData.tab"),
                            paramList=self.paramList,
                            useGrid=False,
                            verbose=self.verbose,
                            queueType=self.queueType,
                            runPath=runPath,
                            nExtFolds=None,
                            nFolds=self.nInnerFolds,
                            logFile=self.logFile,
                            getTunedPars=True,
                        )
                        if not MLmethods[ml] or not MLmethods[ml].optimized:
                            self.__log(
                                "       WARNING: GETACCWOPTPARAM: The learner " + str(ml) + " was not optimized."
                            )
                            self.__log("                It will be ignored")
                            # self.__log("                It will be set to default parameters")
                            self.__log("                    DEBUG can be done in: " + runPath)
                            # Set learner back to default
                            # MLmethods[ml] = MLmethods[ml].__class__()
                            raise Exception("The learner " + str(ml) + " was not optimized.")
                        else:
                            if trainData.domain.classVar.varType == orange.VarTypes.Discrete:
                                optAcc[ml].append(tunedPars[0])
                            else:
                                res = orngTest.crossValidation(
                                    [MLmethods[ml]],
                                    trainData,
                                    folds=5,
                                    strat=orange.MakeRandomIndices.StratifiedIfPossible,
                                    randomGenerator=random.randint(0, 100),
                                )
                                R2 = evalUtilities.R2(res)[0]
                                optAcc[ml].append(R2)

                            miscUtilities.removeDir(runPath)
                    # Train the model
                    model = MLmethods[ml](trainData)
                    models[ml].append(model)
                    # Test the model
                    if self.responseType == "Classification":
                        results[ml].append(
                            (
                                evalUtilities.getClassificationAccuracy(testData, model),
                                evalUtilities.getConfMat(testData, model),
                            )
                        )
                        roc = self.aroc(testData, [model])
                        rocs[ml].append(roc)
                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        results[ml].append(
                            (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))
                        )
                        # Save the experimental value and correspondent predicted value
                        exp_pred[ml] += local_exp_pred
                    if callBack:
                        stepsDone += 1
                        if not callBack((100 * stepsDone) / nTotalSteps):
                            return None

                res = self.createStatObj(
                    results[ml],
                    exp_pred[ml],
                    nTrainEx[ml],
                    nTestEx[ml],
                    self.responseType,
                    self.nExtFolds,
                    logTxt,
                    rocs[ml],
                )

                if self.verbose > 0:
                    print "UnbiasedAccuracyGetter!Results  " + ml + ":\n"
                    pprint(res)
                if not res:
                    raise Exception("No results available!")
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)
                self.__log("       OK")
            except:
                print "Unexpected error:",
                print sys.exc_info()[0]
                print sys.exc_info()[1]
                self.__log("       Learner " + str(ml) + " failed to create/optimize the model!")
                res = self.createStatObj(
                    results[ml],
                    exp_pred[ml],
                    nTrainEx[ml],
                    nTestEx[ml],
                    self.responseType,
                    self.nExtFolds,
                    logTxt,
                    rocs[ml],
                )
                statistics[ml] = copy.deepcopy(res)
                self.__writeResults(statistics)

        if not statistics or len(statistics) < 1:
            self.__log("ERROR: No statistics to return!")
            return None
        elif len(statistics) > 1:
            # We still need to build a consensus model out of the stable models
            #   ONLY if there are more that one model stable!
            #   When only one or no stable models, build a consensus based on all models
            consensusMLs = {}
            for modelName in statistics:
                StabilityValue = statistics[modelName]["StabilityValue"]
                if StabilityValue is not None and statistics[modelName]["stable"]:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            self.__log(
                "Found " + str(len(consensusMLs)) + " stable MLmethods out of " + str(len(statistics)) + " MLmethods."
            )

            if len(consensusMLs) <= 1:  # we need more models to build a consensus!
                consensusMLs = {}
                for modelName in statistics:
                    consensusMLs[modelName] = copy.deepcopy(statistics[modelName])

            if len(consensusMLs) >= 2:
                # Var for saving each Fols result
                Cresults = []
                Cexp_pred = []
                CnTrainEx = []
                CnTestEx = []
                self.__log(
                    "Calculating the statistics for a Consensus model based on " + str([ml for ml in consensusMLs])
                )
                for foldN in range(self.nExtFolds):
                    if self.responseType == "Classification":
                        CLASS0 = str(self.data.domain.classVar.values[0])
                        CLASS1 = str(self.data.domain.classVar.values[1])
                        exprTest0 = "(0"
                        for ml in consensusMLs:
                            exprTest0 += "+( " + ml + " == " + CLASS0 + " )*" + str(optAcc[ml][foldN]) + " "
                        exprTest0 += ")/IF0(sum([False"
                        for ml in consensusMLs:
                            exprTest0 += ", " + ml + " == " + CLASS0 + " "
                        exprTest0 += "]),1)"
                        exprTest1 = exprTest0.replace(CLASS0, CLASS1)
                        expression = [exprTest0 + " >= " + exprTest1 + " -> " + CLASS0, " -> " + CLASS1]
                    else:
                        Q2sum = sum([optAcc[ml][foldN] for ml in consensusMLs])
                        expression = "(1 / " + str(Q2sum) + ") * (0"
                        for ml in consensusMLs:
                            expression += " + " + str(optAcc[ml][foldN]) + " * " + ml + " "
                        expression += ")"

                    testData = self.data.select(DataIdxs[foldN])
                    CnTestEx.append(len(testData))
                    consensusClassifiers = {}
                    for learnerName in consensusMLs:
                        consensusClassifiers[learnerName] = models[learnerName][foldN]

                    model = AZorngConsensus.ConsensusClassifier(classifiers=consensusClassifiers, expression=expression)
                    CnTrainEx.append(model.NTrainEx)
                    # Test the model
                    if self.responseType == "Classification":
                        Cresults.append(
                            (
                                evalUtilities.getClassificationAccuracy(testData, model),
                                evalUtilities.getConfMat(testData, model),
                            )
                        )
                    else:
                        local_exp_pred = []
                        for ex in testData:
                            local_exp_pred.append((ex.getclass(), model(ex)))
                        Cresults.append(
                            (evalUtilities.calcRMSE(local_exp_pred), evalUtilities.calcRsqrt(local_exp_pred))
                        )
                        # Save the experimental value and correspondent predicted value
                        Cexp_pred += local_exp_pred

                res = self.createStatObj(Cresults, Cexp_pred, CnTrainEx, CnTestEx, self.responseType, self.nExtFolds)
                statistics["Consensus"] = copy.deepcopy(res)
                statistics["Consensus"]["IndividualStatistics"] = copy.deepcopy(consensusMLs)
                self.__writeResults(statistics)
            self.__log("Returned multiple ML methods statistics.")
            return statistics

        # By default return the only existing statistics!
        self.__writeResults(statistics)
        self.__log("Returned only one ML method statistics.")
        return statistics[statistics.keys()[0]]
コード例 #41
0
ファイル: ConfPredClass.py プロジェクト: AZCompTox/AZOrange
        #print "Break after the first example"
        #if idx == 1: break


if __name__ == "__main__":
    """
    Assumptions;
    Binary classification 
    Class labels not generalized, assumed to be 'A' and 'N'

    This main will test the implemented CP methods in a 10 fold CV
    """

    data = dataUtilities.DataTable("trainData.tab")
    descList = ["SMILES", "SMILES_1"]
    data = dataUtilities.attributeDeselectionData(data, descList)

    print "Please note that the class labels are not generalized and need to be checked for a new data set"
    print "Assumed to be A and N"
    methods = ["kNNratio", "minNN", "avgNN", "probPred", "combo", "LLOO", "LLOOprob"]   # Non-conformity score method
    #methods = ["kNNratio"]
    cpMethod = "transductive"   # inductive or transductive

    #print "Temp position to save comp time!!"
    # Append to python path /home/kgvf414/dev/AZOrange0.5.5/orangeDependencies/src/orange/orange/Orange/distance/
    #import instances
    #measure = instances.MahalanobisConstructor(data)
    measure = None
    methodIdx = 1
    for method in methods:
コード例 #42
0
extTest = dataUtilities.DataTable("nonIIDtestAZOdesc.txt")
print "Train set ", len(train)
print "randTest set ", len(randTest)
print "extTest set ", len(extTest)

# Calculate fingerprints for train and test sets
fps = getFps(train)
fpsRandTest = getFps(randTest)
fpsExtTest = getFps(extTest)

# Deselect descriptors with no variance
descList = ["ID", "Smiles", "Conc", "Effect", "Conc_1",  "Effect_1", "ID_1", "origSmiles_1", "BioActivity_1" \
,"rdk.fr_dihydropyridine", "rdk.fr_nitroso", "rdk.fr_benzodiazepine", "rdk.fr_thiocyan", "rdk.VSA_EState4" ,"rdk.VSA_EState6" \
,"rdk.VSA_EState7" ,"rdk.VSA_EState1" ,"rdk.VSA_EState2" ,"rdk.VSA_EState3" ,"rdk.SlogP_VSA9" ,"rdk.SMR_VSA8" ,"rdk.fr_diazo" \
,"rdk.fr_prisulfonamd" ,"rdk.fr_isocyan" ,"rdk.fr_azide" ,"rdk.fr_isothiocyan"]
train = dataUtilities.attributeDeselectionData(train, descList)
print "Length domain ", len(train.domain)

#learner = AZorngCvSVM.CvSVMLearner(C=32, gamma=0.03125)
learner = AZorngRF.RFLearner()
#learner = AZorngRF.RFLearner(stratify = "Yes") # No effect
#learner = AZorngCvBoost.CvBoostLearner()
#learner.stratify = "Yes" # No effect
#learner.priors = {"Active":0.80, "Inactive":0.20}
model = learner(train)

thrsList = [0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85]
fileName = "optThrs.txt"
fid = open(fileName, "w")
fid.write("Thrs\tMCC_IID\toutAD_IID\tMCC_nonIID\toutAD_nonIID\n")
for thrs in thrsList:
コード例 #43
0
ファイル: SimBoostedQSAR.py プロジェクト: tojojames/AZOrange
def getSimDescriptors(InActives,
                      InData,
                      methods,
                      active_ids=None,
                      pharmacophore_file=None,
                      callBack=None):
    """ calculates similarity descriptors for a training set (orange object) using the 
                given similarity methods against the given actives
                Possible method strings in methods are the names of the sim_* methods below,
                e.g. rdk_topo_fps for sim_rdk_topo_fps
            callBack function, if defined, will be called on each step sending the pergentage done (0-100): 
                   e.g. callBack(25)
                the callBack function shall return True of False which will indicate to this method if the process it to be continued or Not.
                   e.g. if callBack(25) == False it indicates the caller want's to stop the process of calculating descriptors                 
        """
    # Pre-process input Data tto standardize the SMILES
    SMILESattr = getSMILESAttr(InData)

    if not SMILESattr:
        return None

    #TODO: Create a method in dataUtilities to standardize the attribute smilesName in place having the attr origSmiles as ID
    if "AZutilities.extraUtilities" in sys.modules and hasattr(
            extraUtilities, "StandardizeSMILES"):
        # Call a method for standardizing the SMILES in Data.
        # The method is expected to change the attribute defined as smiAttr in data object
        cleanedData = True
        # Process InData
        tmpDomain = orange.Domain([orange.StringVariable("OrigSMI_ID")] +
                                  [attr for attr in InData.domain])
        data = orange.ExampleTable(tmpDomain, InData)
        #    Fill the OrigSMI_ID
        for ex in data:
            ex["OrigSMI_ID"] = ex[SMILESattr]
        extraUtilities.StandardizeSMILES(data,
                                         smiAttr=SMILESattr,
                                         cName="OrigSMI_ID")
        # Process  Input actives
        activesDomain = orange.Domain([
            orange.StringVariable("OrigSMI_ID"),
            orange.StringVariable("SMILES")
        ], 0)
        activesData = orange.ExampleTable(activesDomain)
        for act in InActives:
            activesData.append([act, act])
        extraUtilities.StandardizeSMILES(activesData,
                                         smiAttr="SMILES",
                                         cName="OrigSMI_ID")
        #print activesData.domain
        actives = []
        for ex in activesData:
            actives.append(str(ex["SMILES"].value))
    else:
        data = InData
        actives = InActives
        cleanedData = False

    # adjust the header
    atts = []
    for m in methods:
        count = 1
        for a in actives:
            attname = m + '(active_' + str(count) + ')'
            atts.append(orange.FloatVariable(attname))
            count += 1

    newdomain = orange.Domain(data.domain.attributes + atts,
                              data.domain.classVar)
    newdata = orange.ExampleTable(newdomain, data)

    att_idx = 0
    # if callBack is defined, it will be called with the percentage done, i.e. 0-100
    if active_ids:
        nTotalSteps = len(newdata) * (
            (len(methods) - 1) * len(actives) + len(active_ids))
    else:
        nTotalSteps = len(methods) * len(actives) * len(newdata)
    stepsDone = 0

    # fill up the data
    for m in methods:
        if m == 'rdk_topo_fps':
            count = 1
            for a in actives:
                attname = m + '(active_' + str(count) + ')'
                for j in range(len(newdata)):
                    instance = newdata[j]
                    tmp = orange.Value(atts[att_idx],
                                       orng_sim_rdk_topo_fps(a, instance))
                    instance[atts[att_idx]] = tmp
                    if callBack:
                        stepsDone += 1
                        if not callBack((100 * stepsDone) / nTotalSteps):
                            return None
                att_idx += 1

        elif m == 'rdk_MACCS_keys':
            count = 1
            for a in actives:
                attname = m + '(active_' + str(count) + ')'
                for j in range(len(newdata)):
                    instance = newdata[j]
                    tmp = orange.Value(atts[att_idx],
                                       orng_sim_rdk_MACCS_keys(a, instance))
                    instance[atts[att_idx]] = tmp
                    if callBack:
                        stepsDone += 1
                        if not callBack((100 * stepsDone) / nTotalSteps):
                            return None

                att_idx += 1

        elif m == 'rdk_morgan_fps':
            count = 1
            for a in actives:
                attname = m + '(active_' + str(count) + ')'
                for j in range(len(newdata)):
                    instance = newdata[j]
                    tmp = orange.Value(atts[att_idx],
                                       orng_sim_rdk_morgan_fps(a, instance))
                    instance[atts[att_idx]] = tmp
                    if callBack:
                        stepsDone += 1
                        if not callBack((100 * stepsDone) / nTotalSteps):
                            return None

                att_idx += 1

        elif m == 'rdk_morgan_features_fps':
            count = 1
            for a in actives:
                attname = m + '(active_' + str(count) + ')'
                for j in range(len(newdata)):
                    instance = newdata[j]
                    tmp = orange.Value(
                        atts[att_idx],
                        orng_sim_rdk_morgan_features_fps(a, instance))
                    instance[atts[att_idx]] = tmp
                    if callBack:
                        stepsDone += 1
                        if not callBack((100 * stepsDone) / nTotalSteps):
                            return None

                att_idx += 1

        elif m == 'rdk_atompair_fps':
            count = 1
            for a in actives:
                attname = m + '(active_' + str(count) + ')'
                for j in range(len(newdata)):
                    instance = newdata[j]
                    tmp = orange.Value(atts[att_idx],
                                       orng_sim_rdk_atompair_fps(a, instance))
                    instance[atts[att_idx]] = tmp
                    if callBack:
                        stepsDone += 1
                        if not callBack((100 * stepsDone) / nTotalSteps):
                            return None

                att_idx += 1

        elif m == 'azo_pharmacophore_fps':
            count = 1
            for a in active_ids:
                attname = m + '(active_' + str(count) + ')'
                for j in range(len(newdata)):
                    instance = newdata[j]
                    tmp = orange.Value(
                        atts[att_idx],
                        azo_pharmacophore_az_inhouse(a, instance,
                                                     pharmacophore_file))
                    instance[atts[att_idx]] = tmp
                    if callBack:
                        stepsDone += 1
                        if not callBack((100 * stepsDone) / nTotalSteps):
                            return None

                att_idx += 1

    if cleanedData:
        #Remove the fixed SMILES and revert to the Original SMILES
        newdata = dataUtilities.attributeDeselectionData(newdata, [SMILESattr])
        newdata.domain["OrigSMI_ID"].name = SMILESattr
    return newdata