Ejemplo n.º 1
0
    def Basic(trainSet,
              testSet,
              file,
              fout,
              name,
              vecin,
              clfName,
              isCount=False):

        auc = 0
        l = GLOB(clfName).getClassifier()
        l.buildClassifier(trainSet)

        vec = l.evaluateModel(testSet)
        actual = testSet[:, -1]

        if isCount:

            vals = DPLIB.getMeasuresCount(actual, vec)
            print(name + ":" + file + ": " + str(vals))
            fout.write("\n" + name + ":" + file + ": " + "Vals=" + str(vals))

        else:
            tvals = DPLIB.getConfMatrix(actual, vec)
            vals = DPLIB.getMeasures(tvals)
            auc = DPLIB.getAUC(actual, vec)
            print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc))
            fout.write("\n" + name + ":" + file + ": " + " AUC = " + str(auc) +
                       ";" + "Vals=" + str(vals))
Ejemplo n.º 2
0
    def NNFilterMulti(trainSeti, testSet, file, fout, name, vecin, count,
                      clfName, tunelrn, vSets):

        startTime = Common.getCurrentTimeMil()

        trainSet = DPLIB.NNFilterMulti(trainSeti, testSet, count)

        l = GLOB(clfName, tunelrn).getClassifier()
        if (tunelrn):

            l = l.getTunedCLF(trainSet, vSets, fout, name, file)

            print("#TUNE-LRN-PARAMS-" + name + ":" + file + ": " +
                  str(l.selectedParams))

            fout.write("#TUNE-LRN-PARAMS-" + name + ":" + file + ": ")
            fout.write(str(l.selectedParams))
            fout.write("\n")
            sCheck = l.getCLFOptions()

            print("#SETSET-LRN-PARAMS-" + name + ":" + file + ": " +
                  str(sCheck))

            fout.write("#SETSET-LRN-PARAMS-" + name + ":" + file + ": ")
            fout.write(str(sCheck))
            fout.write("\n")

        l.buildClassifier(trainSet)

        vec = l.evaluateModel(testSet)

        vecin = vec

        tvals = DPLIB.getConfMatrix(testSet[:, -1], vecin)

        print("#CONF-TEST-" + name + ":" + file + ": " + str(tvals))

        fout.write("#CONF-TEST-" + name + ":" + file + ": ")
        fout.write(str(tvals))
        fout.write("\n")

        auc = DPLIB.getAUC(testSet[:, -1], vec)
        vals = DPLIB.getMeasures(tvals)

        print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc))

        fout.write(name + ":" + file + ": ")
        fout.write(str(vals))
        fout.write(" AUC = ")
        fout.write(str(auc))
        fout.write("\n")

        time = Common.getCurrentTimeMil() - startTime

        print("#TIME-FOR:" + name + ":" + file + ": " + str(time))
        fout.write("#TIME-FOR:" + name + ":" + file + ": " + str(time) + "\n")

        return vecin
Ejemplo n.º 3
0
    def WMulti(files,
               file,
               testSet,
               fout,
               features,
               name,
               clfName,
               dp,
               convertToBinary=True):

        train = []
        for file2 in files:
            if (file2[0:3] == file[0:3] and file2 < file < 0):
                train.append(file2)

        if (len(train)):
            trainSet = DPLIB.LoadCSV(train, dp, features, convertToBinary)

            if (name.lower().find("infogain") >= 0):
                #int indi[] = DPLIB.fSelectInfoGain(trainSet);
                #if (DPLIB.useIterativeInfoGainSubsetting)
                #{
                #    indi = DPLIB.iterativeInfoGainSubsetting(trainSet, indi,clfName);
                #}
                #else
                #    indi = DPLIB.getTopX(indi);
                #trainSet = DPLIB.fSelectSet(trainSet, indi);
                #testSet = DPLIB.fSelectSet(testSet, indi);
                pass

            l = GLOB(clfName).getClassifier()
            l.buildClassifier(trainSet)
            vec = l.evaluateModel(testSet)

            tvals = DPLIB.getConfMatrix(testSet[:, -1], vec)
            auc = DPLIB.getAUC(testSet[:, -1], vec)
            vals = DPLIB.getMeasures(tvals)
            print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc))

            fout.write("\n" + name + ":" + file + ": " + " AUC = " + str(auc) +
                       ";" + "Vals=" + str(vals))

        else:

            print(name + ":" + file + ": " + "!!!" + " AUC = !!!")
            fout.write("\n" + name + ":" + file + ": !!!")
Ejemplo n.º 4
0
    def LOC50(testSeti, file, fout, name, locIndex):
        startTime = Common.getCurrentTimeMil()
        spentISTime = 0
        tempTime = 0
        spentISTime = Common.getCurrentTimeMil()
        allloc = testSeti[:, locIndex]

        med = np.median(allloc)
        predicted = [1 if t >= med else 0 for t in allloc]
        actual = testSeti[:, -1]
        tvals = DPLIB.getConfMatrix(actual, predicted)

        print("#CONF-TEST-" + name + ":" + file + ": " + str(tvals))

        fout.write("#CONF-TEST-" + name + ":" + file + ": ")
        fout.write(str(tvals))
        fout.write("\n")

        vals = DPLIB.getMeasures(tvals)

        auc = DPLIB.getAUC(actual, predicted)
        print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc))

        fout.write(name + ":" + file + ": ")
        fout.write(str(vals))
        fout.write(" AUC = ")
        fout.write(str(auc))
        fout.write("\n")

        time = Common.getCurrentTimeMil() - startTime

        print("#TIME-FOR:" + name + ":" + file + ": " + str(time))
        fout.write("#TIME-FOR:" + name + ":" + file + ": " + str(time) + "\n")

        print("#TIME-FOR-IS:" + name + ":" + file + ": " + str(time))
        fout.write("#TIME-FOR-IS:" + name + ":" + file + ": " + str(time) +
                   "\n")
Ejemplo n.º 5
0
    def CreateBuckets(self, trainSet, testSet, vSets, name, testCut, iternum,
                      save, superbit, stages, buckets, doprint, clfName,
                      tunelrn):

        out = []
        if self.isCount:
            keySet = list(
                DPLIB.getMeasuresCount([0, 1, 2, 3], [0, 1, 2, 3]).keys())
        else:
            keySet = list(
                DPLIB.getExtMeasures({
                    "tp": 1,
                    "tn": 2,
                    "fp": 3,
                    "fn": 4
                }).keys())

        out.append("#STARTED FOR-" + name + ":" + self.file + ": ")

        startTime = Common.getCurrentTimeMil()
        spentIsTime = 0
        tempTime = 0

        out.append("#Using also Label For train in LSH")

        if (vSets == None):

            vSets = []
            vSets.append(trainSet)

        if (save):
            DPLIB.SaveToCsv(
                trainSet, "MAIN-TRAIN-FILE-" + "ITER=" + str(iternum) + "--" +
                "METHOD=" + name + "--FILE=" + self.file + "--")
            DPLIB.SaveToCsv(
                testSet, "MAIN-TEST-FILE-" + "ITER=" + str(iternum) + "--" +
                "METHOD=" + name + "--FILE=" + self.file + "--")
            for i in range(len(vSets)):

                DPLIB.SaveToCsv(
                    trainSet,
                    "VSET-FILE-" + "INDEX=" + str(i) + "ITER=" + str(iternum) +
                    "--" + "METHOD=" + name + "--FILE=" + self.file + "--")

        np.random.shuffle(trainSet)
        np.random.shuffle(testSet)
        tempTime = Common.getCurrentTimeMil()
        count = len(trainSet)
        bins = {}
        # R^n
        n = trainSet.shape[1] - 1

        binid = 0
        #lshmin = LSHMinHash(stages, buckets, n);
        try:
            lshsuper = LSHSuperBit(stages=stages,
                                   buckets=buckets,
                                   dimensions=n)
        except Exception as ex:
            print('##SuperBit with specified parameters failed:' + str(ex))
            return None
        sp = 0.75
        # Compute a SuperBit signature, and a LSH hash
        for i in range(count):
            vector = trainSet[i, 1:].tolist()

            hash = None
            if (superbit):
                hash = lshsuper.hash(vector)
            else:
                ##Minhash support
                # #hash = lshmin.hash(vecBool);
                pass

            binid = hash[0]
            if not binid in bins.keys():
                bins[binid] = []

            bins[binid].append(trainSet[i])

        spentIsTime += Common.getCurrentTimeMil() - tempTime

        numBins = len(bins.keys())

        for binid in bins.keys():
            bins[binid] = np.array(bins[binid])

        out.append("#Number of BINS:" + name + ":" + self.file + ": " +
                   str(numBins))

        pop = []

        for i in bins.keys():

            trSet = bins[i]
            l = GLOB(clfName, tunelrn).getClassifier()

            #if (tunelrn):
            #    l = l.getTunedCLF(trSet, vSets,fout,name, file);

            l.buildClassifier(trSet)
            cf = 0
            j = 0

            allvecs = []
            confs = []
            allcfs = []
            allaucs = []
            valsA = None
            confsA = None
            aucA = 0.0
            for vSet in vSets:

                vec = None
                actuals = None

                vec = l.evaluateModel(vSet)
                actuals = vSet[:, -1]

                vals = None
                auc = 0
                if self.isCount:
                    vals = DPLIB.getMeasuresCount(actuals, vec)
                else:

                    auc = DPLIB.getAUC(actuals, vec)
                    aucA += auc
                    allaucs.append(auc)
                    if (testCut):
                        vCF = 0.1
                        bestCF = 0
                        bestCFVal = -1
                        bestVals = None

                        while True:

                            tvals = DPLIB.getConfMatrix(actuals, vec, vCF)
                            measures = DPLIB.getMeasures(tvals)
                            fit = measures["F"] * measures["GMean1"]
                            if (fit > bestCFVal or bestVals == None):

                                bestCFVal = fit
                                bestCF = vCF
                                bestVals = tvals

                            vCF += 0.1

                            if (vCF >= 1):
                                break

                        if (confsA == None):

                            confsA = {key: 0 for key in bestVals.keys()}

                        for j in confsA.keys():
                            confsA[j] += bestVals[j]

                        confs.append(bestVals)

                        vals = DPLIB.getMeasures(bestVals)
                        cf += bestCF
                        allcfs.append(bestCF)

                    else:

                        tvals = DPLIB.getConfMatrix(actuals, vec)

                        if (confsA == None):

                            confsA = {key: 0 for key in tvals.keys()}

                        for j in confsA.keys():
                            confsA[j] += tvals[j]

                        confs.append(tvals)

                        vals = DPLIB.getMeasures(tvals)
                        allcfs.append(DPLIB.DefaultCF)

                allvecs.append(vals)

                if (valsA == None):
                    valsA = {key: 0 for key in keySet}

                for j in keySet:
                    valsA[j] += vals[j]

            for j in keySet:
                valsA[j] /= len(vSets)

            h = None
            if not self.isCount:
                for j in confsA.keys():
                    confsA[j] /= len(vSets)

                if (testCut):
                    cf /= len(vSets)

                aucA /= len(vSets)

                h = CHRM_GIS(trSet, valsA, aucA)
                h.fitnesses = allvecs
                h.aucs = allaucs
                h.conf = confsA
                h.confs = confs
                h.allcfs = allcfs
                if (testCut):
                    h.bestCF = cf
                else:
                    h.bestCF = DPLIB.DefaultCF
            else:

                h = CHRM_GIS_Count(trSet, valsA)
                h.fitnesses = allvecs

            pop.append(h)
            l = None

        tempTime = Common.getCurrentTimeMil()
        pop = DPLIB.MySort(pop)
        spentIsTime += Common.getCurrentTimeMil() - tempTime
        top = pop[0]

        out.append("#Instances in Top:" + str(len(top.ds)))

        out.append("#STAGES:" + name + ":" + self.file + ": " + str(stages))
        out.append("#BUCKETS:" + name + ":" + self.file + ": " + str(buckets))
        if not self.isCount:
            out.append("#BEST-CF-VALUE:" + name + ":" + self.file + ": " +
                       str(top.bestCF))

        l = GLOB(clfName, tunelrn).getClassifier()

        if (tunelrn):

            l = l.getTunedCLF(top.ds, vSets, fout, name, file)

            out.append("#TUNE-LRN-PARAMS-" + name + ":" + self.file + ": " +
                       str(l.selectedParams))
            sCheck = l.getCLFOptions()
            out.append("#SETSET-LRN-PARAMS-" + name + ":" + self.file + ": " +
                       str(sCheck))

        l.buildClassifier(top.ds)

        vec = l.evaluateModel(testSet)

        out.append("#LSH-FOR-TOP-ONLY")

        if self.isCount:
            vals = DPLIB.getMeasuresCount(testSet[:, -1], vec)
            out.append(name + ":" + self.file + ": " + str(vals))
        else:
            tvals = DPLIB.getConfMatrix(testSet[:, -1], vec, top.bestCF)
            out.append("#CONF-TEST-" + name + ":" + self.file + ": " +
                       str(tvals))
            vals = DPLIB.getMeasures(tvals)
            auc = DPLIB.getAUC(testSet[:, -1], vec)
            vals['auc'] = auc
            out.append(name + ":" + self.file + ": " + str(vals))

        for i in range(len(pop)):

            pop[i] = None

        pop = None

        for i in bins.keys():
            bins[i] = None

        bins = None

        time = Common.getCurrentTimeMil() - startTime

        if (name.find("LSHTune") < 0):
            out.append("#TIME-FOR:" + name + ":" + self.file + ": " +
                       str(time))
            out.append("#TIME-FOR-IS:" + name + ":" + self.file + ": " +
                       str(spentIsTime))
            self.output = +out

        top.addToExtra("SPENT-TIME-IS", float(spentIsTime))

        return top, out
Ejemplo n.º 6
0
    def WCFolds(testSet, folds, file, fout, name, clfName):

        auc = 0
        preds = []
        actuals = []
        vals = None
        tssCopy = testSet[:, :]
        rnd = random.Random(Common.getCurrentTimeMil())
        np.random.shuffle(tssCopy)

        skf = StratifiedKFold(n_splits=folds)
        X = tssCopy[:, :-1]
        y = tssCopy[:, -1]
        for train_index, test_index in skf.split(X, y):

            cvtrain, cvtest = X[train_index], X[test_index]
            cvtrainY, cvtestY = y[train_index], y[test_index]

            cvtrain = np.append(cvtrain,
                                cvtrainY.reshape((len(cvtrainY), 1)),
                                axis=1)

            cvtest = np.append(cvtest,
                               cvtestY.reshape((len(cvtestY), 1)),
                               axis=1)

            if (name.lower().find("infogain") >= 0):
                pass
                #int indi[] = DPLIB.fSelectInfoGain(cvtrain);
                #if (DPLIB.useIterativeInfoGainSubsetting)
                #{
                #    indi = DPLIB.iterativeInfoGainSubsetting(cvtrain, indi, clfName);
                #}
                #else
                #    indi = DPLIB.getTopX(indi);
                #cvtrain = DPLIB.fSelectSet(cvtrain, indi);
                #cvtest = DPLIB.fSelectSet(cvtest, indi);

            m = GLOB(clfName).getClassifier()
            m.buildClassifier(cvtrain)

            vec = m.evaluateModel(cvtest)

            preds.append(vec)
            actuals.append(cvtestY)
            if vals == None:

                vals = DPLIB.getConfMatrix(cvtestY, vec)

            else:

                v2 = DPLIB.getConfMatrix(cvtestY, vec)

                for key in vals.keys():
                    vals[key] += v2[key]

        auc = DPLIB.getAUCCV(actuals, preds)
        vals1 = DPLIB.getMeasures(vals)
        print(name + ":" + file + ": " + str(vals1) + " AUC = " + str(auc))
        fout.write("\n" + name + ":" + file + ": " + " AUC = " + str(auc) +
                   ";" + "Vals=" + str(vals1))
Ejemplo n.º 7
0
    def NNFilter(trainSeti, testSet, file, fout, name, vecin, count, clfName,
                 tunelrn, vSets, testCut):

        startTime = Common.getCurrentTimeMil()
        spentISTime = 0
        tempTime = 0

        bestFit = 0.0
        bestCount = 0
        btrainSet = None
        cfbf = DPLIB.DefaultCF

        if (count == 0):
            for i in range(1, 11):

                tempTime = Common.getCurrentTimeMil()

                trainSet = DPLIB.NNFilter(trainSeti, testSet, i)

                spentISTime += Common.getCurrentTimeMil() - tempTime

                l = GLOB(clfName, tunelrn).getClassifier()

                if (tunelrn):
                    l = l.getTunedCLF(trainSet, vSets, fout, name, file)

                l.buildClassifier(trainSet)

                avgFit = 0.0
                j = 0
                for j in range(len(vSets)):

                    vec = l.evaluateModel(vSets[j])

                    tvals = DPLIB.getConfMatrix(vSets[j][:, -1], vec)
                    measures = DPLIB.getExtMeasures(tvals)
                    fit = measures["F"] * measures["GMean1"]
                    avgFit += fit

                avgFit /= len(vSets)

                if (avgFit > bestFit):
                    bestFit = avgFit
                    bestCount = i
                    btrainSet = trainSet[:, :]

            if (testCut):

                cf = 0
                j = 0

                trainSet = btrainSet

                l = GLOB(clfName, tunelrn).getClassifier()

                if (tunelrn):
                    l = l.getTunedCLF(trainSet, vSets, fout, name, file)

                l.buildClassifier(trainSet)
                avgFit = 0.0

                for j in range(len(vSets)):

                    vec = l.evaluateModel(vSets[j])
                    vCF = 0.1
                    bestCF = 0
                    bestCFVal = -1
                    bestVals = None

                    while True:
                        tvals = DPLIB.getConfMatrix(vSets[j][:, -1], vec, vCF)
                        measures = DPLIB.getExtMeasures(tvals)
                        fit = measures["F"] * measures["GMean1"]
                        if (fit > bestCFVal or bestVals == None):
                            bestCFVal = fit
                            bestCF = vCF
                            bestVals = tvals

                        vCF += 0.1
                        if vCF >= 1:
                            break
                    cf += bestCF

                cf /= vSets.size()
                cfbf = cf

        trainSet = None
        if (count == 0):
            trainSet = btrainSet
        else:
            tempTime = Common.getCurrentTimeMil()
            trainSet = DPLIB.NNFilter(trainSeti, testSet, count)
            spentISTime = Common.getCurrentTimeMil() - tempTime
            bestCount = count

        l = GLOB(clfName, tunelrn).getClassifier()

        if (tunelrn):
            l = l.getTunedCLF(trainSet, vSets, fout, name, file)

            print("#TUNE-LRN-PARAMS-" + name + ":" + file + ": " +
                  str(l.selectedParams))
            fout.write("#TUNE-LRN-PARAMS-" + name + ":" + file + ": ")
            fout.write(str(l.selectedParams))
            fout.write("\n")
            sCheck = l.getCLFOptions()

            print("#SETSET-LRN-PARAMS-" + name + ":" + file + ": " +
                  str(sCheck))
            fout.write("#SETSET-LRN-PARAMS-" + name + ":" + file + ": ")
            fout.write(str(sCheck))
            fout.write("\n")

        l.buildClassifier(trainSet)

        vec = l.evaluateModel(testSet)

        vecin = vec

        tvals = DPLIB.getConfMatrix(testSet[:, -1], vecin, cfbf)
        if (count == 0):

            print("#BESTCOUNT-" + name + ":" + file + ": " + str(bestCount))

            fout.write("#BESTCOUNT-" + name + ":" + file + ": ")
            fout.write(str(bestCount))
            fout.write("\n")

            print("#BESTFIT-" + name + ":" + file + ": " + str(bestFit))
            fout.write("#BESTFIT-" + name + ":" + file + ": ")
            fout.write(str(bestFit))
            fout.write("\n")

        print("#CONF-TEST-" + name + ":" + file + ": " + str(tvals))
        fout.write("#CONF-TEST-" + name + ":" + file + ": ")
        fout.write(str(tvals))
        fout.write("\n")
        if (testCut):

            print("#NN-BEST-CF-VALUE:" + name + ":" + file + ": " + str(cfbf))

            fout.write("#NN-BEST-CF-VALUE:" + name + ":" + file + ": ")
            fout.write(str(cfbf))
            fout.write("\n")

        vals = DPLIB.getMeasures(tvals)
        auc = DPLIB.getAUC(testSet[:, -1], vecin)
        print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc))

        fout.write(name + ":" + file + ": ")
        fout.write(str(vals))
        fout.write(" AUC = ")
        fout.write(str(auc))
        fout.write("\n")

        time = Common.getCurrentTimeMil() - startTime

        print("#TIME-FOR:" + name + ":" + file + ": " + str(time))
        fout.write("#TIME-FOR:" + name + ":" + file + ": " + str(time) + "\n")

        print("#TIME-FOR-IS:" + name + ":" + file + ": " + str(spentISTime))
        fout.write("#TIME-FOR-IS:" + name + ":" + file + ": " +
                   str(spentISTime) + "\n")

        return vecin