def Basic(trainSet, testSet, file, fout, name, vecin, clfName, isCount=False): auc = 0 l = GLOB(clfName).getClassifier() l.buildClassifier(trainSet) vec = l.evaluateModel(testSet) actual = testSet[:, -1] if isCount: vals = DPLIB.getMeasuresCount(actual, vec) print(name + ":" + file + ": " + str(vals)) fout.write("\n" + name + ":" + file + ": " + "Vals=" + str(vals)) else: tvals = DPLIB.getConfMatrix(actual, vec) vals = DPLIB.getMeasures(tvals) auc = DPLIB.getAUC(actual, vec) print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc)) fout.write("\n" + name + ":" + file + ": " + " AUC = " + str(auc) + ";" + "Vals=" + str(vals))
def NNFilterMulti(trainSeti, testSet, file, fout, name, vecin, count, clfName, tunelrn, vSets): startTime = Common.getCurrentTimeMil() trainSet = DPLIB.NNFilterMulti(trainSeti, testSet, count) l = GLOB(clfName, tunelrn).getClassifier() if (tunelrn): l = l.getTunedCLF(trainSet, vSets, fout, name, file) print("#TUNE-LRN-PARAMS-" + name + ":" + file + ": " + str(l.selectedParams)) fout.write("#TUNE-LRN-PARAMS-" + name + ":" + file + ": ") fout.write(str(l.selectedParams)) fout.write("\n") sCheck = l.getCLFOptions() print("#SETSET-LRN-PARAMS-" + name + ":" + file + ": " + str(sCheck)) fout.write("#SETSET-LRN-PARAMS-" + name + ":" + file + ": ") fout.write(str(sCheck)) fout.write("\n") l.buildClassifier(trainSet) vec = l.evaluateModel(testSet) vecin = vec tvals = DPLIB.getConfMatrix(testSet[:, -1], vecin) print("#CONF-TEST-" + name + ":" + file + ": " + str(tvals)) fout.write("#CONF-TEST-" + name + ":" + file + ": ") fout.write(str(tvals)) fout.write("\n") auc = DPLIB.getAUC(testSet[:, -1], vec) vals = DPLIB.getMeasures(tvals) print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc)) fout.write(name + ":" + file + ": ") fout.write(str(vals)) fout.write(" AUC = ") fout.write(str(auc)) fout.write("\n") time = Common.getCurrentTimeMil() - startTime print("#TIME-FOR:" + name + ":" + file + ": " + str(time)) fout.write("#TIME-FOR:" + name + ":" + file + ": " + str(time) + "\n") return vecin
def WMulti(files, file, testSet, fout, features, name, clfName, dp, convertToBinary=True): train = [] for file2 in files: if (file2[0:3] == file[0:3] and file2 < file < 0): train.append(file2) if (len(train)): trainSet = DPLIB.LoadCSV(train, dp, features, convertToBinary) if (name.lower().find("infogain") >= 0): #int indi[] = DPLIB.fSelectInfoGain(trainSet); #if (DPLIB.useIterativeInfoGainSubsetting) #{ # indi = DPLIB.iterativeInfoGainSubsetting(trainSet, indi,clfName); #} #else # indi = DPLIB.getTopX(indi); #trainSet = DPLIB.fSelectSet(trainSet, indi); #testSet = DPLIB.fSelectSet(testSet, indi); pass l = GLOB(clfName).getClassifier() l.buildClassifier(trainSet) vec = l.evaluateModel(testSet) tvals = DPLIB.getConfMatrix(testSet[:, -1], vec) auc = DPLIB.getAUC(testSet[:, -1], vec) vals = DPLIB.getMeasures(tvals) print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc)) fout.write("\n" + name + ":" + file + ": " + " AUC = " + str(auc) + ";" + "Vals=" + str(vals)) else: print(name + ":" + file + ": " + "!!!" + " AUC = !!!") fout.write("\n" + name + ":" + file + ": !!!")
def LOC50(testSeti, file, fout, name, locIndex): startTime = Common.getCurrentTimeMil() spentISTime = 0 tempTime = 0 spentISTime = Common.getCurrentTimeMil() allloc = testSeti[:, locIndex] med = np.median(allloc) predicted = [1 if t >= med else 0 for t in allloc] actual = testSeti[:, -1] tvals = DPLIB.getConfMatrix(actual, predicted) print("#CONF-TEST-" + name + ":" + file + ": " + str(tvals)) fout.write("#CONF-TEST-" + name + ":" + file + ": ") fout.write(str(tvals)) fout.write("\n") vals = DPLIB.getMeasures(tvals) auc = DPLIB.getAUC(actual, predicted) print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc)) fout.write(name + ":" + file + ": ") fout.write(str(vals)) fout.write(" AUC = ") fout.write(str(auc)) fout.write("\n") time = Common.getCurrentTimeMil() - startTime print("#TIME-FOR:" + name + ":" + file + ": " + str(time)) fout.write("#TIME-FOR:" + name + ":" + file + ": " + str(time) + "\n") print("#TIME-FOR-IS:" + name + ":" + file + ": " + str(time)) fout.write("#TIME-FOR-IS:" + name + ":" + file + ": " + str(time) + "\n")
def CreateBuckets(self, trainSet, testSet, vSets, name, testCut, iternum, save, superbit, stages, buckets, doprint, clfName, tunelrn): out = [] if self.isCount: keySet = list( DPLIB.getMeasuresCount([0, 1, 2, 3], [0, 1, 2, 3]).keys()) else: keySet = list( DPLIB.getExtMeasures({ "tp": 1, "tn": 2, "fp": 3, "fn": 4 }).keys()) out.append("#STARTED FOR-" + name + ":" + self.file + ": ") startTime = Common.getCurrentTimeMil() spentIsTime = 0 tempTime = 0 out.append("#Using also Label For train in LSH") if (vSets == None): vSets = [] vSets.append(trainSet) if (save): DPLIB.SaveToCsv( trainSet, "MAIN-TRAIN-FILE-" + "ITER=" + str(iternum) + "--" + "METHOD=" + name + "--FILE=" + self.file + "--") DPLIB.SaveToCsv( testSet, "MAIN-TEST-FILE-" + "ITER=" + str(iternum) + "--" + "METHOD=" + name + "--FILE=" + self.file + "--") for i in range(len(vSets)): DPLIB.SaveToCsv( trainSet, "VSET-FILE-" + "INDEX=" + str(i) + "ITER=" + str(iternum) + "--" + "METHOD=" + name + "--FILE=" + self.file + "--") np.random.shuffle(trainSet) np.random.shuffle(testSet) tempTime = Common.getCurrentTimeMil() count = len(trainSet) bins = {} # R^n n = trainSet.shape[1] - 1 binid = 0 #lshmin = LSHMinHash(stages, buckets, n); try: lshsuper = LSHSuperBit(stages=stages, buckets=buckets, dimensions=n) except Exception as ex: print('##SuperBit with specified parameters failed:' + str(ex)) return None sp = 0.75 # Compute a SuperBit signature, and a LSH hash for i in range(count): vector = trainSet[i, 1:].tolist() hash = None if (superbit): hash = lshsuper.hash(vector) else: ##Minhash support # #hash = lshmin.hash(vecBool); pass binid = hash[0] if not binid in bins.keys(): bins[binid] = [] bins[binid].append(trainSet[i]) spentIsTime += Common.getCurrentTimeMil() - tempTime numBins = len(bins.keys()) for binid in bins.keys(): bins[binid] = np.array(bins[binid]) out.append("#Number of BINS:" + name + ":" + self.file + ": " + str(numBins)) pop = [] for i in bins.keys(): trSet = bins[i] l = GLOB(clfName, tunelrn).getClassifier() #if (tunelrn): # l = l.getTunedCLF(trSet, vSets,fout,name, file); l.buildClassifier(trSet) cf = 0 j = 0 allvecs = [] confs = [] allcfs = [] allaucs = [] valsA = None confsA = None aucA = 0.0 for vSet in vSets: vec = None actuals = None vec = l.evaluateModel(vSet) actuals = vSet[:, -1] vals = None auc = 0 if self.isCount: vals = DPLIB.getMeasuresCount(actuals, vec) else: auc = DPLIB.getAUC(actuals, vec) aucA += auc allaucs.append(auc) if (testCut): vCF = 0.1 bestCF = 0 bestCFVal = -1 bestVals = None while True: tvals = DPLIB.getConfMatrix(actuals, vec, vCF) measures = DPLIB.getMeasures(tvals) fit = measures["F"] * measures["GMean1"] if (fit > bestCFVal or bestVals == None): bestCFVal = fit bestCF = vCF bestVals = tvals vCF += 0.1 if (vCF >= 1): break if (confsA == None): confsA = {key: 0 for key in bestVals.keys()} for j in confsA.keys(): confsA[j] += bestVals[j] confs.append(bestVals) vals = DPLIB.getMeasures(bestVals) cf += bestCF allcfs.append(bestCF) else: tvals = DPLIB.getConfMatrix(actuals, vec) if (confsA == None): confsA = {key: 0 for key in tvals.keys()} for j in confsA.keys(): confsA[j] += tvals[j] confs.append(tvals) vals = DPLIB.getMeasures(tvals) allcfs.append(DPLIB.DefaultCF) allvecs.append(vals) if (valsA == None): valsA = {key: 0 for key in keySet} for j in keySet: valsA[j] += vals[j] for j in keySet: valsA[j] /= len(vSets) h = None if not self.isCount: for j in confsA.keys(): confsA[j] /= len(vSets) if (testCut): cf /= len(vSets) aucA /= len(vSets) h = CHRM_GIS(trSet, valsA, aucA) h.fitnesses = allvecs h.aucs = allaucs h.conf = confsA h.confs = confs h.allcfs = allcfs if (testCut): h.bestCF = cf else: h.bestCF = DPLIB.DefaultCF else: h = CHRM_GIS_Count(trSet, valsA) h.fitnesses = allvecs pop.append(h) l = None tempTime = Common.getCurrentTimeMil() pop = DPLIB.MySort(pop) spentIsTime += Common.getCurrentTimeMil() - tempTime top = pop[0] out.append("#Instances in Top:" + str(len(top.ds))) out.append("#STAGES:" + name + ":" + self.file + ": " + str(stages)) out.append("#BUCKETS:" + name + ":" + self.file + ": " + str(buckets)) if not self.isCount: out.append("#BEST-CF-VALUE:" + name + ":" + self.file + ": " + str(top.bestCF)) l = GLOB(clfName, tunelrn).getClassifier() if (tunelrn): l = l.getTunedCLF(top.ds, vSets, fout, name, file) out.append("#TUNE-LRN-PARAMS-" + name + ":" + self.file + ": " + str(l.selectedParams)) sCheck = l.getCLFOptions() out.append("#SETSET-LRN-PARAMS-" + name + ":" + self.file + ": " + str(sCheck)) l.buildClassifier(top.ds) vec = l.evaluateModel(testSet) out.append("#LSH-FOR-TOP-ONLY") if self.isCount: vals = DPLIB.getMeasuresCount(testSet[:, -1], vec) out.append(name + ":" + self.file + ": " + str(vals)) else: tvals = DPLIB.getConfMatrix(testSet[:, -1], vec, top.bestCF) out.append("#CONF-TEST-" + name + ":" + self.file + ": " + str(tvals)) vals = DPLIB.getMeasures(tvals) auc = DPLIB.getAUC(testSet[:, -1], vec) vals['auc'] = auc out.append(name + ":" + self.file + ": " + str(vals)) for i in range(len(pop)): pop[i] = None pop = None for i in bins.keys(): bins[i] = None bins = None time = Common.getCurrentTimeMil() - startTime if (name.find("LSHTune") < 0): out.append("#TIME-FOR:" + name + ":" + self.file + ": " + str(time)) out.append("#TIME-FOR-IS:" + name + ":" + self.file + ": " + str(spentIsTime)) self.output = +out top.addToExtra("SPENT-TIME-IS", float(spentIsTime)) return top, out
def WCFolds(testSet, folds, file, fout, name, clfName): auc = 0 preds = [] actuals = [] vals = None tssCopy = testSet[:, :] rnd = random.Random(Common.getCurrentTimeMil()) np.random.shuffle(tssCopy) skf = StratifiedKFold(n_splits=folds) X = tssCopy[:, :-1] y = tssCopy[:, -1] for train_index, test_index in skf.split(X, y): cvtrain, cvtest = X[train_index], X[test_index] cvtrainY, cvtestY = y[train_index], y[test_index] cvtrain = np.append(cvtrain, cvtrainY.reshape((len(cvtrainY), 1)), axis=1) cvtest = np.append(cvtest, cvtestY.reshape((len(cvtestY), 1)), axis=1) if (name.lower().find("infogain") >= 0): pass #int indi[] = DPLIB.fSelectInfoGain(cvtrain); #if (DPLIB.useIterativeInfoGainSubsetting) #{ # indi = DPLIB.iterativeInfoGainSubsetting(cvtrain, indi, clfName); #} #else # indi = DPLIB.getTopX(indi); #cvtrain = DPLIB.fSelectSet(cvtrain, indi); #cvtest = DPLIB.fSelectSet(cvtest, indi); m = GLOB(clfName).getClassifier() m.buildClassifier(cvtrain) vec = m.evaluateModel(cvtest) preds.append(vec) actuals.append(cvtestY) if vals == None: vals = DPLIB.getConfMatrix(cvtestY, vec) else: v2 = DPLIB.getConfMatrix(cvtestY, vec) for key in vals.keys(): vals[key] += v2[key] auc = DPLIB.getAUCCV(actuals, preds) vals1 = DPLIB.getMeasures(vals) print(name + ":" + file + ": " + str(vals1) + " AUC = " + str(auc)) fout.write("\n" + name + ":" + file + ": " + " AUC = " + str(auc) + ";" + "Vals=" + str(vals1))
def NNFilter(trainSeti, testSet, file, fout, name, vecin, count, clfName, tunelrn, vSets, testCut): startTime = Common.getCurrentTimeMil() spentISTime = 0 tempTime = 0 bestFit = 0.0 bestCount = 0 btrainSet = None cfbf = DPLIB.DefaultCF if (count == 0): for i in range(1, 11): tempTime = Common.getCurrentTimeMil() trainSet = DPLIB.NNFilter(trainSeti, testSet, i) spentISTime += Common.getCurrentTimeMil() - tempTime l = GLOB(clfName, tunelrn).getClassifier() if (tunelrn): l = l.getTunedCLF(trainSet, vSets, fout, name, file) l.buildClassifier(trainSet) avgFit = 0.0 j = 0 for j in range(len(vSets)): vec = l.evaluateModel(vSets[j]) tvals = DPLIB.getConfMatrix(vSets[j][:, -1], vec) measures = DPLIB.getExtMeasures(tvals) fit = measures["F"] * measures["GMean1"] avgFit += fit avgFit /= len(vSets) if (avgFit > bestFit): bestFit = avgFit bestCount = i btrainSet = trainSet[:, :] if (testCut): cf = 0 j = 0 trainSet = btrainSet l = GLOB(clfName, tunelrn).getClassifier() if (tunelrn): l = l.getTunedCLF(trainSet, vSets, fout, name, file) l.buildClassifier(trainSet) avgFit = 0.0 for j in range(len(vSets)): vec = l.evaluateModel(vSets[j]) vCF = 0.1 bestCF = 0 bestCFVal = -1 bestVals = None while True: tvals = DPLIB.getConfMatrix(vSets[j][:, -1], vec, vCF) measures = DPLIB.getExtMeasures(tvals) fit = measures["F"] * measures["GMean1"] if (fit > bestCFVal or bestVals == None): bestCFVal = fit bestCF = vCF bestVals = tvals vCF += 0.1 if vCF >= 1: break cf += bestCF cf /= vSets.size() cfbf = cf trainSet = None if (count == 0): trainSet = btrainSet else: tempTime = Common.getCurrentTimeMil() trainSet = DPLIB.NNFilter(trainSeti, testSet, count) spentISTime = Common.getCurrentTimeMil() - tempTime bestCount = count l = GLOB(clfName, tunelrn).getClassifier() if (tunelrn): l = l.getTunedCLF(trainSet, vSets, fout, name, file) print("#TUNE-LRN-PARAMS-" + name + ":" + file + ": " + str(l.selectedParams)) fout.write("#TUNE-LRN-PARAMS-" + name + ":" + file + ": ") fout.write(str(l.selectedParams)) fout.write("\n") sCheck = l.getCLFOptions() print("#SETSET-LRN-PARAMS-" + name + ":" + file + ": " + str(sCheck)) fout.write("#SETSET-LRN-PARAMS-" + name + ":" + file + ": ") fout.write(str(sCheck)) fout.write("\n") l.buildClassifier(trainSet) vec = l.evaluateModel(testSet) vecin = vec tvals = DPLIB.getConfMatrix(testSet[:, -1], vecin, cfbf) if (count == 0): print("#BESTCOUNT-" + name + ":" + file + ": " + str(bestCount)) fout.write("#BESTCOUNT-" + name + ":" + file + ": ") fout.write(str(bestCount)) fout.write("\n") print("#BESTFIT-" + name + ":" + file + ": " + str(bestFit)) fout.write("#BESTFIT-" + name + ":" + file + ": ") fout.write(str(bestFit)) fout.write("\n") print("#CONF-TEST-" + name + ":" + file + ": " + str(tvals)) fout.write("#CONF-TEST-" + name + ":" + file + ": ") fout.write(str(tvals)) fout.write("\n") if (testCut): print("#NN-BEST-CF-VALUE:" + name + ":" + file + ": " + str(cfbf)) fout.write("#NN-BEST-CF-VALUE:" + name + ":" + file + ": ") fout.write(str(cfbf)) fout.write("\n") vals = DPLIB.getMeasures(tvals) auc = DPLIB.getAUC(testSet[:, -1], vecin) print(name + ":" + file + ": " + str(vals) + " AUC = " + str(auc)) fout.write(name + ":" + file + ": ") fout.write(str(vals)) fout.write(" AUC = ") fout.write(str(auc)) fout.write("\n") time = Common.getCurrentTimeMil() - startTime print("#TIME-FOR:" + name + ":" + file + ": " + str(time)) fout.write("#TIME-FOR:" + name + ":" + file + ": " + str(time) + "\n") print("#TIME-FOR-IS:" + name + ":" + file + ": " + str(spentISTime)) fout.write("#TIME-FOR-IS:" + name + ":" + file + ": " + str(spentISTime) + "\n") return vecin