def CreateBuckets(self, trainSet, testSet, vSets, name, testCut, iternum, save, superbit, stages, buckets, doprint, clfName, tunelrn): out = [] if self.isCount: keySet = list( DPLIB.getMeasuresCount([0, 1, 2, 3], [0, 1, 2, 3]).keys()) else: keySet = list( DPLIB.getExtMeasures({ "tp": 1, "tn": 2, "fp": 3, "fn": 4 }).keys()) out.append("#STARTED FOR-" + name + ":" + self.file + ": ") startTime = Common.getCurrentTimeMil() spentIsTime = 0 tempTime = 0 out.append("#Using also Label For train in LSH") if (vSets == None): vSets = [] vSets.append(trainSet) if (save): DPLIB.SaveToCsv( trainSet, "MAIN-TRAIN-FILE-" + "ITER=" + str(iternum) + "--" + "METHOD=" + name + "--FILE=" + self.file + "--") DPLIB.SaveToCsv( testSet, "MAIN-TEST-FILE-" + "ITER=" + str(iternum) + "--" + "METHOD=" + name + "--FILE=" + self.file + "--") for i in range(len(vSets)): DPLIB.SaveToCsv( trainSet, "VSET-FILE-" + "INDEX=" + str(i) + "ITER=" + str(iternum) + "--" + "METHOD=" + name + "--FILE=" + self.file + "--") np.random.shuffle(trainSet) np.random.shuffle(testSet) tempTime = Common.getCurrentTimeMil() count = len(trainSet) bins = {} # R^n n = trainSet.shape[1] - 1 binid = 0 #lshmin = LSHMinHash(stages, buckets, n); try: lshsuper = LSHSuperBit(stages=stages, buckets=buckets, dimensions=n) except Exception as ex: print('##SuperBit with specified parameters failed:' + str(ex)) return None sp = 0.75 # Compute a SuperBit signature, and a LSH hash for i in range(count): vector = trainSet[i, 1:].tolist() hash = None if (superbit): hash = lshsuper.hash(vector) else: ##Minhash support # #hash = lshmin.hash(vecBool); pass binid = hash[0] if not binid in bins.keys(): bins[binid] = [] bins[binid].append(trainSet[i]) spentIsTime += Common.getCurrentTimeMil() - tempTime numBins = len(bins.keys()) for binid in bins.keys(): bins[binid] = np.array(bins[binid]) out.append("#Number of BINS:" + name + ":" + self.file + ": " + str(numBins)) pop = [] for i in bins.keys(): trSet = bins[i] l = GLOB(clfName, tunelrn).getClassifier() #if (tunelrn): # l = l.getTunedCLF(trSet, vSets,fout,name, file); l.buildClassifier(trSet) cf = 0 j = 0 allvecs = [] confs = [] allcfs = [] allaucs = [] valsA = None confsA = None aucA = 0.0 for vSet in vSets: vec = None actuals = None vec = l.evaluateModel(vSet) actuals = vSet[:, -1] vals = None auc = 0 if self.isCount: vals = DPLIB.getMeasuresCount(actuals, vec) else: auc = DPLIB.getAUC(actuals, vec) aucA += auc allaucs.append(auc) if (testCut): vCF = 0.1 bestCF = 0 bestCFVal = -1 bestVals = None while True: tvals = DPLIB.getConfMatrix(actuals, vec, vCF) measures = DPLIB.getMeasures(tvals) fit = measures["F"] * measures["GMean1"] if (fit > bestCFVal or bestVals == None): bestCFVal = fit bestCF = vCF bestVals = tvals vCF += 0.1 if (vCF >= 1): break if (confsA == None): confsA = {key: 0 for key in bestVals.keys()} for j in confsA.keys(): confsA[j] += bestVals[j] confs.append(bestVals) vals = DPLIB.getMeasures(bestVals) cf += bestCF allcfs.append(bestCF) else: tvals = DPLIB.getConfMatrix(actuals, vec) if (confsA == None): confsA = {key: 0 for key in tvals.keys()} for j in confsA.keys(): confsA[j] += tvals[j] confs.append(tvals) vals = DPLIB.getMeasures(tvals) allcfs.append(DPLIB.DefaultCF) allvecs.append(vals) if (valsA == None): valsA = {key: 0 for key in keySet} for j in keySet: valsA[j] += vals[j] for j in keySet: valsA[j] /= len(vSets) h = None if not self.isCount: for j in confsA.keys(): confsA[j] /= len(vSets) if (testCut): cf /= len(vSets) aucA /= len(vSets) h = CHRM_GIS(trSet, valsA, aucA) h.fitnesses = allvecs h.aucs = allaucs h.conf = confsA h.confs = confs h.allcfs = allcfs if (testCut): h.bestCF = cf else: h.bestCF = DPLIB.DefaultCF else: h = CHRM_GIS_Count(trSet, valsA) h.fitnesses = allvecs pop.append(h) l = None tempTime = Common.getCurrentTimeMil() pop = DPLIB.MySort(pop) spentIsTime += Common.getCurrentTimeMil() - tempTime top = pop[0] out.append("#Instances in Top:" + str(len(top.ds))) out.append("#STAGES:" + name + ":" + self.file + ": " + str(stages)) out.append("#BUCKETS:" + name + ":" + self.file + ": " + str(buckets)) if not self.isCount: out.append("#BEST-CF-VALUE:" + name + ":" + self.file + ": " + str(top.bestCF)) l = GLOB(clfName, tunelrn).getClassifier() if (tunelrn): l = l.getTunedCLF(top.ds, vSets, fout, name, file) out.append("#TUNE-LRN-PARAMS-" + name + ":" + self.file + ": " + str(l.selectedParams)) sCheck = l.getCLFOptions() out.append("#SETSET-LRN-PARAMS-" + name + ":" + self.file + ": " + str(sCheck)) l.buildClassifier(top.ds) vec = l.evaluateModel(testSet) out.append("#LSH-FOR-TOP-ONLY") if self.isCount: vals = DPLIB.getMeasuresCount(testSet[:, -1], vec) out.append(name + ":" + self.file + ": " + str(vals)) else: tvals = DPLIB.getConfMatrix(testSet[:, -1], vec, top.bestCF) out.append("#CONF-TEST-" + name + ":" + self.file + ": " + str(tvals)) vals = DPLIB.getMeasures(tvals) auc = DPLIB.getAUC(testSet[:, -1], vec) vals['auc'] = auc out.append(name + ":" + self.file + ": " + str(vals)) for i in range(len(pop)): pop[i] = None pop = None for i in bins.keys(): bins[i] = None bins = None time = Common.getCurrentTimeMil() - startTime if (name.find("LSHTune") < 0): out.append("#TIME-FOR:" + name + ":" + self.file + ": " + str(time)) out.append("#TIME-FOR-IS:" + name + ":" + self.file + ": " + str(spentIsTime)) self.output = +out top.addToExtra("SPENT-TIME-IS", float(spentIsTime)) return top, out
def run(self, trainSeti, testSeti, name, fout, vSets, vSetType, fixedTrainSize, log, ignoreOK, threshold, thresholds, rejectedFits, rejectedPerfs, rejectedTestPerfs, clfName): mad = 0.0 if self.isCount: keySet = list( DPLIB.getMeasuresCount([0, 1, 2, 3], [0, 1, 2, 3]).keys()) mad = DPLIB.SetBugCountForMut(trainSeti) else: keySet = list( DPLIB.getExtMeasures({ "tp": 1, "tn": 2, "fp": 3, "fn": 4 }).keys()) startTime = Common.getCurrentTimeMil() tempTime = 0 spentISTime = 0 #For Binary Prediction, isCount = False auc = 0 preds = [] pop = [] trainSet = np.copy(trainSeti) testSet = np.copy(testSeti) pop.clear() tstSize = len(testSet) partSize = int(tstSize / self.numParts) preds.clear() diffs = [] auc = 0.0 #For isCount = True actuals = [] prrs = [] if (log): self.prnt("#GIS-OPTIONS;;For=" + name + "@" + ":iters=" + str(self.iters) + "-POPSIZE=" + str(self.popSize) + "-NumParts=" + str(self.numParts) + "-NumGens=" + str(self.numGens) + "-sizeTop=" + str(self.sizeTopP) + "-Learner=" + clfName + "\n") isOK = True np.random.shuffle(testSet) self.FinalLearners = [] self.FinalDatasets = [] for p in range(self.numParts): diffp = [] self.prnt("\n" + str(p) + ": ") tempTime = Common.getCurrentTimeMil() pop.clear() start = p * partSize end = (p + 1) * partSize if (end > tstSize): end = tstSize if (p == self.numParts - 1): end = tstSize testPart = testSet[start:end, :] spentISTime += Common.getCurrentTimeMil() - tempTime uinds = set() if (vSets == None or len(vSets) == 0): if (vSets == None): vSets = [] vSet = None retVal = "" if (vSetType == 'Train Set'): vSet = trainSeti if (log): retVal = DPLIB.getStats(vSet, True, True, True) self.prnt("#VSETINFO;;prt=" + str(p) + ";;For=" + name + "@" + ":" + retVal + "\n") retVal = None elif (vSetType == 'NN-Filter'): tempTime = Common.getCurrentTimeMil() vSet = DPLIB.NNFilter(trainSet, testPart, 1) spentISTime += Common.getCurrentTimeMil() - tempTime if (log): retVal = DPLIB.getStats(vSet, True, True, True) self.prnt("#VSETINFO;;prt=" + str(p) + ";;For=" + name + "@" + ":" + retVal + "\n") retVal = None #If random, but not fed into the func, generate one randomly, with size of testPart elif (vSetType == 'Multiple Random' or vSetType == 'Single Random'): size = len(testPart) vSet = [] j = 0 while (j < size): index = np.random.randint(trainSet.numInstances()) if (not index in uinds): uinds.add(index) else: continue vSets.append(trainSet[index]) j += 1 if (log): retVal = DPLIB.getStats(vSet, true, true, True) self.prnt("#VSETINFO;;prt=" + str(p) + ";;For=" + name + "@" + ":" + retVal + "\n") retVal = None vSet = np.array(vSet) elif (vSetType == '!!TEST!!'): #Upper Bound. Should not be used. self.prnt("Should not be used.") vSet = testSeti if (log): retVal = DPLIB.getStats(vSet, True, True, True) self.prnt("#VSETINFO;;prt=" + str(p) + ";;For=" + name + "@" + ":" + retVal + "\n") retVal = None elif vSetType == 'KS2': vSet = None vSets.append(vSet) else: retVal = "" for vSet in vSets: if (log): retVal = DPLIB.getStats(vSet, True, True, True) self.prnt("#VSETINFO;;prt=" + str(p) + ";;For=" + name + "@" + ":" + retVal + "\n") retVal = None for i in range(self.popSize): tempTime = Common.getCurrentTimeMil() uinds.clear() size = 0 if (fixedTrainSize): size = self.chrmSize else: size = np.random.randint(self.chrmSize) + 10 while True: trSet = [] j = 0 while (j < size): index = np.random.randint(len(trainSet)) trSet.append(trainSet[index]) if (not index in uinds): uinds.add(index) j += 1 spentISTime += Common.getCurrentTimeMil() - tempTime trSet = np.array(trSet) if len(set(list(trSet[:, -1]))) >= 2: break tempTime = Common.getCurrentTimeMil() pv, p_vals = DPLIB.checkSimilarity(trSet[:, :-1], testPart[:, :-1]) if self.isCount: h = CHRM_GIS_Count(trSet, None, extraAsFitness='p-val') h.addToExtra('p-val', sum(p_vals)) pop.append(h) else: h = CHRM_GIS(trSet, None, None, extraAsFitness='p-val') h.addToExtra('p-val', sum(p_vals)) pop.append(h) spentISTime += Common.getCurrentTimeMil() - tempTime tempTime = Common.getCurrentTimeMil() pop = DPLIB.MySort(pop) spentISTime += Common.getCurrentTimeMil() - tempTime cnt = 0 g = 0 for g in range(self.numGens): self.prnt(str(g) + " ") if (log): pass #retVal = "" #for i in range(len(pop)): # chrm = pop[i] # retVal = DPLIB.getStats(chrm.ds, False, False, False); # self.prnt("#POPITNFO;;gn="+str(g)+";;prt="+str(p)+";;For="+name+"@"+":"+retVal+"\n"); # self.prnt("#POPITVALS;;gn="+str(g)+";;prt="+str(p)+";;For="+name+"@"+":"+"rpaf="+str(chrm.fitness).replace(", ", ",") # +";;conf="+str(chrm.conf).replace(", ", ",")+";;fit="+str(chrm.getFitness())+";;TConf2="+str(chrm.testConf).replace(", ", ",")+";;TRpaf2="+str(chrm.testFitness).replace(", ", ",")+"\n"); # retVal = None; tempTime = Common.getCurrentTimeMil() newPop = [] for i in range(self.sizeTopP): newPop.append(pop[i]) i = 0 for i in range(0, len(pop) - self.sizeTopP, 2): idx1 = 0 idx2 = 0 while (idx1 == idx2): if (cnt >= 3): idx1 = np.random.randint(len(pop)) idx2 = np.random.randint(len(pop)) else: idx1 = GA.tornament(pop) idx2 = GA.tornament(pop) cnt += 1 cnt = 0 ds1 = pop[idx1].ds ds2 = pop[idx2].ds while True: ds1, ds2 = GA.crossOver(ds1, ds2, fixedTrainSize, isCount=self.isCount) if len(set(list(ds1[:, -1]))) >= 2 and len( set(list(ds2[:, -1]))) >= 2: break self.prnt('repeat cross') while True: ds1 = GA.Mutate(ds1, isCount=self.isCount, mad=mad) if len(set(list(ds1[:, -1]))) >= 2: break self.prnt( 'repeat mut ds1, because all elements are of type one class' ) while True: ds2 = GA.Mutate(ds2, isCount=self.isCount, mad=mad) if len(set(list(ds2[:, -1]))) >= 2: break self.prnt( 'repeat mut ds1, because all elements are of type one class' ) if self.isCount: newPop.append( CHRM_GIS_Count(ds1, None, extraAsFitness='p-val')) newPop.append( CHRM_GIS_Count(ds2, None, extraAsFitness='p-val')) else: newPop.append( CHRM_GIS(ds1, None, extraAsFitness='p-val')) newPop.append( CHRM_GIS(ds2, None, extraAsFitness='p-val')) spentISTime += Common.getCurrentTimeMil() - tempTime for i in range(len(newPop)): tempTime = Common.getCurrentTimeMil() pv, p_vals = DPLIB.checkSimilarity(newPop[i].ds[:, :-1], testPart[:, :-1]) newPop[i].addToExtra('p-val', sum(p_vals)) spentISTime += Common.getCurrentTimeMil() - tempTime tempTime = Common.getCurrentTimeMil() newPop = DPLIB.MySort(newPop) exit = False countComp = 0 newPop, rdel = DPLIB.CombinePops(pop, newPop) if (log): pass #retVal = "" #for i in range(len(rdel)): # chrm = rdel[i]; # retVal = DPLIB.getStats(chrm.ds, False, False, False); # self.prnt("#POPDELITNFO;;gn="+str(g)+";;prt="+str(p)+";;For="+name+"@"+":"+retVal+";;rpaf="+str(chrm.fitness).replace(", ", ",") # +";;conf="+str(chrm.conf).replace(", ", ",")+";;fit="+str(chrm.getFitness())+";;TConf2="+str(chrm.testConf).replace(", ", ",")+";;TRpaf2="+str(chrm.testFitness).replace(", ", ",") # +"\n"); # retVal = None; rdel = None diff = abs( GA.GetMeanFittness(pop, countComp) - GA.GetMeanFittness(newPop, countComp)) if (diff < 0.000001): exit = True diffp.append(diff) pop = newPop if (pop[0].getFitness() > 0.0) and (exit): break exit = False spentISTime += Common.getCurrentTimeMil() - tempTime w = [] if (self.count == 0): self.count = len(pop) for i in range(self.count): l = GLOB(clfName).getClassifier() tds = pop[i].ds self.FinalLearners.append(l) self.FinalDatasets.append(tds) testPartI = testPart l.buildClassifier(tds) if self.isCount: actual = DPLIB.getActuals(testPartI) prr = l.evaluateModel(testPartI) #vals = DPLIB.getMeasuresCount(actual,prr) actall = None predall = None if (len(actuals) == self.count): actuals[i] = actuals[i] + actual prrs[i] = prrs[i] + prr else: actuals.append(actual) prrs.append(prr) else: vec = l.evaluateModel(testPartI) if (len(preds) == self.count): preds[i] += list(vec) else: preds.append(list(vec)) if (log): pass #retVal = DPLIB.getStats(tds, True, True, True); #self.prnt("#TRPRTNFO;;prt="+str(p)+";;For="+name+"@"+":"+retVal+"\n"); #retVal = DPLIB.getStats(testPart,true,true, True); #self.prnt("#TSTPRTNFO;;prt="+str(p)+";;For="+name+"@"+":"+retVal+"\n"); #vals = DPLIB.getConfMatrix(testPart[:,-1],vec) #self.prnt("#TSTPRTVALS;;prt="+str(p)+";;For="+name+"@"+":"+ # "rpaf="+str(DPLIB.getMeasures(vals)).replace(", ", ",") # +";;conf="+str(vals).replace(", ", ",")+"\n"); #retVal = None; w.append(pop[i].getFitness()) isOK = True if not isOK: pass else: thresholds.append(pop[0].getFitness()) self.prnt() self.prnt("Best Top Fitness:" + str(pop[0].fitness)) self.prnt("Best Fitness (mean):", pop[0].getMeanFitness()) if self.isCount: vals = DPLIB.getMeasuresCountSet(actuals, prrs) else: vals1 = DPLIB.getConfMatrixSet(testSet[:, -1], preds) vals = DPLIB.getMeasures(vals1) if (isOK): if not self.isCount: if (len(preds) == 1): auc = DPLIB.getAUC(testSet[:, -1], preds[0]) else: auc = DPLIB.getAUCSet(testSet[:, -1], preds) vals['auc'] = auc self.prnt() self.prnt("#CONF-TEST:" + name + ":" + self.file + ": " + str(vals1)) self.prnt() self.prnt(name + ":" + self.file + ": " + str(vals)) self.prnt() else: self.prnt() self.prnt(name + ":" + self.file + ": " + str(vals)) self.prnt() else: bestI = pop[0] rejectedFits.append(bestI.getFitness()) rejVals = copy.deepcopy(bestI.fitness) rejectedPerfs.append(rejVals) testRejVals = copy.deepcopy(vals) rejectedTestPerfs.eppend(testRejVals) self.prnt("#NOTOKPREDS----" + name + ":" + self.file + ": " + str(vals)) if not self.isCount: self.prnt() self.prnt("#NOTOKPREDS----" + "#CONF-TEST:" + name + ":" + self.file + ": " + str(vals1)) time = Common.getCurrentTimeMil() - startTime self.prnt("#TIME-FOR:" + name + ":" + self.file + ": " + str(time)) self.prnt("#TIME-FOR-IS:" + name + ":" + self.file + ": " + str(spentISTime)) return isOK