def fisher_complexity(column,trainX, trainY, sel_features, thisfeature): """ fisher method """ feature_method_index = gol.get_val("feature_method_index") classes = np.array(gol.get_val("classes")) column = np.array(column) fsel_X = trainX[:, sel_features[feature_method_index[thisfeature]]] labels = dict() label_indexs = dict() label_indexs['1'] = np.array(np.where(column == 1))[0] label_indexs['-1'] = np.array(np.where(column == -1))[0] labels['1'] = classes[label_indexs['1']] bool_array = np.zeros(len(trainY), dtype=np.bool) # generate a array being filled of 'False' for lb in labels['1']: bool_array = bool_array | np.array(trainY == lb) part1_X = fsel_X[bool_array] labels['-1'] = classes[label_indexs['-1']] bool_array = np.zeros(len(trainY), dtype=np.bool) # generate a array being filled of 'False' for lb in labels['-1']: bool_array = bool_array | np.array(trainY == lb) part2_X = fsel_X[bool_array] miu1 = np.mean(np.sum(part1_X, axis=0)) miu2 = np.mean(np.sum(part2_X, axis=0)) sigma1 = np.var(np.sum(part1_X, axis=0)) sigma2 = np.var(np.sum(part2_X, axis=0)) if sigma1 + sigma2 == 0 : return 0 fisher_sum = (miu1 - miu2) * (miu1 - miu2) / (sigma1 + sigma2) return fisher_sum
def init_classes(): trainFile = gol.get_val("trainFile") validationFile = gol.get_val("validationFile") testFile = gol.get_val("testFile") classes = DataLoader.loadClasses(trainFile, validationFile, testFile) # a list gol.set_val("classes", classes)
def init_feature(): # features Train_X = gol.get_val("Train_X") Train_Y = gol.get_val("Train_Y") trainFile = gol.get_val("trainFile") # the num of feature to be selected is half of the feature numbers feature_number = Train_X.shape[ 1] / 2 + 1 if Train_X.shape[1] / 2 + 1 < 75 else 75 fea = FeatureSelection.select_features(trainFile, Train_X, Train_Y, feature_number) sel_features_backup = fea[0] feature_F1 = fea[1] feature_F2 = fea[2] feature_F3 = fea[3] feature_F4 = fea[4] sel_features = [] sel_features.append(feature_F1) sel_features.append(feature_F2) sel_features.append(feature_F3) sel_features.append(feature_F4) # feature_method # feature_method_index feature_method_index = dict( (c, i) for i, c in enumerate(FeatureSelection.feature_method)) gol.set_val("sel_features", sel_features) gol.set_val("feature_number", feature_number) gol.set_val("feature_method_index", feature_method_index) gol.set_val("feature_method", FeatureSelection.feature_method)
def eval_func_information_gain(chromosome): """ # Calculate the information gain # The data is all training set """ Train_Y = gol.get_val("Train_Y") classes = gol.get_val("classes") EcocMatrix, features_used_list = TMConvertor.getMatrixDirectly_and_feature( chromosome) infor_gain = information_gain(Train_Y, classes, EcocMatrix) return np.mean(infor_gain)
def eval_func_entropy(chromosome): """ # Calculate the complexity named "means" # The data is all training set """ Train_Y = gol.get_val("Train_Y") classes = gol.get_val("classes") EcocMatrix, features_used_list = TMConvertor.getMatrixDirectly_and_feature( chromosome) entropy = information_entropy(Train_Y, classes, EcocMatrix) return np.mean(entropy)
def init_dataset(): trainFile = gol.get_val("trainFile") testFile = gol.get_val("testFile") validationFile = gol.get_val("validationFile") Train_X, Train_Y, validation_X, validation_Y, Test_X, Test_Y = DataLoader.loadDataset( trainFile, validationFile, testFile) class_num = len(np.unique(Train_Y)) length = len(Train_Y) + len(validation_Y) + len(Test_Y) gol.set_val("Train_X", Train_X) gol.set_val("Train_Y", Train_Y) gol.set_val("validation_X", validation_X) gol.set_val("validation_Y", validation_Y) gol.set_val("Test_X", Test_X) gol.set_val("Test_Y", Test_Y)
def __init__(self, estimator, random_state=None): self.classes = gol.get_val("classes") self.classes_ = None self.code_book_ = None self.estimator = estimator self.estimators_ = None self.estimator_type = None self.featuresNames = gol.get_val("feature_method") self.infos_evaluations = [] self.n_jobs = gol.get_val("n_jobs") self.random_state = random_state self.storager = Storager(gol.get_val("root_path"), gol.get_val("dataName"), self.estimator) self.trainX = None self.weights = None
def get_gene_from_bank(features, ecocmatrix, confusion_matrix, classes): # prepare genebank = gol.get_val("genebank") DETA = 0.01 # to avoid zero divide when calculate acccuracy # calculate the errors errors = list() for i in xrange(len(classes)): errors.append(1 - float(confusion_matrix[i, i]) / np.sum(confusion_matrix[i, :])) # select classes that have higher error than average. avg_errors = np.mean(errors) hard_classes = np.where(errors > avg_errors)[0] if len(hard_classes) == 0: return # it means accucacy is 1 # calculate score of every gene, and sort by decs. score_tuple = [] for (fcolumn, est_accuracy, class_accuracies, used_frequence) in genebank.genes: scores = [(errors[i] - (1 - class_accuracies[i])) * abs(fcolumn[i + 1]) for i in hard_classes] score = sum(scores) / (sum( [abs(fcolumn[i + 1]) + DETA for i in hard_classes])) score_tuple.append((fcolumn, score)) score_tuple = sorted(score_tuple, key=lambda s: s[1], reverse=True) # select most suitable column candidate = None for (fcolumn, score) in score_tuple: if not _check_duplicate(ecocmatrix, fcolumn): candidate = fcolumn break return candidate
def predictFinal_withoutlocalimprovement(self, features_used_list, sel_features, train_X, train_Y, valid_X, valid_Y, test_X, test_Y): self.feature_name = features_used_list self.trainX = np.append(train_X, valid_X, axis=0) self.trainY = np.append(train_Y, valid_Y) self.sel_features = sel_features feature_method_index = gol.get_val("feature_method_index") self.fit(self.trainX, self.trainY, features_used_list, sel_features, self.code_book_) check_is_fitted(self, 'estimators_') Y = [] for i in xrange(len(self.estimators_)): pre = corrected_predict_binary( self.estimators_[i], test_X[:, sel_features[ feature_method_index[features_used_list[i]]]]) Y.append(pre) Y = np.array(Y).T if self.estimator_type == 'decision_function': Y = _sigmoid_normalize(Y) pred = self.get_distances(Y, self.code_book_, Weighted=True).argmin(axis=1) self.conMatrix = confusion_matrix(test_Y, self.classes_[pred]) score, accuracy = self.calculateFScore(self.classes_[pred], test_Y) return score, accuracy
def predict_withoutlocalimprovement(self, features_used_list, sel_features, Train_X, Train_Y, Valid_X, valid_Y): self.feature_name = features_used_list self.trainX = Train_X self.trainY = Train_Y self.sel_features = sel_features check_is_fitted(self, 'estimators_') feature_method_index = gol.get_val("feature_method_index") Y = [] for i in xrange(len(self.estimators_)): self.storager.setfeaturecode( sel_features[feature_method_index[features_used_list[i]]], self.code_book_[:, i]) pre = self.storager.load_prediction_valid() if pre is None: pre = corrected_predict_binary( self.estimators_[i], Valid_X[:, sel_features[ feature_method_index[features_used_list[i]]]]) self.storager.save_prediction_valid(pre) Y.append(pre) Y = np.array(Y).T if self.estimator_type == 'decision_function': Y = _sigmoid_normalize(Y) pred = self.get_distances(Y, self.code_book_).argmin(axis=1) self.conMatrix = confusion_matrix(valid_Y, self.classes_[pred]) score, accuracy = self.calculateFScore(self.classes_[pred], valid_Y) return score, accuracy
def get_distances(self, output_y, code_book_, Weighted=False): # need weighted if not Weighted: valid_y = gol.get_val("validation_Y") self.weights = get_weights(output_y, code_book_, valid_y) return weighting_corrected_euclidean_distances(output_y, code_book_, self.weights)
def fit(self, X, y, features_used_list, sel_features, code_book): _check_estimator(self.estimator) if hasattr(self.estimator, "decision_function"): self.estimator_type = 'decision_function' # output = [-Nan,Nan] else: self.estimator_type = 'predict_proba' # output = [0, 1] self.classes_ = np.unique(np.sort(y)) self.code_book_ = code_book self.trainX = X feature_method_index = gol.get_val("feature_method_index") classes_index = dict((c, i) for i, c in enumerate(self.classes_)) extend_ecocmatrix = np.array( [self.code_book_[classes_index[y[i]]] for i in range(X.shape[0])], dtype=np.int) # try to restore estimators from cache self.estimators_ = list() for i in range(code_book.shape[1]): _column = self.code_book_[:, i] _features = feature_method_index[features_used_list[i]] self.storager.setfeaturecode(sel_features[_features], _column) est = self.storager.load_estimator_train() if est is None: # need training est = corrected_fit_binary(self.estimator, X[:, sel_features[_features]], extend_ecocmatrix[:, i]) self.storager.save_estimator_train(est) self.estimators_.append(est) return self
def eval_func_fscore(chromosome): """ # calculate fscore """ EcocMatrix, features_used_list = TMConvertor.getMatrixDirectly_and_feature( chromosome) Train_X = gol.get_val("Train_X") Train_Y = gol.get_val("Train_Y") validation_X = gol.get_val("validation_X") validation_Y = gol.get_val("validation_Y") sel_features = gol.get_val("sel_features") cc = CC(features_used_list, sel_features, EcocMatrix) fscore, accuracy, infos_evaluations = cc.TrainAndTest( Train_X, Train_Y, validation_X, validation_Y) chromosome.infos_evaluation = infos_evaluations return fscore, accuracy
def logMiddleInfo_callback(gp_engine): Train_X = gol.get_val("Train_X") Train_Y = gol.get_val("Train_Y") validation_X = gol.get_val("validation_X") validation_Y = gol.get_val("validation_Y") Test_X = gol.get_val("Test_X") Test_Y = gol.get_val("Test_Y") sel_features = gol.get_val("sel_features") import sys from utils import delog sys.stdout.write("logMiddleInfo...") genid = gp_engine.getCurrentGeneration() best = gp_engine.bestIndividual() FinalMatrix, features_used_list = TMConvertor.getMatrixDirectly_and_feature(best) # result with local improvemtent cc = CC(features_used_list, sel_features, FinalMatrix) finalScore, finalAccuracy, infos_evaluations = cc.FinalTrainAndTest(Train_X, Train_Y, validation_X, validation_Y, Test_X, Test_Y) delog.logMiddle(genid, finalAccuracy, "AAAAA") delog.logMiddle(genid, finalScore, "AAAAAfscore") # result without local improvemtent cc = CC(features_used_list, sel_features, FinalMatrix) cc.TrainAndTest_withoutlocalimp(Train_X, Train_Y, validation_X, validation_Y) _finalScore, _finalAccuracy = cc.FinalTrainAndTest_withoutlocalimp(Train_X, Train_Y, validation_X, validation_Y, Test_X, Test_Y) delog.logMiddle(genid, _finalAccuracy, "BestAcc_no_impro") delog.logMiddle(genid, _finalScore, "BestFscore_no_impro") sys.stdout.write("over\n") sys.stdout.flush()
def selfDefined_GTreeGPMutatorSubtree(genome, **args): """ The self defined mutator of GTreeGP, Subtree Mutator This mutator will recreate random subtree of the tree using the grow algorithm. """ classes = gol.get_val("classes") ga_engine = args["ga_engine"] max_depth = genome.getParam("max_depth", None) mutations = 0 if max_depth is None: Util.raiseException("You must specify the max_depth genome parameter !", ValueError) if max_depth < 0: Util.raiseException("The max_depth must be >= 1, if you want to use GTreeGPMutatorSubtree crossover !", ValueError) if Util.randomFlipCoin(args["pmut"]): Illegal = True while Illegal is True: new_genome = copy.deepcopy(genome) node = new_genome.getRandomNode() assert node is not None depth = new_genome.getNodeDepth(node) node_parent = node.getParent() mutations += 1 root_subtree = GTreeNode.buildGTreeGPGrow(ga_engine, 0, max_depth - depth) if node_parent is None: new_genome.setRoot(root_subtree) else: root_subtree.setParent(node_parent) node_parent.replaceChild(node, root_subtree) new_genome.processNodes() # illegal ? # Actually, case #1 and case #2 may not happen Illegal = False ecocMatrix, feature_list = TMConverter.getMatrixDirectly_and_feature(new_genome) # 1.The number of column is too little if LC.tooLittleColumn(ecocMatrix): Illegal = True elif LC.tooMuchColumn(ecocMatrix): Illegal = True # 3. if any class not included in the terminal nodes. - substatute randomly else: labels = set(classes) for i in new_genome.nodes_list: if i.isLeaf(): labels = labels - set(i.getData()) labels = list(labels) if len(labels) > 0: Illegal = True genome.setRoot(new_genome.getRoot()) genome.processNodes() return int(mutations)
def eval_func_eucdist(chromosome): """ # calculate avg_euclidean_dist of a individual """ EcocMatrix, features_used_list = TMConvertor.getMatrixDirectly_and_feature( chromosome) classes = gol.get_val("classes") num_class = len(classes) num_cols = EcocMatrix.shape[1] _dist = euclidean_distances(EcocMatrix, EcocMatrix) / np.sqrt(num_cols) dist = np.sum(_dist) / 2 / (num_class * (num_class - 1)) return dist
def update_referred_times(referred_fcolumn): # prepare genebank = gol.get_val("genebank") # find for (fcolumn, est_accuracy, class_accuracies, used_frequence) in genebank.genes: if (referred_fcolumn == fcolumn).all() or ( referred_fcolumn[0] == fcolumn[0] and referred_fcolumn[1:] * -1 == fcolumn[1:]).all(): # update used_frequence[0] += 1 break
def eval_func_hamdist(chromosome): """ # calculate hamdist of a individual """ EcocMatrix, features_used_list = TMConvertor.getMatrixDirectly_and_feature( chromosome) classes = gol.get_val("classes") dist = 0 for i in xrange(len(EcocMatrix)): for j in xrange(i + 1, len(EcocMatrix)): dist += distance.hamming(EcocMatrix[i], EcocMatrix[j]) num = len(classes) * (len(classes) - 1) / 2 dist /= num return dist
def add_gene_from_matrix(features, ecocmatrix, output_y, valid_y, classes): # prepare ADD_PERCENT = 0.3 # the percent of columns to be saved. DETA = 0.01 # to avoid zero divide when calculate acccuracy genebank = gol.get_val("genebank") output_y_bin = np.array(output_y) # binarize for ith_output in output_y_bin: ith_output[ith_output > 0] = 1 ith_output[ith_output < 0] = -1 # compare and record the true and false number of every base_classifier for every class. # eg. t_num[i][j] represents the times that ith classifier truly recognized jth class. t_num = np.zeros((ecocmatrix.shape[1], ecocmatrix.shape[0])) f_num = np.zeros((ecocmatrix.shape[1], ecocmatrix.shape[0])) classDict = dict((j, i) for i, j in enumerate(classes)) for i in xrange(len(output_y_bin)): ith_output = output_y_bin[i] ith_codeword = ecocmatrix[classDict[valid_y[i]]] for j in xrange(len(ith_codeword)): if ith_codeword[j] == 0: continue if ith_codeword[j] == ith_output[j]: t_num[j][classDict[valid_y[i]]] += 1 else: f_num[j][classDict[valid_y[i]]] += 1 # calculate the accuracy of every base_classifier. ests_accuracy = [ float(sum(t_num[i])) / (sum(t_num[i]) + sum(f_num[i]) + DETA) for i in xrange(ecocmatrix.shape[1]) ] # calculate accuracy of every base_classifier for every class. est_class_accuracies = [] for i in xrange(ecocmatrix.shape[1]): est_class_accuracies.append([ (i, j, float(t_num[i][j]) / (t_num[i][j] + f_num[i][j] + DETA)) for j in xrange(ecocmatrix.shape[0]) ]) # save n column randomly to genebank n = np.ceil(ecocmatrix.shape[1] * ADD_PERCENT) for i in xrange(int(n)): import random est_index = random.randint(0, ecocmatrix.shape[1] - 1) est_accuracy = ests_accuracy[est_index] feature = features[est_index] col = ecocmatrix[:, est_index] fcolumn = np.hstack((feature, col)) class_accuracies = [ accuracy for (i_est, j_cls, accuracy) in est_class_accuracies[est_index] ] genebank.addgene(fcolumn, est_accuracy, class_accuracies, [0])
def Operation_F1(a, b): features = gol.get_val("feature_method") a = list(a) b = list(b) for i in xrange(len(features)): if features[i] in a: a.remove(features[i]) if features[i] in b: b.remove(features[i]) sets = set(a) | set(b) function = "fclassify" result = list(sets) result.insert(0, function) return result
def logResultEveryGen_callback(gp_engine): if gp_engine.getCurrentGeneration() ==0: print "="*65 format_str = 'Gen' + ' '*12 + '%%-8s %%-8s %%-8%s %%-10%s %%-10%s %%-10%s' print( (format_str % ('s', 's', 's', 's')) % ('Max', 'Min', 'Avg', 'Best-Fscore', 'Best-Hamdist', 'Best-Accuracy')) np.set_printoptions(threshold='nan') # do in every generation best = gp_engine.getPopulation().bestRaw() bestMatrix , feature_list = TMConvertor.getMatrixDirectly_and_feature(best) feature_method_index = gol.get_val("feature_method_index") feature_index_list = list(feature_method_index[method] for method in feature_list) bestMatrix = np.ndarray.tolist(bestMatrix) bestMatrix.insert(0,feature_index_list) print np.array(bestMatrix)
def init_config(): dataName = gol.get_val("dataName") gol.set_val("n_jobs", Configs.n_jobs) gol.set_val("version", Configs.version) gol.set_val("testFile", "data/" + dataName + "_test.data") gol.set_val("trainFile", "data/" + dataName + "_train.data") gol.set_val("validationFile", "data/" + dataName + "_validation.data") gol.set_val("root_path", Configs.root_path) gol.set_val("growMethod", Configs.growMethod) gol.set_val("freq_stats", Configs.freq_stats) gol.set_val("generations", Configs.generations) gol.set_val("n_neighbors", Configs.n_neighbors) gol.set_val("mutationRate", Configs.mutationRate) gol.set_val("crossoverRate", Configs.crossoverRate) gol.set_val("populationSize", Configs.populationSize)
def main_run(): ########################################## # variables preparation ########################################## Initializator.init_gol() gol.set_val("aimFolder", Configs.aimFolder) gol.set_val("dataName", Configs.dataName) Initializator.init_all() classes = gol.get_val("classes") maxDeap = gol.get_val("maxDeap") growMethod = gol.get_val("growMethod") generations = gol.get_val("generations") crossoverRate = gol.get_val("crossoverRate") mutationRate = gol.get_val("mutationRate") populationSize = gol.get_val("populationSize") freq_Stats = gol.get_val("freq_stats") Train_X = gol.get_val("Train_X") Train_Y = gol.get_val("Train_Y") validation_X = gol.get_val("validation_X") validation_Y = gol.get_val("validation_Y") Test_X = gol.get_val("Test_X") Test_Y = gol.get_val("Test_Y") sel_features = gol.get_val("sel_features") ########################################## genome = GTree.GTreeGP() genome.setParams(max_depth=maxDeap, method=growMethod) genome.evaluator += EM.eval_func_fscore ga = GSimpleGA.GSimpleGA(genome) ga.setParams(gp_terminals=classes, gp_function_prefix="Operation") ga.setMinimax(Consts.minimaxType["maximize"]) ga.setGenerations(generations) ga.setCrossoverRate(crossoverRate) ga.setMutationRate(mutationRate) ga.setPopulationSize(populationSize) ga.setElitismReplacement(1) #ga.stepCallback.set(CB.printIndividuals_callback) ga.stepCallback += CB.checkAncients_callback ga.stepCallback += CB.logResultEveryGen_callback ga.stepCallback += CB.delogPopulation_callback ga.stepCallback += CB.logMiddleInfo_callback ga.stepCallback += CB.debug_callback print "------------------------------------------------------" ga(freq_stats=freq_Stats) best = ga.bestIndividual() #change the display_flag to display test labels and predict labels FinalMatrix, features_used_list = TMConvertor.getMatrixDirectly_and_feature( best) cc = ConnectClassifier(features_used_list, sel_features, FinalMatrix) finalScore, finalAccuracy, infos_evaluations = cc.FinalTrainAndTest( Train_X, Train_Y, validation_X, validation_Y, Test_X, Test_Y) # euddist num_class = len(classes) num_cols = FinalMatrix.shape[1] _dist = euclidean_distances(FinalMatrix, FinalMatrix) / np.sqrt(num_cols) dist = np.sum(_dist) / 2 / (num_class * (num_class - 1)) infos_evaluations.insert(len(infos_evaluations), "---------test------------") infos_evaluations.insert(len(infos_evaluations), "fscore: %f" % finalScore) infos_evaluations.insert(len(infos_evaluations), "accuracy: %f" % finalAccuracy) infos_evaluations.insert(len(infos_evaluations), "dist: %f" % dist) for text in infos_evaluations: print text
def DIYGTreeGPMutatorSubtree(genome, **args): """ The mutator of GTreeGP, Subtree Mutator This mutator will recreate random subtree of the tree using the grow algorithm. .. versionadded:: 0.6 The *GTreeGPMutatorSubtree* function """ classes = gol.get_val("classes") ind = genome Illegal = True while Illegal==True: #mutator if args["pmut"] <= 0.0: return 0 ga_engine = args["ga_engine"] max_depth = genome.getParam("max_depth", None) mutations = 0 if max_depth is None: Util.raiseException("You must specify the max_depth genome parameter !", ValueError) if max_depth < 0: Util.raiseException("The max_depth must be >= 1, if you want to use GTreeGPMutatorSubtree crossover !", ValueError) node = genome.getRandomNode() assert node is not None if Util.randomFlipCoin(args["pmut"]): depth = genome.getNodeDepth(node) mutations += 1 root_subtree = GTreeNode.buildGTreeGPGrow(ga_engine, 0, max_depth - depth) node_parent = node.getParent() if node_parent is None: genome.setRoot(root_subtree) else: root_subtree.setParent(node_parent) node_parent.replaceChild(node, root_subtree) genome.processNodes() # complete ? void : alter # code_comp = genome.getCompiledCode() # if any class not included in the terminal nodes. labels = set(classes) nums = [] for i in xrange(len(ind.nodes_list)): if ind.nodes_list[i].getType() == nodeType["TERMINAL"]: labels = labels - set(ind.nodes_list[i].getData()) nums.append(i) labels = list(labels) #print labels if len(nums) >= len(classes) : # substatute randomly while len(labels): for j in xrange(len(labels)): slice = random.sample(nums, 1) ind.nodes_list[slice[0]].setData(labels[j]) # if any class not included in the terminal nodes. labels = set(classes) nums = [] for i in xrange(len(ind.nodes_list)): if ind.nodes_list[i].getType() == nodeType["TERMINAL"]: labels = labels - set(ind.nodes_list[i].getData()) nums.append(i) labels = list(labels) #illegal? Illegal = False ecocMatrix,feature_list = TMConverter.getMatrixDirectly_and_feature(ind) ###row### #1.Two rows having the same numbers if LC.sameRows(ecocMatrix): Illegal = True continue #2.There being a row with all 0 elif LC.zeroRow(ecocMatrix): Illegal = True continue if LC.tooLittleColumn(ecocMatrix): Illegal = True return int(mutations)
def fisher_complexity_old(column,trainX, trainY, sel_features, thisfeature): """ fisher method old version """ feature_method_index = gol.get_val("feature_method_index") fea_num = gol.get_val("feature_number") classes = gol.get_val("classes") miu1 = 0; miu2 = 0; sigma1 = 0; sigma2 = 0 d1 = np.zeros(fea_num); d2 = np.zeros(fea_num) d3 = np.zeros((len(classes),fea_num)) # the sum of each feature for every class miu3 = [0]*len(classes) # the average of d3 sigma3 = [0]*len(classes) fisher11 = 0; fisher12 = 0; fisher21 = 0; fisher22 = 0 numleft = 0; numright = 0 num = [list(trainY).count(classes[i]) for i in xrange(len(classes))] # which feature selection method? d = trainX[:, sel_features[feature_method_index[thisfeature]]] # in for j in xrange(len(classes)): for m in xrange(len(trainY)): if (trainY[m] == classes[j]): d3[j,:] += d[m,:] if (column[j] == 1): numleft += num[j] elif (column[j] == -1): numright += num[j] miu3[j] = np.mean(d3[j]) sigma3[j] = np.var(d3[j]) for i in xrange(len(classes)): if (column[i] == 1): num[i] = float(num[i]) / numleft fisher12 += num[i] * sigma3[i] elif (column[i] == -1): num[i] = float(num[i]) / numright fisher22 += num[i] * sigma3[i] for i in xrange(len(classes)): for j in xrange(len(classes)): if (j > i): if (column[i] == 1 and column[j] == 1): fisher11 += num[i] * num[j] * (miu3[i] - miu3[j]) * (miu3[i] - miu3[j]) elif (column[i] == -1 and column[j] == -1): fisher21 += num[i] * num[j] * (miu3[i] - miu3[j]) * (miu3[i] - miu3[j]) # out for j in xrange(len(classes)): if column[j] == 1: d1[:] += d3[j,:] elif column[j] == -1: d2[:] += d3[j,:] miu1 = np.mean(d1) miu2 = np.mean(d2) sigma1 = np.var(d1) sigma2 = np.var(d2) if sigma1 + sigma2 == 0 : return 0 fisher_sum = (miu1 - miu2) * (miu1 - miu2) / (sigma1 + sigma2) # final fisherSet = [] fisherSet.append(fisher_sum) if (fisher11 != 0): fisher1 = float(fisher11 / fisher12) fisherSet.append(fisher1) if (fisher21 != 0): fisher2 = float(fisher21 / fisher22) fisherSet.append(fisher2) return np.mean(fisherSet)
def _crossover_supplement_ecoc(raw_ind, coming_node): import numpy as np from utils import gol classes = gol.get_val("classes") leafs = raw_ind.getLeafs() datas = [_node_.getData() for _node_ in leafs] if len(classes) == len(np.unique(datas)): return # when lack class import copy node_ind = copy.deepcopy(raw_ind) node_ind.setRoot(coming_node) node_ind.processNodes() leafs_coming = node_ind.getLeafs() leafs_old = copy.deepcopy(leafs) datas_old = [_node_.getData() for _node_ in leafs_old] for _node_ in leafs_coming: datas_old.remove(_node_.getData()) class_lack = [cls for cls in classes if cls not in datas] # find nodes_redundancy, they will be replaced with lacks datas_temp_mid = [] nodes_redundancy = [] for _node_ in leafs_coming: if _node_.getData() in datas_old: nodes_redundancy.append(_node_) elif _node_.getData() in datas_temp_mid: nodes_redundancy.append(_node_) else: datas_temp_mid.append(_node_.getData()) if len(nodes_redundancy) > 0: for i in xrange(len(nodes_redundancy)): nodes_redundancy[i].setData(class_lack[0]) class_lack.remove(class_lack[0]) if len(class_lack) <= 0: break # after replacing, it may still lack class if len(class_lack) > 0: import random from gp.GTreeNode import GTreeNodeGP max_depth = gol.get_val("maxDeap") nodes_grow_candidate = leafs_coming nodes_grow_candidate_new = nodes_grow_candidate # grow the tree to fill lacks while len(class_lack) > 0: nodes_grow_candidate = nodes_grow_candidate_new nodes_grow_candidate_new = [] for i in xrange(len(nodes_grow_candidate)): if raw_ind.getNodeDepth(nodes_grow_candidate[i]) < max_depth: _newnode_old = GTreeNodeGP( nodes_grow_candidate[i].getData(), node_type=nodeType['TERMINAL'], parent=nodes_grow_candidate[i]) _newnode_lack = GTreeNodeGP(class_lack[0], node_type=nodeType['TERMINAL'], parent=nodes_grow_candidate[i]) nodes_grow_candidate[i].addChild(_newnode_old) nodes_grow_candidate[i].addChild(_newnode_lack) nodes_grow_candidate[i].setType(nodeType['NONTERMINAL']) nodes_grow_candidate[i].setData('Operation_F' + str(random.randint(1, 4))) class_lack.remove(class_lack[0]) if len(class_lack) <= 0: break nodes_grow_candidate_new.append(_newnode_old) nodes_grow_candidate_new.append(_newnode_lack) raw_ind.processNodes() raw_ind.processNodes()
def checkAncients_callback(gp_engine): if gp_engine.getCurrentGeneration() != 0: return from utils import delog delog.decache("check first Gen...") begin = 0 end = gol.get_val("populationSize") classes = gol.get_val("classes") population = gp_engine.getPopulation() for i in xrange(begin, end): genome = population[i] max_depth = genome.getParam("max_depth", None) #illegal? ecocMatrix, feature_list = TMConvertor.getMatrixDirectly_and_feature(genome) Illegal = False if LCheckers.tooLittleColumn(ecocMatrix): Illegal = True elif LCheckers.tooMuchColumn(ecocMatrix): Illegal = True # 2. if any class not included in the terminal nodes. else: labels = set(classes) for i in genome.nodes_list: if i.isLeaf(): labels = labels - set(i.getData()) labels = list(labels) if len(labels) > 0: Illegal = True if max_depth is None: Util.raiseException("You must specify the max_depth genome parameter !", ValueError) if max_depth < 0: Util.raiseException("The max_depth must be >= 1, if you want to use GTreeGPMutatorSubtree crossover !", ValueError) while Illegal==True: new_genome = copy.deepcopy(genome) node = new_genome.getRandomNode() assert node is not None depth = new_genome.getNodeDepth(node) node_parent = node.getParent() root_subtree = GTreeNode.buildGTreeGPGrow(gp_engine, 0, max_depth - depth) if node_parent is None: new_genome.setRoot(root_subtree) else: root_subtree.setParent(node_parent) node_parent.replaceChild(node, root_subtree) new_genome.processNodes() # illegal ? # Actually, case #1 and case #2 may not happen Illegal = False ecocMatrix, feature_list = TMConvertor.getMatrixDirectly_and_feature(new_genome) # 1.The number of column is too little if LCheckers.tooLittleColumn(ecocMatrix): Illegal = True elif LCheckers.tooMuchColumn(ecocMatrix): Illegal = True # 2. if any class not included in the terminal nodes. else: labels = set(classes) for i in new_genome.nodes_list: if i.isLeaf(): labels = labels - set(i.getData()) labels = list(labels) if len(labels) > 0: Illegal = True # apply the mutations if Illegal == False: genome.setRoot(new_genome.getRoot()) genome.processNodes() #Update the scores of population delog.deprint_string( "over.") population.evaluate() population.sort()
def predictFinal(self, features_used_list, sel_features, train_X, train_Y, valid_X, valid_Y, test_X, test_Y): self.feature_name = features_used_list self.trainY = train_Y self.sel_features = sel_features # prepare check_is_fitted(self, 'estimators_') feature_method_index = gol.get_val("feature_method_index") # try to restore output from cache output_y = [] for i in xrange(len(self.estimators_)): _column = self.code_book_[:, i] _features = feature_method_index[features_used_list[i]] self.storager.setfeaturecode(sel_features[_features], _column) pre = self.storager.load_prediction_valid() if pre is None: pre = corrected_predict_binary( self.estimators_[i], valid_X[:, sel_features[_features]]) self.storager.save_prediction_valid(pre) output_y.append(pre) output_y = np.array(output_y).T if self.estimator_type == 'decision_function': output_y = _sigmoid_normalize(output_y) # get score and confusion matrix pred = self.get_distances(output_y, self.code_book_).argmin(axis=1) score, accuracy = self.calculateFScore(self.classes_[pred], valid_Y) self.conMatrix = confusion_matrix(valid_Y, self.classes_[pred]) # log _message = "Performance without local improvement:" self.matrix_tracer(_message, score, accuracy) t_score, t_acc = self.predictFinal_withoutlocalimprovement( features_used_list, self.sel_features, train_X, train_Y, valid_X, valid_Y, test_X, test_Y) self.infos_evaluations.insert(len(self.infos_evaluations), "test-f-score:" + str(t_score)) self.infos_evaluations.insert(len(self.infos_evaluations), "test-accuracy:" + str(t_acc)) self.infos_evaluations.insert(len(self.infos_evaluations), self.conMatrix) # adding column temp_features_name = dict( (c, i) for i, c in feature_method_index.items()) add_counter = 0 while True: features_digit = [ feature_method_index[features_used_list[i]] for i in xrange(self.code_book_.shape[1]) ] add_fcol = get_gene_from_bank(features_digit, self.code_book_, self.conMatrix, self.classes_) if add_fcol is not None: # prepare new_ecocmatrix and output_y new_ecocmatrix = np.hstack( (self.code_book_, np.array([add_fcol[1:]]).transpose())) # reconstruct the output_y because they can not be sigmoid respectively new_y = [] for i in xrange(len(self.estimators_)): _column = self.code_book_[:, i] _features = feature_method_index[features_used_list[i]] self.storager.setfeaturecode(sel_features[_features], _column) pre = self.storager.load_prediction_valid() if pre is None: pre = corrected_predict_binary( self.estimators_[i], valid_X[:, sel_features[_features]]) self.storager.save_prediction_valid(pre) new_y.append(pre) # add new output of new column, need training new_estimator = self.fit_one(train_X, train_Y, sel_features, add_fcol) add_fcol_y = corrected_predict_binary( new_estimator, valid_X[:, sel_features[add_fcol[0]]]) new_y.append(add_fcol_y) new_y = np.array(new_y).T if self.estimator_type == 'decision_function': new_y = _sigmoid_normalize(new_y) # calculate new accuracy and compare new_pred = self.get_distances(new_y, new_ecocmatrix).argmin(axis=1) new_score, new_accuracy = self.calculateFScore( self.classes_[new_pred], valid_Y) # update if there be any improvement if new_accuracy > accuracy: # if new_score >= score: output_y = new_y pred = new_pred score = new_score accuracy = new_accuracy self.code_book_ = new_ecocmatrix self.feature_name = np.hstack( (features_used_list, temp_features_name[add_fcol[0]])) features_used_list = self.feature_name self.estimators_.insert(len(self.estimators_), new_estimator) self.conMatrix = confusion_matrix(valid_Y, self.classes_[pred]) # update used frequency update_referred_times(add_fcol) # update matrix tracer add_counter += 1 _message = str(add_counter) + "Add one column:" self.matrix_tracer(_message, score, accuracy) t_score, t_acc = self.predictFinal_withoutlocalimprovement( features_used_list, self.sel_features, train_X, train_Y, valid_X, valid_Y, test_X, test_Y) self.infos_evaluations.insert( len(self.infos_evaluations), "test-f-score:" + str(t_score)) self.infos_evaluations.insert( len(self.infos_evaluations), "test-accuracy:" + str(t_acc)) self.infos_evaluations.insert(len(self.infos_evaluations), self.conMatrix) else: # no improvement, stop. break else: # genebank is empty, or no suitable column, stop. break ######## # TEST # ######## # retraining because different training set, try to restore estimators from cache. classes_index = dict((c, i) for i, c in enumerate(self.classes_)) final_train_x = np.vstack((train_X, valid_X)) final_train_y = np.hstack((train_Y, valid_Y)) self.estimators_ = list() for i in range(self.code_book_.shape[1]): _column = self.code_book_[:, i] _features = feature_method_index[features_used_list[i]] self.storager.setfeaturecode(sel_features[_features], _column) est = self.storager.load_estimator_test() if est is None: # need training extend_column = np.array([ _column[classes_index[final_train_y[i]]] for i in xrange(final_train_x.shape[0]) ], dtype=np.int) est = corrected_fit_binary( self.estimator, final_train_x[:, sel_features[_features]], extend_column) self.storager.save_estimator_test(est) self.estimators_.append(est) # predicting because different training set, try to restore output from cache. output_y = [] for i in xrange(len(self.estimators_)): _column = self.code_book_[:, i] _features = feature_method_index[features_used_list[i]] self.storager.setfeaturecode(sel_features[_features], _column) pre = self.storager.load_prediction_test() if pre is None: pre = corrected_predict_binary( self.estimators_[i], test_X[:, sel_features[_features]]) self.storager.save_prediction_test(pre) output_y.append(pre) output_y = np.array(output_y).T if self.estimator_type == 'decision_function': output_y = _sigmoid_normalize(output_y) # get score pred = self.get_distances(output_y, self.code_book_, Weighted=True).argmin(axis=1) score, accuracy = self.calculateFScore(self.classes_[pred], test_Y) return score, accuracy, self.infos_evaluations
def debug_callback(gp_engine): genes = gol.get_val("genebank").genes genid = gp_engine.getCurrentGeneration() None
def printIndividuals_callback(gp_engine): import pydot global numnum New_Ind = GTree.GTreeGP() classes = gol.get_val("classes") numnum = numnum + 1 begin = 0 end = 20 if gp_engine.getCurrentGeneration() != -1: population = gp_engine.getPopulation() graph = pydot.Dot(graph_type = "digraph") n = 0 filename = 'Tree' + str(numnum) +'.jpg' for i in xrange(begin, end) : arrays = [] ind = population[i] subg = pydot.Cluster("cluster_%d" % i, label="\"Ind. #%d - Score Raw/Fit.: %.4f/%.4f\"" % (i, ind.getRawScore(), ind.getFitnessScore())) count = n node_stack = [] nodes_dict = {} tmp = None import __main__ as main_module for i in xrange(len(ind.nodes_list)): newnode = pydot.Node(str(count), style="filled") count += 1 # color if ind.nodes_list[i].getType() == Consts.nodeType["TERMINAL"]: newnode.set_color("lightblue2") else: newnode.set_color("goldenrod2") # content of node if ind.nodes_list[i].getType() == Consts.nodeType["NONTERMINAL"]: func = getattr(main_module, ind.nodes_list[i].getData()) if hasattr(func, "shape"): newnode.set_shape(func.shape) if hasattr(func, "representation"): newnode.set_label(func.representation) else: for j in xrange(0, len(classes)): locals()[classes[j]] = classes[j] New_Ind.setRoot(ind.nodes_list[i]) array = eval(New_Ind.getCompiledCode()) newnode.set_label(str(array)) #if hasattr(func, "color"): newnode.set_color(func.color) else: newnode.set_label(ind.nodes_list[i].getData()) nodes_dict.update({ind.nodes_list[i]: newnode}) graph.add_node(newnode) node_stack.append(ind.getRoot()) while len(node_stack) > 0: tmp = node_stack.pop() parent = tmp.getParent() if parent is not None: parent_node = nodes_dict[parent] child_node = nodes_dict[tmp] newedge = pydot.Edge(parent_node, child_node) graph.add_edge(newedge) rev_childs = tmp.getChilds()[:] rev_childs.reverse() node_stack.extend(rev_childs) n = count graph.add_subgraph(subg) graph.write(filename, prog='dot', format="jpeg")