def main(args): setup = experiment_setups.parse(args.setup) # use_base_dir = False dirname = fileutil.run_dir(setup.dest_dir, setup.name, setup.max_quantifier_length, setup.model_size, setup.pareto_name) file_util = FileUtil(dirname) expressions = language_loader.load_all_evaluated_expressions(file_util) languages_0 = language_generator.generate_sampled( expressions, args.lang_size, int(args.sample_size / args.lang_size)) universe = generator.generate_simplified_models(setup.model_size) measure_complexity = SumComplexityMeasurer(args.lang_size, 1) measure_informativeness = SimMaxInformativenessMeasurer(universe) pool = ProcessPool(nodes=setup.processes) languages = languages_0 #lanuages will be iteratively updated in subsequent loop for gen in range(args.generations): print('GENERATION {0}'.format(gen)) print('measuring') complexity = pool.map(measure_complexity, languages) informativeness = pool.map(measure_informativeness, languages) measurements = [(1 - inf, comp) for inf, comp in zip(informativeness, complexity)] print('calculating dominating') dominating_indices = pygmo.non_dominated_front_2d(measurements) dominating_languages = [languages[i] for i in dominating_indices] print('mutating') languages = sample_mutated(dominating_languages, args.sample_size, expressions) language_indices = [[e.index for e in lang] for lang in dominating_languages] dominating_complexity = [complexity[i] for i in dominating_indices] dominating_informativeness = [ informativeness[i] for i in dominating_indices ] file_util.dump_dill(dominating_complexity, 'complexity_wordcomplexity.dill') file_util.dump_dill(dominating_informativeness, 'informativeness_simmax.dill') file_util.dump_dill(language_indices, 'language_indices.dill') file_util.save_stringlist([list(map(str, lang)) for lang in languages], 'languages.txt') print("generate_evolutionary.py finished.")
def plot_from_csv(filepath, title, perf_index, comp_index, add_ndf): ''' Read data from csv file and redirect fitness data to plot_front function. ''' fitness = [] ndf = [] with open(filepath) as csv_file: csv_data = csv.reader(csv_file, delimiter=',') for row in csv_data: fitness.append([float(row[perf_index]), float(row[comp_index])]) if add_ndf: ndf = pyg.non_dominated_front_2d(fitness) plot_front(title, fitness, ndf)
def write_ndf_csv(self, name): '''Write a csv file that contains the non dominated vectors of the optimisation''' fitness_keys = [ key for key in self.fitness_dict.keys() ] fitness_values = [ val for val in self.fitness_dict.values() ] _, _, dc, ndr = pyg.fast_non_dominated_sorting(fitness_values) ndf = pyg.non_dominated_front_2d(fitness_values) logger.info(name + "\nNon dominated vectors: "+str(len(ndf)) + "\nDomination count: " + str(dc) +"\nNon domination ranks: " + str(ndr)) pl.plot_front(name +" all fits", fitness_values, ndf) # Save ndf results to file with open(cfg.RESULTS_PATH + cfg.timestamp + '/NDF-' + name + '.csv', mode='w') as data_file: data_writer = csv.writer(data_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) for i in ndf: data = np.concatenate((fitness_keys[i], fitness_values[i]), axis=None) data = np.concatenate((data, self.complete_results[fitness_keys[i]]), axis=None) data_writer.writerow(data)
pareto_data = pd.DataFrame({ 'complexity': evo_comp, 'comm_cost': [1 - x for x in evo_inf] }) main_data = pd.read_csv( '../../../results/Final_length=12_size=10/tables/pandas_27_may_natural_degrees.csv' ) comm_cost = list(pareto_data['comm_cost'].values) + list( main_data['comm_cost'].values) complexity = list(pareto_data['complexity'].values) + list( main_data['complexity'].values) dominating_indices = pygmo.non_dominated_front_2d( list(zip(complexity, comm_cost))) dominating_complexity = [complexity[i] for i in dominating_indices] dominating_comm_cost = [comm_cost[i] for i in dominating_indices] values = list(zip(dominating_comm_cost, dominating_complexity)) values.sort(key=lambda val: -val[1]) values.sort(key=lambda val: val[0]) values = [np.array(value) for value in values] x = [] y = [] interval = .001 for (left, right) in zip(values[:-1], values[1:]): diff = right - left if norm(diff) == 0:
item.reset_index(inplace=True, drop=True) languages = pd.concat([languages, item]) languages.reset_index(inplace=True, drop=True) # Run the evolutionary algorithm for 100 generations, each with 2000 languages num_of_gen = 100 sample_size = 2000 max_mut = 3 for i in range(1, num_of_gen + 1): temp = copy(languages) temp = prep(temp) tempcc = costcomplexity(temp) tempcc['new_col'] = list( zip(tempcc.costoflanguages, tempcc.complexityoflanguages)) dominating_indices = pygmo.non_dominated_front_2d(tempcc['new_col']) dominating_languages_list = [ tempcc.LANG[k] for k in dominating_indices ] #this was bound by upper i somehow!!! dominating_languages_df = languages[languages['LANG'].isin( dominating_languages_list)] languages = sample_mutated(dominating_languages_df, sample_size, i, max_mut) languages = prep(languages) costcomlang = costcomplexity(languages) costcomlang.to_csv(r'' + Folder + 'finalgencostcom.csv', index=False) # Find the dominant languages from the final generation + languages of Experiment1 allfinal = pd.read_csv( Folder +
new_l.append([l1[i],l2[i]]) return new_l def create_configs(keys): configs=[] l1 = list(itertools.permutations( keys,2)) for each in l1: for key in keys: if key not in each: if [each[0],key,each[1]] not in configs: configs.append([key,each[0],each[1]]) return configs ndf, dl, dc, ndr = pg.fast_non_dominated_sorting(points = [[0,1],[-1,3],[2.3,-0.2],[1.1,-0.12],[1.1, 2.12],[-70,-100]]) ndf = pg.non_dominated_front_2d(points = [[0,1],[-1,3],[2.3,-0.2],[1.1,-0.12],[1.1, 2.12],[-70,-100]]) print(ndf) df = pd.read_csv('output/data64.csv', sep = "\t") datasets = list(set(df["dataset"])) #eval_labels={"silhouette_score":1,"calinski_harabasz_score":1,"davies_bouldin_score":-1,"SSE":-1,"nSSE":-1} #eval_labels={"silhouette_score":1,"calinski_harabasz_score":1,"davies_bouldin_score":-1} #eval_labels={"Baker_Hubert_Gamma":-1,"Banfeld_Raferty":-1,"Davies_Bouldin":-1,"Dunns_index":1,"McClain_Rao":-1,"PBM_index":1,"Ratkowsky_Lance":1,"Ray_Turi":-1,"Scott_Symons":-1,"Wemmert_Gancarski":1,"Xie_Beni":-1,"c_index":-1,"g_plus_index":-1,"i_index":1,"modified_hubert_t":1,"point_biserial":1,"s_dbw":-1,"silhouette":1,"tau_index":1,"IIndex":1,"SDBW":-1,"calinski_harabasz_score":1} eval_labels={"Banfeld_Raferty":-1,"Davies_Bouldin":-1,"Dunns_index":1,"McClain_Rao":-1,"PBM_index":1,"Ratkowsky_Lance":1,"Ray_Turi":-1,"Scott_Symons":-1,"Xie_Beni":-1,"c_index":-1,"i_index":1,"modified_hubert_t":1,"point_biserial":1,"s_dbw":-1,"silhouette":1,"IIndex":1,"SDBW":-1,"calinski_harabasz_score":1} configs = create_configs(list(eval_labels.keys())) print(len(configs)) print(len(list(itertools.permutations( list(eval_labels.keys()),3)))) infos=[]
""" Pareto Front """ PF_total = [] for pf in range(len(Sigma_pred.T)): r = Sigma_pred[:, ][:, pf] un = Sigma_unpop[:, ][:, pf] PF = [] points = [] for ii in range(len(r)): points.append([-r[ii], -un[ii]]) if len(np.unique(points)) == 2: PF.append(np.array([0])) else: PF.append(pg.non_dominated_front_2d(points)) PF_total.append([PF]) # plot PFs for user 0 pf = 0 r = Sigma_pred[:, ][:, pf] un = Sigma_unpop[:, ][:, pf] print("PFs for user {} is \n{}: ".format(pf, PF_total[pf][0][0])) n = list(np.arange(1, NP + 1)) fig, ax = plt.subplots() ax.scatter(r, un) for i, txt in enumerate(n): ax.annotate(txt, (r[i], un[i])) plt.scatter(r[PF_total[pf][0][0]],
import pygmo import analysisutil analysisutil.add_argument('complexity_strategy') analysisutil.add_argument('informativeness_strategy') (args, setup, file_util) = analysisutil.init() languages = file_util.load_dill('languages.dill') informativeness = file_util.load_dill('informativeness_{0}.dill'.format( args.informativeness_strategy)) complexity = file_util.load_dill('complexity_{0}.dill'.format( args.complexity_strategy)) measurements = [(1 - inf, comp) for inf, comp in zip(informativeness, complexity)] dominating_indices = pygmo.non_dominated_front_2d(measurements) dominating_languages = [(languages[i], complexity[i], informativeness[i]) for i in dominating_indices] filename = '{0}/dominating_languages_{1}_{2}.txt'.format( file_util.folderName, args.complexity_strategy, args.informativeness_strategy) with open(filename, 'w') as f: for lang, comp, inf in dominating_languages: f.write("{0}\nComplexity : {1}\nInformativeness : {2}\n".format( list(map(str, lang)), comp, inf))
def NSGA2_pygmo(model, fevals, lb, ub, cf=None): """Finds the estimated Pareto front of a GPy model using NSGA2 [1]_. Parameters ---------- model : GPy.models.gp_regression.GPRegression GPy regression model on which to find the Pareto front of its mean prediction and standard deviation. fevals : int Maximum number of times to evaluate a location using the model. lb : (D, ) numpy.ndarray Lower bound box constraint on D ub : (D, ) numpy.ndarray Upper bound box constraint on D cf : callable, optional Constraint function that returns True if it is called with a valid decision vector, else False. Returns ------- X_front : (F, D) numpy.ndarray The F D-dimensional locations on the estimated Pareto front. musigma_front : (F, 2) numpy.ndarray The corresponding mean response and standard deviation of the locations on the front such that a point X_front[i, :] has a mean prediction musigma_front[i, 0] and standard deviation musigma_front[i, 1]. Notes ----- NSGA2 [1]_ discards locations on the pareto front if the size of the front is greater than that of the population size. We counteract this by storing every location and its corresponding mean and standard deviation and calculate the Pareto front from this - thereby making the most of every GP model evaluation. References ---------- .. [1] Kalyanmoy Deb, Amrit Pratap, Sameer Agarwal, and T. Meyarivan. A fast and elitist multiobjective genetic algorithm: NSGA-II. IEEE Transactions on Evolutionary Computation 6, 2 (2001), 182–197. """ # internal class for the pygmo optimiser class GPY_WRAPPER(object): def __init__(self, model, lb, ub, cf, evals): # model = GPy model # lb = np.array of lower bounds on X # ub = np.array of upper bounds on X # cf = callable constraint function # evals = total evaluations to be carried out self.model = model self.lb = lb self.ub = ub self.nd = lb.size self.got_cf = cf is not None self.cf = cf self.i = 0 # evaluation pointer def get_bounds(self): return (self.lb, self.ub) def get_nobj(self): return 2 def fitness(self, X): X = np.atleast_2d(X) f = model_fitness(X, self.model, self.cf, self.got_cf, self.i, self.i + X.shape[0]) self.i += X.shape[0] return f # fitness function for the optimiser def model_fitness(X, model, cf, got_cf, start_slice, end_slice): valid = True # if we select a location that violates the constraint, # ensure it cannot dominate anything by having its fitness values # maximally bad (i.e. set to infinity) if got_cf: if not cf(X): f = [np.inf, np.inf] valid = False if valid: mu, sigmaSQR = model.predict(X, full_cov=False) # note the negative sigmaSQR here as NSGA2 is minimising # so we want to minimise the negative variance f = [mu.flat[0], -np.sqrt(sigmaSQR).flat[0]] # store every point ever evaluated model_fitness.X[start_slice:end_slice, :] = X model_fitness.Y[start_slice:end_slice, :] = f return f # get the problem dimensionality D = lb.size # NSGA-II settings POPSIZE = D * 100 N_GENS = int(np.ceil(fevals / POPSIZE)) TOTAL_EVALUATIONS = POPSIZE * N_GENS nsga2 = pg.algorithm(pg.nsga2(gen=1, cr=0.8, # cross-over probability. eta_c=20.0, # distribution index (cr) m=1 / D, # mutation rate eta_m=20.0)) # distribution index (m) # preallocate the storage of every location and fitness to be evaluated model_fitness.X = np.zeros((TOTAL_EVALUATIONS, D)) model_fitness.Y = np.zeros((TOTAL_EVALUATIONS, 2)) # problem instance gpy_problem = GPY_WRAPPER(model, lb, ub, cf, TOTAL_EVALUATIONS) problem = pg.problem(gpy_problem) # initialise the population population = pg.population(problem, size=POPSIZE) # evolve the population for i in range(N_GENS): population = nsga2.evolve(population) # indices non-dominated points across the entire NSGA-II run front_inds = pg.non_dominated_front_2d(model_fitness.Y) X_front = model_fitness.X[front_inds, :] musigma_front = model_fitness.Y[front_inds, :] # convert the standard deviations back to positive values; nsga2 minimises # the negative standard deviation (i.e. maximises the standard deviation) musigma_front[:, 1] *= -1 return X_front, musigma_front