def bin_search(dist: np.ndarray, level: float): max_entropy = entropy(np.ones(len(dist))) if entropy(dist) / max_entropy >= level: max = dist min = np.zeros(len(dist)) min[np.random.randint(len(min))] = 1. if entropy(min) / max_entropy == level: return min, False else: max = np.ones(len(dist)) min = dist if entropy(max) / max_entropy == level: return max, False max_ent = entropy(max) / max_entropy min_ent = entropy(min) / max_entropy while not isclose(max_ent, level) and not isclose(min_ent, level): mid = midpoint(min, max) if entropy(mid) / max_entropy >= level: max = mid max_ent = entropy(mid) / max_entropy else: min = mid min_ent = entropy(mid) / max_entropy if isclose(entropy(max) / max_entropy, level): return max, True else: return min, True
def sample(n_states: int, level: float): if level < 0 or level > 1: raise ValueError("level should be between 0 and 1") u = np.ones(n_states) d = np.random.dirichlet(u) max_entropy = entropy(u) if entropy(d) / max_entropy == level: return d else: support, sample_again = bin_search(d, level) # multiply support with a factor so that its a very narrow support if sample_again: support = 1000 * support return np.random.dirichlet(support) else: return support
def entropy(self): parser = argparse.ArgumentParser(prog='entropy', parents=[Measure.parent_parser]) parser.add_argument('measure') args = parser.parse_args(sys.argv[3:]) if args.measure == 'in_selectivity': measure_dict = measures.in_selectivity(args.network) elif args.measure == 'out_selectivity': measure_dict = measures.out_selectivity(args.network) elif args.measure == 'selectivity': measure_dict = measures.selectivity(args.network) elif args.measure == 'in_ipr': measure_dict = measures.in_ipr(args.network) elif args.measure == 'out_ipr': args.measure = measures.out_ipr(args.network) print measures.entropy(measure_dict)
def max_synergistic(input: dit.Distribution, conditional: np.ndarray, eps: float = 0.01): rvs = input.get_rv_names() states = input.alphabet[0] partition_size = int(len(input) / (len(states)**2)) max_entropy = (len(states)**2) * entropy(np.ones(partition_size)) best_syn_vars = (0, 1) best_outcome_dict = {} lowest_entropy = max_entropy # conditional = np.stack([d.pmf for d in conditional]) # stack the conditional # conditional = conditional/conditional.sum() # normalize the conditional to give each for synergy_vars in itertools.combinations(range(len(rvs)), r=2): # Build the outcome dict outcome_dict = { state: np.zeros(partition_size, dtype=int) for state in list(itertools.product(states, repeat=2)) } for i, outcome in enumerate(input.outcomes): cur_state = outcome[synergy_vars[0]], outcome[synergy_vars[1]] outcome_dict[cur_state][np.argmax( outcome_dict[cur_state] == 0)] = i # Choose the first zero entry to fill current_entropy = sum([ entropy(input.pmf[indices]) for state, indices in outcome_dict.items() ]) if current_entropy < lowest_entropy: best_syn_vars = synergy_vars lowest_entropy = current_entropy best_outcome_dict = outcome_dict # Use best syn vars to find the nudge vector that makes the largest impact nudge_vector = np.zeros(len(input)) for state, indices in best_outcome_dict.items(): nudge_vector[indices] = max_global( input.pmf[indices], np.array([d for i, d in enumerate(conditional) if i in indices]), eps / len(best_outcome_dict), False) return nudge_vector, best_syn_vars
def outputting(gen,ts,agents,freq,memory,positions,out,directory,run,optimization,exploration): del_freq = len(freq['delete']) tra_freq = len(freq['transmit']) inv_freq = len(freq['invent']) mod_freq = len(freq['modification']) ag_soll = [memory[agent] for agent in agents] pos_pr = [positions[agent] for agent in agents] edit_out = [edit_distance(i,j) for i,j in zip(ag_soll,pos_pr)] edit_norm_out = [edit_distance(i,j)/len(max([i,j], key=len)) for i,j in zip(ag_soll,pos_pr)] len_out = [len(i) for i in ag_soll] ent_out = [entropy(i) for i in ag_soll] complex_out = [string_complexity(i) for i in ag_soll] prob_ent_out = [entropy(i) for i in pos_pr] prob_str_out = list(set([len(i) for i in pos_pr])) prob_len_out = [len(i) for i in pos_pr] solu_pool = len(list(set(ag_soll))) prob_pool = len(list(set(pos_pr))) pop_size = len(agents) lev = np.sum(np.asarray(edit_out))/len(edit_out) lev_norm = np.sum(np.asarray(edit_norm_out))/len(edit_norm_out) s_len = np.sum(np.asarray(len_out))/len(len_out) p_len = np.sum(np.asarray(prob_str_out))/len(prob_str_out) ent = np.sum(np.asarray(ent_out))/len(ent_out) p_ent = np.sum(np.asarray(prob_ent_out))/len(prob_ent_out) sol_complexity = np.sum(np.asarray(complex_out))/len(complex_out) if out == True: with open(directory,'a') as output: output.write(str(run)+';'+str(gen)+';'+str(ts)+';'+str(pop_size)+';'+str(optimization)+';'+str(exploration)+';'+str(solu_pool)+';'+str(prob_pool)+';'+str(s_len)+';'+str(p_len)+';'+str(ent)+';'+str(p_ent)+';'+str(lev)+';'+str(lev_norm)+';'+str(tra_freq)+';'+str(inv_freq)+';'+str(del_freq)+';'+str(mod_freq)+';'+str(sol_complexity)+'\n') else: print('Gen:',gen) print('TS: ',ts) print('modification:',len(freq['modification'])) print('Transmit:',len(freq['transmit'])) print('Invent:',len(freq['invent'])) print('Delete:',len(freq['delete'])) print('Solution Pool Size: ',len(list(set(ag_soll)))) print('LD(Norm): ', np.sum(np.asarray(edit_norm_out))/len(edit_norm_out)) print('String length: ', np.sum(np.asarray(len_out))/len(len_out)) print('String Entropy (Average): ', np.sum(np.asarray(ent_out))/len(ent_out)) print('String Complexity: ', sol_complexity) print('Problem Length: ', np.sum(np.asarray(prob_str_out))/len(prob_str_out))
def max_individual(input: dit.Distribution, conditional: np.ndarray, eps: float = 0.01, minimal_entropy_idx=None): rvs = input.get_rv_names() conditional = conditional / conditional.sum() states = len(input.alphabet[0]) if not minimal_entropy_idx == 0 and not minimal_entropy_idx: minimal_entropy_idx = np.argmin([ entropy(input.marginal([rv], rv_mode='indices').pmf) for rv in range(len(rvs)) ]) non_minimal_rvs = rvs[:minimal_entropy_idx] + rvs[minimal_entropy_idx + 1:] non_minimal_marginal, minimal_conditional = input.condition_on( non_minimal_rvs) [d.make_dense() for d in minimal_conditional] # minimal_conditional = np.stack([d.pmf for d in minimal_conditional]) # print("minimal_conditional:",minimal_conditional) indiv_shape = (len(minimal_conditional), len(minimal_conditional[0])) # minimal_conditional = minimal_conditional.flatten() nudge_vector = np.zeros(indiv_shape) rotated_conditional = R(conditional, minimal_entropy_idx, len(rvs), states) total_max_impact = 0 # print(len(rvs), (eps / 2)/len(minimal_conditional)) for i, mc_dist in enumerate(minimal_conditional): rows = rotated_conditional[i * states:(i + 1) * states, :] max_impact = 0 for allignment in itertools.product( [-1, 1], repeat=rotated_conditional.shape[1]): allignment = np.array(allignment) if np.all(allignment == 1) or np.all(allignment == -1): continue scores = np.sum(allignment * rows, axis=1) # Add rotation of scores so that scores are well aligned. # Weigh scores using the non_minimal_marginal vector, impact = find_max_impact(scores, mc_dist.pmf, (eps / 2) / len(minimal_conditional)) if impact > max_impact: nudge_vector[i, :] = vector max_impact = impact total_max_impact += max_impact return nudge_vector, total_max_impact, minimal_entropy_idx
def max_local(input: dit.Distribution, conditional: np.ndarray, eps: float = 0.01): rvs = input.get_rv_names() sorted_rvs = np.argsort([ entropy(input.marginal([rv], rv_mode='indices').pmf) for rv in range(len(rvs)) ]) nudge_vectors = np.zeros( (input.outcome_length(), int(len(input) / 3), 3) ) # For each random variable we get (hopefully) a different nudge vector of len the input size max_impacts = np.zeros(input.outcome_length()) for rv in sorted_rvs: nudge_vectors[rv, :, :], max_impacts[rv], _ = max_individual( input, conditional, eps / len(sorted_rvs), rv) return nudge_vectors, max_impacts
def max_local_nudge2(old_X: dit.Distribution, YgivenX: np.ndarray, eps: float = 0.01): if old_X.outcome_length() == 1: return max_global_nudge(old_X, YgivenX, eps) mask = old_X._mask base = old_X.get_base() new_X = old_X.copy(base=base) old_X.make_dense() rvs = old_X.get_rv_names() sorted_rvs = np.argsort([ entropy(old_X.marginal([rv], rv_mode='indices').pmf) for rv in range(len(rvs)) ]) oldshape = len(old_X) outcomes = old_X.outcomes # print("before", new_X.pmf.shape) for i, rv in enumerate(sorted_rvs): nudges, _ = max_nudge(new_X.copy('linear'), YgivenX, eps=(eps / len(sorted_rvs)), nudge_type='individual', minimal_entropy_idx=rv) # print("local eps",sum([sum(abs(nudge)) for nudge in nudges]), eps, old_X.outcome_length()) new_X = do_max_individual_nudge(new_X, nudges, rv, True) # print("after {}".format(i), new_X.pmf.shape) new_X.make_dense() newshape = len(new_X) # if oldshape != newshape: # print(nudges) # print("after {} and making dense".format(i), new_X.pmf.shape) dct = {o: new_X[o] if o in new_X.outcomes else 0.0 for o in outcomes} #print(outcomes, dct) new_X = dit.Distribution(dct) new_X.set_rv_names(rvs) new_X._mask = mask return new_X
def run(optimizer, objectivefunc, dataset_List, NumOfRuns, params, export_flags, auto_cluster=True, n_clusters='supervised', labels_exist=True, metric='euclidean'): """ It serves as the main interface of the framework for running the experiments. Parameters ---------- optimizer : list The list of optimizers names objectivefunc : list The list of objective functions dataset_List : list The list of the names of the data sets files NumOfRuns : int The number of independent runs params : set The set of parameters which are: 1. Size of population (PopulationSize) 2. The number of iterations (Iterations) export_flags : set The set of Boolean flags which are: 1. Export (Exporting the results in a file) 2. Export_details (Exporting the detailed results in files) 3. Export_details_labels (Exporting the labels detailed results in files) 4. Export_convergence (Exporting the covergence plots) 5. Export_boxplot (Exporting the box plots) auto_cluster : boolean, default = True Choose whether the number of clusters is detected automatically. If True, select one of the following: 'supervised', 'CH', 'silhouette', 'elbow', 'gap', 'min', 'max', 'median' for n_clusters. If False, specify a list of integers for n_clusters. n_clusters : string, or list, default = 'supervised' A list of the number of clusters for the datasets in dataset_List Other values can be considered instead of specifying the real value, which are as follows: - supervised: The number of clusters is derived from the true labels of the datasets - elbow: The number of clusters is automatically detected by elbow method - gap: The number of clusters is automatically detected by gap analysis methos - silhouette: The number of clusters is automatically detected by silhouette coefficient method - CH: The number of clusters is automatically detected by Calinski-Harabasz index - DB: The number of clusters is automatically detected by Davies Bouldin index - BIC: The number of clusters is automatically detected by Bayesian Information Criterion score - min: The number of clusters is automatically detected by the minimum value of the number of clusters detected by all detection techniques - max: The number of clusters is automatically detected by the maximum value of the number of clusters detected by all detection techniques - median: The number of clusters is automatically detected by the median value of the number of clusters detected by all detection techniques - majority: The number of clusters is automatically detected by the majority vote of the number of clusters detected by all detection techniques labels_exist : boolean, default = True Specify if labels exist as the last column of the csv file of the datasets in dataset_List if the value is False, the following hold: - supervised value for n_clusters is not allowed - experiments, and experiments_details files contain only the evaluation measures for "SSE","TWCV","SC","DB","DI","STDev" - Export_boxplot is set for "SSE","TWCV","SC","DB","DI","STDev" metric : string, default = 'euclidean' The metric to use when calculating the distance between points if applicable for the objective function selected. It must be one of the options allowed by scipy.spatial.distance.pdist for its metric parameter Returns ----------- N/A """ if not labels_exist and n_clusters == 'supervised': print( 'Syupervised value for n_clusters is not allowed when labels_exist value is false' ) sys.exit() if isinstance(n_clusters, list): if len(n_clusters) != len(dataset_List): print( 'Length of n_clusters list should equal the length of dataset_List list' ) sys.exit() if min(n_clusters) < 2: print('n_clusters value should be larger than 2') sys.exit() if auto_cluster == True: print('n_clusters should be string if auto_cluster is true') sys.exit() else: if auto_cluster == False: print( 'n_clusters should be a list of integers if auto_cluster is false' ) sys.exit() # Select general parameters for all optimizers (population size, number of iterations) .... PopulationSize = params['PopulationSize'] Iterations = params['Iterations'] #Export results ? Export = export_flags['Export_avg'] Export_details = export_flags['Export_details'] Export_details_labels = export_flags['Export_details_labels'] Export_convergence = export_flags['Export_convergence'] Export_boxplot = export_flags['Export_boxplot'] # Check if it works at least once Flag = False Flag_details = False Flag_details_Labels = False # CSV Header for for the cinvergence CnvgHeader = [] if labels_exist: datasets_directory = "datasets/" # the directory where the dataset is stored else: datasets_directory = "datasets/unsupervised/" # the directory where the dataset is stored results_directory = time.strftime("%Y-%m-%d-%H-%M-%S") + '/' Path(results_directory).mkdir(parents=True, exist_ok=True) dataset_len = len(dataset_List) k = [-1] * dataset_len f = [-1] * dataset_len points = [0] * dataset_len labelsTrue = [0] * dataset_len for l in range(0, Iterations): CnvgHeader.append("Iter" + str(l + 1)) #read all datasets for h in range(dataset_len): dataset_filename = dataset_List[h] + '.csv' # Read the dataset file and generate the points list and true values rawData = open( os.path.join(os.path.abspath(os.path.dirname(__file__)), datasets_directory + dataset_filename), 'rt') data = numpy.loadtxt(rawData, delimiter=",") nPoints, nValues = data.shape #Number of points and Number of values for each point if labels_exist: f[h] = nValues - 1 #Dimension value points[h] = data[:, :-1].tolist() #list of points labelsTrue[h] = data[:, -1].tolist( ) #List of actual cluster of each points (last field) else: f[h] = nValues #Dimension value points[h] = data.copy().tolist() #list of points labelsTrue[ h] = None #List of actual cluster of each points (last field) points[h] = preprocessing.normalize(points[h], norm='max', axis=0) if n_clusters == 'supervised': k[h] = len(numpy.unique(data[:, -1])) #k: Number of clusters elif n_clusters == 'elbow': k[h] = clus_det.ELBOW(points[h]) #k: Number of clusters elif n_clusters == 'gap': k[h] = clus_det.GAP_STATISTICS(points[h]) #k: Number of clusters elif n_clusters == 'silhouette': k[h] = clus_det.SC(points[h]) #k: Number of clusters elif n_clusters == 'DB': k[h] = clus_det.DB(points[h]) #k: Number of clusters elif n_clusters == 'CH': k[h] = clus_det.CH(points[h]) #k: Number of clusters elif n_clusters == 'DB': k[h] = clus_det.DB(points[h]) #k: Number of clusters elif n_clusters == 'BIC': k[h] = clus_det.BIC(points[h]) #k: Number of clusters elif n_clusters == 'min': k[h] = clus_det.min_clusters(points[h]) #k: Number of clusters elif n_clusters == 'max': k[h] = clus_det.max_clusters(points[h]) #k: Number of clusters elif n_clusters == 'median': k[h] = clus_det.median_clusters(points[h]) #k: Number of clusters elif n_clusters == 'majority': k[h] = clus_det.majority_clusters( points[h]) #k: Number of clusters else: k[h] = n_clusters[h] #k: Number of clusters for i in range(0, len(optimizer)): for j in range(0, len(objectivefunc)): for h in range(len(dataset_List)): HS = [0] * NumOfRuns CS = [0] * NumOfRuns VM = [0] * NumOfRuns AMI = [0] * NumOfRuns ARI = [0] * NumOfRuns Fmeasure = [0] * NumOfRuns SC = [0] * NumOfRuns accuracy = [0] * NumOfRuns DI = [0] * NumOfRuns DB = [0] * NumOfRuns stdev = [0] * NumOfRuns exSSE = [0] * NumOfRuns exTWCV = [0] * NumOfRuns purity = [0] * NumOfRuns entropy = [0] * NumOfRuns convergence = [0] * NumOfRuns executionTime = [0] * NumOfRuns #Agg = [0]*NumOfRuns for z in range(0, NumOfRuns): print("Dataset: " + dataset_List[h]) print("k: " + str(k[h])) print("Run no.: " + str(z)) print("Population Size: " + str(PopulationSize)) print("Iterations: " + str(Iterations)) objective_name = objectivefunc[j] x = selector(optimizer[i], objective_name, k[h], f[h], PopulationSize, Iterations, points[h], metric) if labels_exist: HS[z] = measures.HS(labelsTrue[h], x.labelsPred) CS[z] = measures.CS(labelsTrue[h], x.labelsPred) VM[z] = measures.VM(labelsTrue[h], x.labelsPred) AMI[z] = measures.AMI(labelsTrue[h], x.labelsPred) ARI[z] = measures.ARI(labelsTrue[h], x.labelsPred) Fmeasure[z] = measures.Fmeasure( labelsTrue[h], x.labelsPred) accuracy[z] = measures.accuracy( labelsTrue[h], x.labelsPred) purity[z] = measures.purity(labelsTrue[h], x.labelsPred) entropy[z] = measures.entropy(labelsTrue[h], x.labelsPred) #Agg[z] = float("%0.2f"%(float("%0.2f"%(HS[z] + CS[z] + VM[z] + AMI[z] + ARI[z])) / 5)) SC[z] = measures.SC(points[h], x.labelsPred) DI[z] = measures.DI(points[h], x.labelsPred) DB[z] = measures.DB(points[h], x.labelsPred) stdev[z] = measures.stdev(x.bestIndividual, x.labelsPred, k[h], points[h]) exSSE[z] = measures.SSE(x.bestIndividual, x.labelsPred, k[h], points[h]) exTWCV[z] = measures.TWCV(x.bestIndividual, x.labelsPred, k[h], points[h]) executionTime[z] = x.executionTime convergence[z] = x.convergence optimizerName = x.optimizer objfname = x.objfname if (Export_details_labels == True): ExportToFileDetailsLabels = results_directory + "experiment_details_Labels.csv" with open(ExportToFileDetailsLabels, 'a', newline='\n') as out_details_labels: writer_details = csv.writer(out_details_labels, delimiter=',') if ( Flag_details_Labels == False ): # just one time to write the header of the CSV file header_details = numpy.concatenate( [["Dataset", "Optimizer", "objfname", "k"]]) writer_details.writerow(header_details) Flag_details_Labels = True a = numpy.concatenate([[ dataset_List[h], optimizerName, objfname, k[h] ], x.labelsPred]) writer_details.writerow(a) out_details_labels.close() if (Export_details == True): ExportToFileDetails = results_directory + "experiment_details.csv" with open(ExportToFileDetails, 'a', newline='\n') as out_details: writer_details = csv.writer(out_details, delimiter=',') if ( Flag_details == False ): # just one time to write the header of the CSV file if labels_exist: header_details = numpy.concatenate([[ "Dataset", "Optimizer", "objfname", "k", "ExecutionTime", "SSE", "Purity", "Entropy", "HS", "CS", "VM", "AMI", "ARI", "Fmeasure", "TWCV", "SC", "Accuracy", "DI", "DB", "STDev" ], CnvgHeader]) else: header_details = numpy.concatenate([[ "Dataset", "Optimizer", "objfname", "k", "ExecutionTime", "SSE", "TWCV", "SC", "DI", "DB", "STDev" ], CnvgHeader]) writer_details.writerow(header_details) Flag_details = True if labels_exist: a = numpy.concatenate([[ dataset_List[h], optimizerName, objfname, k[h], float("%0.2f" % (executionTime[z])), float("%0.2f" % (exSSE[z])), float("%0.2f" % (purity[z])), float("%0.2f" % (entropy[z])), float("%0.2f" % (HS[z])), float("%0.2f" % (CS[z])), float("%0.2f" % (VM[z])), float("%0.2f" % (AMI[z])), float("%0.2f" % (ARI[z])), float("%0.2f" % (Fmeasure[z])), float("%0.2f" % (exTWCV[z])), float("%0.2f" % (SC[z])), float("%0.2f" % (accuracy[z])), float("%0.2f" % (DI[z])), float("%0.2f" % (DB[z])), float("%0.2f" % (stdev[z])) ], numpy.around( convergence[z], decimals=2)]) else: a = numpy.concatenate([[ dataset_List[h], optimizerName, objfname, k[h], float("%0.2f" % (executionTime[z])), float("%0.2f" % (exSSE[z])), float("%0.2f" % (exTWCV[z])), float("%0.2f" % (SC[z])), float("%0.2f" % (DI[z])), float("%0.2f" % (DB[z])), float("%0.2f" % (stdev[z])) ], numpy.around( convergence[z], decimals=2)]) writer_details.writerow(a) out_details.close() if (Export == True): ExportToFile = results_directory + "experiment.csv" with open(ExportToFile, 'a', newline='\n') as out: writer = csv.writer(out, delimiter=',') if ( Flag == False ): # just one time to write the header of the CSV file if labels_exist: header = numpy.concatenate([[ "Dataset", "Optimizer", "objfname", "k", "ExecutionTime", "SSE", "Purity", "Entropy", "HS", "CS", "VM", "AMI", "ARI", "Fmeasure", "TWCV", "SC", "Accuracy", "DI", "DB", "STDev" ], CnvgHeader]) else: header = numpy.concatenate([[ "Dataset", "Optimizer", "objfname", "k", "ExecutionTime", "SSE", "TWCV", "SC", "DI", "DB", "STDev" ], CnvgHeader]) writer.writerow(header) Flag = True # at least one experiment avgSSE = str(float("%0.2f" % (sum(exSSE) / NumOfRuns))) avgTWCV = str( float("%0.2f" % (sum(exTWCV) / NumOfRuns))) avgPurity = str( float("%0.2f" % (sum(purity) / NumOfRuns))) avgEntropy = str( float("%0.2f" % (sum(entropy) / NumOfRuns))) avgHomo = str(float("%0.2f" % (sum(HS) / NumOfRuns))) avgComp = str(float("%0.2f" % (sum(CS) / NumOfRuns))) avgVmeas = str(float("%0.2f" % (sum(VM) / NumOfRuns))) avgAMI = str(float("%0.2f" % (sum(AMI) / NumOfRuns))) avgARI = str(float("%0.2f" % (sum(ARI) / NumOfRuns))) avgFmeasure = str( float("%0.2f" % (sum(Fmeasure) / NumOfRuns))) avgSC = str(float("%0.2f" % (sum(SC) / NumOfRuns))) avgAccuracy = str( float("%0.2f" % (sum(accuracy) / NumOfRuns))) avgDI = str(float("%0.2f" % (sum(DI) / NumOfRuns))) avgDB = str(float("%0.2f" % (sum(DB) / NumOfRuns))) avgStdev = str( float("%0.2f" % (sum(stdev) / NumOfRuns))) #avgAgg = str(float("%0.2f"%(sum(Agg) / NumOfRuns))) avgExecutionTime = float( "%0.2f" % (sum(executionTime) / NumOfRuns)) avgConvergence = numpy.around(numpy.mean( convergence, axis=0, dtype=numpy.float64), decimals=2).tolist() if labels_exist: a = numpy.concatenate([[ dataset_List[h], optimizerName, objfname, k[h], avgExecutionTime, avgSSE, avgPurity, avgEntropy, avgHomo, avgComp, avgVmeas, avgAMI, avgARI, avgFmeasure, avgTWCV, avgSC, avgAccuracy, avgDI, avgDB, avgStdev ], avgConvergence]) else: a = numpy.concatenate([[ dataset_List[h], optimizerName, objfname, k[h], avgExecutionTime, avgSSE, avgTWCV, avgSC, avgDI, avgDB, avgStdev ], avgConvergence]) writer.writerow(a) out.close() if Export_convergence == True: conv_plot.run(results_directory, optimizer, objectivefunc, dataset_List, Iterations) if Export_boxplot == True: if labels_exist: ev_measures = [ 'SSE', 'Purity', 'Entropy', 'HS', 'CS', 'VM', 'AMI', 'ARI', 'Fmeasure', 'TWCV', 'SC', 'Accuracy', 'DI', 'DB', 'STDev' ] else: ev_measures = ['SSE', 'TWCV', 'SC', 'DI', 'DB', 'STDev'] box_plot.run(results_directory, optimizer, objectivefunc, dataset_List, ev_measures, Iterations) print("Execution completed")
def run(optimizer, objectivefunc, dataset_List, NumOfRuns, params, export_flags): """ It serves as the main interface of the framework for running the experiments. Parameters ---------- optimizer : list The list of optimizers names objectivefunc : list The list of boolean preference of objective functions dataset_List : list The list of the names of the data sets files NumOfRuns : int The number of independent runs params : set The set of parameters which are: 1. Size of population (PopulationSize) 2. The number of iterations (Iterations) export_flags : set The set of Boolean flags which are: 1. Export (Exporting the results in a file) 2. Export_details (Exporting the detailed results in files) 3. Export_details_labels (Exporting the labels detailed results in files) 4. Export_convergence (Exporting the covergence plots) 5. Export_boxplot (Exporting the box plots) Returns ----------- N/A """ # Select general parameters for all optimizers (population size, number of iterations) .... PopulationSize = params['PopulationSize'] Iterations = params['Iterations'] #Export results ? Export = export_flags['Export_avg'] Export_details = export_flags['Export_details'] Export_details_labels = export_flags['Export_details_labels'] Export_convergence = export_flags['Export_convergence'] Export_boxplot = export_flags['Export_boxplot'] #Automaticly generated name by date and time # Check if it works at least once Flag = False Flag_details = False Flag_details_Labels = False # CSV Header for for the cinvergence CnvgHeader = [] datasets_directory = "datasets/" # the directory where the dataset is stored results_directory = time.strftime("%Y-%m-%d-%H-%M-%S") + '/' Path(results_directory).mkdir(parents=True, exist_ok=True) dataset_len = len(dataset_List) k = [-1] * dataset_len f = [-1] * dataset_len points = [0] * dataset_len labelsTrue = [0] * dataset_len for l in range(0, Iterations): CnvgHeader.append("Iter" + str(l + 1)) #read all datasets for h in range(dataset_len): dataset_filename = dataset_List[h] + '.csv' # Read the dataset file and generate the points list and true values rawData = open( os.path.join(os.path.abspath(os.path.dirname(__file__)), datasets_directory + dataset_filename), 'rt') data = numpy.loadtxt(rawData, delimiter=",") nPoints, nValues = data.shape #Number of points and Number of values for each point f[h] = nValues - 1 #Dimension value k[h] = len(numpy.unique(data[:, -1])) #k: Number of clusters points[h] = data[:, :-1].tolist() #list of points labelsTrue[h] = data[:, -1].tolist( ) #List of actual cluster of each points (last field) points[h] = preprocessing.normalize(points[h], norm='max', axis=0) for i in range(0, len(optimizer)): for j in range(0, len(objectivefunc)): for h in range(len(dataset_List)): HS = [0] * NumOfRuns CS = [0] * NumOfRuns VM = [0] * NumOfRuns AMI = [0] * NumOfRuns ARI = [0] * NumOfRuns Fmeasure = [0] * NumOfRuns SC = [0] * NumOfRuns accuracy = [0] * NumOfRuns DI = [0] * NumOfRuns DB = [0] * NumOfRuns stdev = [0] * NumOfRuns exSSE = [0] * NumOfRuns exTWCV = [0] * NumOfRuns purity = [0] * NumOfRuns entropy = [0] * NumOfRuns convergence = [0] * NumOfRuns executionTime = [0] * NumOfRuns #Agg = [0]*NumOfRuns for z in range(0, NumOfRuns): print("Dataset: " + dataset_List[h]) print("Run no.: " + str(z)) print("Population Size: " + str(PopulationSize)) print("Iterations: " + str(Iterations)) objective_name = objectivefunc[j] x = selector(optimizer[i], objective_name, k[h], f[h], PopulationSize, Iterations, points[h]) HS[z] = measures.HS(labelsTrue[h], x.labelsPred) CS[z] = measures.CS(labelsTrue[h], x.labelsPred) VM[z] = measures.VM(labelsTrue[h], x.labelsPred) AMI[z] = measures.AMI(labelsTrue[h], x.labelsPred) ARI[z] = measures.ARI(labelsTrue[h], x.labelsPred) Fmeasure[z] = measures.Fmeasure(labelsTrue[h], x.labelsPred) SC[z] = measures.SC(points[h], x.labelsPred) accuracy[z] = measures.accuracy(labelsTrue[h], x.labelsPred) DI[z] = measures.DI(points[h], x.labelsPred) DB[z] = measures.DB(points[h], x.labelsPred) stdev[z] = measures.stdev(x.bestIndividual, x.labelsPred, k[h], points[h]) exSSE[z] = measures.SSE(x.bestIndividual, x.labelsPred, k[h], points[h]) exTWCV[z] = measures.TWCV(x.bestIndividual, x.labelsPred, k[h], points[h]) purity[z] = measures.purity(labelsTrue[h], x.labelsPred) entropy[z] = measures.entropy(labelsTrue[h], x.labelsPred) #Agg[z] = float("%0.2f"%(float("%0.2f"%(HS[z] + CS[z] + VM[z] + AMI[z] + ARI[z])) / 5)) executionTime[z] = x.executionTime convergence[z] = x.convergence optimizerName = x.optimizer objfname = x.objfname if (Export_details_labels == True): ExportToFileDetailsLabels = results_directory + "experiment_details_Labels.csv" with open(ExportToFileDetailsLabels, 'a', newline='\n') as out_details_labels: writer_details = csv.writer(out_details_labels, delimiter=',') if ( Flag_details_Labels == False ): # just one time to write the header of the CSV file header_details = numpy.concatenate( [["Dataset", "Optimizer", "objfname"]]) writer_details.writerow(header_details) Flag_details_Labels = True a = numpy.concatenate( [[dataset_List[h], optimizerName, objfname], x.labelsPred]) writer_details.writerow(a) out_details_labels.close() if (Export_details == True): ExportToFileDetails = results_directory + "experiment_details.csv" with open(ExportToFileDetails, 'a', newline='\n') as out_details: writer_details = csv.writer(out_details, delimiter=',') if ( Flag_details == False ): # just one time to write the header of the CSV file header_details = numpy.concatenate([[ "Dataset", "Optimizer", "objfname", "ExecutionTime", "SSE", "Purity", "Entropy", "HS", "CS", "VM", "AMI", "ARI", "Fmeasure", "TWCV", "SC", "Accuracy", "DI", "DB", "STDev" ], CnvgHeader]) writer_details.writerow(header_details) Flag_details = True a = numpy.concatenate([[ dataset_List[h], optimizerName, objfname, float("%0.2f" % (executionTime[z])), float("%0.2f" % (exSSE[z])), float("%0.2f" % (purity[z])), float("%0.2f" % (entropy[z])), float("%0.2f" % (HS[z])), float("%0.2f" % (CS[z])), float("%0.2f" % (VM[z])), float("%0.2f" % (AMI[z])), float("%0.2f" % (ARI[z])), float("%0.2f" % (Fmeasure[z])), float("%0.2f" % (exTWCV[z])), float("%0.2f" % (SC[z])), float("%0.2f" % (accuracy[z])), float("%0.2f" % (DI[z])), float("%0.2f" % (DB[z])), float("%0.2f" % (stdev[z])) ], numpy.around(convergence[z], decimals=2)]) writer_details.writerow(a) out_details.close() if (Export == True): ExportToFile = results_directory + "experiment.csv" with open(ExportToFile, 'a', newline='\n') as out: writer = csv.writer(out, delimiter=',') if ( Flag == False ): # just one time to write the header of the CSV file header = numpy.concatenate([[ "Dataset", "Optimizer", "objfname", "ExecutionTime", "SSE", "Purity", "Entropy", "HS", "CS", "VM", "AMI", "ARI", "Fmeasure", "TWCV", "SC", "Accuracy", "DI", "DB", "STDev" ], CnvgHeader]) writer.writerow(header) avgSSE = str(float("%0.2f" % (sum(exSSE) / NumOfRuns))) avgTWCV = str( float("%0.2f" % (sum(exTWCV) / NumOfRuns))) avgPurity = str( float("%0.2f" % (sum(purity) / NumOfRuns))) avgEntropy = str( float("%0.2f" % (sum(entropy) / NumOfRuns))) avgHomo = str(float("%0.2f" % (sum(HS) / NumOfRuns))) avgComp = str(float("%0.2f" % (sum(CS) / NumOfRuns))) avgVmeas = str(float("%0.2f" % (sum(VM) / NumOfRuns))) avgAMI = str(float("%0.2f" % (sum(AMI) / NumOfRuns))) avgARI = str(float("%0.2f" % (sum(ARI) / NumOfRuns))) avgFmeasure = str( float("%0.2f" % (sum(Fmeasure) / NumOfRuns))) avgSC = str(float("%0.2f" % (sum(SC) / NumOfRuns))) avgAccuracy = str( float("%0.2f" % (sum(accuracy) / NumOfRuns))) avgDI = str(float("%0.2f" % (sum(DI) / NumOfRuns))) avgDB = str(float("%0.2f" % (sum(DB) / NumOfRuns))) avgStdev = str( float("%0.2f" % (sum(stdev) / NumOfRuns))) #avgAgg = str(float("%0.2f"%(sum(Agg) / NumOfRuns))) avgExecutionTime = float( "%0.2f" % (sum(executionTime) / NumOfRuns)) avgConvergence = numpy.around(numpy.mean( convergence, axis=0, dtype=numpy.float64), decimals=2).tolist() a = numpy.concatenate([[ dataset_List[h], optimizerName, objfname, avgExecutionTime, avgSSE, avgPurity, avgEntropy, avgHomo, avgComp, avgVmeas, avgAMI, avgARI, avgFmeasure, avgTWCV, avgSC, avgAccuracy, avgDI, avgDB, avgStdev ], avgConvergence]) writer.writerow(a) out.close() Flag = True # at least one experiment if Export_convergence == True: conv_plot.run(results_directory, optimizer, objectivefunc, dataset_List, Iterations) if Export_boxplot == True: ev_measures = [ 'SSE', 'Purity', 'Entropy', 'HS', 'CS', 'VM', 'AMI', 'ARI', 'Fmeasure', 'TWCV', 'SC', 'Accuracy', 'DI', 'DB', 'STDev' ] box_plot.run(results_directory, optimizer, objectivefunc, dataset_List, ev_measures, Iterations) if (Flag == False): # Faild to run at least one experiment print( "No Optomizer or Cost function is selected. Check lists of available optimizers and cost functions" ) print("Execution completed")
ARI[z] = measures.ARI(labelsTrue[h], x.labelsPred) Fmeasure[z] = measures.Fmeasure(labelsTrue[h], x.labelsPred) SC[z] = measures.SC(points[h], x.labelsPred) accuracy[z] = measures.accuracy(labelsTrue[h], x.labelsPred) DI[z] = measures.DI(points[h], x.labelsPred) DB[z] = measures.DB(points[h], x.labelsPred) stdev[z] = measures.stdev(x.bestIndividual, x.labelsPred, k[h], points[h]) exSSE[z] = measures.SSE(x.bestIndividual, x.labelsPred, k[h], points[h]) exTWCV[z] = measures.TWCV(x.bestIndividual, x.labelsPred, k[h], points[h]) purity[z] = measures.purity(labelsTrue[h], x.labelsPred) entropy[z] = measures.entropy(labelsTrue[h], x.labelsPred) #Agg[z] = float("%0.2f"%(float("%0.2f"%(HS[z] + CS[z] + VM[z] + AMI[z] + ARI[z])) / 5)) executionTime[z] = x.executionTime convergence[z] = x.convergence optimizerName = x.optimizer objfname = x.objfname if (Export_details == True): with open(ExportToFileDetailsLabels, 'a', newline='\n') as out_details_labels: writer_details = csv.writer(out_details_labels, delimiter=',') if ( Flag_details_Labels == False ): # just one time to write the header of the CSV file
for topic in topics: print topic dataFile = PATH_TO_CLEAN_DATA + "/data_" + topic + ".txt" tweets = util.txtTolist(dataFile) k = [5, 10, 25, 50, 100] for ki in k: topk = "TOP_" + str(ki) CosineSimilarityVSM = [] method = "ALL_TWEETS" outPath = PATH_TO_RESULTS + "/" + topic + "/" + topk + "/" + method util.createFilePath(outPath) val = measures.entropy(tweets) print(topic + topk + method + " Entropy : " + str(val)) method = "RANDOM_TWEETS" outPath = PATH_TO_RESULTS + "/" + topic + "/" + topk + "/" + method util.createFilePath(outPath) results = tweets[0:ki] rfile = outPath + "/" + topic + "_" + topk + "_" + method + ".txt" print rfile util.listTotxt(rfile, results, "w+") val = measures.entropy(results) print(topic + " " + topk + method + " Entropy : " + str(val)) measures.get_ParaphraseSim(tweets, rfile, outPath, topic, ki) CosineSimilarityVSM.append(measures.get_VSMsim(rfile, tweets, results)) outFile = outPath + "/" + topic + "_" + topk + "_" + method + "_VSMSimilarityMatrix.csv"
for topic in topics : print topic dataFile=PATH_TO_CLEAN_DATA+"/data_"+topic+".txt" tweets=util.txtTolist(dataFile) k=[5,10,25,50,100] for ki in k : topk="TOP_"+str(ki) CosineSimilarityVSM=[] method="ALL_TWEETS" outPath=PATH_TO_RESULTS+"/"+topic+"/"+topk+"/"+method util.createFilePath(outPath) val=measures.entropy(tweets) print(topic+topk+method+" Entropy : "+str(val)) method="RANDOM_TWEETS" outPath=PATH_TO_RESULTS+"/"+topic+"/"+topk+"/"+method util.createFilePath(outPath) results=tweets[0:ki] rfile=outPath+"/"+topic+"_"+topk+"_"+method+".txt" print rfile util.listTotxt(rfile,results,"w+") val=measures.entropy(results) print(topic+" "+topk+method+" Entropy : "+str(val)) measures.get_ParaphraseSim(tweets,rfile,outPath,topic,ki) CosineSimilarityVSM.append(measures.get_VSMsim(rfile,tweets,results)) outFile=outPath+"/"+topic+"_"+topk+"_"+method+"_VSMSimilarityMatrix.csv"