def bin_search(dist: np.ndarray, level: float):
    max_entropy = entropy(np.ones(len(dist)))
    if entropy(dist) / max_entropy >= level:
        max = dist
        min = np.zeros(len(dist))
        min[np.random.randint(len(min))] = 1.
        if entropy(min) / max_entropy == level:
            return min, False
    else:
        max = np.ones(len(dist))
        min = dist
        if entropy(max) / max_entropy == level:
            return max, False

    max_ent = entropy(max) / max_entropy
    min_ent = entropy(min) / max_entropy
    while not isclose(max_ent, level) and not isclose(min_ent, level):
        mid = midpoint(min, max)
        if entropy(mid) / max_entropy >= level:
            max = mid
            max_ent = entropy(mid) / max_entropy
        else:
            min = mid
            min_ent = entropy(mid) / max_entropy
    if isclose(entropy(max) / max_entropy, level):
        return max, True
    else:
        return min, True
def sample(n_states: int, level: float):
    if level < 0 or level > 1:
        raise ValueError("level should be between 0 and 1")
    u = np.ones(n_states)
    d = np.random.dirichlet(u)
    max_entropy = entropy(u)
    if entropy(d) / max_entropy == level:
        return d
    else:
        support, sample_again = bin_search(d, level)
        # multiply support with a factor so that its a very narrow support
        if sample_again:
            support = 1000 * support
            return np.random.dirichlet(support)
        else:
            return support
Example #3
0
    def entropy(self):
        parser = argparse.ArgumentParser(prog='entropy',
                                         parents=[Measure.parent_parser])
        parser.add_argument('measure')
        args = parser.parse_args(sys.argv[3:])
        if args.measure == 'in_selectivity':
            measure_dict = measures.in_selectivity(args.network)
        elif args.measure == 'out_selectivity':
            measure_dict = measures.out_selectivity(args.network)
        elif args.measure == 'selectivity':
            measure_dict = measures.selectivity(args.network)
        elif args.measure == 'in_ipr':
            measure_dict = measures.in_ipr(args.network)
        elif args.measure == 'out_ipr':
            args.measure = measures.out_ipr(args.network)

        print measures.entropy(measure_dict)
Example #4
0
    def entropy(self):
        parser = argparse.ArgumentParser(prog='entropy',
                                         parents=[Measure.parent_parser])
        parser.add_argument('measure')
        args = parser.parse_args(sys.argv[3:])
        if args.measure == 'in_selectivity':
            measure_dict = measures.in_selectivity(args.network)
        elif args.measure == 'out_selectivity':
            measure_dict = measures.out_selectivity(args.network)
        elif args.measure == 'selectivity':
            measure_dict = measures.selectivity(args.network)
        elif args.measure == 'in_ipr':
            measure_dict = measures.in_ipr(args.network)
        elif args.measure == 'out_ipr':
            args.measure = measures.out_ipr(args.network)

        print measures.entropy(measure_dict)
Example #5
0
def max_synergistic(input: dit.Distribution,
                    conditional: np.ndarray,
                    eps: float = 0.01):
    rvs = input.get_rv_names()
    states = input.alphabet[0]
    partition_size = int(len(input) / (len(states)**2))
    max_entropy = (len(states)**2) * entropy(np.ones(partition_size))
    best_syn_vars = (0, 1)
    best_outcome_dict = {}
    lowest_entropy = max_entropy

    # conditional = np.stack([d.pmf for d in conditional]) # stack the conditional
    # conditional = conditional/conditional.sum() # normalize the conditional to give each
    for synergy_vars in itertools.combinations(range(len(rvs)), r=2):
        # Build the outcome dict
        outcome_dict = {
            state: np.zeros(partition_size, dtype=int)
            for state in list(itertools.product(states, repeat=2))
        }
        for i, outcome in enumerate(input.outcomes):
            cur_state = outcome[synergy_vars[0]], outcome[synergy_vars[1]]
            outcome_dict[cur_state][np.argmax(
                outcome_dict[cur_state] ==
                0)] = i  # Choose the first zero entry to fill

        current_entropy = sum([
            entropy(input.pmf[indices])
            for state, indices in outcome_dict.items()
        ])

        if current_entropy < lowest_entropy:
            best_syn_vars = synergy_vars
            lowest_entropy = current_entropy
            best_outcome_dict = outcome_dict

    # Use best syn vars to find the nudge vector that makes the largest impact
    nudge_vector = np.zeros(len(input))

    for state, indices in best_outcome_dict.items():
        nudge_vector[indices] = max_global(
            input.pmf[indices],
            np.array([d for i, d in enumerate(conditional) if i in indices]),
            eps / len(best_outcome_dict), False)

    return nudge_vector, best_syn_vars
Example #6
0
def outputting(gen,ts,agents,freq,memory,positions,out,directory,run,optimization,exploration):
	del_freq = len(freq['delete'])
	tra_freq = len(freq['transmit'])
	inv_freq = len(freq['invent'])
	mod_freq = len(freq['modification'])
	ag_soll = [memory[agent] for agent in agents]
	pos_pr = [positions[agent] for agent in agents]
	edit_out = [edit_distance(i,j) for i,j in zip(ag_soll,pos_pr)]
	edit_norm_out = [edit_distance(i,j)/len(max([i,j], key=len)) for i,j in zip(ag_soll,pos_pr)]
	len_out = [len(i) for i in ag_soll]
	ent_out = [entropy(i) for i in ag_soll]
	complex_out = [string_complexity(i) for i in ag_soll]
	prob_ent_out = [entropy(i) for i in pos_pr]
	prob_str_out = list(set([len(i) for i in pos_pr]))
	prob_len_out = [len(i) for i in pos_pr]
	
	solu_pool = len(list(set(ag_soll)))
	prob_pool = len(list(set(pos_pr)))
	pop_size = len(agents)
	lev = np.sum(np.asarray(edit_out))/len(edit_out)
	lev_norm = np.sum(np.asarray(edit_norm_out))/len(edit_norm_out)
	s_len = np.sum(np.asarray(len_out))/len(len_out)
	p_len = np.sum(np.asarray(prob_str_out))/len(prob_str_out)
	ent = np.sum(np.asarray(ent_out))/len(ent_out)
	p_ent = np.sum(np.asarray(prob_ent_out))/len(prob_ent_out)
	sol_complexity = np.sum(np.asarray(complex_out))/len(complex_out)

	if out == True:
		with open(directory,'a') as output:
			output.write(str(run)+';'+str(gen)+';'+str(ts)+';'+str(pop_size)+';'+str(optimization)+';'+str(exploration)+';'+str(solu_pool)+';'+str(prob_pool)+';'+str(s_len)+';'+str(p_len)+';'+str(ent)+';'+str(p_ent)+';'+str(lev)+';'+str(lev_norm)+';'+str(tra_freq)+';'+str(inv_freq)+';'+str(del_freq)+';'+str(mod_freq)+';'+str(sol_complexity)+'\n')
	
	else:
		print('Gen:',gen)
		print('TS: ',ts)
		print('modification:',len(freq['modification']))
		print('Transmit:',len(freq['transmit']))
		print('Invent:',len(freq['invent']))
		print('Delete:',len(freq['delete']))
		print('Solution Pool Size: ',len(list(set(ag_soll))))
		print('LD(Norm): ', np.sum(np.asarray(edit_norm_out))/len(edit_norm_out))
		print('String length: ', np.sum(np.asarray(len_out))/len(len_out))
		print('String Entropy (Average): ', np.sum(np.asarray(ent_out))/len(ent_out))
		print('String Complexity: ', sol_complexity)
		print('Problem Length: ', np.sum(np.asarray(prob_str_out))/len(prob_str_out))
Example #7
0
def max_individual(input: dit.Distribution,
                   conditional: np.ndarray,
                   eps: float = 0.01,
                   minimal_entropy_idx=None):
    rvs = input.get_rv_names()
    conditional = conditional / conditional.sum()
    states = len(input.alphabet[0])
    if not minimal_entropy_idx == 0 and not minimal_entropy_idx:
        minimal_entropy_idx = np.argmin([
            entropy(input.marginal([rv], rv_mode='indices').pmf)
            for rv in range(len(rvs))
        ])

    non_minimal_rvs = rvs[:minimal_entropy_idx] + rvs[minimal_entropy_idx + 1:]
    non_minimal_marginal, minimal_conditional = input.condition_on(
        non_minimal_rvs)
    [d.make_dense() for d in minimal_conditional]
    # minimal_conditional = np.stack([d.pmf for d in minimal_conditional])
    # print("minimal_conditional:",minimal_conditional)
    indiv_shape = (len(minimal_conditional), len(minimal_conditional[0]))

    # minimal_conditional = minimal_conditional.flatten()
    nudge_vector = np.zeros(indiv_shape)
    rotated_conditional = R(conditional, minimal_entropy_idx, len(rvs), states)
    total_max_impact = 0
    # print(len(rvs), (eps / 2)/len(minimal_conditional))
    for i, mc_dist in enumerate(minimal_conditional):

        rows = rotated_conditional[i * states:(i + 1) * states, :]

        max_impact = 0
        for allignment in itertools.product(
            [-1, 1], repeat=rotated_conditional.shape[1]):
            allignment = np.array(allignment)
            if np.all(allignment == 1) or np.all(allignment == -1):
                continue
            scores = np.sum(allignment * rows, axis=1)

            # Add rotation of scores so that scores are well aligned.
            # Weigh scores using the non_minimal_marginal

            vector, impact = find_max_impact(scores, mc_dist.pmf, (eps / 2) /
                                             len(minimal_conditional))
            if impact > max_impact:
                nudge_vector[i, :] = vector
                max_impact = impact
        total_max_impact += max_impact
    return nudge_vector, total_max_impact, minimal_entropy_idx
Example #8
0
def max_local(input: dit.Distribution,
              conditional: np.ndarray,
              eps: float = 0.01):
    rvs = input.get_rv_names()
    sorted_rvs = np.argsort([
        entropy(input.marginal([rv], rv_mode='indices').pmf)
        for rv in range(len(rvs))
    ])
    nudge_vectors = np.zeros(
        (input.outcome_length(), int(len(input) / 3), 3)
    )  # For each random variable we get (hopefully) a different nudge vector of len the input size
    max_impacts = np.zeros(input.outcome_length())
    for rv in sorted_rvs:
        nudge_vectors[rv, :, :], max_impacts[rv], _ = max_individual(
            input, conditional, eps / len(sorted_rvs), rv)
    return nudge_vectors, max_impacts
Example #9
0
def max_local_nudge2(old_X: dit.Distribution,
                     YgivenX: np.ndarray,
                     eps: float = 0.01):
    if old_X.outcome_length() == 1:
        return max_global_nudge(old_X, YgivenX, eps)

    mask = old_X._mask
    base = old_X.get_base()
    new_X = old_X.copy(base=base)
    old_X.make_dense()
    rvs = old_X.get_rv_names()
    sorted_rvs = np.argsort([
        entropy(old_X.marginal([rv], rv_mode='indices').pmf)
        for rv in range(len(rvs))
    ])
    oldshape = len(old_X)
    outcomes = old_X.outcomes
    # print("before", new_X.pmf.shape)
    for i, rv in enumerate(sorted_rvs):
        nudges, _ = max_nudge(new_X.copy('linear'),
                              YgivenX,
                              eps=(eps / len(sorted_rvs)),
                              nudge_type='individual',
                              minimal_entropy_idx=rv)
        #        print("local eps",sum([sum(abs(nudge)) for nudge in nudges]), eps, old_X.outcome_length())
        new_X = do_max_individual_nudge(new_X, nudges, rv, True)
        # print("after {}".format(i), new_X.pmf.shape)
        new_X.make_dense()
        newshape = len(new_X)
    #  if oldshape != newshape:
    #      print(nudges)
    #   print("after {} and making dense".format(i), new_X.pmf.shape)
    dct = {o: new_X[o] if o in new_X.outcomes else 0.0 for o in outcomes}
    #print(outcomes, dct)
    new_X = dit.Distribution(dct)
    new_X.set_rv_names(rvs)
    new_X._mask = mask
    return new_X
Example #10
0
def run(optimizer,
        objectivefunc,
        dataset_List,
        NumOfRuns,
        params,
        export_flags,
        auto_cluster=True,
        n_clusters='supervised',
        labels_exist=True,
        metric='euclidean'):
    """
	It serves as the main interface of the framework for running the experiments.

	Parameters
	----------    
	optimizer : list
	    The list of optimizers names
	objectivefunc : list
	    The list of objective functions
	dataset_List : list
	    The list of the names of the data sets files
	NumOfRuns : int
	    The number of independent runs 
	params  : set
	    The set of parameters which are: 
	    1. Size of population (PopulationSize)
	    2. The number of iterations (Iterations)
	export_flags : set
	    The set of Boolean flags which are: 
	    1. Export (Exporting the results in a file)
	    2. Export_details (Exporting the detailed results in files)
	    3. Export_details_labels (Exporting the labels detailed results in files)
	    4. Export_convergence (Exporting the covergence plots)
	    5. Export_boxplot (Exporting the box plots)
	auto_cluster : boolean, default = True
		Choose whether the number of clusters is detected automatically. 
		If True, select one of the following: 'supervised', 'CH', 'silhouette', 'elbow', 'gap', 'min', 'max', 'median' for n_clusters. 
		If False, specify a list of integers for n_clusters. 
	n_clusters : string, or list, default = 'supervised'
		A list of the number of clusters for the datasets in dataset_List
		Other values can be considered instead of specifying the real value, which are as follows:
		- supervised: The number of clusters is derived from the true labels of the datasets
		- elbow: The number of clusters is automatically detected by elbow method
		- gap: The number of clusters is automatically detected by gap analysis methos
		- silhouette: The number of clusters is automatically detected by silhouette coefficient method
		- CH: The number of clusters is automatically detected by Calinski-Harabasz index
		- DB: The number of clusters is automatically detected by Davies Bouldin index
		- BIC: The number of clusters is automatically detected by Bayesian Information Criterion score
		- min: The number of clusters is automatically detected by the minimum value of the number of clusters detected by all detection techniques
		- max: The number of clusters is automatically detected by the maximum value of the number of clusters detected by all detection techniques
		- median: The number of clusters is automatically detected by the median value of the number of clusters detected by all detection techniques
		- majority: The number of clusters is automatically detected by the majority vote of the number of clusters detected by all detection techniques
	labels_exist : boolean, default = True
		Specify if labels exist as the last column of the csv file of the datasets in dataset_List
		if the value is False, the following hold:
		- supervised value for n_clusters is not allowed
		- experiments, and experiments_details files contain only the evaluation measures for 
		  "SSE","TWCV","SC","DB","DI","STDev"
		- Export_boxplot is set for "SSE","TWCV","SC","DB","DI","STDev"   
	metric : string, default = 'euclidean'
		The metric to use when calculating the distance between points if applicable for the objective function selected. 
		It must be one of the options allowed by scipy.spatial.distance.pdist for its metric parameter

	
	Returns
	-----------
	N/A
	"""

    if not labels_exist and n_clusters == 'supervised':
        print(
            'Syupervised value for n_clusters is not allowed when labels_exist value is false'
        )
        sys.exit()

    if isinstance(n_clusters, list):
        if len(n_clusters) != len(dataset_List):
            print(
                'Length of n_clusters list should equal the length of dataset_List list'
            )
            sys.exit()
        if min(n_clusters) < 2:
            print('n_clusters value should be larger than 2')
            sys.exit()
        if auto_cluster == True:
            print('n_clusters should be string if auto_cluster is true')
            sys.exit()
    else:
        if auto_cluster == False:
            print(
                'n_clusters should be a list of integers if auto_cluster is false'
            )
            sys.exit()

    # Select general parameters for all optimizers (population size, number of iterations) ....
    PopulationSize = params['PopulationSize']
    Iterations = params['Iterations']

    #Export results ?
    Export = export_flags['Export_avg']
    Export_details = export_flags['Export_details']
    Export_details_labels = export_flags['Export_details_labels']
    Export_convergence = export_flags['Export_convergence']
    Export_boxplot = export_flags['Export_boxplot']

    # Check if it works at least once
    Flag = False
    Flag_details = False
    Flag_details_Labels = False

    # CSV Header for for the cinvergence
    CnvgHeader = []

    if labels_exist:
        datasets_directory = "datasets/"  # the directory where the dataset is stored
    else:
        datasets_directory = "datasets/unsupervised/"  # the directory where the dataset is stored

    results_directory = time.strftime("%Y-%m-%d-%H-%M-%S") + '/'
    Path(results_directory).mkdir(parents=True, exist_ok=True)

    dataset_len = len(dataset_List)

    k = [-1] * dataset_len
    f = [-1] * dataset_len
    points = [0] * dataset_len
    labelsTrue = [0] * dataset_len

    for l in range(0, Iterations):
        CnvgHeader.append("Iter" + str(l + 1))

    #read all datasets
    for h in range(dataset_len):

        dataset_filename = dataset_List[h] + '.csv'
        # Read the dataset file and generate the points list and true values
        rawData = open(
            os.path.join(os.path.abspath(os.path.dirname(__file__)),
                         datasets_directory + dataset_filename), 'rt')
        data = numpy.loadtxt(rawData, delimiter=",")

        nPoints, nValues = data.shape  #Number of points and Number of values for each point

        if labels_exist:
            f[h] = nValues - 1  #Dimension value
            points[h] = data[:, :-1].tolist()  #list of points
            labelsTrue[h] = data[:, -1].tolist(
            )  #List of actual cluster of each points (last field)
        else:
            f[h] = nValues  #Dimension value
            points[h] = data.copy().tolist()  #list of points
            labelsTrue[
                h] = None  #List of actual cluster of each points (last field)

        points[h] = preprocessing.normalize(points[h], norm='max', axis=0)

        if n_clusters == 'supervised':
            k[h] = len(numpy.unique(data[:, -1]))  #k: Number of clusters
        elif n_clusters == 'elbow':
            k[h] = clus_det.ELBOW(points[h])  #k: Number of clusters
        elif n_clusters == 'gap':
            k[h] = clus_det.GAP_STATISTICS(points[h])  #k: Number of clusters
        elif n_clusters == 'silhouette':
            k[h] = clus_det.SC(points[h])  #k: Number of clusters
        elif n_clusters == 'DB':
            k[h] = clus_det.DB(points[h])  #k: Number of clusters
        elif n_clusters == 'CH':
            k[h] = clus_det.CH(points[h])  #k: Number of clusters
        elif n_clusters == 'DB':
            k[h] = clus_det.DB(points[h])  #k: Number of clusters
        elif n_clusters == 'BIC':
            k[h] = clus_det.BIC(points[h])  #k: Number of clusters
        elif n_clusters == 'min':
            k[h] = clus_det.min_clusters(points[h])  #k: Number of clusters
        elif n_clusters == 'max':
            k[h] = clus_det.max_clusters(points[h])  #k: Number of clusters
        elif n_clusters == 'median':
            k[h] = clus_det.median_clusters(points[h])  #k: Number of clusters
        elif n_clusters == 'majority':
            k[h] = clus_det.majority_clusters(
                points[h])  #k: Number of clusters
        else:
            k[h] = n_clusters[h]  #k: Number of clusters

    for i in range(0, len(optimizer)):
        for j in range(0, len(objectivefunc)):
            for h in range(len(dataset_List)):
                HS = [0] * NumOfRuns
                CS = [0] * NumOfRuns
                VM = [0] * NumOfRuns
                AMI = [0] * NumOfRuns
                ARI = [0] * NumOfRuns
                Fmeasure = [0] * NumOfRuns
                SC = [0] * NumOfRuns
                accuracy = [0] * NumOfRuns
                DI = [0] * NumOfRuns
                DB = [0] * NumOfRuns
                stdev = [0] * NumOfRuns
                exSSE = [0] * NumOfRuns
                exTWCV = [0] * NumOfRuns
                purity = [0] * NumOfRuns
                entropy = [0] * NumOfRuns
                convergence = [0] * NumOfRuns
                executionTime = [0] * NumOfRuns
                #Agg = [0]*NumOfRuns

                for z in range(0, NumOfRuns):
                    print("Dataset: " + dataset_List[h])
                    print("k: " + str(k[h]))
                    print("Run no.: " + str(z))
                    print("Population Size: " + str(PopulationSize))
                    print("Iterations: " + str(Iterations))

                    objective_name = objectivefunc[j]
                    x = selector(optimizer[i], objective_name, k[h], f[h],
                                 PopulationSize, Iterations, points[h], metric)

                    if labels_exist:
                        HS[z] = measures.HS(labelsTrue[h], x.labelsPred)
                        CS[z] = measures.CS(labelsTrue[h], x.labelsPred)
                        VM[z] = measures.VM(labelsTrue[h], x.labelsPred)
                        AMI[z] = measures.AMI(labelsTrue[h], x.labelsPred)
                        ARI[z] = measures.ARI(labelsTrue[h], x.labelsPred)
                        Fmeasure[z] = measures.Fmeasure(
                            labelsTrue[h], x.labelsPred)
                        accuracy[z] = measures.accuracy(
                            labelsTrue[h], x.labelsPred)
                        purity[z] = measures.purity(labelsTrue[h],
                                                    x.labelsPred)
                        entropy[z] = measures.entropy(labelsTrue[h],
                                                      x.labelsPred)
                        #Agg[z] = float("%0.2f"%(float("%0.2f"%(HS[z] + CS[z] + VM[z] + AMI[z] + ARI[z])) / 5))

                    SC[z] = measures.SC(points[h], x.labelsPred)
                    DI[z] = measures.DI(points[h], x.labelsPred)
                    DB[z] = measures.DB(points[h], x.labelsPred)
                    stdev[z] = measures.stdev(x.bestIndividual, x.labelsPred,
                                              k[h], points[h])
                    exSSE[z] = measures.SSE(x.bestIndividual, x.labelsPred,
                                            k[h], points[h])
                    exTWCV[z] = measures.TWCV(x.bestIndividual, x.labelsPred,
                                              k[h], points[h])

                    executionTime[z] = x.executionTime
                    convergence[z] = x.convergence
                    optimizerName = x.optimizer
                    objfname = x.objfname

                    if (Export_details_labels == True):
                        ExportToFileDetailsLabels = results_directory + "experiment_details_Labels.csv"
                        with open(ExportToFileDetailsLabels, 'a',
                                  newline='\n') as out_details_labels:
                            writer_details = csv.writer(out_details_labels,
                                                        delimiter=',')
                            if (
                                    Flag_details_Labels == False
                            ):  # just one time to write the header of the CSV file
                                header_details = numpy.concatenate(
                                    [["Dataset", "Optimizer", "objfname",
                                      "k"]])
                                writer_details.writerow(header_details)
                                Flag_details_Labels = True
                            a = numpy.concatenate([[
                                dataset_List[h], optimizerName, objfname, k[h]
                            ], x.labelsPred])
                            writer_details.writerow(a)
                        out_details_labels.close()

                    if (Export_details == True):
                        ExportToFileDetails = results_directory + "experiment_details.csv"
                        with open(ExportToFileDetails, 'a',
                                  newline='\n') as out_details:
                            writer_details = csv.writer(out_details,
                                                        delimiter=',')
                            if (
                                    Flag_details == False
                            ):  # just one time to write the header of the CSV file
                                if labels_exist:
                                    header_details = numpy.concatenate([[
                                        "Dataset", "Optimizer", "objfname",
                                        "k", "ExecutionTime", "SSE", "Purity",
                                        "Entropy", "HS", "CS", "VM", "AMI",
                                        "ARI", "Fmeasure", "TWCV", "SC",
                                        "Accuracy", "DI", "DB", "STDev"
                                    ], CnvgHeader])
                                else:
                                    header_details = numpy.concatenate([[
                                        "Dataset", "Optimizer", "objfname",
                                        "k", "ExecutionTime", "SSE", "TWCV",
                                        "SC", "DI", "DB", "STDev"
                                    ], CnvgHeader])
                                writer_details.writerow(header_details)
                                Flag_details = True
                            if labels_exist:
                                a = numpy.concatenate([[
                                    dataset_List[h], optimizerName, objfname,
                                    k[h],
                                    float("%0.2f" % (executionTime[z])),
                                    float("%0.2f" % (exSSE[z])),
                                    float("%0.2f" % (purity[z])),
                                    float("%0.2f" % (entropy[z])),
                                    float("%0.2f" % (HS[z])),
                                    float("%0.2f" % (CS[z])),
                                    float("%0.2f" % (VM[z])),
                                    float("%0.2f" % (AMI[z])),
                                    float("%0.2f" % (ARI[z])),
                                    float("%0.2f" % (Fmeasure[z])),
                                    float("%0.2f" % (exTWCV[z])),
                                    float("%0.2f" % (SC[z])),
                                    float("%0.2f" % (accuracy[z])),
                                    float("%0.2f" % (DI[z])),
                                    float("%0.2f" % (DB[z])),
                                    float("%0.2f" % (stdev[z]))
                                ],
                                                       numpy.around(
                                                           convergence[z],
                                                           decimals=2)])
                            else:
                                a = numpy.concatenate([[
                                    dataset_List[h], optimizerName, objfname,
                                    k[h],
                                    float("%0.2f" % (executionTime[z])),
                                    float("%0.2f" % (exSSE[z])),
                                    float("%0.2f" % (exTWCV[z])),
                                    float("%0.2f" % (SC[z])),
                                    float("%0.2f" % (DI[z])),
                                    float("%0.2f" % (DB[z])),
                                    float("%0.2f" % (stdev[z]))
                                ],
                                                       numpy.around(
                                                           convergence[z],
                                                           decimals=2)])

                            writer_details.writerow(a)
                        out_details.close()

                if (Export == True):
                    ExportToFile = results_directory + "experiment.csv"

                    with open(ExportToFile, 'a', newline='\n') as out:
                        writer = csv.writer(out, delimiter=',')
                        if (
                                Flag == False
                        ):  # just one time to write the header of the CSV file
                            if labels_exist:
                                header = numpy.concatenate([[
                                    "Dataset", "Optimizer", "objfname", "k",
                                    "ExecutionTime", "SSE", "Purity",
                                    "Entropy", "HS", "CS", "VM", "AMI", "ARI",
                                    "Fmeasure", "TWCV", "SC", "Accuracy", "DI",
                                    "DB", "STDev"
                                ], CnvgHeader])
                            else:
                                header = numpy.concatenate([[
                                    "Dataset", "Optimizer", "objfname", "k",
                                    "ExecutionTime", "SSE", "TWCV", "SC", "DI",
                                    "DB", "STDev"
                                ], CnvgHeader])
                            writer.writerow(header)
                            Flag = True  # at least one experiment

                        avgSSE = str(float("%0.2f" % (sum(exSSE) / NumOfRuns)))
                        avgTWCV = str(
                            float("%0.2f" % (sum(exTWCV) / NumOfRuns)))
                        avgPurity = str(
                            float("%0.2f" % (sum(purity) / NumOfRuns)))
                        avgEntropy = str(
                            float("%0.2f" % (sum(entropy) / NumOfRuns)))
                        avgHomo = str(float("%0.2f" % (sum(HS) / NumOfRuns)))
                        avgComp = str(float("%0.2f" % (sum(CS) / NumOfRuns)))
                        avgVmeas = str(float("%0.2f" % (sum(VM) / NumOfRuns)))
                        avgAMI = str(float("%0.2f" % (sum(AMI) / NumOfRuns)))
                        avgARI = str(float("%0.2f" % (sum(ARI) / NumOfRuns)))
                        avgFmeasure = str(
                            float("%0.2f" % (sum(Fmeasure) / NumOfRuns)))
                        avgSC = str(float("%0.2f" % (sum(SC) / NumOfRuns)))
                        avgAccuracy = str(
                            float("%0.2f" % (sum(accuracy) / NumOfRuns)))
                        avgDI = str(float("%0.2f" % (sum(DI) / NumOfRuns)))
                        avgDB = str(float("%0.2f" % (sum(DB) / NumOfRuns)))
                        avgStdev = str(
                            float("%0.2f" % (sum(stdev) / NumOfRuns)))
                        #avgAgg = str(float("%0.2f"%(sum(Agg) / NumOfRuns)))

                        avgExecutionTime = float(
                            "%0.2f" % (sum(executionTime) / NumOfRuns))
                        avgConvergence = numpy.around(numpy.mean(
                            convergence, axis=0, dtype=numpy.float64),
                                                      decimals=2).tolist()
                        if labels_exist:
                            a = numpy.concatenate([[
                                dataset_List[h], optimizerName, objfname, k[h],
                                avgExecutionTime, avgSSE, avgPurity,
                                avgEntropy, avgHomo, avgComp, avgVmeas, avgAMI,
                                avgARI, avgFmeasure, avgTWCV, avgSC,
                                avgAccuracy, avgDI, avgDB, avgStdev
                            ], avgConvergence])
                        else:
                            a = numpy.concatenate([[
                                dataset_List[h], optimizerName, objfname, k[h],
                                avgExecutionTime, avgSSE, avgTWCV, avgSC,
                                avgDI, avgDB, avgStdev
                            ], avgConvergence])
                        writer.writerow(a)
                    out.close()

    if Export_convergence == True:
        conv_plot.run(results_directory, optimizer, objectivefunc,
                      dataset_List, Iterations)

    if Export_boxplot == True:
        if labels_exist:
            ev_measures = [
                'SSE', 'Purity', 'Entropy', 'HS', 'CS', 'VM', 'AMI', 'ARI',
                'Fmeasure', 'TWCV', 'SC', 'Accuracy', 'DI', 'DB', 'STDev'
            ]
        else:
            ev_measures = ['SSE', 'TWCV', 'SC', 'DI', 'DB', 'STDev']
        box_plot.run(results_directory, optimizer, objectivefunc, dataset_List,
                     ev_measures, Iterations)

    print("Execution completed")
Example #11
0
def run(optimizer, objectivefunc, dataset_List, NumOfRuns, params,
        export_flags):
    """
	It serves as the main interface of the framework for running the experiments.

	Parameters
	----------    
	optimizer : list
	    The list of optimizers names
	objectivefunc : list
	    The list of boolean preference of objective functions
	dataset_List : list
	    The list of the names of the data sets files
	NumOfRuns : int
	    The number of independent runs 
	params  : set
	    The set of parameters which are: 
	    1. Size of population (PopulationSize)
	    2. The number of iterations (Iterations)
	export_flags : set
	    The set of Boolean flags which are: 
	    1. Export (Exporting the results in a file)
	    2. Export_details (Exporting the detailed results in files)
	    3. Export_details_labels (Exporting the labels detailed results in files)
	    4. Export_convergence (Exporting the covergence plots)
	    5. Export_boxplot (Exporting the box plots)

	Returns
	-----------
	N/A
	"""

    # Select general parameters for all optimizers (population size, number of iterations) ....
    PopulationSize = params['PopulationSize']
    Iterations = params['Iterations']

    #Export results ?
    Export = export_flags['Export_avg']
    Export_details = export_flags['Export_details']
    Export_details_labels = export_flags['Export_details_labels']
    Export_convergence = export_flags['Export_convergence']
    Export_boxplot = export_flags['Export_boxplot']

    #Automaticly generated name by date and time

    # Check if it works at least once
    Flag = False
    Flag_details = False
    Flag_details_Labels = False

    # CSV Header for for the cinvergence
    CnvgHeader = []

    datasets_directory = "datasets/"  # the directory where the dataset is stored
    results_directory = time.strftime("%Y-%m-%d-%H-%M-%S") + '/'
    Path(results_directory).mkdir(parents=True, exist_ok=True)

    dataset_len = len(dataset_List)

    k = [-1] * dataset_len
    f = [-1] * dataset_len
    points = [0] * dataset_len
    labelsTrue = [0] * dataset_len

    for l in range(0, Iterations):
        CnvgHeader.append("Iter" + str(l + 1))

    #read all datasets
    for h in range(dataset_len):

        dataset_filename = dataset_List[h] + '.csv'
        # Read the dataset file and generate the points list and true values
        rawData = open(
            os.path.join(os.path.abspath(os.path.dirname(__file__)),
                         datasets_directory + dataset_filename), 'rt')
        data = numpy.loadtxt(rawData, delimiter=",")

        nPoints, nValues = data.shape  #Number of points and Number of values for each point
        f[h] = nValues - 1  #Dimension value
        k[h] = len(numpy.unique(data[:, -1]))  #k: Number of clusters
        points[h] = data[:, :-1].tolist()  #list of points
        labelsTrue[h] = data[:, -1].tolist(
        )  #List of actual cluster of each points (last field)

        points[h] = preprocessing.normalize(points[h], norm='max', axis=0)

    for i in range(0, len(optimizer)):
        for j in range(0, len(objectivefunc)):
            for h in range(len(dataset_List)):
                HS = [0] * NumOfRuns
                CS = [0] * NumOfRuns
                VM = [0] * NumOfRuns
                AMI = [0] * NumOfRuns
                ARI = [0] * NumOfRuns
                Fmeasure = [0] * NumOfRuns
                SC = [0] * NumOfRuns
                accuracy = [0] * NumOfRuns
                DI = [0] * NumOfRuns
                DB = [0] * NumOfRuns
                stdev = [0] * NumOfRuns
                exSSE = [0] * NumOfRuns
                exTWCV = [0] * NumOfRuns
                purity = [0] * NumOfRuns
                entropy = [0] * NumOfRuns
                convergence = [0] * NumOfRuns
                executionTime = [0] * NumOfRuns
                #Agg = [0]*NumOfRuns

                for z in range(0, NumOfRuns):
                    print("Dataset: " + dataset_List[h])
                    print("Run no.: " + str(z))
                    print("Population Size: " + str(PopulationSize))
                    print("Iterations: " + str(Iterations))

                    objective_name = objectivefunc[j]
                    x = selector(optimizer[i], objective_name, k[h], f[h],
                                 PopulationSize, Iterations, points[h])

                    HS[z] = measures.HS(labelsTrue[h], x.labelsPred)
                    CS[z] = measures.CS(labelsTrue[h], x.labelsPred)
                    VM[z] = measures.VM(labelsTrue[h], x.labelsPred)
                    AMI[z] = measures.AMI(labelsTrue[h], x.labelsPred)
                    ARI[z] = measures.ARI(labelsTrue[h], x.labelsPred)
                    Fmeasure[z] = measures.Fmeasure(labelsTrue[h],
                                                    x.labelsPred)
                    SC[z] = measures.SC(points[h], x.labelsPred)
                    accuracy[z] = measures.accuracy(labelsTrue[h],
                                                    x.labelsPred)
                    DI[z] = measures.DI(points[h], x.labelsPred)
                    DB[z] = measures.DB(points[h], x.labelsPred)
                    stdev[z] = measures.stdev(x.bestIndividual, x.labelsPred,
                                              k[h], points[h])
                    exSSE[z] = measures.SSE(x.bestIndividual, x.labelsPred,
                                            k[h], points[h])
                    exTWCV[z] = measures.TWCV(x.bestIndividual, x.labelsPred,
                                              k[h], points[h])
                    purity[z] = measures.purity(labelsTrue[h], x.labelsPred)
                    entropy[z] = measures.entropy(labelsTrue[h], x.labelsPred)
                    #Agg[z] = float("%0.2f"%(float("%0.2f"%(HS[z] + CS[z] + VM[z] + AMI[z] + ARI[z])) / 5))

                    executionTime[z] = x.executionTime
                    convergence[z] = x.convergence
                    optimizerName = x.optimizer
                    objfname = x.objfname

                    if (Export_details_labels == True):
                        ExportToFileDetailsLabels = results_directory + "experiment_details_Labels.csv"
                        with open(ExportToFileDetailsLabels, 'a',
                                  newline='\n') as out_details_labels:
                            writer_details = csv.writer(out_details_labels,
                                                        delimiter=',')
                            if (
                                    Flag_details_Labels == False
                            ):  # just one time to write the header of the CSV file
                                header_details = numpy.concatenate(
                                    [["Dataset", "Optimizer", "objfname"]])
                                writer_details.writerow(header_details)
                                Flag_details_Labels = True
                            a = numpy.concatenate(
                                [[dataset_List[h], optimizerName, objfname],
                                 x.labelsPred])
                            writer_details.writerow(a)
                        out_details_labels.close()

                    if (Export_details == True):
                        ExportToFileDetails = results_directory + "experiment_details.csv"
                        with open(ExportToFileDetails, 'a',
                                  newline='\n') as out_details:
                            writer_details = csv.writer(out_details,
                                                        delimiter=',')
                            if (
                                    Flag_details == False
                            ):  # just one time to write the header of the CSV file
                                header_details = numpy.concatenate([[
                                    "Dataset", "Optimizer", "objfname",
                                    "ExecutionTime", "SSE", "Purity",
                                    "Entropy", "HS", "CS", "VM", "AMI", "ARI",
                                    "Fmeasure", "TWCV", "SC", "Accuracy", "DI",
                                    "DB", "STDev"
                                ], CnvgHeader])
                                writer_details.writerow(header_details)
                                Flag_details = True
                            a = numpy.concatenate([[
                                dataset_List[h], optimizerName, objfname,
                                float("%0.2f" % (executionTime[z])),
                                float("%0.2f" % (exSSE[z])),
                                float("%0.2f" % (purity[z])),
                                float("%0.2f" % (entropy[z])),
                                float("%0.2f" % (HS[z])),
                                float("%0.2f" % (CS[z])),
                                float("%0.2f" % (VM[z])),
                                float("%0.2f" % (AMI[z])),
                                float("%0.2f" % (ARI[z])),
                                float("%0.2f" % (Fmeasure[z])),
                                float("%0.2f" % (exTWCV[z])),
                                float("%0.2f" % (SC[z])),
                                float("%0.2f" % (accuracy[z])),
                                float("%0.2f" % (DI[z])),
                                float("%0.2f" % (DB[z])),
                                float("%0.2f" % (stdev[z]))
                            ],
                                                   numpy.around(convergence[z],
                                                                decimals=2)])
                            writer_details.writerow(a)
                        out_details.close()

                if (Export == True):
                    ExportToFile = results_directory + "experiment.csv"

                    with open(ExportToFile, 'a', newline='\n') as out:
                        writer = csv.writer(out, delimiter=',')
                        if (
                                Flag == False
                        ):  # just one time to write the header of the CSV file
                            header = numpy.concatenate([[
                                "Dataset", "Optimizer", "objfname",
                                "ExecutionTime", "SSE", "Purity", "Entropy",
                                "HS", "CS", "VM", "AMI", "ARI", "Fmeasure",
                                "TWCV", "SC", "Accuracy", "DI", "DB", "STDev"
                            ], CnvgHeader])
                            writer.writerow(header)

                        avgSSE = str(float("%0.2f" % (sum(exSSE) / NumOfRuns)))
                        avgTWCV = str(
                            float("%0.2f" % (sum(exTWCV) / NumOfRuns)))
                        avgPurity = str(
                            float("%0.2f" % (sum(purity) / NumOfRuns)))
                        avgEntropy = str(
                            float("%0.2f" % (sum(entropy) / NumOfRuns)))
                        avgHomo = str(float("%0.2f" % (sum(HS) / NumOfRuns)))
                        avgComp = str(float("%0.2f" % (sum(CS) / NumOfRuns)))
                        avgVmeas = str(float("%0.2f" % (sum(VM) / NumOfRuns)))
                        avgAMI = str(float("%0.2f" % (sum(AMI) / NumOfRuns)))
                        avgARI = str(float("%0.2f" % (sum(ARI) / NumOfRuns)))
                        avgFmeasure = str(
                            float("%0.2f" % (sum(Fmeasure) / NumOfRuns)))
                        avgSC = str(float("%0.2f" % (sum(SC) / NumOfRuns)))
                        avgAccuracy = str(
                            float("%0.2f" % (sum(accuracy) / NumOfRuns)))
                        avgDI = str(float("%0.2f" % (sum(DI) / NumOfRuns)))
                        avgDB = str(float("%0.2f" % (sum(DB) / NumOfRuns)))
                        avgStdev = str(
                            float("%0.2f" % (sum(stdev) / NumOfRuns)))
                        #avgAgg = str(float("%0.2f"%(sum(Agg) / NumOfRuns)))

                        avgExecutionTime = float(
                            "%0.2f" % (sum(executionTime) / NumOfRuns))
                        avgConvergence = numpy.around(numpy.mean(
                            convergence, axis=0, dtype=numpy.float64),
                                                      decimals=2).tolist()
                        a = numpy.concatenate([[
                            dataset_List[h], optimizerName, objfname,
                            avgExecutionTime, avgSSE, avgPurity, avgEntropy,
                            avgHomo, avgComp, avgVmeas, avgAMI, avgARI,
                            avgFmeasure, avgTWCV, avgSC, avgAccuracy, avgDI,
                            avgDB, avgStdev
                        ], avgConvergence])
                        writer.writerow(a)
                    out.close()
                Flag = True  # at least one experiment

    if Export_convergence == True:
        conv_plot.run(results_directory, optimizer, objectivefunc,
                      dataset_List, Iterations)

    if Export_boxplot == True:
        ev_measures = [
            'SSE', 'Purity', 'Entropy', 'HS', 'CS', 'VM', 'AMI', 'ARI',
            'Fmeasure', 'TWCV', 'SC', 'Accuracy', 'DI', 'DB', 'STDev'
        ]
        box_plot.run(results_directory, optimizer, objectivefunc, dataset_List,
                     ev_measures, Iterations)

    if (Flag == False):  # Faild to run at least one experiment
        print(
            "No Optomizer or Cost function is selected. Check lists of available optimizers and cost functions"
        )

    print("Execution completed")
Example #12
0
                    ARI[z] = measures.ARI(labelsTrue[h], x.labelsPred)
                    Fmeasure[z] = measures.Fmeasure(labelsTrue[h],
                                                    x.labelsPred)
                    SC[z] = measures.SC(points[h], x.labelsPred)
                    accuracy[z] = measures.accuracy(labelsTrue[h],
                                                    x.labelsPred)
                    DI[z] = measures.DI(points[h], x.labelsPred)
                    DB[z] = measures.DB(points[h], x.labelsPred)
                    stdev[z] = measures.stdev(x.bestIndividual, x.labelsPred,
                                              k[h], points[h])
                    exSSE[z] = measures.SSE(x.bestIndividual, x.labelsPred,
                                            k[h], points[h])
                    exTWCV[z] = measures.TWCV(x.bestIndividual, x.labelsPred,
                                              k[h], points[h])
                    purity[z] = measures.purity(labelsTrue[h], x.labelsPred)
                    entropy[z] = measures.entropy(labelsTrue[h], x.labelsPred)
                    #Agg[z] = float("%0.2f"%(float("%0.2f"%(HS[z] + CS[z] + VM[z] + AMI[z] + ARI[z])) / 5))

                    executionTime[z] = x.executionTime
                    convergence[z] = x.convergence
                    optimizerName = x.optimizer
                    objfname = x.objfname

                    if (Export_details == True):
                        with open(ExportToFileDetailsLabels, 'a',
                                  newline='\n') as out_details_labels:
                            writer_details = csv.writer(out_details_labels,
                                                        delimiter=',')
                            if (
                                    Flag_details_Labels == False
                            ):  # just one time to write the header of the CSV file
Example #13
0
for topic in topics:

    print topic
    dataFile = PATH_TO_CLEAN_DATA + "/data_" + topic + ".txt"
    tweets = util.txtTolist(dataFile)

    k = [5, 10, 25, 50, 100]

    for ki in k:
        topk = "TOP_" + str(ki)
        CosineSimilarityVSM = []

        method = "ALL_TWEETS"
        outPath = PATH_TO_RESULTS + "/" + topic + "/" + topk + "/" + method
        util.createFilePath(outPath)
        val = measures.entropy(tweets)
        print(topic + topk + method + " Entropy : " + str(val))

        method = "RANDOM_TWEETS"
        outPath = PATH_TO_RESULTS + "/" + topic + "/" + topk + "/" + method
        util.createFilePath(outPath)
        results = tweets[0:ki]
        rfile = outPath + "/" + topic + "_" + topk + "_" + method + ".txt"
        print rfile
        util.listTotxt(rfile, results, "w+")

        val = measures.entropy(results)
        print(topic + " " + topk + method + " Entropy : " + str(val))
        measures.get_ParaphraseSim(tweets, rfile, outPath, topic, ki)
        CosineSimilarityVSM.append(measures.get_VSMsim(rfile, tweets, results))
        outFile = outPath + "/" + topic + "_" + topk + "_" + method + "_VSMSimilarityMatrix.csv"
Example #14
0
for topic in topics :
	
	print topic
	dataFile=PATH_TO_CLEAN_DATA+"/data_"+topic+".txt"
	tweets=util.txtTolist(dataFile)
	
	k=[5,10,25,50,100]
	
	for ki in k :
		topk="TOP_"+str(ki)
		CosineSimilarityVSM=[]

		method="ALL_TWEETS"
		outPath=PATH_TO_RESULTS+"/"+topic+"/"+topk+"/"+method
		util.createFilePath(outPath)
		val=measures.entropy(tweets)
		print(topic+topk+method+" Entropy : "+str(val))
	
		method="RANDOM_TWEETS"
		outPath=PATH_TO_RESULTS+"/"+topic+"/"+topk+"/"+method
		util.createFilePath(outPath)
		results=tweets[0:ki]
		rfile=outPath+"/"+topic+"_"+topk+"_"+method+".txt"
		print rfile
		util.listTotxt(rfile,results,"w+")

		val=measures.entropy(results)
		print(topic+" "+topk+method+" Entropy : "+str(val))
		measures.get_ParaphraseSim(tweets,rfile,outPath,topic,ki)
		CosineSimilarityVSM.append(measures.get_VSMsim(rfile,tweets,results))
		outFile=outPath+"/"+topic+"_"+topk+"_"+method+"_VSMSimilarityMatrix.csv"