def fig16_4(): results_dir = get_project_results_dir() init_methods = [ 'Rand-P', 'Rand-C', 'Maxmin', 'kmeans++', 'Bradley', 'Sorting', 'Projection', 'Luxburg', 'Split' ] pu.figure_setup() fig_size = pu.get_fig_size(15, 7) fig = plt.figure(figsize=(fig_size)) ax = fig.add_subplot() ax.title.set_text(f'Dataset G2 (CI final)') ax.set_prop_cycle(color=plt.cm.Set1.colors) dimensions_sizes = [ '1', '2', '4', '8', '16', '32', '64', '128', '256', '512', '1024' ] std_sizes = list(range(10, 101, 10)) for init_method in init_methods: final_percentages_means = [] for dimensions_size in dimensions_sizes: final_percentages = [] for std_size in std_sizes: df = pd.read_csv(results_dir / f"g2-{dimensions_size}-{std_size}" / f"{init_method}.csv") n_rows = df.shape[0] final_zeros = df['ci_final'].value_counts().get(0, 0) final_percentage = (final_zeros / n_rows) * 100 final_percentages.append(final_percentage) final_percentages_means.append(np.mean(final_percentages)) label = "KMeansPP" if init_method == "kmeans++" else init_method ax.plot(dimensions_sizes, final_percentages_means, label=label) ax.set_xticks(dimensions_sizes) ax.set_xlabel('Dimensões') ax.set_yticks([99, 99.20, 99.40, 99.60, 99.80, 100]) ax.set_yticklabels( ['99\%', '99,2\%', '99,4\%', '99,6\%', '99,8\%', '100\%']) ax.set_ylabel('Taxa de sucesso (\%)') plt.legend(bbox_to_anchor=(1, 1), loc="upper left") plt.tight_layout() filename = get_project_results_dir().joinpath('fig16_4.eps') return fig, str(filename)
def table6(): results_dir = get_project_results_dir() init_methods = [ 'Rand-P', 'Rand-C', 'Maxmin', 'kmeans++', 'Bradley', 'Sorting', 'Projection', 'Luxburg', 'Split' ] table = LatexTable() table.add_caption( "CI mínimo atingido em 200k iterações para cada método nos datasets A3 e Unbalanced." ) for dataset in ["a3", "unb"]: table.add_line() table.add_header([dataset]) header = ['Método', 'CI mínimo', 'Iteração'] table.add_header(header) for init_method in init_methods: row = [] row.append(init_method) filepath = results_dir / f"{dataset}-200-r100" / f"{init_method}.csv" df = pd.read_csv(filepath) min_value = df['ci_final'].min() row.append(str(min_value)) min_iter = df['ci_final'].iloc[min_value] row.append(str(min_iter)) table.add_row(row) table.add_line() result = table.to_str() print(result) return result
def produce_report(init_method, dataset, experiment_data): results_dir = get_project_results_dir() report_file = results_dir / dataset / f"{init_method}.csv" report_file.parent.mkdir(exist_ok=True) report_file.touch(exist_ok=True) with report_file.open('a') as f: writer = csv.DictWriter(f, experiment_data[0].keys()) writer.writeheader() writer.writerows(experiment_data)
def fig16_1(): results_dir = get_project_results_dir() init_methods = [ 'Rand-P', 'Rand-C', 'Maxmin', 'kmeans++', 'Bradley', 'Sorting', 'Projection', 'Luxburg', 'Split' ] pu.figure_setup() fig_size = pu.get_fig_size(15, 7) fig = plt.figure(figsize=(fig_size)) ax = fig.add_subplot() ax.title.set_text(f'Dataset DIM (CI inicial)') ax.set_prop_cycle(color=plt.cm.Set1.colors) dimensions_sizes = ['32', '64', '128', '256', '512', '1024'] for init_method in init_methods: initial_percentages = [] for dimensions_size in dimensions_sizes: df = pd.read_csv(results_dir / f"dim{dimensions_size}" / f"{init_method}.csv") n_rows = df.shape[0] initial_zeros = df['ci_initial'].value_counts().get(0, 0) initial_percentage = (initial_zeros / n_rows) * 100 initial_percentages.append(initial_percentage) label = "KMeansPP" if init_method == "kmeans++" else init_method ax.plot(dimensions_sizes, initial_percentages, label=label) ax.set_xticks(dimensions_sizes) ax.set_xlabel('Dimensões') ax.set_yticks([0, 20, 40, 60, 80, 100]) ax.set_yticklabels(['0\%', '20\%', '40\%', '60\%', '80\%', '100\%']) ax.set_ylabel('Taxa de sucesso (\%)') plt.legend(bbox_to_anchor=(1, 1), loc="upper left") plt.tight_layout() filename = get_project_results_dir().joinpath('fig16_1.eps') return fig, str(filename)
def fig14(): results_dir = get_project_results_dir() init_methods = [ 'Rand-P', 'Rand-C', 'Maxmin', 'kmeans++', 'Bradley', 'Sorting', 'Projection', 'Luxburg', 'Split' ] pu.figure_setup() fig_size = pu.get_fig_size(15, 7) fig = plt.figure(figsize=(fig_size)) ax = fig.add_subplot() ax.set_prop_cycle(color=plt.cm.Set1.colors) cluster_sizes = list(range(10, 101, 10)) for init_method in init_methods: final_percentages = [] for cluster_size in cluster_sizes: df = pd.read_csv(results_dir / f"b2-sub-{cluster_size}" / f"{init_method}.csv") n_rows = df.shape[0] final_zeros = df['ci_final'].value_counts().get(0, 0) final_percentage = (final_zeros / n_rows) * 100 final_percentages.append(final_percentage) label = "KMeansPP" if init_method == "kmeans++" else init_method ax.plot(cluster_sizes, final_percentages, label=label) ax.set_xticks(cluster_sizes) ax.set_xlabel('Clusters (k)') ax.set_yticks([0, 20, 40, 60, 80, 100]) ax.set_yticklabels(['0\%', '20\%', '40\%', '60\%', '80\%', '100\%']) ax.set_ylabel('Taxa de sucesso (\%)') plt.legend(bbox_to_anchor=(1, 1), loc="upper left") plt.tight_layout() filename = get_project_results_dir().joinpath('fig14.eps') return fig, str(filename)
def fig15(): results_dir = get_project_results_dir() init_methods = [ 'Rand-P', 'Rand-C', 'Maxmin', 'kmeans++', 'Bradley', 'Sorting', 'Projection', 'Luxburg', 'Split' ] pu.figure_setup() fig_size = pu.get_fig_size(15, 7) fig = plt.figure(figsize=(fig_size)) ax = fig.add_subplot() ax.set_prop_cycle(color=plt.cm.Set1.colors) cluster_sizes = list(range(10, 101, 10)) for init_method in init_methods: relative_cis = [] for cluster_size in cluster_sizes: df = pd.read_csv(results_dir / f"b2-sub-{cluster_size}" / f"{init_method}.csv") n_rows = df.shape[0] ci_mean = df['ci_final'].mean() relative_cis.append(ci_mean / cluster_size) label = "KMeansPP" if init_method == "kmeans++" else init_method ax.plot(cluster_sizes, relative_cis, label=label) ax.set_xticks(cluster_sizes) ax.set_xlabel('Clusters (k)') ax.set_ylabel('CI relativo (CI/k)') plt.legend(bbox_to_anchor=(1, 1), loc="upper left") plt.tight_layout() filename = get_project_results_dir().joinpath('fig15.eps') return fig, str(filename)
def plot_hq_mtx(parameters_dict): X = parameters_dict['X'] y = parameters_dict['y'] k_lst = parameters_dict['k_lst'] p_lst = parameters_dict['p_lst'] measures_lst = parameters_dict['measures_lst'] dataset_name = parameters_dict['dataset_name'] target_names = parameters_dict['target_names'] pu.figure_setup() X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, shuffle=True, stratify=y) fig_size = pu.get_fig_size(15, 4.4) fig = plt.figure(figsize=(fig_size)) fig.suptitle(f'Dataset: {dataset_name.upper()}') for i in range(3): ax = fig.add_subplot(1,3,i+1) ax.set_axisbelow(True) curr_name = measures_lst[i] pg = LVQ(prototypes_number=35, version=curr_name) s_set = pg.generate(X_train,y_train) classifier = Knn(n_neighbors=k_lst[-1]).fit(s_set[0], s_set[1]) plot_confusion_matrix(classifier, X_test, y_test, display_labels=target_names, ax=ax, cmap=plt.cm.Blues, normalize=None ) ax.set_title(curr_name) plt.tight_layout() filename = get_project_results_dir().joinpath(dataset_name + '_cf_mtx.eps') pu.save_fig(fig, str(filename))
def plot_hq_summary_p(parameters_dict): k_lst = parameters_dict['k_lst'] p_lst = parameters_dict['p_lst'] measures_lst = parameters_dict['measures_lst'] dataset_name = parameters_dict['dataset_name'] fmt = ['ro--','g^--','bs--'] pu.figure_setup() fig_size = pu.get_fig_size(15, 6) fig = plt.figure(figsize=(fig_size)) fig.suptitle(f'Dataset: {dataset_name.upper()}') ax = fig.add_subplot(1,2,1) ax.set_xlabel('Protótipos') ax.set_ylabel('Tempo de Processamento (s)') ax.set_axisbelow(True) for i in range(len(fmt)): curr_measure = f'{measures_lst[i]}-p' curr_name = measures_lst[i] ax.plot( p_lst, parameters_dict[curr_measure][0], fmt[i], markersize=1.5, linewidth=0.5, label=curr_name) ax.set_xticks(p_lst) plt.legend() plt.tight_layout() ax = fig.add_subplot(1,2,2) ax.set_xlabel('Protótipos') ax.set_ylabel('Acurácia') ax.set_axisbelow(True) for i in range(len(fmt)): curr_measure = f'{measures_lst[i]}-p' curr_name = measures_lst[i] ax.plot( p_lst, parameters_dict[curr_measure][1], fmt[i], markersize=1.5, linewidth=0.5, label=curr_name) ax.set_xticks(p_lst) plt.legend() plt.tight_layout() filename = get_project_results_dir().joinpath(dataset_name + '_summary_p.eps') pu.save_fig(fig, str(filename))
def fig13(): results_dir = get_project_results_dir() init_methods = [ 'Rand-P', 'Rand-C', 'Maxmin', 'kmeans++', 'Bradley', 'Sorting', 'Projection', 'Luxburg', 'Split' ] init_methods_bar = [ 'Rand-P', 'Rand-C', 'Maxmin', 'KMeansPP', 'Bradley', 'Sorting', 'Projection', 'Luxburg', 'Split' ] high_overlap_datasets = [ 'g2-2-40', 'g2-2-50', 'g2-2-60', 'g2-2-70', 'g2-2-80', 'g2-2-90', 'g2-2-100', 'g2-4-50', 'g2-4-60', 'g2-4-70', 'g2-4-80', 'g2-4-90', 'g2-4-100', 'g2-8-70', 'g2-8-80', 'g2-8-90', 'g2-8-100', 'g2-16-90', 'g2-16-100', ] pu.figure_setup() fig_size = pu.get_fig_size(15, 7) fig = plt.figure(figsize=(fig_size)) axs = fig.subplots(ncols=2) axs[0].title.set_text(f'Baixa sobreposição') initial = [] final = [] for init_method in init_methods: initial_percentages = [] final_percentages = [] for dataset in results_dir.glob('g2*'): if dataset.stem in high_overlap_datasets: continue df = pd.read_csv(dataset / f"{init_method}.csv") n_rows = df.shape[0] initial_zeros = df['ci_initial'].value_counts().get(0, 0) initial_percentage = (initial_zeros / n_rows) * 100 initial_percentages.append(initial_percentage) final_zeros = df['ci_final'].value_counts().get(0, 0) final_percentage = (final_zeros / n_rows) * 100 final_percentages.append(final_percentage) initial.append(np.mean(initial_percentages)) final.append(np.mean(final_percentages) - np.mean(initial_percentages)) # axs[0].bar(init_methods_bar, initial, label='Inicial') axs[0].bar(init_methods_bar, initial, label='Inicial', color='gray', edgecolor='black') axs[0].bar(init_methods_bar, final, bottom=initial, label='Final', color='white', edgecolor='black') axs[0].set_ylabel('Taxa de sucesso (\%)') axs[0].tick_params('x', labelrotation=70) axs[0].set_ylim([0, 110]) axs[0].set_yticks([0, 20, 40, 60, 80, 100]) axs[0].set_yticklabels(['0\%', '20\%', '40\%', '60\%', '80\%', '100\%']) axs[0].grid(b=False, axis='x') axs[1].title.set_text(f'Alta sobreposição') initial = [] final = [] for init_method in init_methods: initial_percentages = [] final_percentages = [] for dataset in high_overlap_datasets: df = pd.read_csv(results_dir / dataset / f"{init_method}.csv") n_rows = df.shape[0] initial_zeros = df['ci_initial'].value_counts().get(0, 0) initial_percentage = (initial_zeros / n_rows) * 100 initial_percentages.append(initial_percentage) final_zeros = df['ci_final'].value_counts().get(0, 0) final_percentage = (final_zeros / n_rows) * 100 final_percentages.append(final_percentage) initial.append(np.mean(initial_percentages)) final.append(np.mean(final_percentages) - np.mean(initial_percentages)) init_methods[3] = 'KMeansPP' axs[1].bar(init_methods_bar, initial, label='Inicial', color='gray', edgecolor='black') axs[1].bar(init_methods_bar, final, bottom=initial, label='Final', color='white', edgecolor='black') axs[1].tick_params('x', labelrotation=70) axs[1].set_ylim([0, 110]) axs[1].set_yticks([0, 20, 40, 60, 80, 100]) axs[1].set_yticklabels(['0\%', '20\%', '40\%', '60\%', '80\%', '100\%']) axs[1].grid(b=False, axis='x') plt.legend() plt.tight_layout() filename = get_project_results_dir().joinpath('fig13.eps') return fig, str(filename)
def table3(): results_dir = get_project_results_dir() init_methods = [ 'Rand-P', 'Rand-C', 'Maxmin', 'kmeans++', 'Bradley', 'Sorting', 'Projection', 'Luxburg', 'Split' ] table = LatexTable() table.add_caption( "Resultados para comparação com a Tabela 3 do artigo original.") table.add_line() table.add_header(['CI-values (initial)']) header = [ 'Method', 's1', 's2', 's3', 's4', 'a1', 'a2', 'a3', 'unb', 'b1', 'b2', 'dim32', 'Aver.', ] table.add_header(header) averages = [] for init_method in init_methods: row = [] row.append(init_method) means = [] for dataset in header[1:-1]: filepath = results_dir / dataset / f"{init_method}.csv" if not filepath.exists(): means.append(0.0) row.append('0.0') continue df = pd.read_csv(filepath) mean = df['ci_initial'].mean() means.append(mean) row.append(f"{mean:.1f}") average = np.mean(means) averages.append(average) row.append(f"\\bfseries{{{average:.1f}}}") table.add_row(row) table.add_line() table.add_header(['CI-values (final)']) header = [ 'Method', 's1', 's2', 's3', 's4', 'a1', 'a2', 'a3', 'unb', 'b1', 'b2', 'dim32', 'Aver.', 'Impr.', ] table.add_header(header) for init_method, initial_average in zip(init_methods, averages): row = [] row.append(init_method) means = [] for dataset in header[1:-2]: filepath = results_dir / dataset / f"{init_method}.csv" if not filepath.exists(): means.append(0.0) row.append('0.0') continue df = pd.read_csv(filepath) mean = df['ci_final'].mean() means.append(mean) row.append(f"{mean:.1f}") average = np.mean(means) row.append(f"\\bfseries{{{average:.1f}}}") improvement = 100.0 - (average / initial_average) * 100 row.append(f"\\bfseries{{{improvement:.0f}\\%}}") table.add_row(row) table.add_line() table.add_header(['Success-\%']) header = [ 'Method', 's1', 's2', 's3', 's4', 'a1', 'a2', 'a3', 'unb', 'b1', 'b2', 'dim32', 'Aver.', 'Fails', ] table.add_header(header) for init_method, initial_average in zip(init_methods, averages): row = [] row.append(init_method) percentages = [] for dataset in header[1:-2]: filepath = results_dir / dataset / f"{init_method}.csv" if not filepath.exists(): percentages.append(0) row.append('0\%') continue df = pd.read_csv(filepath) zeros = df['ci_final'].value_counts().get(0, 0) n_rows = df.shape[0] percentage = (zeros / n_rows) * 100 percentages.append(percentage) row.append(f"{percentage:.0f}\\%") average = np.mean(percentages) row.append(f"\\bfseries{{{average:.0f}\\%}}") row.append(f"\\bfseries{{{Counter(percentages)[0]}}}") table.add_row(row) table.add_line() table.add_header(['Number of iterations']) header = [ 'Method', 's1', 's2', 's3', 's4', 'a1', 'a2', 'a3', 'unb', 'b1', 'b2', 'dim32', 'Aver.', ] table.add_header(header) for init_method, initial_average in zip(init_methods, averages): row = [] row.append(init_method) means = [] for dataset in header[1:-1]: filepath = results_dir / dataset / f"{init_method}.csv" if not filepath.exists(): percentages.append(0) row.append('0\%') continue df = pd.read_csv(filepath) mean = df['iterations'].mean() means.append(mean) row.append(f"{mean:.0f}") average = np.mean(means) row.append(f"\\bfseries{{{average:.0f}}}") table.add_row(row) table.add_line() result = table.to_str() print(result) return result
def table5(): results_dir = get_project_results_dir() init_methods = [ 'Rand-P', 'Rand-C', 'Maxmin', 'kmeans++', 'Bradley', 'Sorting', 'Projection', 'Luxburg', 'Split' ] table = LatexTable() table.add_caption( "Resultados para comparação com a Tabela 5 do artigo original.") table.add_line() table.add_header(['CI-values']) header = [ 'Method', 's1', 's2', 's3', 's4', 'a1', 'a2', 'a3', 'unb', 'b1', 'b2', 'dim32', 'KM', 'RKM', ] table.add_header(header) km_averages = [] for init_method in init_methods: means = [] for dataset in header[1:-2]: filepath = results_dir / dataset / f"{init_method}.csv" df = pd.read_csv(filepath) mean = df['ci_final'].mean() means.append(mean) average = np.mean(means) km_averages.append(average) for init_method, km_average in zip(init_methods, km_averages): row = [] row.append(init_method) means = [] for dataset in header[1:-2]: filepath = results_dir / f"{dataset}-r100" / f"{init_method}.csv" df = pd.read_csv(filepath) mean = df['ci_final'].mean() means.append(mean) row.append(f"{mean:.1f}") row.append(f"\\bfseries{{{km_average:.1f}}}") average = np.mean(means) row.append(f"\\bfseries{{{average:.1f}}}") table.add_row(row) table.add_line() table.add_header(['Success-\%']) header = [ 'Method', 's1', 's2', 's3', 's4', 'a1', 'a2', 'a3', 'unb', 'b1', 'b2', 'dim32', 'Aver.', 'Fails', ] table.add_header(header) for init_method in init_methods: row = [] row.append(init_method) percentages = [] for dataset in header[1:-2]: filepath = results_dir / f"{dataset}-r100" / f"{init_method}.csv" df = pd.read_csv(filepath) zeros = df['ci_final'].value_counts().get(0, 0) n_rows = df.shape[0] percentage = (zeros / n_rows) * 100 percentages.append(percentage) row.append(f"{percentage:.0f}\\%") average = np.mean(percentages) row.append(f"\\bfseries{{{average:.0f}\\%}}") row.append(f"\\bfseries{{{Counter(percentages)[0]}}}") table.add_row(row) table.add_line() table.add_header(['Running time (s)']) header = [ 'Method', 's1', 's2', 's3', 's4', 'a1', 'a2', 'a3', 'unb', 'b1', 'b2', 'dim32' ] table.add_header(header) for init_method in init_methods: row = [] row.append(init_method) for dataset in header[1:]: filepath = results_dir / f"{dataset}-r100" / f"{init_method}.csv" df = pd.read_csv(filepath) mean = df['elapsed_time'].mean() means.append(mean) row.append(f"{mean:.1f}") table.add_row(row) table.add_line() result = table.to_str() print(result) return result