if not os.path.isdir(ds_folder): os.makedirs(ds_folder) if not os.path.isdir(out_folder): os.makedirs(out_folder) params_general = collections.OrderedDict() params_general['calculate_kmeans'] = collections.OrderedDict() clusters_big = [100, 1000, 10000] clusters_medium = [100, 500, 5000] clusters_small = [100, 250, 1000] do_evaluations(load_usps_dataset(ds_folder), 'usps', out_folder, params_general, clusters_small) if not args.testmode: do_evaluations(load_sector_dataset(ds_folder), 'sector', out_folder, params_general, clusters_small) do_evaluations(load_and_extract_dataset_from_github('fcl_datasets2', ds_folder, 'real_sim.scaled.bz2'), 'realsim', out_folder, params_general, clusters_medium) do_evaluations(load_and_extract_dataset_from_github('fcl_datasets2', ds_folder, 'mediamill_static_label_scaled.bz2'), 'mediamill', out_folder, params_general, clusters_medium) do_evaluations(load_and_extract_dataset_from_github('fcl_datasets2', ds_folder, 'caltech101.scaled.bz2'), 'caltech101', out_folder, params_general, clusters_big) do_evaluations(load_and_extract_dataset_from_github('fcl_datasets2', ds_folder, 'e2006_static_label.scaled.bz2'), 'e2006', out_folder, params_general, clusters_small) do_evaluations(load_and_extract_dataset_from_github('fcl_datasets2', ds_folder, 'avira_201.scaled.bz2'), 'avira201', out_folder, params_general, clusters_big) do_evaluations(load_and_extract_dataset_from_github('fcl_datasets2', ds_folder, 'kdd.scaled.bz2'), 'kdd2001', out_folder, params_general, clusters_big) do_evaluations(load_and_extract_dataset_from_github('fcl_datasets2', ds_folder, 'mnist800k.scaled.bz2'), 'mnist800k', out_folder, params_general, clusters_big) else: if not os.path.isdir(out_folder): raise Exception("cannot do evaluation with nonexisting output dir %s" % out_folder) if not os.path.isdir(output_path_latex): os.makedirs(output_path_latex) result_evaluation_dataset_speed_comparison(out_folder, output_path_latex)
if args.testmode: truncated_svd_annz_percentage = [ float(x) / 100.0 for x in range(6, 10, 2) ] bv_annz = [float(x) / 100.0 for x in range(25, 35, 5)] else: truncated_svd_annz_percentage = [ float(x) / 100.0 for x in range(2, 42, 2) ] bv_annz = [float(x) / 100.0 for x in range(5, 75, 5)] do_evaluations(load_usps_dataset(ds_folder), 'usps', out_folder, params_general, clusters_small, truncated_svd_annz_percentage, bv_annz) if not args.testmode: do_evaluations(load_sector_dataset(ds_folder), 'sector', out_folder, params_general, clusters_small, truncated_svd_annz_percentage, bv_annz) do_evaluations( load_and_extract_dataset_from_github('fcl_datasets2', ds_folder, 'real_sim.scaled.bz2'), 'realsim', out_folder, params_general, clusters_medium, truncated_svd_annz_percentage, bv_annz) do_evaluations( load_and_extract_dataset_from_github( 'fcl_datasets2', ds_folder, 'mediamill_static_label_scaled.bz2'), 'mediamill', out_folder, params_general, clusters_medium, truncated_svd_annz_percentage, bv_annz) do_evaluations(
label=dataset_name) plt.legend() plt.grid(True) plt.xlabel('number of clusters') plt.ylabel('avoided full distance calculations (percent)') plt.title( r'Varying k and observing avoided full distance calculations (bv annz = 0.3)' ) destination_filename = join(dirname(__file__), "varying_k_evaluation.png") plt.savefig(destination_filename) print("plot was saved in the current folder to: %s" % destination_filename) if __name__ == "__main__": datasets = [] ds_folder = abspath( join(dirname(__file__), os.pardir, os.pardir, os.pardir, 'datasets')) with open(load_sector_dataset(ds_folder), 'r') as f: datasets.append(('sector', f.read())) with open(load_usps_dataset(ds_folder), 'r') as f: datasets.append(('usps', f.read())) do_evaluations(datasets)