t = myProf.Timer() # ensemble_file = '/home/diogoaos/QCThesis/datasets/gauss10e6_overlap/ensemble_500k_test2.h5' ensemble_file = '/media/Data/diogoaos_tmp/gaussseparated_ensembles/ensemble_500000_2sqrt.hdf' coassc_path_ssd = '/home/diogoaos/QCThesis/coassoc.h5' index_path_ssd = '/home/diogoaos/QCThesis/' coassc_path_spin = '/media/Data/diogoaos_tmp/coassoc.h5' index_path_spin = '/media/Data/diogoaos_tmp/' print "loading ensemble" t.reset() t.tic() ensemble = part.loadEnsembleFromFileHDF(ensemble_file) print 'load ensemble time: {}'.format(t.tac()) n_samples = part.n_samples_from_partition(ensemble[0]) n_partitions = len(ensemble) print "number of samples: {}".format(n_samples) print "number of partitions: {}".format(n_partitions) ma = eacSp._compute_max_assocs_from_ensemble(ensemble) ma *= 3 ma = int(ma) print "memory required: {} MB".format(ma * n_samples * 5 / (1024.0**2)) mat = eacSp.EAC_CSR(n_samples=n_samples, max_assocs=ma,
def get_ensemble(data_sampled, rule): n_clusts = rule(n) logger.info("* * * * * * * * * * * * * * * * * *") logger.info("Num. samples: {}".format(n)) logger.info("New config: {}".format(rule.__doc__)) logger.info("* * * * * * * * * * * * * * * * * *") # skip if number of clusters is bigger than number of samples if n_clusts[1] >= n: logger.info("Kmax too large for dataset size. Skipping...") continue if n_clusts[0] <= 1: logger.info("Kmin too little. Skipping...") continue ## generate ensemble logger.info("Checking for ensemble in folder...") generator = myKM.K_Means(cuda_mem="manual") # if there is an ensemble file load it, otherwise generate and save ensemble_filename = os.path.join(folder,"ensemble_{}_{}.hdf".format(n, rule.__doc__)) if not os.path.exists(ensemble_filename): logger.info("No ensemble detected. Generating ensemble...") t.reset() t.tic() ensemble = part.generateEnsemble(data_sampled, generator, n_clusts, n_partitions, n_iters) t.tac() part.saveEnsembleToFileHDF(ensemble_filename, ensemble) logger.info("Saved ensemble in file: {}".format(ensemble_filename)) t_ensemble = t.elapsed else: logger.info("Ensemble detected in file {}. Loading ensemble...".format(ensemble_filename)) ensemble = part.loadEnsembleFromFileHDF(ensemble_filename) t_ensemble = -1 # ensemble_name = "ensemble_" + rule.__doc__ + ".hdf" # part.saveEnsembleToFileHDF(os.path.join(folder, ensemble_name), ensemble) max_cluster_size = myEAC.biggest_cluster_size(ensemble) logger.info("Maximum cluster size: {}".format(max_cluster_size)) # # # # # # # # # # # # # # # check memory usage for different matrix schemes # compute memory usage for each type of matrix # linear properties for condensed sparse matrix n_s = 0.05 n_e = 1.0 val_s = 1.0 val_e = 0.05 ma = max_cluster_size * sparse_max_assocs_factor mems = compute_mems(n, ma, n_s, n_e, val_s, val_e) f_mat = mems[0] # full matrix fc_mat = mems[1] # full condensed matrix sp_const = mems[2] # sparse constant matrix sp_lin = mems[3] # sparse linear matrix sp_const_mst = mems[4] sp_lin_mst = mems[5]
t = myProf.Timer() # ensemble_file = '/home/diogoaos/QCThesis/datasets/gauss10e6_overlap/ensemble_500k_test2.h5' ensemble_file = "/media/Data/diogoaos_tmp/gaussseparated_ensembles/ensemble_500000_2sqrt.hdf" coassc_path_ssd = "/home/diogoaos/QCThesis/coassoc.h5" index_path_ssd = "/home/diogoaos/QCThesis/" coassc_path_spin = "/media/Data/diogoaos_tmp/coassoc.h5" index_path_spin = "/media/Data/diogoaos_tmp/" print "loading ensemble" t.reset() t.tic() ensemble = part.loadEnsembleFromFileHDF(ensemble_file) print "load ensemble time: {}".format(t.tac()) n_samples = part.n_samples_from_partition(ensemble[0]) n_partitions = len(ensemble) print "number of samples: {}".format(n_samples) print "number of partitions: {}".format(n_partitions) ma = eacSp._compute_max_assocs_from_ensemble(ensemble) ma *= 3 ma = int(ma) print "memory required: {} MB".format(ma * n_samples * 5 / (1024.0 ** 2)) mat = eacSp.EAC_CSR(n_samples=n_samples, max_assocs=ma, condensed=True, sort_mode="surgical")
ensemble_path = os.path.join(ensemble_dir, ensemble_filename) if not os.path.exists(ensemble_path): logger.info("No ensemble detected. Generating ensemble...") t.reset() t.tic() ensemble = part.generateEnsemble(data_sampled, generator, n_clusts, n_partitions, n_iters) t.tac() part.saveEnsembleToFileHDF(ensemble_path, ensemble) logger.info("Saved ensemble in file: {}".format(ensemble_path)) t_ensemble = t.elapsed else: logger.info( "Ensemble detected in file {}. Loading ensemble...".format( ensemble_path)) ensemble = part.loadEnsembleFromFileHDF(ensemble_path) t_ensemble = -1 # ensemble_name = "ensemble_" + rule.__doc__ + ".hdf" # part.saveEnsembleToFileHDF(os.path.join(folder, ensemble_name), ensemble) max_cluster_size = myEAC.biggest_cluster_size(ensemble) logger.info("Maximum cluster size: {}".format(max_cluster_size)) # # # # # # # # # # # # # # # check memory usage for different matrix schemes # compute memory usage for each type of matrix # linear properties for condensed sparse matrix n_s = 0.05