def transform(self): """Transform data by mapping it into the latent space.""" # Note: This maps to mean of distribution, we could alternatively # sample from Gaussian distribution # get batch data # return r are only (69952,62) from original dataset which are 70000, 50 datapoints # # Reload the data without shuffling before mapping it to z space if self.label != -1: X, y = utils_parent.load_mnist(self.dataset_name, shuffle=False) d = split_data_according_to_label(X, y, self.num_labels) noshuffle_data_X = X[d[str(self.label)]] # y represent the index with label i noshuffle_data_y = d[str(self.label)] else: noshuffle_data_X, noshuffle_data_y = utils_parent.load_mnist(self.dataset_name, shuffle=False) batch_images = noshuffle_data_X[0:self.batch_size] r = self.sess.run(self.mu, feed_dict={self.inputs: batch_images}) for idx in range(1, self.num_batches): # from the beginning to the end of the dataset, each time do batchsize, conform to the net tensor definiation batch_images = noshuffle_data_X[idx * self.batch_size:(idx + 1) * self.batch_size] z = self.sess.run(self.mu, feed_dict={self.inputs: batch_images}) r = tf.concat([r, z], 0) return r, noshuffle_data_y
def __init__(self, sess, epoch, batch_size, z_dim, dataset_name, checkpoint_dir, result_dir, log_dir, label = -1, num_labels=10,config_manager=None): self.sess = sess self.dataset_name = dataset_name self.checkpoint_dir = checkpoint_dir self.result_dir = result_dir self.log_dir = log_dir self.epoch = epoch self.batch_size = batch_size self.label = label self.config_manager = config_manager self.num_labels = num_labels if dataset_name == 'mnist' or dataset_name == 'fashion-mnist': # parameters self.input_height = 28 self.input_width = 28 self.output_height = 28 self.output_width = 28 self.z_dim = z_dim # dimension of noise-vector self.c_dim = 1 # train self.learning_rate = 0.0002 self.beta1 = 0.5 # test self.sample_num = 64 # number of generated images to be saved # load mnist # if flag labeled is true, train data is the subset of data(Mnist) which has same label if label != -1: X,y=utils_parent.load_mnist(self.dataset_name) # dict[i] represent data index with label i dict = split_data_according_to_label(X,y,num_labels) # extract data with label i from global training data self.data_X = X[dict[str(label)]] # y represent the index with label i self.data_y = dict[str(label)] # self.data_y = y[dict[str(label)]] else: self.data_X, self.data_y = utils_parent.load_mnist(self.dataset_name) # get number of batches for a single epoch self.num_batches = len(self.data_X) // self.batch_size elif dataset_name =='imagenet': raise NotImplementedError else: raise NotImplementedError
def __init__(self, pattern="/global_index_cluster_data.npy", root_dir='../results/VAE_fashion-mnist_64_62', transform=None, list_idx=[0], dsname="fashion-mnist", num_labels=10, num_cluster=5): """ Args: pattern (string): Path to the npy file. root_dir (string): Directory with all the images. transform (callable, optional): Optional transform to be applied on a sample. list_idx (list): the list of indexes of the cluster to choose as trainset or testset for example trainset = VGMMDataset(list_idx = [0,1, 2, 3]) testset = VGMMDataset(list_idx = [4]) dsname: currently dsname is fashion-mnist, but not used at all """ X, y = utils_parent.load_mnist(dsname) y = y.argmax(axis=1) self.root_dir = root_dir self.pattern = pattern self.transform = transform #if cluster ==True: if not tf.gfile.Exists(self.root_dir + self.pattern): _, self.global_index = concatenate_data_from_dir( self.root_dir, num_labels=num_labels, num_clusters=num_cluster) else: self.global_index = np.load(self.root_dir + pattern, allow_pickle=True) self.list_idx = list_idx all_inds = [] print('cluster index list:' + str(list_idx)) for index in self.list_idx: to_append = self.global_index.item().get( str(index) ) # self.global_index is a dictionary of {'0': [15352, 2152,21, 25,...], '1':[1121, 1252, 3195,...]} print('\n size of cluster:' + str(np.shape(to_append)) + '\n') all_inds = np.append(all_inds, to_append) print(all_inds.shape) self.all_inds = all_inds.tolist() # self.all_inds = map(round, self.all_inds) if self.all_inds is not None: self.all_inds = [round(a) for a in self.all_inds] self.samples = { "x": X.take(self.all_inds, axis=0), "y": y.take(self.all_inds, axis=0) } print('\n size of dataset:' + str(np.shape(self.all_inds)) + '\n')
def counting_label(num_labels,num_clusters): # load data _,y = utils_parent.load_mnist(config.dataset_name) global_index = np.load(config.data_path+config.global_index_name,allow_pickle=True) results = {} for i in range(num_clusters): index = global_index.item().get(str(i)) temp_y = np.sum(y[index],axis=0) sum = np.sum(temp_y) temp_y = temp_y/sum results[str(i)]= temp_y with open("distribution_y.txt", 'a') as lf: lf.write(str(results)) return results
def main(unused_argv): # parse arguments args = parse_args() if args is None: exit() # load training and eval data X, y = utils_parent.load_mnist(args.dataset) results_random_ressample = cross_validation(X, y, config.num_clusters, args) results_shifted = cross_validation_for_clustered_data( X, y, config.data_path, config.num_labels, config.num_clusters, args) print("***********random************") print(results_random_ressample) print("***********shifted************") print(results_shifted) utils_parent.write_results_convnet_to_csv("results_random.csv", results_random_ressample) utils_parent.write_results_convnet_to_csv("results_cluster.csv", results_shifted)
def __init__(self, sess, epoch, batch_size, z_dim, dataset_name, checkpoint_dir, result_dir, log_dir): self.sess = sess self.dataset_name = dataset_name self.checkpoint_dir = checkpoint_dir self.result_dir = result_dir self.log_dir = log_dir self.epoch = epoch self.batch_size = batch_size if dataset_name == 'mnist' or dataset_name == 'fashion-mnist': # parameters self.input_height = 28 self.input_width = 28 self.output_height = 28 self.output_width = 28 self.z_dim = z_dim # dimension of noise-vector self.y_dim = 10 # dimension of code-vector (label) self.c_dim = 1 # train self.learning_rate = 0.0002 self.beta1 = 0.5 # test self.sample_num = 64 # number of generated images to be saved # code self.len_discrete_code = 10 # categorical distribution (i.e. label) self.len_continuous_code = 2 # gaussian distribution (e.g. rotation, thickness) # load mnist self.data_X, self.data_y = utils_parent.load_mnist( self.dataset_name) # get number of batches for a single epoch self.num_batches = len(self.data_X) // self.batch_size else: raise NotImplementedError
def cross_validation(num_labels, num_cluster, args): print("cross validation for random resampling") best_acc = 0 resize = cf.resize start_epoch, num_epochs, batch_size, optim_type = cf.start_epoch, cf.num_epochs, cf.batch_size, cf.optim_type results = {} X, y = utils_parent.load_mnist('fashion-mnist') kf = KFold(n_splits=num_cluster, shuffle=True) i = 0 for train_eval_idx, test_idx in kf.split(X, y): #iterator #breakpoint() iter = kf.split(X,y); for xx in iter: print(xx); it seems that KFold.split works cv_idx = i i = i + 1 trainset, evalset, testset, inputs, outputs = prepare_data_for_normal_cv( args, train_eval_idx, test_idx, resize) # Hyper Parameter settings use_cuda = torch.cuda.is_available() use_cuda = cf.use_cuda() if use_cuda is True: torch.cuda.set_device(0) best_acc = 0 resize = cf.resize start_epoch, num_epochs, batch_size, optim_type = cf.start_epoch, cf.num_epochs, cf.batch_size, cf.optim_type trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=4) evalloader = torch.utils.data.DataLoader(evalset, batch_size=batch_size, shuffle=False, num_workers=4) testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=4) # num_workers: how many subprocesses to use for data loading. 0 means that the data will be loaded in the main process. (default: 0)# Return network & file name # Model print('\n[Phase 2] : Model setup') if args.resume: # Load checkpoint print('| Resuming from checkpoint...') assert os.path.isdir( 'checkpoint'), 'Error: No checkpoint directory found!' _, file_name = getNetwork(args, inputs, outputs) checkpoint = torch.load('./checkpoint/' + args.dataset + os.sep + file_name + args.cv_type + str(cv_idx) + '.t7') net = checkpoint['net'] best_acc = checkpoint['acc'] start_epoch = checkpoint['epoch'] else: print('| Building net type [' + args.net_type + ']...') net, file_name = getNetwork(args, inputs, outputs) if use_cuda: net.cuda() vi = GaussianVariationalInference(torch.nn.CrossEntropyLoss()) #logfile = os.path.join('diagnostics_Bayes{}_{}.txt'.format(args.net_type, args.dataset)) logfile_train = os.path.join( 'diagnostics_Bayes{}_{}_cv{}_train_rand.txt'.format( args.net_type, args.dataset, i)) logfile_test = os.path.join( 'diagnostics_Bayes{}_{}_cv{}_test_rand.txt'.format( args.net_type, args.dataset, i)) logfile_eval = os.path.join( 'diagnostics_Bayes{}_{}_cv{}_val_rand.txt'.format( args.net_type, args.dataset, i)) print('\n[Phase 3] : Training model') print('| Training Epochs = ' + str(num_epochs)) print('| Initial Learning Rate = ' + str(args.lr)) print('| Optimizer = ' + str(optim_type)) elapsed_time = 0 train_return = [] test_return = [] eval_return = [] for epoch in range(start_epoch, start_epoch + num_epochs): start_time = time.time() temp_train_return = train(epoch, trainset, inputs, net, batch_size, trainloader, resize, num_epochs, use_cuda, vi, logfile_train) temp_eval_return = test(epoch, evalset, inputs, batch_size, evalloader, net, use_cuda, num_epochs, resize, vi, logfile_eval, file_name) temp_test_return = test(epoch, testset, inputs, batch_size, testloader, net, use_cuda, num_epochs, resize, vi, logfile_test, "test") train_return = np.append(train_return, temp_train_return) eval_return = np.append(eval_return, temp_eval_return) test_return = np.append(test_return, temp_test_return) print(temp_train_return) print(temp_eval_return) print(temp_test_return) epoch_time = time.time() - start_time elapsed_time += epoch_time print('| Elapsed time : %d:%02d:%02d' % (cf.get_hms(elapsed_time))) print('\n[Phase 4] : Testing model') print('* Test results : Acc@1 = %.2f%%' % (best_acc)) results[str(i)] = { "train": train_return, "test": test_return, "eval": eval_return } print(results) return results
type=bool, help="debug mode has smaller data") parser.add_argument('--cv_idx', default=0, type=int, help='index of cv') args = parser.parse_args() global cv_idx cv_idx = 0 if args.cv_type == "vgmm": global result # result ={} with MyPool(multiprocessing.cpu_count()) as p: result = p.map(cross_validation_for_clustered_data_parallel, list(range(config_parent.num_clusters))) # result = cross_validation_for_clustered_data(num_labels=config_parent.num_labels,num_cluster=config_parent.num_clusters,args=args) else: X, y = utils_parent.load_mnist('fashion-mnist') kf = KFold(n_splits=config_parent.num_clusters) global global_rand_idx global_rand_idx = {} i = 0 for train_eval_idx, test_idx in kf.split(X, y): global_rand_idx[str(i)] = { "train_eval_idx": train_eval_idx, "test_idx": test_idx } with MyPool(multiprocessing.cpu_count()) as p: result = p.map(cross_validation_parallel, list(range(config_parent.num_clusters))) # result = cross_validation(config_parent.num_labels,config_parent.num_clusters,args) final_file_prefix = "Bayes_" + args.cv_type + '_' + args.net_type + '_cross_validation_result'
transform=transform_train) trainset_refactor = refactor_dataset_class.VGMMDataset( transform=transform_train) trainloader_org = torch.utils.data.DataLoader(trainset_org, batch_size=batch_size, shuffle=True, num_workers=4) trainloader_refactor = torch.utils.data.DataLoader(trainset_refactor, batch_size=batch_size, shuffle=False, num_workers=4) # num_workers: how many subprocesses to use for data loading. 0 means that the data will be loaded in the main process. (default: 0)# Return network & file name import utils_parent X, y = utils_parent.load_mnist("fashion-mnist") type(y) y.shape ynew = y.argmax(axis=1) ynew type(ynew) for batch_idx, (inputs_value, targets) in enumerate(trainloader_org): print(inputs_value) print(targets) print("......") print(targets.type) print(targets.shape) break for batch_idx, (inputs_value, targets) in enumerate(trainloader_refactor):