def train_ocgd(epoch_num=10, optim_type='BCGD2', startPoint=None, logdir='test', update_min=True, z_dim=128, batchsize=64, loss_name='WGAN', model_name='dc', data_path='None', dataname='cifar10', device='cpu', gpu_num=1, collect_info=False): lr_d = 0.01 lr_g = 0.01 dataset = get_data(dataname=dataname, path='../datas/%s' % data_path) dataloader = DataLoader(dataset=dataset, batch_size=batchsize, shuffle=True, num_workers=4) D, G = get_model(model_name=model_name, z_dim=z_dim) D.to(device) G.to(device) if startPoint is not None: chk = torch.load(startPoint) D.load_state_dict(chk['D']) G.load_state_dict(chk['G']) print('Start from %s' % startPoint) optimizer = OCGD(max_params=G.parameters(), min_params=D.parameters(), udpate_min=update_min, device=device) loss_list = [] count = 0 for e in range(epoch_num): for real_x in dataloader: real_x = real_x[0].to(device) d_real = D(real_x) z = torch.randn((real_x.shape[0], z_dim), device=device) fake_x = G(z) d_fake = D(fake_x) D_loss = get_loss(name=loss_name, g_loss=False, d_real=d_real, d_fake=d_fake) optimizer.zero_grad() optimizer.step(loss=D_loss) if count % 100 == 0: print('Iter %d, Loss: %.5f' % (count, D_loss.item())) loss_list.append(D_loss.item()) count += 1 print('epoch{%d/%d}' % (e, epoch_num)) name = 'overtrainD.pth' if update_min else 'overtrainG.pth' save_checkpoint(path=logdir, name=name, D=D, G=G) loss_data = pd.DataFrame(loss_list) loss_data.to_csv('logs/train_oneside.csv')
def generate(dataname, path, save_path, batch_size=64, device='cpu'): dataset = get_data(dataname=dataname, path='../datas/%s' % path) real_loader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, num_workers=4) real_set = next(iter(real_loader)) real_set = real_set[0].to(device) if not os.path.exists('figs/%s' % save_path): os.makedirs('figs/%s' % save_path) vutils.save_image(real_set, 'figs/%s/%s.png' % (save_path, dataname), normalize=True)
def train_cifar(config): device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print(device) # learning_rate = 0.0001 # batch_size = 64 # z_dim = 128 if config['model'] == 'dc': D = GoodDiscriminator() G = GoodGenerator() elif config['model'] == 'ResGAN': D = ResNet32Discriminator(n_in=3, num_filters=128, batchnorm=False) G = ResNet32Generator(z_dim=config['z_dim'], num_filters=128, batchnorm=True) elif config['model'] == 'DCGAN': D = DC_discriminator() G = DC_generator(z_dim=config['z_dim']) dataset = get_data(dataname=config['dataset'], path='../datas/%s' % config['datapath']) # dataset = CIFAR10(root='../datas/cifar10', train=True, transform=transform, download=True) trainer = WGAN_GP(model_name='dc-wgp', D=D, G=G, device=device, dataset=dataset, z_dim=config['z_dim'], batchsize=config['batchsize'], lr_d=config['lr_d'], lr_g=config['lr_g'], show_iter=config['show_iter'], gp_weight=config['gp_weight'], d_penalty=config['d_penalty'], d_iter=config['d_iter'], noise_shape=(64, config['z_dim']), gpu_num=config['gpu_num'], weight_path=config['weight_path'], startPoint=config['startPoint']) trainer.train_epoch(is_flag=config['eval_is'], fid_flag=config['eval_fid'], epoch_num=config['epoch_num'], dirname=config['logdir'], dataname=config['dataset'], gp=True, d_penalty=config['d_penalty'])
def run(config): # Get loader config['drop_last'] = False loader = get_data(dataname=config['dataset'], path=config['data_path']) # Load inception net net = load_inception_net(parallel=config['parallel']) pool, logits = [], [] device = torch.device('cuda:0') print(device) for i, (x, y) in enumerate(tqdm(loader)): x = x.to(device) with torch.no_grad(): pool_val, logits_val = net(x) pool += [np.asarray(pool_val.cpu())] logits += [np.asarray(F.softmax(logits_val, 1).cpu())] pool, logits = [np.concatenate(item, 0) for item in [pool, logits]] # uncomment to save pool, logits, and labels to disk # print('Saving pool, logits, and labels to disk...') # np.savez(config['dataset']+'_inception_activations.npz', # {'pool': pool, 'logits': logits, 'labels': labels}) # Calculate inception metrics and report them print('Calculating inception metrics...') IS_mean, IS_std = calculate_inception_score(logits) print('Training data from dataset %s has IS of %5.5f +/- %5.5f' % (config['dataset'], IS_mean, IS_std)) # Prepare mu and sigma, save to disk. Remove "hdf5" by default # (the FID code also knows to strip "hdf5") print('Calculating means and covariances...') mu, sigma = np.mean(pool, axis=0), np.cov(pool, rowvar=False) print('Saving calculated means and covariances to disk...') np.savez('metrics/stats/' + config['dataset'] + '_inception_moments.npz', **{ 'mu': mu, 'sigma': sigma })
def main_Core50(conf, run, close_at_the_end=False): # Prepare configurations files conf['solver_file_first_batch'] = conf['solver_file_first_batch'].replace( 'X', conf['model']) conf['solver_file'] = conf['solver_file'].replace('X', conf['model']) conf['init_weights_file'] = conf['init_weights_file'].replace( 'X', conf['model']) conf['tmp_weights_file'] = conf['tmp_weights_file'].replace( 'X', conf['model']) train_filelists = conf['train_filelists'].replace('RUN_X', run) test_filelist = conf['test_filelist'].replace('RUN_X', run) run_on_the_fly = True # If True, tells the train_utils.get_data(...) script not to cache batch data on disk (Path(conf['exp_path']) / 'CM').mkdir(exist_ok=True, parents=True) (Path(conf['exp_path']) / 'EwC').mkdir(exist_ok=True, parents=True) (Path(conf['exp_path']) / 'Syn').mkdir(exist_ok=True, parents=True) if 'brn_past_weight' not in conf or conf['brn_past_weight'] is None: if conf['rehearsal_is_latent']: conf['brn_past_weight'] = 20000 else: conf['brn_past_weight'] = 10000 # To change if needed the network prototxt if conf['rehearsal_is_latent']: solver_param = caffe_pb2.SolverParameter() with open(conf['solver_file']) as f: txtf.Merge(str(f.read()), solver_param) next_batches_net_prototxt_path = Path(solver_param.net) if not next_batches_net_prototxt_path.stem.endswith('b'): print( 'Error dealing with latent rehearsal: invalid net prototxt name!' ) exit(1) next_batches_net_prototxt_path_orig = next_batches_net_prototxt_path.parent / ( next_batches_net_prototxt_path.stem[:-1] + next_batches_net_prototxt_path.suffix) moving_avg_fraction = 1.0 - (1.0 / conf['brn_past_weight']) train_utils.modify_net_prototxt( str(next_batches_net_prototxt_path_orig), str(next_batches_net_prototxt_path), moving_average_fraction=moving_avg_fraction) if conf['model'] == 'MobileNetV1': rehearsal_layer_mapping_for_mobilenetv1 = { 'data': ([-1, 3, 128, 128], 'conv1'), 'conv2_1/dw': ([-1, 32, 64, 64], 'conv2_1/sep'), #conv2_1 / dw(128, 32, 64, 64) # conv2_1 / sep(128, 64, 64, 64) 'conv2_2/dw': ([-1, 64, 32, 32], 'conv2_2/sep'), #conv2_2 / dw(128, 64, 32, 32) # conv2_2 / sep(128, 128, 32, 32) 'conv3_1/dw': ([-1, 128, 32, 32], 'conv3_1/sep'), #conv3_1 / dw(128, 128, 32, 32) # conv3_1 / sep(128, 128, 32, 32) 'conv3_2/dw': ([-1, 128, 16, 16], 'conv3_2/sep'), #conv3_2 / dw(128, 128, 16, 16) # conv3_2 / sep(128, 256, 16, 16) 'conv4_1/dw': ([-1, 256, 16, 16], 'conv4_1/sep'), #conv4_1 / dw(128, 256, 16, 16) # conv4_1 / sep(128, 256, 16, 16) 'conv4_2/dw': ([-1, 256, 8, 8], 'conv4_2/sep'), #conv4_2 / dw(128, 256, 8, 8) # conv4_2 / sep(128, 512, 8, 8) 'conv5_1/dw': ([-1, 512, 8, 8], 'conv5_1/sep'), #conv5_1 / dw(512, 1, 3, 3) # conv5_1 / sep(512, 512, 1, 1) 'conv5_2/dw': ([-1, 512, 8, 8], 'conv5_2/sep'), #conv5_2 / dw(512, 1, 3, 3) # conv5_2 / sep(512, 512, 1, 1) 'conv5_3/dw': ([-1, 512, 8, 8], 'conv5_3/sep'), #conv5_3 / dw(512, 1, 3, 3) # conv5_3 / sep(512, 512, 1, 1) 'conv5_4/dw': ([-1, 512, 8, 8], 'conv5_4/sep'), #conv5_4 / dw(512, 1, 3, 3) # conv5_4 / sep(512, 512, 1, 1) 'conv5_5/dw': ([-1, 512, 8, 8], 'conv5_5/sep'), #conv5_5 / dw(512, 1, 3, 3) # conv5_5 / sep(512, 512, 1, 1) 'conv5_6/dw': ([-1, 512, 4, 4], 'conv5_6/sep'), #conv5_6 / dw(512, 1, 3, 3) # conv5_6 / sep(1024, 512, 1, 1) 'conv6/dw': ([-1, 1024, 4, 4], 'conv6/sep'), #conv6 / dw(1024, 1, 3, 3) # conv6 / sep(1024, 1024, 1, 1) 'pool6': ([-1, 1024, 1, 1], 'mid_fc7') #avg_pool(1024) # mid_fc7(50, 1024, 1, 1)(50, ) } current_mapping = rehearsal_layer_mapping_for_mobilenetv1[ conf['rehearsal_layer']] if 'rehearsal_stop_layer' not in conf or conf[ 'rehearsal_stop_layer'] is None: conf['rehearsal_stop_layer'] = current_mapping[1] rehe_lat_surgery.create_concat_layer_from_net_template( str(next_batches_net_prototxt_path), str(next_batches_net_prototxt_path), conf['rehearsal_layer'], current_mapping[0], current_mapping[1], original_input=21, rehearsal_input=107) else: raise RuntimeError('Unsupported model for latent rehearsal:', conf['model']) # Parse the solver prototxt # for more details see - https://stackoverflow.com/questions/31823898/changing-the-solver-parameters-in-caffe-through-pycaffe if conf['initial_batch'] == 0: print('Solver proto: ', conf['solver_file_first_batch']) solver_param = caffe_pb2.SolverParameter() with open(conf['solver_file_first_batch']) as f: txtf.Merge(str(f.read()), solver_param) net_prototxt = solver_param.net # Obtains the path to the net prototxt print('Net proto: ', net_prototxt) else: print('Solver proto: ', conf['solver_file']) solver_param = caffe_pb2.SolverParameter() with open(conf['solver_file']) as f: txtf.Merge(str(f.read()), solver_param) net_prototxt = solver_param.net # Obtains the path to the net prototxt print('Net proto: ', net_prototxt) # Obtain class labels if conf['class_labels'] != '': # More complex than a simple loadtxt because of the unicode representation in python 3 label_str = np.loadtxt(conf['class_labels'], dtype=bytes, delimiter="\n").astype(str) # Obtain minibatch size from net proto train_minibatch_size, test_minibatch_size = train_utils.extract_minibatch_size_from_prototxt_with_input_layers( net_prototxt) print(' test minibatch size: ', test_minibatch_size) print(' train minibatch size: ', train_minibatch_size) # Load test set print("Recovering Test Set: ", test_filelist, " ...") start = time.time() test_x, test_y = train_utils.get_data(test_filelist, conf['db_path'], conf['exp_path'], on_the_fly=run_on_the_fly, verbose=conf['verbose']) assert (test_x.shape[0] == test_y.shape[0]) if conf['num_classes'] < 50: # Checks if we are doing category-based classification test_y = test_y // 5 test_y = test_y.astype(np.float32) test_patterns = test_x.shape[0] test_x, test_y, test_iterat = train_utils.pad_data(test_x, test_y, test_minibatch_size) print(' -> %d patterns of %d classes (%.2f sec.)' % (test_patterns, len(np.unique(test_y)), time.time() - start)) print(' -> %.2f -> %d iterations for full evaluation' % (test_patterns / test_minibatch_size, test_iterat)) # Load training patterns in batches (by now assume the same number in all batches) batch_count = conf['num_batches'] train_patterns = train_utils.count_lines_in_batches( batch_count, train_filelists) train_iterations_per_epoch = np.zeros(batch_count, int) train_iterations = np.zeros(batch_count, int) test_interval_epochs = conf['test_interval_epochs'] test_interval = np.zeros(batch_count, float) for batch in range(batch_count): if conf["rehearsal"] and batch > 0: train_patterns[batch] += conf["rehearsal_memory"] train_iterations_per_epoch[batch] = int( np.ceil(train_patterns[batch] / train_minibatch_size)) test_interval[ batch] = test_interval_epochs * train_iterations_per_epoch[batch] if (batch == 0): train_iterations[batch] = train_iterations_per_epoch[batch] * conf[ 'num_epochs_first_batch'] else: train_iterations[ batch] = train_iterations_per_epoch[batch] * conf['num_epochs'] print("Batch %2d: %d patterns, %d iterations (%d iter. per epochs - test every %.1f iter.)" \ % (batch, train_patterns[batch], train_iterations[batch], train_iterations_per_epoch[batch], test_interval[batch])) # Create evaluation points # -> iterations which are boundaries of batches batch_iter = [0] iter = 0 for batch in range(batch_count): iter += train_iterations[batch] batch_iter.append(iter) # Calculates the iterations where the network will be evaluated eval_iters = [ 1 ] # Start with 1 (instead of 0) because the test net is aligned to the train one after solver.step(1) for batch in range(batch_count): start = batch_iter[batch] end = batch_iter[batch + 1] start += test_interval[batch] while start < end: eval_iters.append(int(start)) start += test_interval[batch] eval_iters.append(end) # Iterations which are epochs in the evaluation range epochs_iter = [] for batch in range(batch_count): start = batch_iter[batch] end = batch_iter[batch + 1] start += train_iterations_per_epoch[batch] while start <= end: epochs_iter.append(int(start)) start += train_iterations_per_epoch[batch] prev_train_loss = np.zeros(len(eval_iters)) prev_test_acc = np.zeros(len(eval_iters)) prev_train_acc = np.zeros(len(eval_iters)) prev_exist = filelog.TryLoadPrevTrainingLog(conf['train_log_file'], prev_train_loss, prev_test_acc, prev_train_acc) train_loss = np.copy( prev_train_loss ) # Copying allows to correctly visualize the graph in case we start from initial_batch > 0 test_acc = np.copy(prev_test_acc) train_acc = np.copy(prev_train_acc) epochs_tick = False if batch_count > 30 else True # For better visualization visualization.Plot_Incremental_Training_Init('Incremental Training', eval_iters, epochs_iter, batch_iter, train_loss, test_acc, 5, conf['accuracy_max'], prev_exist, prev_train_loss, prev_test_acc, show_epochs_tick=epochs_tick) filelog.Train_Log_Init(conf['train_log_file']) filelog.Train_LogDetails_Init(conf['train_log_file']) start_train = time.time() eval_idx = 0 # Evaluation iterations counter global_eval_iter = 0 # Global iterations counter first_round = True initial_batch = conf['initial_batch'] if initial_batch > 0: # Move forward by skipping unnecessary evaluation global_eval_iter = batch_iter[initial_batch] while eval_iters[eval_idx] < global_eval_iter: eval_idx += 1 eval_idx += 1 for batch in range(initial_batch, batch_count): print( '\nBATCH = {:2d} ----------------------------------------------------' .format(batch)) if batch == 0: solver = caffe.get_solver( conf['solver_file_first_batch'] ) # Load the solver for the first batch and create net(s) if conf['init_weights_file'] != '': solver.net.copy_from(conf['init_weights_file']) print('Network created and Weights loaded from: ', conf['init_weights_file']) # Test solver.share_weights(solver.test_nets[0]) print('Weights shared with Test Net') accuracy, _, pred_y = train_utils.test_network_with_accuracy_layer( solver, test_x, test_y, test_iterat, test_minibatch_size, prediction_level_Model[conf['model']], return_prediction=True) if conf['strategy'] in ['cwr+', 'ar1', 'ar1free']: cwr.zeros_cwr_layer_bias_lr(solver.net, cwr_layers_Model[conf['model']]) class_updates = np.full(conf['num_classes'], conf['initial_class_updates_value'], dtype=np.float32) cons_w = cwr.init_consolidated_weights( solver.net, cwr_layers_Model[conf['model']], conf['num_classes'] ) # allocate space for consolidated weights and initialze to 0 cwr.reset_weights( solver.net, cwr_layers_Model[conf['model']], conf['num_classes'] ) # reset weights to 0 (done here for the first batch to keep initial stats correct) # cwr.reset_weights(solver.net, cwr_layers_Model[conf['model']], conf['num_classes']) # reset weights to 0 (done here for the first batch to keep initial stats correct) if conf['strategy'] in ['ar1', 'ar1free']: ewcData, synData = syn.create_syn_data( solver.net ) # ewcData stores optimal weights + normalized fisher; trajectory store unnormalized summed grad*deltaW if conf['rehearsal_is_latent']: reha_data_size = solver.net.blobs[ conf['rehearsal_layer']].data[0].size rehearsal.allocate_memory(conf['rehearsal_memory'], reha_data_size, 1) else: rehearsal.allocate_memory(conf['rehearsal_memory'], test_x[0].size, 1) elif batch == 1: solver = caffe.get_solver( conf['solver_file']) # load solver and create net if first_round: solver.net.copy_from(conf['init_weights_file']) print('Network created and Weights loaded from: ', conf['init_weights_file']) else: solver.net.copy_from(conf['tmp_weights_file']) print('Network created and Weights loaded from: ', conf['tmp_weights_file']) solver.share_weights(solver.test_nets[0]) if first_round: print('Loading consolidated weights...') class_updates = np.full(conf['num_classes'], conf['initial_class_updates_value'], dtype=np.float32) rand_w, cons_w = cwr.copy_initial_weights( solver.net, cwr_layers_Model[conf['model']], conf['num_classes']) if conf['strategy'] in ['ar1']: ewcData, synData = syn.create_syn_data( solver.net ) # ewcData stores optimal weights + normalized fisher; trajectory store unnormalized summed grad*deltaW if conf['strategy'] in ['cwr+']: cwr.zeros_non_cwr_layers_lr( solver.net, cwr_layers_Model[conf['model']]) # blocca livelli sotto if conf['strategy'] in ['cwr+', 'ar1', 'ar1free']: if 'cwr_lr_mult' in conf.keys() and conf['cwr_lr_mult'] != 1: cwr.zeros_cwr_layer_bias_lr( solver.net, cwr_layers_Model[conf['model']], force_weights_lr_mult=conf['cwr_lr_mult']) else: cwr.zeros_cwr_layer_bias_lr( solver.net, cwr_layers_Model[conf['model']]) cwr.set_brn_past_weight(solver.net, conf['brn_past_weight']) # Initializes some data structures used for reporting stats. Executed once (in the first round) if first_round: if batch == 1 and (conf['strategy'] in ['cwr', 'cwr+', 'ar1', 'ar1free']): print('Cannot start from batch 1 in ', conf['strategy'], ' strategy!') sys.exit(0) visualization.PrintNetworkArchitecture(solver.net) # If accuracy layer is defined in the prototxt also in TRAIN mode -> log train accuracy too (not in the plot) try: report_train_accuracy = True err = solver.net.blobs[ 'accuracy'].num # Assume this is stable for prototxt of successive batches except: report_train_accuracy = False first_round = False if conf['compute_param_stats']: param_change = {} param_stats = train_utils.stats_initialize_param(solver.net) # nonzero_activations = train_utils.stats_activations_initialize(solver.net) # Load training data for the current batch # Note that the file lists are provided in the batch_filelists folder current_train_filelist = train_filelists.replace( 'XX', str(batch).zfill(2)) print("Recovering training data: ", current_train_filelist, " ...") batch_x, batch_y = train_utils.get_data(current_train_filelist, conf['db_path'], conf['exp_path'], on_the_fly=run_on_the_fly, verbose=conf['verbose']) print("Done.") if conf['num_classes'] < 50: # Category based classification batch_y = batch_y // 5 batch_t = train_utils.compute_one_hot_vectors(batch_y, conf['num_classes']) # Load patterns from Rehearsal Memory rehe_x, rehe_y = rehearsal.get_samples() rehe_t = train_utils.compute_one_hot_vectors(rehe_y, conf['num_classes']) # Detects how many patterns per class are present in the current batch if batch == 0: classes_in_cur_train = batch_y.astype(np.int) else: classes_in_cur_train = np.concatenate( (batch_y.astype(np.int), rehe_y.astype(np.int))) unique_y, y_freq = np.unique(classes_in_cur_train, return_counts=True) if conf['strategy'] in ['cwr+', 'ar1', 'ar1free' ] and batch > initial_batch: cwr.reset_weights( solver.net, cwr_layers_Model[conf['model']], conf['num_classes']) # Reset weights of CWR layers to 0 # Loads previously consolidated weights # This procedure, explained in Fine-Grained Continual Learning (https://arxiv.org/pdf/1907.03799.pdf), # is necessary in the NIC scenario if 'cwr_nic_load_weight' in conf.keys( ) and conf['cwr_nic_load_weight']: cwr.load_weights_nic(solver.net, cwr_layers_Model[conf['model']], unique_y, cons_w) if conf['strategy'] in ['ar1'] and batch > initial_batch: syn.weight_stats(solver.net, batch, ewcData, conf['ewc_clip_to']) solver.net.blobs['ewc'].data[...] = ewcData # Convert labels to float32 batch_y = batch_y.astype(np.float32) assert (batch_x.shape[0] == batch_y.shape[0]) rehe_y = rehe_y.astype(np.float32) avg_train_loss = 0 avg_train_accuracy = 0 avg_count = 0 if conf['strategy'] in ['syn', 'ar1']: syn.init_batch(solver.net, ewcData, synData) reharshal_size = conf[ "rehearsal_memory"] if batch > initial_batch else 0 orig_in_minibatch = np.round( train_minibatch_size * batch_x.shape[0] / (batch_x.shape[0] + reharshal_size)).astype(np.int) reha_in_minibatch = train_minibatch_size - orig_in_minibatch print(' -> Current Batch: %d patterns, External Memory: %d patterns' % (batch_x.shape[0], reharshal_size)) print( ' -> per minibatch (size %d): %d from current batch and %d from external memory' % (train_minibatch_size, orig_in_minibatch, reha_in_minibatch)) # Padding and shuffling batch_x, orig_iters_per_epoch = train_utils.pad_data_single( batch_x, orig_in_minibatch) batch_y, _ = train_utils.pad_data_single(batch_y, orig_in_minibatch) batch_t, _ = train_utils.pad_data_single(batch_t, orig_in_minibatch) batch_x, batch_y, batch_t = train_utils.shuffle_in_unison( (batch_x, batch_y, batch_t), 0) if conf['rehearsal_is_latent']: req_shape = (batch_x.shape[0], ) + solver.net.blobs[ conf['rehearsal_layer']].data.shape[1:] latent_batch_x = np.zeros(req_shape, dtype=np.float32) # Padding and shuffling of rehasal patterns reha_iters_per_epoch = 0 if reharshal_size > 0: rehe_x, reha_iters_per_epoch = train_utils.pad_data_single( rehe_x, reha_in_minibatch) rehe_y, _ = train_utils.pad_data_single(rehe_y, reha_in_minibatch) rehe_t, _ = train_utils.pad_data_single(rehe_t, reha_in_minibatch) rehe_x, rehe_y, rehe_t = train_utils.shuffle_in_unison( (rehe_x, rehe_y, rehe_t), 0) # shuffle print( ' -> iterations per epoch (with padding): %d, %d (initial %d)' % (orig_iters_per_epoch, reha_iters_per_epoch, train_iterations_per_epoch[batch])) # The main solver loop (per batch) it = 0 while it < train_iterations[batch]: # The following part is pretty much straight-forward # The current batch is split in minibatches (which size was previously detected by looking at the net prototxt) # The minibatch is loaded in blobs 'data', 'data_reha', 'label' and 'target' it_mod_orig = it % orig_iters_per_epoch orig_start = it_mod_orig * orig_in_minibatch orig_end = (it_mod_orig + 1) * orig_in_minibatch if conf['rehearsal_is_latent']: solver.net.blobs['data'].data[ ...] = batch_x[orig_start:orig_end] else: solver.net.blobs['data'].data[:orig_in_minibatch] = batch_x[ orig_start:orig_end] # Provide data to input layers (new patterns) solver.net.blobs['label'].data[:orig_in_minibatch] = batch_y[ orig_start:orig_end] solver.net.blobs['target'].data[:orig_in_minibatch] = batch_t[ orig_start:orig_end] # Provide data to input layers (reharsal patterns) if reharshal_size > 0: it_mod_reha = it % reha_iters_per_epoch reha_start = it_mod_reha * reha_in_minibatch reha_end = (it_mod_reha + 1) * reha_in_minibatch if conf['rehearsal_is_latent']: solver.net.blobs['data_reha'].data[ ...] = rehe_x[reha_start:reha_end] else: solver.net.blobs['data'].data[orig_in_minibatch:] = rehe_x[ reha_start:reha_end] solver.net.blobs['label'].data[orig_in_minibatch:] = rehe_y[ reha_start:reha_end] solver.net.blobs['target'].data[orig_in_minibatch:] = rehe_t[ reha_start:reha_end] if conf['strategy'] in ['ar1']: syn.pre_update(solver.net, ewcData, synData) # Explicit (net.step(1)) solver.net.clear_param_diffs() solver.net.forward() # start=None, end=None if batch > 0 and conf['strategy'] in ['cwr+', 'cwr']: solver.net.backward( end='mid_fc7' ) # In CWR+ we stop the backward step at the CWR layer else: if batch > 0 and 'rehearsal_stop_layer' in conf.keys( ) and conf['rehearsal_stop_layer'] is not None: # When using latent replay we stop the backward step at the latent rehearsal layer solver.net.backward(end=conf['rehearsal_stop_layer']) else: solver.net.backward() if conf['rehearsal_is_latent']: # Save latent features of new patterns (only during the first epoch) if batch > 0 and it < orig_iters_per_epoch: latent_batch_x[orig_start:orig_end] = solver.net.blobs[ conf['rehearsal_layer']].data # Weights update solver.apply_update() if conf['strategy'] == 'ar1': syn.post_update(solver.net, ewcData, synData, cwr_layers_Model[conf['model']]) print('+', end='', flush=True) global_eval_iter += 1 avg_count += 1 avg_train_loss += solver.net.blobs['loss'].data if report_train_accuracy: avg_train_accuracy += solver.net.blobs['accuracy'].data if global_eval_iter == eval_iters[eval_idx]: # Evaluation point if avg_count > 0: avg_train_loss /= avg_count avg_train_accuracy /= avg_count train_loss[eval_idx] = avg_train_loss print('\nIter {:>4}'.format(it + 1), '({:>4})'.format(global_eval_iter), ': Train Loss = {:.5f}'.format(avg_train_loss), end='', flush=True) if report_train_accuracy: train_acc[eval_idx] = avg_train_accuracy print(' Train Accuracy = {:.5f}%'.format( avg_train_accuracy * 100), end='', flush=True) compute_confusion_matrix = True if ( conf['confusion_matrix'] and it == train_iterations[batch] - 1) else False # last batch iter # The following lines are executed only if this is the last iteration for the current batch if conf['strategy'] in [ 'cwr+', 'ar1', 'ar1free' ] and it == train_iterations[batch] - 1: cwr.consolidate_weights_cwr_plus( solver.net, cwr_layers_Model[conf['model']], unique_y, y_freq, class_updates, cons_w) class_updates[unique_y] += y_freq print(class_updates) cwr.load_weights( solver.net, cwr_layers_Model[conf['model']], conf['num_classes'], cons_w) # Load consolidated weights for testing accuracy, _, pred_y = train_utils.test_network_with_accuracy_layer( solver, test_x, test_y, test_iterat, test_minibatch_size, prediction_level_Model[conf['model']], return_prediction=compute_confusion_matrix) test_acc[eval_idx] = accuracy * 100 print(' Test Accuracy = {:.5f}%'.format(accuracy * 100)) # Batch(Re)Norm Stats train_utils.print_bn_stats(solver.net) visualization.Plot_Incremental_Training_Update( eval_idx, eval_iters, train_loss, test_acc) filelog.Train_Log_Update(conf['train_log_file'], eval_iters[eval_idx], accuracy, avg_train_loss, report_train_accuracy, avg_train_accuracy) avg_train_loss = 0 avg_train_accuracy = 0 avg_count = 0 eval_idx += 1 # Next eval it += 1 # Next iter # Current batch training concluded if conf['strategy'] in ['ar1']: syn.update_ewc_data(solver.net, ewcData, synData, batch, conf['ewc_clip_to'], c=conf['ewc_w']) if conf['save_ewc_histograms']: visualization.EwcHistograms(ewcData, 100, save_as=conf['exp_path'] + 'Syn/F_' + str(batch) + '.png') if conf['rehearsal_is_latent']: if batch == 0: reha_it = 0 while reha_it < orig_iters_per_epoch: orig_start = reha_it * orig_in_minibatch orig_end = (reha_it + 1) * orig_in_minibatch solver.net.blobs['data'].data[ ...] = batch_x[orig_start:orig_end] solver.net.forward() latent_batch_x[orig_start:orig_end] = solver.net.blobs[ conf['rehearsal_layer']].data reha_it += 1 rehearsal.update_memory(latent_batch_x, batch_y.astype(np.int), batch) else: rehearsal.update_memory(batch_x, batch_y.astype(np.int), batch) if compute_confusion_matrix: # Computes the confusion matrix and logs + plots it cnf_matrix = confusion_matrix(test_y, pred_y, range(conf['num_classes'])) if batch == 0: prev_class_accuracies = np.zeros(conf['num_classes']) else: prev_class_accuracies = current_class_accuracies current_class_accuracies = np.diagonal( cnf_matrix) / cnf_matrix.sum(axis=1) deltas = current_class_accuracies - prev_class_accuracies classes_in_batch = set(batch_y.astype(np.int)) classes_non_in_batch = set(range( conf['num_classes'])) - classes_in_batch mean_class_in_batch = np.mean(deltas[list(classes_in_batch)]) std_class_in_batch = np.std(deltas[list(classes_in_batch)]) mean_class_non_in_batch = np.mean( deltas[list(classes_non_in_batch)]) std_class_non_in_batch = np.std(deltas[list(classes_non_in_batch)]) print( 'InBatch -> mean = %.2f%% std = %.2f%%, OutBatch -> mean = %.2f%% std = %.2f%%' % (mean_class_in_batch * 100, std_class_in_batch * 100, mean_class_non_in_batch * 100, std_class_non_in_batch * 100)) filelog.Train_LogDetails_Update(conf['train_log_file'], batch, mean_class_in_batch, std_class_in_batch, mean_class_non_in_batch, std_class_non_in_batch) visualization.plot_confusion_matrix( cnf_matrix, normalize=True, title='CM after batch: ' + str(batch), save_as=conf['exp_path'] + 'CM/CM_' + str(batch) + '.png') if conf['compute_param_stats']: train_utils.stats_compute_param_change_and_update_prev( solver.net, param_stats, batch, param_change) if batch == 0: solver.net.save(conf['tmp_weights_file']) print('Weights saved to: ', conf['tmp_weights_file']) del solver print('Training Time: %.2f sec' % (time.time() - start_train)) if conf['compute_param_stats']: stats_normalization = True train_utils.stats_normalize(solver.net, param_stats, batch_count, param_change, stats_normalization) visualization.Plot3d_param_stats(solver.net, param_change, batch_count, stats_normalization) filelog.Train_Log_End(conf['train_log_file']) filelog.Train_LogDetails_End(conf['train_log_file']) visualization.Plot_Incremental_Training_End(close=close_at_the_end)
callbacks=[], max_steps=0) trainer.test(model=model, datamodule=datamodule) bt_test_preds = model.preds[0] bt_test_labels = model.labels[0] if last_one_only: bt_test_df = bt_test_df.groupby('user_id').last() bt_test_df['model_pred'] = bt_test_preds.cpu() else: if args.model == 'sakt_legacy' or (args.model == 'dkt' and args.dataset == 'ednet'): bt_test_data, _ = get_chunked_data(bt_test_df, max_length=500, \ train_split=1.0, stride=1) else: bt_test_data, _ = get_data(bt_test_df, train_split=1.0, randomize=False, model_name=args.model) bt_test_batch = prepare_batches(bt_test_data, 128, False) bt_test_preds = eval_batches(model, bt_test_batch, 'cuda', model_name=args.model) bt_test_df['model_pred'] = bt_test_preds if last_one_only: bt_test_df = bt_test_df.groupby('user_id').last() # 4. CHECK PASS CONDITION AND RUN CASE-SPECIFIC ANALYSIS. test_funcs = { 'repetition': test_repeated_feed, 'insertion':
def train_mnist(epoch_num=10, show_iter=100, logdir='test', model_weight=None, load_d=False, load_g=False, compare_path=None, info_time=100, run_select=None, device='cpu'): lr_d = 0.01 lr_g = 0.01 batchsize = 128 z_dim = 96 print('MNIST, discriminator lr: %.3f, generator lr: %.3f' % (lr_d, lr_g)) dataset = get_data(dataname='MNIST', path='../datas/mnist') dataloader = DataLoader(dataset=dataset, batch_size=batchsize, shuffle=True, num_workers=4) D = dc_D().to(device) G = dc_G(z_dim=z_dim).to(device) D.apply(weights_init_d) G.apply(weights_init_g) if model_weight is not None: chk = torch.load(model_weight) if load_d: D.load_state_dict(chk['D']) print('Load D from %s' % model_weight) if load_g: G.load_state_dict(chk['G']) print('Load G from %s' % model_weight) if compare_path is not None: discriminator = dc_D().to(device) model_weight = torch.load(compare_path) discriminator.load_state_dict(model_weight['D']) model_vec = torch.cat( [p.contiguous().view(-1) for p in discriminator.parameters()]) print('Load discriminator from %s' % compare_path) if run_select is not None: fixed_data = torch.load(run_select) real_set = fixed_data['real_set'] fake_set = fixed_data['fake_set'] real_d = fixed_data['real_d'] fake_d = fixed_data['fake_d'] fixed_vec = fixed_data['pred_vec'] print('load fixed data set') from datetime import datetime current_time = datetime.now().strftime('%b%d_%H-%M-%S') writer = SummaryWriter(log_dir='logs/%s/%s_%.3f' % (logdir, current_time, lr_d)) d_optimizer = SGD(D.parameters(), lr=lr_d) g_optimizer = SGD(G.parameters(), lr=lr_g) timer = time.time() count = 0 fixed_noise = torch.randn((64, z_dim), device=device) for e in range(epoch_num): for real_x in dataloader: real_x = real_x[0].to(device) d_real = D(real_x) z = torch.randn((d_real.shape[0], z_dim), device=device) fake_x = G(z) fake_x_c = fake_x.clone().detach() # update generator d_fake = D(fake_x) writer.add_scalars('Discriminator output', { 'Generated image': d_fake.mean().item(), 'Real image': d_real.mean().item() }, global_step=count) G_loss = get_loss(name='JSD', g_loss=True, d_fake=d_fake) g_optimizer.zero_grad() G_loss.backward() g_optimizer.step() gg = torch.norm(torch.cat( [p.grad.contiguous().view(-1) for p in G.parameters()]), p=2) d_fake_c = D(fake_x_c) D_loss = get_loss(name='JSD', g_loss=False, d_real=d_real, d_fake=d_fake_c) if compare_path is not None and count % info_time == 0: diff = get_diff(net=D, model_vec=model_vec) writer.add_scalar('Distance from checkpoint', diff.item(), global_step=count) if run_select is not None: with torch.no_grad(): d_real_set = D(real_set) d_fake_set = D(fake_set) diff_real = torch.norm(d_real_set - real_d, p=2) diff_fake = torch.norm(d_fake_set - fake_d, p=2) d_vec = torch.cat([d_real_set, d_fake_set]) diff = torch.norm(d_vec.sub_(fixed_vec), p=2) writer.add_scalars('L2 norm of pred difference', { 'Total': diff.item(), 'real set': diff_real.item(), 'fake set': diff_fake.item() }, global_step=count) d_optimizer.zero_grad() D_loss.backward() d_optimizer.step() gd = torch.norm(torch.cat( [p.grad.contiguous().view(-1) for p in D.parameters()]), p=2) writer.add_scalars('Loss', { 'D_loss': D_loss.item(), 'G_loss': G_loss.item() }, global_step=count) writer.add_scalars('Grad', { 'D grad': gd.item(), 'G grad': gg.item() }, global_step=count) if count % show_iter == 0: time_cost = time.time() - timer print('Iter :%d , D_loss: %.5f, G_loss: %.5f, time: %.3fs' % (count, D_loss.item(), G_loss.item(), time_cost)) timer = time.time() with torch.no_grad(): fake_img = G(fixed_noise).detach() path = 'figs/%s/' % logdir if not os.path.exists(path): os.makedirs(path) vutils.save_image(fake_img, path + 'iter_%d.png' % count, normalize=True) save_checkpoint(path=logdir, name='SGD-%.3f_%d.pth' % (lr_d, count), D=D, G=G) count += 1 writer.close()
def train_cgd(epoch_num=10, milestone=None, optim_type='ACGD', startPoint=None, start_n=0, z_dim=128, batchsize=64, tols={ 'tol': 1e-10, 'atol': 1e-16 }, l2_penalty=0.0, momentum=0.0, loss_name='WGAN', model_name='dc', data_path='None', show_iter=100, logdir='test', dataname='cifar10', device='cpu', gpu_num=1, collect_info=False): lr_d = 0.01 lr_g = 0.01 dataset = get_data(dataname=dataname, path='../datas/%s' % data_path) dataloader = DataLoader(dataset=dataset, batch_size=batchsize, shuffle=True, num_workers=4) D, G = get_model(model_name=model_name, z_dim=z_dim) D.apply(weights_init_d).to(device) G.apply(weights_init_g).to(device) from datetime import datetime current_time = datetime.now().strftime('%b%d_%H-%M-%S') writer = SummaryWriter(log_dir='logs/%s/%s_%.3f' % (logdir, current_time, lr_d)) if optim_type == 'BCGD': optimizer = BCGD(max_params=G.parameters(), min_params=D.parameters(), lr_max=lr_g, lr_min=lr_d, momentum=momentum, tol=tols['tol'], atol=tols['atol'], device=device) scheduler = lr_scheduler(optimizer=optimizer, milestone=milestone) elif optim_type == 'ICR': optimizer = ICR(max_params=G.parameters(), min_params=D.parameters(), lr=lr_d, alpha=1.0, device=device) scheduler = icrScheduler(optimizer, milestone) elif optim_type == 'ACGD': optimizer = ACGD(max_params=G.parameters(), min_params=D.parameters(), lr_max=lr_g, lr_min=lr_d, tol=tols['tol'], atol=tols['atol'], device=device, solver='cg') scheduler = lr_scheduler(optimizer=optimizer, milestone=milestone) if startPoint is not None: chk = torch.load(startPoint) D.load_state_dict(chk['D']) G.load_state_dict(chk['G']) optimizer.load_state_dict(chk['optim']) print('Start from %s' % startPoint) if gpu_num > 1: D = nn.DataParallel(D, list(range(gpu_num))) G = nn.DataParallel(G, list(range(gpu_num))) timer = time.time() count = 0 if model_name == 'DCGAN' or model_name == 'DCGAN-WBN': fixed_noise = torch.randn((64, z_dim, 1, 1), device=device) else: fixed_noise = torch.randn((64, z_dim), device=device) for e in range(epoch_num): scheduler.step(epoch=e) print('======Epoch: %d / %d======' % (e, epoch_num)) for real_x in dataloader: real_x = real_x[0].to(device) d_real = D(real_x) if model_name == 'DCGAN' or model_name == 'DCGAN-WBN': z = torch.randn((d_real.shape[0], z_dim, 1, 1), device=device) else: z = torch.randn((d_real.shape[0], z_dim), device=device) fake_x = G(z) d_fake = D(fake_x) loss = get_loss(name=loss_name, g_loss=False, d_real=d_real, d_fake=d_fake, l2_weight=l2_penalty, D=D) optimizer.zero_grad() optimizer.step(loss) if count % show_iter == 0: time_cost = time.time() - timer print('Iter :%d , Loss: %.5f, time: %.3fs' % (count, loss.item(), time_cost)) timer = time.time() with torch.no_grad(): fake_img = G(fixed_noise).detach() path = 'figs/%s_%s/' % (dataname, logdir) if not os.path.exists(path): os.makedirs(path) vutils.save_image(fake_img, path + 'iter_%d.png' % (count + start_n), normalize=True) save_checkpoint( path=logdir, name='%s-%s%.3f_%d.pth' % (optim_type, model_name, lr_g, count + start_n), D=D, G=G, optimizer=optimizer) writer.add_scalars('Discriminator output', { 'Generated image': d_fake.mean().item(), 'Real image': d_real.mean().item() }, global_step=count) writer.add_scalar('Loss', loss.item(), global_step=count) if collect_info: cgd_info = optimizer.get_info() writer.add_scalar('Conjugate Gradient/iter num', cgd_info['iter_num'], global_step=count) writer.add_scalar('Conjugate Gradient/running time', cgd_info['time'], global_step=count) writer.add_scalars('Delta', { 'D gradient': cgd_info['grad_y'], 'G gradient': cgd_info['grad_x'], 'D hvp': cgd_info['hvp_y'], 'G hvp': cgd_info['hvp_x'], 'D cg': cgd_info['cg_y'], 'G cg': cgd_info['cg_x'] }, global_step=count) count += 1 writer.close()
def train(epoch_num=10, milestone=None, optim_type='Adam', momentum=0.5, lr_d=1e-4, lr_g=1e-4, startPoint=None, start_n=0, z_dim=128, batchsize=64, loss_name='WGAN', model_name='dc', model_config=None, data_path='None', show_iter=100, logdir='test', dataname='cifar10', device='cpu', gpu_num=1, saturating=False): dataset = get_data(dataname=dataname, path=data_path) dataloader = DataLoader(dataset=dataset, batch_size=batchsize, shuffle=True, num_workers=4) D, G = get_model(model_name=model_name, z_dim=z_dim, configs=model_config) D.apply(weights_init_d).to(device) G.apply(weights_init_g).to(device) from datetime import datetime current_time = datetime.now().strftime('%b%d_%H-%M-%S') # writer = SummaryWriter(log_dir='logs/%s/%s' % (logdir, current_time)) d_optimizer = Adam(D.parameters(), lr=lr_d, betas=(momentum, 0.99)) g_optimizer = Adam(G.parameters(), lr=lr_g, betas=(momentum, 0.99)) if startPoint is not None: chk = torch.load(startPoint) D.load_state_dict(chk['D']) G.load_state_dict(chk['G']) d_optimizer.load_state_dict(chk['d_optim']) g_optimizer.load_state_dict(chk['g_optim']) print('Start from %s' % startPoint) if gpu_num > 1: D = nn.DataParallel(D, list(range(gpu_num))) G = nn.DataParallel(G, list(range(gpu_num))) timer = time.time() count = 0 if 'DCGAN' in model_name: fixed_noise = torch.randn((64, z_dim, 1, 1), device=device) else: fixed_noise = torch.randn((64, z_dim), device=device) for e in range(epoch_num): print('======Epoch: %d / %d======' % (e, epoch_num)) for real_x in dataloader: real_x = real_x[0].to(device) d_real = D(real_x) if 'DCGAN' in model_name: z = torch.randn((d_real.shape[0], z_dim, 1, 1), device=device) else: z = torch.randn((d_real.shape[0], z_dim), device=device) fake_x = G(z) d_fake = D(fake_x) d_loss = get_loss(name=loss_name, g_loss=False, d_real=d_real, d_fake=d_fake) d_optimizer.zero_grad() g_optimizer.zero_grad() d_loss.backward() d_optimizer.step() if not saturating: if 'DCGAN' in model_name: z = torch.randn((d_real.shape[0], z_dim, 1, 1), device=device) else: z = torch.randn((d_real.shape[0], z_dim), device=device) fake_x = G(z) d_fake = D(fake_x) g_loss = get_loss(name=loss_name, g_loss=True, d_fake=d_fake) g_optimizer.zero_grad() g_loss.backward() else: g_loss = d_loss g_optimizer.step() # writer.add_scalar('Loss/D loss', d_loss.item(), count) # writer.add_scalar('Loss/G loss', g_loss.item(), count) # writer.add_scalars('Discriminator output', {'Generated image': d_fake.mean().item(), # 'Real image': d_real.mean().item()}, # global_step=count) if count % show_iter == 0: time_cost = time.time() - timer print('Iter %d, D Loss: %.5f, G loss: %.5f, time: %.2f s' % (count, d_loss.item(), g_loss.item(), time_cost)) timer = time.time() with torch.no_grad(): fake_img = G(fixed_noise).detach() path = 'figs/%s_%s/' % (dataname, logdir) if not os.path.exists(path): os.makedirs(path) vutils.save_image(fake_img, path + 'iter_%d.png' % (count + start_n), normalize=True) save_checkpoint(path=logdir, name='%s-%s_%d.pth' % (optim_type, model_name, count + start_n), D=D, G=G, optimizer=d_optimizer, g_optimizer=g_optimizer) count += 1
def train_d(epoch_num=10, logdir='test', optim='SGD', loss_name='JSD', show_iter=500, model_weight=None, load_d=False, load_g=False, compare_path=None, info_time=100, run_select=None, device='cpu'): lr_d = 0.001 lr_g = 0.01 batchsize = 128 z_dim = 96 print('discriminator lr: %.3f' % lr_d) dataset = get_data(dataname='MNIST', path='../datas/mnist') dataloader = DataLoader(dataset=dataset, batch_size=batchsize, shuffle=True, num_workers=4) D = dc_D().to(device) G = dc_G(z_dim=z_dim).to(device) D.apply(weights_init_d) G.apply(weights_init_g) if model_weight is not None: chk = torch.load(model_weight) if load_d: D.load_state_dict(chk['D']) print('Load D from %s' % model_weight) if load_g: G.load_state_dict(chk['G']) print('Load G from %s' % model_weight) if compare_path is not None: discriminator = dc_D().to(device) model_weight = torch.load(compare_path) discriminator.load_state_dict(model_weight['D']) model_vec = torch.cat( [p.contiguous().view(-1) for p in discriminator.parameters()]) print('Load discriminator from %s' % compare_path) if run_select is not None: fixed_data = torch.load(run_select) real_set = fixed_data['real_set'] fake_set = fixed_data['fake_set'] real_d = fixed_data['real_d'] fake_d = fixed_data['fake_d'] fixed_vec = fixed_data['pred_vec'] print('load fixed data set') from datetime import datetime current_time = datetime.now().strftime('%b%d_%H-%M-%S') # writer = SummaryWriter(log_dir='logs/%s/%s_%.3f' % (logdir, current_time, lr_d)) if optim == 'SGD': d_optimizer = SGD(D.parameters(), lr=lr_d) print('Optimizer SGD') else: d_optimizer = BCGD2(max_params=G.parameters(), min_params=D.parameters(), lr_max=lr_g, lr_min=lr_d, update_max=False, device=device, collect_info=True) print('Optimizer BCGD2') timer = time.time() count = 0 d_losses = [] g_losses = [] for e in range(epoch_num): tol_correct = 0 tol_dloss = 0 tol_gloss = 0 for real_x in dataloader: real_x = real_x[0].to(device) d_real = D(real_x) z = torch.randn((real_x.shape[0], z_dim), device=device) fake_x = G(z) d_fake = D(fake_x) D_loss = get_loss(name=loss_name, g_loss=False, d_real=d_real, d_fake=d_fake) tol_dloss += D_loss.item() * real_x.shape[0] G_loss = get_loss(name=loss_name, g_loss=True, d_real=d_real, d_fake=d_fake) tol_gloss += G_loss.item() * fake_x.shape[0] if compare_path is not None and count % info_time == 0: diff = get_diff(net=D, model_vec=model_vec) # writer.add_scalar('Distance from checkpoint', diff.item(), global_step=count) if run_select is not None: with torch.no_grad(): d_real_set = D(real_set) d_fake_set = D(fake_set) diff_real = torch.norm(d_real_set - real_d, p=2) diff_fake = torch.norm(d_fake_set - fake_d, p=2) d_vec = torch.cat([d_real_set, d_fake_set]) diff = torch.norm(d_vec.sub_(fixed_vec), p=2) # writer.add_scalars('L2 norm of pred difference', # {'Total': diff.item(), # 'real set': diff_real.item(), # 'fake set': diff_fake.item()}, # global_step=count) d_optimizer.zero_grad() if optim == 'SGD': D_loss.backward() d_optimizer.step() gd = torch.norm(torch.cat( [p.grad.contiguous().view(-1) for p in D.parameters()]), p=2) gg = torch.norm(torch.cat( [p.grad.contiguous().view(-1) for p in G.parameters()]), p=2) else: d_optimizer.step(D_loss) cgdInfo = d_optimizer.get_info() gd = cgdInfo['grad_y'] gg = cgdInfo['grad_x'] # writer.add_scalars('Grad', {'update': cgdInfo['update']}, global_step=count) tol_correct += (d_real > 0).sum().item() + (d_fake < 0).sum().item() # writer.add_scalars('Loss', {'D_loss': D_loss.item(), # 'G_loss': G_loss.item()}, global_step=count) # writer.add_scalars('Grad', {'D grad': gd, # 'G grad': gg}, global_step=count) # writer.add_scalars('Discriminator output', {'Generated image': d_fake.mean().item(), # 'Real image': d_real.mean().item()}, # global_step=count) if count % show_iter == 0: time_cost = time.time() - timer print('Iter :%d , D_loss: %.5f, G_loss: %.5f, time: %.3fs' % (count, D_loss.item(), G_loss.item(), time_cost)) timer = time.time() save_checkpoint(path=logdir, name='FixG-%.3f_%d.pth' % (lr_d, count), D=D, G=G) count += 1
def train_g(epoch_num=10, logdir='test', loss_name='JSD', show_iter=500, model_weight=None, load_d=False, load_g=False, device='cpu'): lr_d = 0.01 lr_g = 0.01 batchsize = 128 z_dim = 96 print('MNIST, discriminator lr: %.3f' % lr_d) dataset = get_data(dataname='MNIST', path='../datas/mnist') dataloader = DataLoader(dataset=dataset, batch_size=batchsize, shuffle=True, num_workers=4) D = dc_D().to(device) G = dc_G(z_dim=z_dim).to(device) D.apply(weights_init_d) G.apply(weights_init_g) if model_weight is not None: chk = torch.load(model_weight) if load_d: D.load_state_dict(chk['D']) print('Load D from %s' % model_weight) if load_g: G.load_state_dict(chk['G']) print('Load G from %s' % model_weight) from datetime import datetime current_time = datetime.now().strftime('%b%d_%H-%M-%S') # writer = SummaryWriter(log_dir='logs/%s/%s_%.3f' % (logdir, current_time, lr_g)) d_optimizer = SGD(D.parameters(), lr=lr_d) g_optimizer = SGD(G.parameters(), lr=lr_g) timer = time.time() count = 0 for e in range(epoch_num): for real_x in dataloader: real_x = real_x[0].to(device) d_real = D(real_x) z = torch.randn((real_x.shape[0], z_dim), device=device) fake_x = G(z) d_fake = D(fake_x) D_loss = get_loss(name=loss_name, g_loss=False, d_real=d_real, d_fake=d_fake) G_loss = get_loss(name=loss_name, g_loss=True, d_real=d_real, d_fake=d_fake) d_optimizer.zero_grad() g_optimizer.zero_grad() G_loss.backward() g_optimizer.step() print('D_loss: {}, G_loss: {}'.format(D_loss.item(), G_loss.item())) # writer.add_scalars('Loss', {'D_loss': D_loss.item(), # 'G_loss': G_loss.item()}, # global_step=count) # writer.add_scalars('Discriminator output', {'Generated image': d_fake.mean().item(), # 'Real image': d_real.mean().item()}, # global_step=count) if count % show_iter == 0: time_cost = time.time() - timer print('Iter :%d , D_loss: %.5f, G_loss: %.5f, time: %.3fs' % (count, D_loss.item(), G_loss.item(), time_cost)) timer = time.time() save_checkpoint(path=logdir, name='FixD-%.3f_%d.pth' % (lr_d, count), D=D, G=G) count += 1
def main_Core50(conf, run, close_at_the_end = False): # Prepare configurations files conf['solver_file_first_batch'] = conf['solver_file_first_batch'].replace('X', conf['model']) conf['solver_file'] = conf['solver_file'].replace('X', conf['model']) conf['init_weights_file'] = conf['init_weights_file'].replace('X', conf['model']) conf['tmp_weights_file'] = conf['tmp_weights_file'].replace('X', conf['model']) train_filelists = conf['train_filelists'].replace('RUN_X', run) test_filelist = conf['test_filelist'].replace('RUN_X', run) # For run 0 store/load binary files # For the rest of runs read single files (slower, but saves disk space) #run_on_the_fly = False if run == 'run0' else True run_on_the_fly = True # This is the procedure we applied to obtain the reduced test set # train_utils.reduce_filelist(test_filelist, test_filelist+"3", 20) (Path(conf['exp_path']) / 'CM').mkdir(exist_ok=True, parents=True) (Path(conf['exp_path']) / 'EwC').mkdir(exist_ok=True, parents=True) (Path(conf['exp_path']) / 'Syn').mkdir(exist_ok=True, parents=True) # Parse the solver prototxt # for more details see - https://stackoverflow.com/questions/31823898/changing-the-solver-parameters-in-caffe-through-pycaffe print('Solver proto: ', conf['solver_file_first_batch']) solver_param = caffe_pb2.SolverParameter() with open(conf['solver_file_first_batch']) as f: txtf.Merge(str(f.read()), solver_param) net_prototxt = solver_param.net # Obtains the path to the net prototxt print('Net proto: ',net_prototxt) # Obtain class labels if conf['class_labels'] != '': # More complex than a simple loadtxt because of the unicode representation in python 3 label_str = np.loadtxt(conf['class_labels'], dtype=bytes, delimiter="\n").astype(str) # Obtain minibatch size from net proto train_minibatch_size, test_minibatch_size = train_utils.extract_minibatch_size_from_prototxt_with_input_layers(net_prototxt) print(' test minibatch size: ', test_minibatch_size) print(' train minibatch size: ', train_minibatch_size) # Is the network using target vectors (besides the labels)? need_target = train_utils.net_use_target_vectors(net_prototxt) # Load test set print ("Recovering Test Set: ", test_filelist, " ...") start = time.time() test_x, test_y = train_utils.get_data(test_filelist, conf['db_path'], conf['exp_path'], on_the_fly=run_on_the_fly, verbose = conf['verbose']) assert(test_x.shape[0] == test_y.shape[0]) if conf['num_classes'] == 10: # Checks if we are doing category-based classification test_y = test_y // 5 test_y = test_y.astype(np.float32) test_patterns = test_x.shape[0] test_x, test_y, test_iterat = train_utils.pad_data(test_x, test_y, test_minibatch_size) print (' -> %d patterns of %d classes (%.2f sec.)' % (test_patterns, len(np.unique(test_y)), time.time() - start)) print (' -> %.2f -> %d iterations for full evaluation' % (test_patterns / test_minibatch_size, test_iterat)) # Load training patterns in batches (by now assume the same number in all batches) batch_count = conf['num_batches'] train_patterns = train_utils.count_lines_in_batches(batch_count,train_filelists) train_iterations_per_epoch = np.zeros(batch_count, int) train_iterations = np.zeros(batch_count, int) test_interval_epochs = conf['test_interval_epochs'] test_interval = np.zeros(batch_count, float) for batch in range(batch_count): train_iterations_per_epoch[batch] = int(np.ceil(train_patterns[batch] / train_minibatch_size)) test_interval[batch] = test_interval_epochs * train_iterations_per_epoch[batch] if (batch == 0): train_iterations[batch] = train_iterations_per_epoch[batch] * conf['num_epochs_first_batch'] else: train_iterations[batch] = train_iterations_per_epoch[batch] * conf['num_epochs'] print ("Batch %2d: %d patterns, %d iterations (%d iter. per epochs - test every %.1f iter.)" \ % (batch, train_patterns[batch], train_iterations[batch], train_iterations_per_epoch[batch], test_interval[batch])) # Create evaluation points # -> iterations which are boundaries of batches batch_iter = [0] iter = 0 for batch in range(batch_count): iter += train_iterations[batch] batch_iter.append(iter) # Calculates the iterations where the network will be evaluated eval_iters = [1] # Start with 1 (insted of 0) because the test net is aligned to the train one after solver.step(1) for batch in range(batch_count): start = batch_iter[batch] end = batch_iter[batch+1] start += test_interval[batch] while start < end: eval_iters.append(int(start)) start += test_interval[batch] eval_iters.append(end) # Iterations which are epochs in the evaluation range epochs_iter = [] for batch in range(batch_count): start = batch_iter[batch] end = batch_iter[batch+1] start += train_iterations_per_epoch[batch] while start <= end: epochs_iter.append(int(start)) start += train_iterations_per_epoch[batch] prev_train_loss = np.zeros(len(eval_iters)) prev_test_acc = np.zeros(len(eval_iters)) prev_exist = filelog.TryLoadPrevTrainingLog(conf['train_log_file'], prev_train_loss, prev_test_acc) train_loss = np.copy(prev_train_loss) # Copying allows to correctly visualize the graph in case we start from initial_batch > 0 test_acc = np.copy(prev_test_acc) train_acc = np.zeros(len(eval_iters)) epochs_tick = False if batch_count > 30 else True # For better visualization visualization.Plot_Incremental_Training_Init('Incremental Training', eval_iters, epochs_iter, batch_iter, train_loss, test_acc, 5, conf['accuracy_max'], prev_exist, prev_train_loss, prev_test_acc, show_epochs_tick = epochs_tick) filelog.Train_Log_Init(conf['train_log_file']) filelog.Train_LogDetails_Init(conf['train_log_file']) start_train = time.time() eval_idx = 0 # Evaluation iterations counter global_eval_iter = 0 # Global iterations counter first_round = True initial_batch = conf['initial_batch'] if initial_batch > 0: # Move forward by skipping unnecessary evaluation global_eval_iter = batch_iter[initial_batch] while eval_iters[eval_idx] < global_eval_iter: eval_idx += 1 eval_idx += 1 for batch in range(initial_batch, batch_count): print ('\nBATCH = {:2d} ----------------------------------------------------'.format(batch)) if batch == 0: solver = caffe.get_solver(conf['solver_file_first_batch']) # Load the solver for the first batch and create net(s) if conf['init_weights_file'] !='': solver.net.copy_from(conf['init_weights_file']) print('Network created and Weights loaded from: ', conf['init_weights_file']) # Test solver.test_nets[0].copy_from(conf['init_weights_file']) accuracy, _ , pred_y = train_utils.test_network_with_accuracy_layer(solver, test_x, test_y, test_iterat, test_minibatch_size, prediction_level_Model[conf['model']], return_prediction = True) # BatchNorm Stats train_utils.print_bn_stats(solver.net) if conf['strategy'] in ['cwr+','ar1']: cwr.zeros_cwr_layer_bias_lr(solver.net, cwr_layers_Model[conf['model']]) class_updates = np.zeros(conf['num_classes'], dtype=np.float32) cons_w = cwr.init_consolidated_weights(solver.net, cwr_layers_Model[conf['model']], conf['num_classes']) # Allocate space for consolidated weights and initialze them to 0 cwr.reset_weights(solver.net, cwr_layers_Model[conf['model']], conf['num_classes']) # Reset cwr weights to 0 (done here for the first batch to keep initial stats correct) if conf['strategy'] == 'cwr' or conf['dynamic_head_expansion'] == True: class_updates = np.zeros(conf['num_classes'], dtype=np.float32) rand_w, cons_w = cwr.copy_initial_weights(solver.net, cwr_layers_Model[conf['model']], conf['num_classes']) # Random values for cwr layers (since they do not exist in pretrained models) if conf['strategy'] in ['syn','ar1']: # ewcData stores optimal weights + normalized fisher; trajectory stores unnormalized summed grad*deltaW ewcData, synData = syn.create_syn_data(solver.net) elif batch == 1: solver = caffe.get_solver(conf['solver_file']) # Load the solver for the next batches and create net(s) solver.net.copy_from(conf['tmp_weights_file']) print('Network created and Weights loaded from: ', conf['tmp_weights_file']) if conf['strategy'] in ['cwr','cwr+']: cwr.zeros_non_cwr_layers_lr(solver.net, cwr_layers_Model[conf['model']]) # In CWR we freeze every layer except the CWR one(s) # By providing a cwr_lr_mult multiplier we can use a different Learning Rate for CWR and non-CWR cwr_layers_Model # Note that a similar result can be achieved by manually editing the net prototxt if conf['strategy'] in ['cwr+', 'ar1']: if 'cwr_lr_mult' in conf.keys() and conf['cwr_lr_mult'] != 1: cwr.zeros_cwr_layer_bias_lr(solver.net, cwr_layers_Model[conf['model']], force_weights_lr_mult = conf['cwr_lr_mult']) else: cwr.zeros_cwr_layer_bias_lr(solver.net, cwr_layers_Model[conf['model']]) cwr.set_brn_past_weight(solver.net, 10000) # Initializes some data structures used for reporting stats. Executed once (in the first round) if first_round: if batch == 1 and (conf['strategy'] in ['ewc','cwr', 'cwr+', 'syn', 'ar1']): print('Cannot start from batch 1 in ', conf['strategy'], ' strategy!') sys.exit(0) visualization.PrintNetworkArchitecture(solver.net) # if accuracy layer is defined in the prototxt also in TRAIN mode -> log also train accuracy (not in the plot) try: report_train_accuracy = True err = solver.net.blobs['accuracy'].num # assume this is stable for prototxt of successive batches except: report_train_accuracy = False first_round = False if conf['compute_param_stats']: param_change = {} param_stats = train_utils.stats_initialize_param(solver.net) # Load training data for the current batch # Note that the file lists are provided in the batch_filelists folder current_train_filelist = train_filelists.replace('XX', str(batch).zfill(2)) print ("Recovering training data: ", current_train_filelist, " ...") load_start = time.time() train_x, train_y = train_utils.get_data(current_train_filelist, conf['db_path'], conf['exp_path'], on_the_fly=run_on_the_fly, verbose = conf['verbose']) print ("Done.") if conf['num_classes'] == 10: # Category based classification train_y = train_y // 5 # If target values (e.g. one hot vectors) are needed we need to create them from numerical class labels if need_target: target_y = train_utils.compute_one_hot_vectors(train_y, conf['num_classes']) train_x, tmp_iters = train_utils.pad_data_single(train_x, train_minibatch_size) train_y, _ = train_utils.pad_data_single(train_y, train_minibatch_size) target_y, _ = train_utils.pad_data_single(target_y, train_minibatch_size) if batch>0 and conf['strategy'] == 'lwf': if conf['lwf_weight'] > 0: weight_old = conf['lwf_weight'] else: weight_old = 1 - (train_patterns[batch] / np.sum(train_patterns[0:batch+1])) x_min = 2.0/3.0 x_max = 0.9 y_min = 0.45 y_max = 0.60 # if weight_old > x_max: weight_old = x_max # Clip weight_old weight_old = y_min + (weight_old - x_min)*(y_max-y_min)/(x_max-x_min) print('Lwf Past Weight: %.2f' % (weight_old)) target_y = lwf.update_target_vectors(solver, train_x, train_y, conf['num_classes'], train_iterations_per_epoch[batch], train_minibatch_size, weight_old) if conf['dynamic_head_expansion'] == True: train_utils.dynamic_head_expansion(solver.net, cwr_layers_Model[conf['model']], conf['num_classes'], train_y, rand_w) if conf['strategy'] == 'cwr' and batch > initial_batch: cwr.load_weights(solver.net, cwr_layers_Model[conf['model']], conf['num_classes'], rand_w) # Reset net weights if conf['strategy'] in ['cwr+','ar1'] and batch > initial_batch: cwr.reset_weights(solver.net, cwr_layers_Model[conf['model']], conf['num_classes']) # Reset weights of CWR layers to 0 (maximal head approach!) # Loads previously consolidated weights # This procedure, explained in the paper, is necessary in the NIC scenario if 'cwr_nic_load_weight' in conf.keys() and conf['cwr_nic_load_weight']: cwr.load_weights_nic(solver.net, cwr_layers_Model[conf['model']], train_y, cons_w) train_x, train_y, target_y = train_utils.shuffle_in_unison((train_x, train_y, target_y), 0) if conf['strategy'] in ['ewc','syn','ar1'] and batch > initial_batch: #syn.weight_stats(solver.net, batch, ewcData, conf['ewc_clip_to']) # Makes ewc info available to the network for successive training # The 'ewc' blob will be used by our C++ code (see the provided custom "sgd_solver.cpp") solver.net.blobs['ewc'].data[...] = ewcData else: #TODO: review branch (is it necessary?) train_x, tmp_iters = train_utils.pad_data_single(train_x, train_minibatch_size) train_y, _ = train_utils.pad_data_single(train_y, train_minibatch_size) train_x, train_y = train_utils.shuffle_in_unison((train_x, train_y), 0) # apply temporal coherence strategy to modify labels if batch > 0 and conf['strategy'] != 'naive': train_x, train_y = train_utils.predict_labels_temporal_coherence(solver, train_x, train_y, conf['num_classes'], train_iterations_per_epoch[batch], train_minibatch_size, conf['strategy'], 0.80) # ATTENTION, if patterns have been removed do padding again! print (' -> %d patterns (of %d classes) after padding and shuffling (%.2f sec.)' % (train_x.shape[0], len(np.unique(train_y)), time.time()-load_start)) assert(train_iterations[batch] >= tmp_iters) # convert labels to float32 train_y = train_y.astype(np.float32) assert(train_x.shape[0] == train_y.shape[0]) # training avg_train_loss = 0 avg_train_accuracy = 0 avg_count = 0; if conf['strategy'] in ['syn','ar1']: syn.init_batch(solver.net, ewcData, synData) # The main solver loop (per batch) it = 0 while it < train_iterations[batch]: # The following part is pretty much straight-forward # The current batch is split in minibatches (which size was previously detected by looking at the net prototxt) # The minibatch is loaded in blobs 'data', 'label' and 'target' # a step(1) is executed (which executes forward + backward + weights update) it_mod = it % train_iterations_per_epoch[batch] start = it_mod * train_minibatch_size end = (it_mod + 1) * train_minibatch_size if conf['verbose']: avgl = avga = 0 if avg_count > 0: avgl = avg_train_loss / avg_count print ('Iter {:>4}'.format(it+1), '({:>4})'.format(global_eval_iter), ': Train Loss = {:.5f}'.format(avgl), end='', flush = True) if report_train_accuracy: if avg_count > 0: avga = avg_train_accuracy / avg_count print (' Train Accuracy = {:.5f}%'.format(avga*100), flush = True) else: print ('+', end = '', flush=True) # Provide data to input layers solver.net.blobs['data'].data[...] = train_x[start:end] solver.net.blobs['label'].data[...] = train_y[start:end] if need_target: solver.net.blobs['target'].data[...] = target_y[start:end] if conf['strategy'] in ['syn','ar1']: syn.pre_update(solver.net, ewcData, synData) # SGD by Caffe if conf['strategy'] in ['cwr+','cwr'] and batch > initial_batch: solver.net.clear_param_diffs() solver.net.forward() # start=None, end=None solver.net.backward(end='mid_fc7') solver.apply_update() else: solver.step(1) #train_utils.print_bn_stats(solver.net) # If enabled saves the gradient magniture of the prediction_level stats on file # train_utils.gradient_stats(prediction_level_Model[conf['model']], global_eval_iter, solver.net, train_y, start, end) if conf['strategy'] == 'syn': syn.post_update(solver.net, ewcData, synData) if conf['strategy'] == 'ar1': syn.post_update(solver.net, ewcData, synData, cwr_layers_Model[conf['model']]) global_eval_iter +=1 avg_count +=1 avg_train_loss += solver.net.blobs['loss'].data if report_train_accuracy: avg_train_accuracy += solver.net.blobs['accuracy'].data # Early stopping (a.k.a. Limited) if conf['strategy'] == '_syn' and avg_count > 0 and avg_train_loss/avg_count < syn.target_train_loss_accuracy_per_batch(batch): # enable by removing "_" on demand it = train_iterations[batch]-1 # skip to last iter global_eval_iter = eval_iters[eval_idx] # enable evaluation point now if global_eval_iter == eval_iters[eval_idx]: # Evaluation point if avg_count > 0: avg_train_loss/= avg_count avg_train_accuracy /= avg_count train_loss[eval_idx] = avg_train_loss print ('\nIter {:>4}'.format(it+1), '({:>4})'.format(global_eval_iter), ': Train Loss = {:.5f}'.format(avg_train_loss), end='', flush = True) if report_train_accuracy: train_acc[eval_idx] = avg_train_accuracy print (' Train Accuracy = {:.5f}%'.format(avg_train_accuracy*100), end='', flush = True) compute_confusion_matrix = True if (conf['confusion_matrix'] and it == train_iterations[batch]-1) else False # last batch iter # The following lines are executed only if this is the last iteration for the current batch if conf['strategy'] in ['cwr', 'cwr+', 'ar1'] and it == train_iterations[batch]-1: if conf['strategy'] == 'cwr': batch_weight = conf['cwr_batch0_weight'] if batch == initial_batch else 1 cwr._consolidate_weights_cwr(solver.net, cwr_layers_Model[conf['model']], train_y, cons_w, batch_weight, class_updates = class_updates) class_updates[train_y.astype(np.int)] += 1; # Increase weights of trained classes else: unique_y, y_freq = np.unique(train_y.astype(np.int), return_counts=True) cwr.consolidate_weights_cwr_plus(solver.net, cwr_layers_Model[conf['model']], unique_y, y_freq, class_updates, cons_w) class_updates[unique_y] += y_freq; # print(class_updates) cwr.load_weights(solver.net, cwr_layers_Model[conf['model']], conf['num_classes'], cons_w) # Load consolidated weights for testing accuracy, _ , pred_y = train_utils.test_network_with_accuracy_layer(solver, test_x, test_y, test_iterat, test_minibatch_size, prediction_level_Model[conf['model']], return_prediction = compute_confusion_matrix) test_acc[eval_idx] = accuracy*100 print (' Test Accuracy = {:.5f}%'.format(accuracy*100)) # Batch(Re)Norm Stats train_utils.print_bn_stats(solver.net) visualization.Plot_Incremental_Training_Update(eval_idx, eval_iters, train_loss, test_acc) filelog.Train_Log_Update(conf['train_log_file'], eval_iters[eval_idx], accuracy, avg_train_loss, report_train_accuracy, avg_train_accuracy) avg_train_loss = 0 avg_train_accuracy = 0 avg_count = 0 eval_idx+=1 # next eval it+=1 # next iter # Current batch training concluded if conf['strategy'] == 'ewc': if batch == initial_batch: ewcData, fisher = ewc.create_ewc_data(solver.net) # ewcData stores optimal weights + normalized fisher; fisher store unnormalized summed fisher print ("Computing Fisher Information and Storing Optimal Weights...") ewc.update_ewc_data(ewcData, fisher, solver.net, train_x, train_y, target_y, train_iterations_per_epoch[batch], train_minibatch_size, batch, conf['ewc_clip_to'], conf['ewc_w']) print ("Done.") if conf['save_ewc_histograms']: visualization.EwcHistograms(ewcData, 100, save_as = conf['exp_path'] + 'EwC/F_' + str(batch) + '.png') if conf['strategy'] in ['syn','ar1']: syn.update_ewc_data(solver.net, ewcData, synData, batch, conf['ewc_clip_to']) if conf['save_ewc_histograms']: visualization.EwcHistograms(ewcData, 100, save_as = conf['exp_path'] + 'Syn/F_' + str(batch) + '.png') if compute_confusion_matrix: # Computes the confusion matrix and logs + plots it cnf_matrix = confusion_matrix(test_y, pred_y) if batch ==0: prev_class_accuracies = np.zeros(conf['num_classes']) else: prev_class_accuracies = current_class_accuracies current_class_accuracies = np.diagonal(cnf_matrix) / cnf_matrix.sum(axis = 1) deltas = current_class_accuracies - prev_class_accuracies classes_in_batch = set(train_y.astype(np.int)) classes_non_in_batch = set(range(conf['num_classes']))-classes_in_batch mean_class_in_batch = np.mean(deltas[list(classes_in_batch)]) std_class_in_batch = np.std(deltas[list(classes_in_batch)]) mean_class_non_in_batch = np.mean(deltas[list(classes_non_in_batch)]) std_class_non_in_batch = np.std(deltas[list(classes_non_in_batch)]) print('InBatch -> mean = %.2f%% std = %.2f%%, OutBatch -> mean = %.2f%% std = %.2f%%' % (mean_class_in_batch*100, std_class_in_batch*100, mean_class_non_in_batch*100, std_class_non_in_batch*100)) filelog.Train_LogDetails_Update(conf['train_log_file'], batch, mean_class_in_batch, std_class_in_batch, mean_class_non_in_batch, std_class_non_in_batch) visualization.plot_confusion_matrix(cnf_matrix, normalize = True, title='CM after batch: ' + str(batch), save_as = conf['exp_path'] + 'CM/CM_' + str(batch) + '.png') if conf['compute_param_stats']: train_utils.stats_compute_param_change_and_update_prev(solver.net, param_stats, batch, param_change) if batch == 0: solver.net.save(conf['tmp_weights_file']) print('Weights saved to: ', conf['tmp_weights_file']) del solver print('Training Time: %.2f sec' % (time.time() - start_train)) if conf['compute_param_stats']: stats_normalization = True train_utils.stats_normalize(solver.net, param_stats, batch_count, param_change, stats_normalization) visualization.Plot3d_param_stats(solver.net, param_change, batch_count, stats_normalization) filelog.Train_Log_End(conf['train_log_file']) filelog.Train_LogDetails_End(conf['train_log_file']) visualization.Plot_Incremental_Training_End(close = close_at_the_end)
def train_sim(epoch_num=10, optim_type='ACGD', startPoint=None, start_n=0, z_dim=128, batchsize=64, l2_penalty=0.0, momentum=0.0, log=False, loss_name='WGAN', model_name='dc', model_config=None, data_path='None', show_iter=100, logdir='test', dataname='CIFAR10', device='cpu', gpu_num=1): lr_d = 1e-4 lr_g = 1e-4 dataset = get_data(dataname=dataname, path=data_path) dataloader = DataLoader(dataset=dataset, batch_size=batchsize, shuffle=True, num_workers=4) D, G = get_model(model_name=model_name, z_dim=z_dim, configs=model_config) D.apply(weights_init_d).to(device) G.apply(weights_init_g).to(device) optim_d = RMSprop(D.parameters(), lr=lr_d) optim_g = RMSprop(G.parameters(), lr=lr_g) if startPoint is not None: chk = torch.load(startPoint) D.load_state_dict(chk['D']) G.load_state_dict(chk['G']) optim_d.load_state_dict(chk['d_optim']) optim_g.load_state_dict(chk['g_optim']) print('Start from %s' % startPoint) if gpu_num > 1: D = nn.DataParallel(D, list(range(gpu_num))) G = nn.DataParallel(G, list(range(gpu_num))) timer = time.time() count = 0 if 'DCGAN' in model_name: fixed_noise = torch.randn((64, z_dim, 1, 1), device=device) else: fixed_noise = torch.randn((64, z_dim), device=device) for e in range(epoch_num): print('======Epoch: %d / %d======' % (e, epoch_num)) for real_x in dataloader: real_x = real_x[0].to(device) d_real = D(real_x) if 'DCGAN' in model_name: z = torch.randn((d_real.shape[0], z_dim, 1, 1), device=device) else: z = torch.randn((d_real.shape[0], z_dim), device=device) fake_x = G(z) d_fake = D(fake_x) loss = get_loss(name=loss_name, g_loss=False, d_real=d_real, d_fake=d_fake, l2_weight=l2_penalty, D=D) D.zero_grad() G.zero_grad() loss.backward() optim_d.step() optim_g.step() if count % show_iter == 0: time_cost = time.time() - timer print('Iter :%d , Loss: %.5f, time: %.3fs' % (count, loss.item(), time_cost)) timer = time.time() with torch.no_grad(): fake_img = G(fixed_noise).detach() path = 'figs/%s_%s/' % (dataname, logdir) if not os.path.exists(path): os.makedirs(path) vutils.save_image(fake_img, path + 'iter_%d.png' % (count + start_n), normalize=True) save_checkpoint( path=logdir, name='%s-%s%.3f_%d.pth' % (optim_type, model_name, lr_g, count + start_n), D=D, G=G, optimizer=optim_d, g_optimizer=optim_g) if wandb and log: wandb.log({ 'Real score': d_real.mean().item(), 'Fake score': d_fake.mean().item(), 'Loss': loss.item() }) count += 1
def train_cgd(epoch_num=10, optim_type='ACGD', startPoint=None, start_n=0, z_dim=128, batchsize=64, tols={ 'tol': 1e-10, 'atol': 1e-16 }, l2_penalty=0.0, momentum=0.0, loss_name='WGAN', model_name='dc', model_config=None, data_path='None', show_iter=100, logdir='test', dataname='CIFAR10', device='cpu', gpu_num=1, ada_train=True, log=False, collect_info=False, args=None): lr_d = args['lr_d'] lr_g = args['lr_g'] dataset = get_data(dataname=dataname, path=data_path) dataloader = DataLoader(dataset=dataset, batch_size=batchsize, shuffle=True, num_workers=4) D, G = get_model(model_name=model_name, z_dim=z_dim, configs=model_config) D.apply(weights_init_d).to(device) G.apply(weights_init_g).to(device) if optim_type == 'BCGD': optimizer = BCGD(max_params=G.parameters(), min_params=D.parameters(), lr_max=lr_g, lr_min=lr_d, momentum=momentum, tol=tols['tol'], atol=tols['atol'], device=device) # scheduler = lr_scheduler(optimizer=optimizer, milestone=milestone) elif optim_type == 'ICR': optimizer = ICR(max_params=G.parameters(), min_params=D.parameters(), lr=lr_d, alpha=1.0, device=device) # scheduler = icrScheduler(optimizer, milestone) elif optim_type == 'ACGD': optimizer = ACGD(max_params=G.parameters(), min_params=D.parameters(), lr_max=lr_g, lr_min=lr_d, tol=tols['tol'], atol=tols['atol'], device=device, solver='cg') # scheduler = lr_scheduler(optimizer=optimizer, milestone=milestone) if startPoint is not None: chk = torch.load(startPoint) D.load_state_dict(chk['D']) G.load_state_dict(chk['G']) # optimizer.load_state_dict(chk['optim']) print('Start from %s' % startPoint) if gpu_num > 1: D = nn.DataParallel(D, list(range(gpu_num))) G = nn.DataParallel(G, list(range(gpu_num))) timer = time.time() count = 0 if 'DCGAN' in model_name: fixed_noise = torch.randn((64, z_dim, 1, 1), device=device) else: fixed_noise = torch.randn((64, z_dim), device=device) mod = 10 accs = torch.tensor([0.8 for _ in range(mod)]) for e in range(epoch_num): # scheduler.step(epoch=e) print('======Epoch: %d / %d======' % (e, epoch_num)) for real_x in dataloader: real_x = real_x[0].to(device) d_real = D(real_x) if 'DCGAN' in model_name: z = torch.randn((d_real.shape[0], z_dim, 1, 1), device=device) else: z = torch.randn((d_real.shape[0], z_dim), device=device) fake_x = G(z) d_fake = D(fake_x) loss = get_loss(name=loss_name, g_loss=False, d_real=d_real, d_fake=d_fake, l2_weight=l2_penalty, D=D) optimizer.zero_grad() optimizer.step(loss) num_correct = torch.sum(d_real > 0) + torch.sum(d_fake < 0) acc = num_correct.item() / (d_real.shape[0] + d_fake.shape[0]) accs[count % mod] = acc acc_indicator = sum(accs) / mod if acc_indicator > 0.9: ada_ratio = 0.05 elif acc_indicator < 0.80: ada_ratio = 0.1 else: ada_ratio = 1.0 if ada_train: optimizer.set_lr(lr_max=lr_g, lr_min=ada_ratio * lr_d) if count % show_iter == 0 and count != 0: time_cost = time.time() - timer print('Iter :%d , Loss: %.5f, time: %.3fs' % (count, loss.item(), time_cost)) timer = time.time() with torch.no_grad(): fake_img = G(fixed_noise).detach() path = 'figs/%s_%s/' % (dataname, logdir) if not os.path.exists(path): os.makedirs(path) vutils.save_image(fake_img, path + 'iter_%d.png' % (count + start_n), normalize=True) save_checkpoint(path=logdir, name='%s-%s_%d.pth' % (optim_type, model_name, count + start_n), D=D, G=G, optimizer=optimizer) if wandb and log: wandb.log( { 'Real score': d_real.mean().item(), 'Fake score': d_fake.mean().item(), 'Loss': loss.item(), 'Acc_indicator': acc_indicator, 'Ada ratio': ada_ratio }, step=count, ) if collect_info and wandb: cgd_info = optimizer.get_info() wandb.log( { 'CG iter num': cgd_info['iter_num'], 'CG runtime': cgd_info['time'], 'D gradient': cgd_info['grad_y'], 'G gradient': cgd_info['grad_x'], 'D hvp': cgd_info['hvp_y'], 'G hvp': cgd_info['hvp_x'], 'D cg': cgd_info['cg_y'], 'G cg': cgd_info['cg_x'] }, step=count) count += 1
def train_scg(config, tols, milestone, device='cpu'): lr_d = config['lr_d'] lr_g = config['lr_g'] optim_type = config['optimizer'] z_dim = config['z_dim'] model_name = config['model'] epoch_num = config['epoch_num'] show_iter = config['show_iter'] loss_name = config['loss_type'] l2_penalty = config['d_penalty'] logdir = config['logdir'] start_n = config['startn'] dataset = get_data(dataname=config['dataset'], path='../datas/%s' % config['datapath']) dataloader = DataLoader(dataset=dataset, batch_size=config['batchsize'], shuffle=True, num_workers=4) inner_loader = DataLoader(dataset=dataset, batch_size=config['batchsize'], shuffle=True, num_workers=4) D, G = get_model(model_name=model_name, z_dim=z_dim) D.apply(weights_init_d).to(device) G.apply(weights_init_g).to(device) optimizer = SCG(max_params=G.parameters(), min_params=D.parameters(), lr_max=lr_g, lr_min=lr_d, tol=tols['tol'], atol=tols['atol'], dataloader=inner_loader, device=device, solver='cg') scheduler = lr_scheduler(optimizer=optimizer, milestone=milestone) if config['checkpoint'] is not None: startPoint = config['checkpoint'] chk = torch.load(startPoint) D.load_state_dict(chk['D']) G.load_state_dict(chk['G']) optimizer.load_state_dict(chk['optim']) print('Start from %s' % startPoint) gpu_num = config['gpu_num'] if gpu_num > 1: D = nn.DataParallel(D, list(range(gpu_num))) G = nn.DataParallel(G, list(range(gpu_num))) timer = time.time() count = 0 if model_name == 'DCGAN' or model_name == 'DCGAN-WBN': fixed_noise = torch.randn((64, z_dim, 1, 1), device=device) else: fixed_noise = torch.randn((64, z_dim), device=device) for e in range(epoch_num): scheduler.step(epoch=e) print('======Epoch: %d / %d======' % (e, epoch_num)) for real_x in dataloader: optimizer.zero_grad() real_x = real_x[0] if model_name == 'DCGAN' or model_name == 'DCGAN-WBN': z = torch.randn((real_x.shape[0], z_dim, 1, 1), device=device) else: z = torch.randn((real_x.shape[0], z_dim), device=device) def closure(train_x): train_x = train_x.to(device) fake_x = G(z) d_fake = D(fake_x) d_real = D(train_x) loss = get_loss(name=loss_name, g_loss=False, d_real=d_real, d_fake=d_fake, l2_weight=l2_penalty, D=D) return loss loss = optimizer.step(closure=closure, img=real_x) if count % show_iter == 0: time_cost = time.time() - timer print('Iter :%d , Loss: %.5f, time: %.3fs' % (count, loss.item(), time_cost)) timer = time.time() with torch.no_grad(): fake_img = G(fixed_noise).detach() path = 'figs/%s_%s/' % (config['dataset'], logdir) if not os.path.exists(path): os.makedirs(path) vutils.save_image(fake_img, path + 'iter_%d.png' % (count + start_n), normalize=True) save_checkpoint(path=logdir, name='%s-%s%.3f_%d.pth' % (optim_type, model_name, lr_g, count + start_n), D=D, G=G, optimizer=optimizer) count += 1
def trains(start, end, step, epoch_num, model_name, weight_prefix, dataname, data_path, preload_path=None): import pandas as pd modes = ['lcgd', 'cgd', 'SGD', 'Adam', 'RMSProp'] device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') print(device) lr = 0.05 z_dim = 128 if model_name == 'dc': D = dc_D() G = dc_G(z_dim=z_dim) elif model_name == 'DC': D = GoodDiscriminator() G = GoodGenerator() dataset = get_data(dataname=dataname, path='../datas/%s' % data_path) trainer = VisionData(D=D, G=G, device=device, dataset=dataset, z_dim=z_dim, batchsize=128, lr_d=lr, lr_g=lr, show_iter=500, weight_decay=0.0, d_penalty=0.0, g_penalty=0, noise_shape=(64, z_dim), gp_weight=0) d_loss_list = [] g_loss_list = [] row_names = [ '%s%d.pth' % (weight_prefix, i) for i in range(start, end, step) ] for weight_path in row_names: if preload_path is not None: num_fcin = trainer.D.linear.in_features trainer.D.linear = nn.Linear(num_fcin, 10) trainer.load_checkpoint(preload_path, count=0, load_g=False, load_d=True) trainer.D.linear = nn.Linear(num_fcin, 1) trainer.D.to(device) print('Load pretrained discriminator from %s' % preload_path) trainer.load_checkpoint(weight_path, count=0, load_g=True, load_d=False) d_losses, g_losses = trainer.train_d(epoch_num=epoch_num, mode=modes[2], logname='train_is', dataname='CIFAR') d_loss_list.append(d_losses) g_loss_list.append(g_losses) df = pd.DataFrame(d_loss_list, index=row_names) gf = pd.DataFrame(g_loss_list, index=row_names) print(df, gf) df.to_csv(r'eval_results/preCIFAR_d_loss.csv') gf.to_csv(r'eval_results/preCIFAR_g_loss.csv')