def train_model(model, dataloaders, criterion, optimizer, args, start_epoch=1, num_epochs=25): """ Trains the 3D CNN Model :param model: Model object that we will train :param base_model_name: The base name of the model :param dataloaders: A dictionary of train and validation dataloader :param criterion: Pytorch Criterion Instance :param optimizer: Pytorch Optimizer Instance :param num_epochs: Number of epochs during training :return: model, train_loss_history, val_loss_history, train_acc_history, val_acc_history, train_f1_score, val_f1_score, plot_epoch """ # Initializes Session History in the history file init_session_history(args) since = time.time() train_acc_history = [] val_acc_history = [] train_loss_history = [] val_loss_history = [] train_f1_score = [] val_f1_score = [] plot_epoch = [] best_model_wts = copy.deepcopy(model.state_dict()) best_acc = 0.0 for epoch in range(start_epoch, num_epochs): # Each epoch has a training and validation phase for phase in ['train', 'val']: if phase == 'train': model.train() # Set model to training mode train_pred_classes = [] train_ground_truths = [] else: model.eval() # Set model to evaluate mode val_pred_classes = [] val_ground_truths = [] running_loss = 0.0 running_corrects = 0 train_n_total = 1 pbar = tqdm(dataloaders[phase]) # Iterate over data. for sample in pbar: inputs = sample["video"] labels = sample["action"] inputs = inputs.to(device) labels = labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward # track history if only in train with torch.set_grad_enabled(phase == 'train'): outputs = model(inputs) loss = criterion(outputs, torch.max(labels, 1)[1]) _, preds = torch.max(outputs, 1) #print(preds) #print(torch.max(labels, 1)[1]) if phase == 'train': train_pred_classes.extend(preds.detach().cpu().numpy()) train_ground_truths.extend(torch.max(labels, 1)[1].detach().cpu().numpy()) else: val_pred_classes.extend(preds.detach().cpu().numpy()) val_ground_truths.extend(torch.max(labels, 1)[1].detach().cpu().numpy()) # backward + optimize only if in training phase if phase == 'train': loss.backward() optimizer.step() # statistics running_loss += loss.item() * inputs.size(0) running_corrects += torch.sum(preds == torch.max(labels, 1)[1]) pbar.set_description('Phase: {} || Epoch: {} || Loss {:.5f} '.format(phase, epoch, running_loss / train_n_total)) train_n_total += 1 epoch_loss = running_loss / len(dataloaders[phase].dataset) epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset) # Calculate elapsed time time_elapsed = time.time() - since print(phase, ' training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc)) # For Checkpointing and Confusion Matrix if phase == 'val': val_acc_history.append(epoch_acc) val_loss_history.append(epoch_loss) val_pred_classes = np.asarray(val_pred_classes) val_ground_truths = np.asarray(val_ground_truths) val_accuracy, val_f1, val_precision, val_recall = get_acc_f1_precision_recall( val_pred_classes, val_ground_truths ) val_f1_score.append(val_f1) val_confusion_matrix = np.array_str(confusion_matrix(val_ground_truths, val_pred_classes, labels=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) print('Epoch: {} || Val_Acc: {} || Val_Loss: {}'.format( epoch, val_accuracy, epoch_loss )) print(f'val: \n{val_confusion_matrix}') # Deep Copy Model if best accuracy if epoch_acc > best_acc: best_acc = epoch_acc best_model_wts = copy.deepcopy(model.state_dict()) # set current loss to val loss for write history val_loss = epoch_loss if phase == 'train': train_acc_history.append(epoch_acc) train_loss_history.append(epoch_loss) train_pred_classes = np.asarray(train_pred_classes) train_ground_truths = np.asarray(train_ground_truths) train_accuracy, train_f1, train_precision, train_recall = get_acc_f1_precision_recall( train_pred_classes, train_ground_truths ) train_f1_score.append(train_f1) train_confusion_matrix = np.array_str(confusion_matrix(train_ground_truths, train_pred_classes, labels=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) print('Epoch: {} || Train_Acc: {} || Train_Loss: {}'.format( epoch, train_accuracy, epoch_loss )) print(f'train: \n{train_confusion_matrix}') plot_epoch.append(epoch) # set current loss to train loss for write history train_loss = epoch_loss # Save Weights model_name = save_weights(model, args, epoch, optimizer) # Write History after train and validation phase write_history( args.history_path, model_name, train_loss, val_loss, train_accuracy, val_accuracy, train_f1, val_f1, train_precision, val_precision, train_recall, val_recall, train_confusion_matrix, val_confusion_matrix ) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) print('Best val Acc: {:4f}'.format(best_acc)) # load best model weights model.load_state_dict(best_model_wts) return model, train_loss_history, val_loss_history, train_acc_history, val_acc_history, train_f1_score, val_f1_score, plot_epoch
for i in xrange(len(gl)): gl[i] /= num_batches_u for i in xrange(len(cl)): cl[i] /= num_batches_u if (epoch >= anneal_lr_epoch) and (epoch % anneal_lr_every_epoch == 0): lr = lr*anneal_lr_factor cla_lr *= anneal_lr_factor_cla t = time.time() - start line = "*Epoch=%d Time=%.2f LR=%.5f\n" %(epoch, t, lr) + "DisLosses: " + str(dl)+"\nGenLosses: "+str(gl)+"\nInfLosses: "+str(il)+"\nClaLosses: "+str(cl) print line with open(logfile,'a') as f: f.write(line + "\n") # random generation for visualization if epoch % vis_epoch == 0: import utils.paramgraphics as paramgraphics tail = '-'+str(epoch)+'.png' ran_y = np.int32(np.repeat(np.arange(num_classes), num_classes)) x_gen = generate(ran_y) x_gen = x_gen.reshape((z_generated*num_classes,-1)) image = paramgraphics.mat_to_img(x_gen.T, dim_input, colorImg=colorImg, scale=generation_scale, save_path=os.path.join(sample_path, 'sample'+tail)) if epoch % 200 == 0: from utils.checkpoints import save_weights params = ll.get_all_params(dis_layers+[classifier,]+gen_layers+disxz_layers+inf_layers) save_weights(os.path.join(outfolder, 'model_epoch' + str(epoch) + '.npy'), params, None) save_weights(os.path.join(outfolder, 'average'+ str(epoch) +'.npy'), cla_param_avg, None)
def train(config): # Set random seed to ensure identical network initializations. # Note that cuDNN's convolutions are nondeterministic, so this # does not guarantee that two networks will behave identically. lasagne.random.set_rng(np.random.RandomState(1234)) # Load config file config_module = imp.load_source('config', config.model_definition) cfg = config_module.cfg # Get model model = config_module.get_model() # Compile functions log(config.log_file, 'Compiling theano functions...') test_function, test_vars, model = make_test_function(cfg, model, config) tfuncs, tvars, model = make_training_functions(cfg, model, config) tfuncs.update(test_function) tvars.update(test_vars) weights = config.weights if weights == -1: start_epoch = 0 else: ld = config.log_dir WEIGHTS = config.weights ckptfile = os.path.join(ld, config.snapshot_prefix + str(WEIGHTS) + '.npz') log(config.log_file, 'Loaded weights.') start_epoch = WEIGHTS + 1 ACC_LOGGER.load( (os.path.join(ld, "{}_acc_train_accuracy.csv".format(config.name)), os.path.join(ld, "{}_acc_eval_accuracy.csv".format(config.name))), epoch=WEIGHTS) LOSS_LOGGER.load( (os.path.join(ld, "{}_loss_train_loss.csv".format(config.name)), os.path.join(ld, '{}_loss_eval_loss.csv'.format(config.name))), epoch=WEIGHTS) metadata = checkpoints.load_weights(ckptfile, model['l_out']) itr = 0 # Load data and shuffle training examples. # Note that this loads the entire dataset into RAM! If you don't # have a lot of RAM, consider only loading chunks of this at a time. log(config.log_file, 'Loading Data') x_test = np.load(os.path.join(config.data, 'test.npz'))['features'] y_test = np.load(os.path.join(config.data, 'test.npz'))['targets'] x = np.load(os.path.join(config.data, 'train.npz'))['features'] # Seed the shuffle np.random.seed(42) # Define shuffle indices index = np.random.permutation(len(x)) # Shuffle inputs x = x[index] # Shuffle targets to match inputs y = np.load(os.path.join(config.data, 'train.npz'))['targets'][index] # Define size of chunk to be loaded into GPU memory chunk_size = cfg['batch_size'] * cfg['batches_per_chunk'] # Determine number of chunks num_chunks = int(math.ceil(len(y) / float(chunk_size))) # Get current learning rate new_lr = np.float32(tvars['learning_rate'].get_value()) # Loop across training epochs! begin = start_epoch end = cfg['max_epochs'] + start_epoch log(config.log_file, 'Starting Training') for epoch in xrange(begin, end + 1): #EVAL evaluate(x_test, y_test, cfg, tfuncs, tvars, config, epoch=epoch) ACC_LOGGER.save(config.log_dir) LOSS_LOGGER.save(config.log_dir) ACC_LOGGER.plot(dest=config.log_dir) LOSS_LOGGER.plot(dest=config.log_dir) # Update Learning Rate if isinstance(cfg['learning_rate'], dict) and epoch > 0: if any(x == epoch for x in cfg['learning_rate'].keys()): lr = np.float32(tvars['learning_rate'].get_value()) new_lr = cfg['learning_rate'][epoch] log(config.log_file, 'Changing learning rate from {} to {}'.format(lr, new_lr)) tvars['learning_rate'].set_value(np.float32(new_lr)) if cfg['decay_rate'] and epoch > 0: lr = np.float32(tvars['learning_rate'].get_value()) new_lr = lr * (1 - cfg['decay_rate']) log(config.log_file, 'Changing learning rate from {} to {}'.format(lr, new_lr)) tvars['learning_rate'].set_value(np.float32(new_lr)) # Loop across chunks! #for chunk_index in xrange(1): for chunk_index in xrange(num_chunks): # Define upper index of chunk to load # If you start doing complicated things with data loading, consider # wrapping all of this into its own little function. upper_range = min(len(y), (chunk_index + 1) * chunk_size) # Get current chunk x_shared = np.asarray(x[chunk_index * chunk_size:upper_range, :, :, :, :], dtype=np.float32) y_shared = np.asarray(y[chunk_index * chunk_size:upper_range], dtype=np.float32) # Get repeatable seed to shuffle jittered and unjittered instances within chunk. # Note that this seed varies between chunks, but will be constant across epochs. np.random.seed(chunk_index) # Get shuffled chunk indices for a second round of shuffling indices = np.random.permutation(2 * len(x_shared)) # Get number of batches in this chunk num_batches = 2 * len(x_shared) // cfg['batch_size'] # Combine data with jittered data, then shuffle and change binary range from {0,1} to {-1,3}, then load into GPU memory. tvars['X_shared'].set_value(4.0 * np.append( x_shared, jitter_chunk(x_shared, cfg, chunk_index), axis=0)[indices] - 1.0, borrow=True) tvars['y_shared'].set_value(np.append(y_shared, y_shared, axis=0)[indices], borrow=True) lvs, accs = [], [] # Loop across batches! for bi in xrange(num_batches): [classifier_loss, class_acc] = tfuncs['update_iter'](bi) # Record batch loss and accuracy lvs.append(classifier_loss) accs.append(class_acc) # Update iteration counter itr += 1 if itr % max(config.train_log_frq / config.batch_size, 1) == 0: [closs, c_acc ] = [float(np.mean(lvs)), 1.0 - float(np.mean(accs))] ACC_LOGGER.log(c_acc, epoch, "train_accuracy") LOSS_LOGGER.log(closs, epoch, "train_loss") lvs, accs = [], [] log( config.log_file, 'TRAINING: epoch: {0:^3d}, itr: {1:d}, c_loss: {2:.6f}, class_acc: {3:.5f}' .format(epoch, itr, closs, c_acc)) if not (epoch % cfg['checkpoint_every_nth']) or epoch == end: weights_fname = os.path.join(config.log_dir, config.snapshot_prefix + str(epoch)) checkpoints.save_weights(weights_fname, model['l_out'], { 'itr': itr, 'ts': time.time(), 'learning_rate': new_lr }) log(config.log_file, 'Training done')
def main(args): # Load config file config_module = imp.load_source('config', args.config_path) cfg = config_module.cfg # Define weights file name weights_fname = str(args.config_path)[:-3] + '.npz' # Define training metrics filename metrics_fname = weights_fname[:-4] + 'METRICS.jsonl' # Prepare Logs logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s| %(message)s') logging.info('Metrics will be saved to {}'.format(metrics_fname)) mlog = metrics_logging.MetricsLogger(metrics_fname, reinitialize=True) # Get model and compile theano functions model = config_module.get_model() logging.info('Compiling theano functions...') tfuncs, tvars = make_training_functions(cfg, model) logging.info('Training...') # Iteration Counter. One iteration corresponds to one minibatch. itr = 0 # Best true-positive rate best_tp = 0 for epoch in xrange(cfg['max_epochs']): # Prepare data loader loader = (data_loader(cfg, args.train_file)) # Update Learning Rate. Note that this version of the function does not support a decay rate; # See other training files in the discriminative section for this. if isinstance(cfg['learning_rate'], dict) and epoch > 0: if any(x == epoch for x in cfg['learning_rate'].keys()): lr = np.float32(tvars['learning_rate'].get_value()) new_lr = cfg['learning_rate'][epoch] logging.info('Changing learning rate from {} to {}'.format( lr, new_lr)) tvars['learning_rate'].set_value(np.float32(new_lr)) # Initialize epoch-wise chunk counter iter_counter = 0 # Initialize Epoch-wise metrics vloss_e, floss_e, closs_e, d_kl_e, c_acc_e, acc_e = 0, 0, 0, 0, 0, 0 # Train! for x_shared, y_shared in loader: # Loop across chunks # Increment chunk counter iter_counter += 1 # Determine number of batches in this chunk; this should only vary from # cfg['batches_per_chunk'] if we're at the end of the dataset. num_batches = len(x_shared) // cfg['batch_size'] # Load chunk into memory tvars['X_shared'].set_value(x_shared, borrow=True) tvars['y_shared'].set_value(y_shared, borrow=True) # Initialize Chunk-wise metrics voxel_lvs,feature_lvs,class_lvs,kl_divs,class_accs,accs = [],[],[],[],[],[] for bi in xrange(num_batches): # Loop across batches within chunk # Update! results = tfuncs['update_iter'](bi) # Assign results # This could definitely be done more cleanly with a list comprehension. voxel_loss = results[0] feature_loss = results[1] if cfg['introspect'] else 0 classifier_loss = results[ 1 + cfg['introspect']] if cfg['discriminative'] else 0 kl_div = results[1 + cfg['introspect'] + cfg['discriminative']] class_acc = results[ 2 + cfg['introspect'] + cfg['discriminative']] if cfg['discriminative'] else 0 acc = results[2 + cfg['introspect'] + 2 * cfg['discriminative']] # Append results to chunk-wise result list; these will be averaged later. voxel_lvs.append(voxel_loss) feature_lvs.append(feature_loss) class_lvs.append(classifier_loss) kl_divs.append(kl_div) class_accs.append(class_acc) accs.append(acc) # Increment batch counter itr += 1 # Average metrics across chunk [vloss, floss, closs, d_kl, c_acc, acc] = [ float(np.mean(voxel_lvs)), float(np.mean(feature_lvs)), float(np.mean(class_lvs)), float(np.mean(kl_divs)), 1.0 - float(np.mean(class_accs)), 1.0 - float(np.mean(accs)) ] # Update epoch-wise metrics vloss_e, floss_e, closs_e, d_kl_e, c_acc_e, acc_e = [ vloss_e + vloss, floss_e + floss, closs_e + closs, d_kl_e + d_kl, c_acc_e + c_acc, acc_e + acc ] # Report and Log chunk-wise metrics logging.info( 'epoch: {}, itr: {}, v_loss: {}, f_loss: {}, c_loss: {}, D_kl: {}, class_acc: {}, acc: {}' .format(epoch, itr, vloss, floss, closs, d_kl, c_acc, acc)) mlog.log(epoch=epoch, itr=itr, vloss=vloss, floss=floss, acc=acc, d_kl=d_kl, c_acc=c_acc) # Average metrics across epoch vloss_e, floss_e, closs_e, d_kl_e, c_acc_e, acc_e = [ vloss_e / iter_counter, floss_e / iter_counter, closs_e / iter_counter, d_kl_e / iter_counter, c_acc_e / iter_counter, acc_e / iter_counter ] # Report and log epoch-wise metrics logging.info( 'Training metrics, Epoch {}, v_loss: {}, f_loss: {}, c_loss: {}, D_kl: {}, class_acc: {}, acc: {}' .format(epoch, vloss_e, floss_e, closs_e, d_kl_e, c_acc_e, acc_e)) mlog.log(epoch=epoch, vloss_e=vloss_e, floss_e=floss_e, closs_e=closs_e, d_kl_e=d_kl_e, c_acc_e=c_acc_e, acc_e=acc_e) # Every Nth epoch, save weights if not (epoch % cfg['checkpoint_every_nth']): checkpoints.save_weights(weights_fname, model['l_out'], { 'itr': itr, 'ts': time.time() }) # When training is complete, check test performance test_loader = test_data_loader(cfg, 'shapenet10_test_nr.tar') logging.info('Examining performance on test set') # Initialize test metrics test_error,test_class_error,latent_values,tp,tn = [],[],[],[],[] # Initialize true class array for 2D manifold plots true_class = np.array([], dtype=np.int) for x_shared, y_shared in test_loader: # Loop across test chunks # Calculate number of batches num_batches = len(x_shared) // cfg['batch_size'] # Load test chunk into memory tvars['X_shared'].set_value(x_shared, borrow=True) tvars['y_shared'].set_value(y_shared, borrow=True) # Update true class array for 2D Manifold Plots true_class = np.append(true_class, np.argmax(y_shared, axis=1)) for bi in xrange(num_batches): # Loop across minibatches # Get test results test_results = tfuncs['test_function'](bi) # Assign test results # This could be done more cleanly with a list comprehension batch_test_error = test_results[0] batch_test_class_error = test_results[1] if cfg[ 'discriminative'] else 0 latents = test_results[1 + cfg['discriminative']] batch_tp = test_results[2 + cfg['discriminative']] batch_tn = test_results[3 + cfg['discriminative']] test_error.append(batch_test_error) test_class_error.append(batch_test_class_error) latent_values.append(latents) tp.append(batch_tp) tn.append(batch_tn) # Average results t_error = 1 - float(np.mean(test_error)) true_positives = float(np.mean(tp)) true_negatives = float(np.mean(tn)) t_class_error = 1 - float(np.mean(test_class_error)) Zs = np.asarray(latent_values, np.float32) # Report and log results logging.info( 'Test Accuracy: {}, Classification Test Accuracy: {}, True Positives: {}, True Negatives: {}' .format(t_error, t_class_error, true_positives, true_negatives)) mlog.log(test_error=t_error, t_class_error=t_class_error, true_positives=true_positives, true_negatives=true_negatives) # Optionally plot and save 2D manifold if using only 2 latent variables. if np.shape(Zs)[2] == 2: Zs = np.reshape(Zs, (np.shape(Zs)[0] * np.shape(Zs)[1], 1, 2)) ygnd = np.asarray(true_class, np.int) plt.scatter(Zs[:, 0, 0], Zs[:, 0, 1], s=30, c=ygnd, alpha=0.5) plt.savefig('figs/' + weights_fname[:-4] + str(epoch) + '.png') plt.clf() logging.info('training done') checkpoints.save_weights(weights_fname, model['l_out'], { 'itr': itr, 'ts': time.time() })
def main(args): # Load config file config_module = imp.load_source('config', args.config_path) cfg = config_module.cfg # Define weights file name weights_fname = str(args.config_path)[:-3]+'.npz' # Define training metrics filename metrics_fname = weights_fname[:-4]+'METRICS.jsonl' # Prepare Logs logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s| %(message)s') logging.info('Metrics will be saved to {}'.format(metrics_fname)) mlog = metrics_logging.MetricsLogger(metrics_fname, reinitialize=True) # Get model and compile theano functions model = config_module.get_model() logging.info('Compiling theano functions...') tfuncs, tvars = make_training_functions(cfg,model) logging.info('Training...') # Iteration Counter. One iteration corresponds to one minibatch. itr = 0 # Best true-positive rate best_tp = 0 for epoch in xrange(cfg['max_epochs']): # Prepare data loader loader = (data_loader(cfg,args.train_file)) # Update Learning Rate. Note that this version of the function does not support a decay rate; # See other training files in the discriminative section for this. if isinstance(cfg['learning_rate'], dict) and epoch > 0: if any(x==epoch for x in cfg['learning_rate'].keys()): lr = np.float32(tvars['learning_rate'].get_value()) new_lr = cfg['learning_rate'][epoch] logging.info('Changing learning rate from {} to {}'.format(lr, new_lr)) tvars['learning_rate'].set_value(np.float32(new_lr)) # Initialize epoch-wise chunk counter iter_counter = 0; # Initialize Epoch-wise metrics vloss_e, floss_e, closs_e, d_kl_e, c_acc_e, acc_e = 0, 0, 0, 0, 0, 0 # Train! for x_shared, y_shared in loader: # Loop across chunks # Increment chunk counter iter_counter+=1 # Determine number of batches in this chunk; this should only vary from # cfg['batches_per_chunk'] if we're at the end of the dataset. num_batches = len(x_shared)//cfg['batch_size'] # Load chunk into memory tvars['X_shared'].set_value(x_shared, borrow=True) tvars['y_shared'].set_value(y_shared, borrow=True) # Initialize Chunk-wise metrics voxel_lvs,feature_lvs,class_lvs,kl_divs,class_accs,accs = [],[],[],[],[],[] for bi in xrange(num_batches): # Loop across batches within chunk # Update! results = tfuncs['update_iter'](bi) # Assign results # This could definitely be done more cleanly with a list comprehension. voxel_loss = results[0] feature_loss = results[1] if cfg['introspect'] else 0 classifier_loss = results[1+cfg['introspect']] if cfg['discriminative'] else 0 kl_div = results[1+cfg['introspect']+cfg['discriminative']] class_acc = results[2+cfg['introspect']+cfg['discriminative']] if cfg['discriminative'] else 0 acc = results[2+cfg['introspect']+2*cfg['discriminative']] # Append results to chunk-wise result list; these will be averaged later. voxel_lvs.append(voxel_loss) feature_lvs.append(feature_loss) class_lvs.append(classifier_loss) kl_divs.append(kl_div) class_accs.append(class_acc) accs.append(acc) # Increment batch counter itr += 1 # Average metrics across chunk [vloss, floss,closs, d_kl,c_acc,acc] = [float(np.mean(voxel_lvs)), float(np.mean(feature_lvs)), float(np.mean(class_lvs)), float(np.mean(kl_divs)), 1.0-float(np.mean(class_accs)), 1.0-float(np.mean(accs))] # Update epoch-wise metrics vloss_e, floss_e, closs_e, d_kl_e, c_acc_e, acc_e = [vloss_e+vloss, floss_e+floss, closs_e+closs, d_kl_e+d_kl, c_acc_e+c_acc, acc_e+acc] # Report and Log chunk-wise metrics logging.info('epoch: {}, itr: {}, v_loss: {}, f_loss: {}, c_loss: {}, D_kl: {}, class_acc: {}, acc: {}'.format(epoch, itr, vloss, floss, closs, d_kl, c_acc, acc)) mlog.log(epoch=epoch, itr=itr, vloss=vloss,floss=floss, acc=acc,d_kl=d_kl,c_acc=c_acc) # Average metrics across epoch vloss_e, floss_e, closs_e, d_kl_e, c_acc_e, acc_e = [vloss_e/iter_counter, floss_e/iter_counter, closs_e/iter_counter, d_kl_e/iter_counter, c_acc_e/iter_counter, acc_e/iter_counter] # Report and log epoch-wise metrics logging.info('Training metrics, Epoch {}, v_loss: {}, f_loss: {}, c_loss: {}, D_kl: {}, class_acc: {}, acc: {}'.format(epoch, vloss_e, floss_e,closs_e,d_kl_e,c_acc_e,acc_e)) mlog.log(epoch=epoch, vloss_e=vloss_e, floss_e=floss_e, closs_e=closs_e, d_kl_e=d_kl_e, c_acc_e=c_acc_e, acc_e=acc_e) # Every Nth epoch, save weights if not (epoch%cfg['checkpoint_every_nth']): checkpoints.save_weights(weights_fname, model['l_out'], {'itr': itr, 'ts': time.time()}) # When training is complete, check test performance test_loader = test_data_loader(cfg,'shapenet10_test_nr.tar') logging.info('Examining performance on test set') # Initialize test metrics test_error,test_class_error,latent_values,tp,tn = [],[],[],[],[] # Initialize true class array for 2D manifold plots true_class = np.array([],dtype=np.int) for x_shared,y_shared in test_loader: # Loop across test chunks # Calculate number of batches num_batches = len(x_shared)//cfg['batch_size'] # Load test chunk into memory tvars['X_shared'].set_value(x_shared, borrow=True) tvars['y_shared'].set_value(y_shared, borrow=True) # Update true class array for 2D Manifold Plots true_class = np.append(true_class,np.argmax(y_shared,axis=1)) for bi in xrange(num_batches): # Loop across minibatches # Get test results test_results = tfuncs['test_function'](bi) # Assign test results # This could be done more cleanly with a list comprehension batch_test_error=test_results[0] batch_test_class_error = test_results[1] if cfg['discriminative'] else 0 latents = test_results[1+cfg['discriminative']] batch_tp = test_results[2+cfg['discriminative']] batch_tn = test_results[3+cfg['discriminative']] test_error.append(batch_test_error) test_class_error.append(batch_test_class_error) latent_values.append(latents) tp.append(batch_tp) tn.append(batch_tn) # Average results t_error = 1-float(np.mean(test_error)) true_positives = float(np.mean(tp)) true_negatives = float(np.mean(tn)) t_class_error = 1-float(np.mean(test_class_error)) Zs = np.asarray(latent_values,np.float32) # Report and log results logging.info('Test Accuracy: {}, Classification Test Accuracy: {}, True Positives: {}, True Negatives: {}'.format(t_error,t_class_error,true_positives,true_negatives)) mlog.log(test_error=t_error,t_class_error = t_class_error,true_positives=true_positives,true_negatives=true_negatives) # Optionally plot and save 2D manifold if using only 2 latent variables. if np.shape(Zs)[2]==2: Zs = np.reshape(Zs,(np.shape(Zs)[0]*np.shape(Zs)[1],1,2)) ygnd = np.asarray(true_class,np.int) plt.scatter(Zs[:,0,0],Zs[:,0,1],s = 30, c=ygnd,alpha = 0.5) plt.savefig('figs/'+weights_fname[:-4]+str(epoch)+'.png') plt.clf() logging.info('training done') checkpoints.save_weights(weights_fname, model['l_out'], {'itr': itr, 'ts': time.time()})
def main(args): # Set random seed to ensure identical network initializations. # Note that cuDNN's convolutions are nondeterministic, so this # does not guarantee that two networks will behave identically. lasagne.random.set_rng(np.random.RandomState(1234)) # Load config file config_module = imp.load_source('config', args.config_path) cfg = config_module.cfg # Get weights and metrics filename weights_fname =str(args.config_path)[:-3]+'.npz' metrics_fname = weights_fname[:-4]+'METRICS.jsonl' # Prepare logs logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s| %(message)s') logging.info('Metrics will be saved to {}'.format(metrics_fname)) mlog = metrics_logging.MetricsLogger(metrics_fname, reinitialize=(not args.resume)) # Get model model = config_module.get_model() # Compile functions logging.info('Compiling theano functions...') tfuncs, tvars,model = make_training_functions(cfg,model) # Resume training if file exists and you turn on the resume tag if os.path.isfile(weights_fname) and args.resume: print('loading weights') metadata = checkpoints.load_weights(weights_fname, model['l_out']) # GPU Memory Info; currently not implemented, but you can potentially # use this information to monitor GPU memory useage. baseGPUmem = sbcuda.cuda_ndarray.cuda_ndarray.mem_info()[0]/1024./1024/1024 # Training loop logging.info('Training...') itr = 0 # Load data and shuffle training examples. # Note that this loads the entire dataset into RAM! If you don't # have a lot of RAM, consider only loading chunks of this at a time. x = np.load(args.data_path)['features'] # Seed the shuffle np.random.seed(42) # Define shuffle indices index = np.random.permutation(len(x)) # Shuffle inputs x = x[index] # Shuffle targets to match inputs y = np.load(args.data_path)['targets'][index] # Define size of chunk to be loaded into GPU memory chunk_size = cfg['batch_size']*cfg['batches_per_chunk'] # Determine number of chunks num_chunks = int(math.ceil(len(y)/float(chunk_size))) # Get current learning rate new_lr = np.float32(tvars['learning_rate'].get_value()) # Loop across training epochs! for epoch in xrange(cfg['max_epochs']): # Tic epoch_start_time = time.time() # Update Learning Rate if isinstance(cfg['learning_rate'], dict) and epoch > 0: if any(x==epoch for x in cfg['learning_rate'].keys()): lr = np.float32(tvars['learning_rate'].get_value()) new_lr = cfg['learning_rate'][epoch] logging.info('Changing learning rate from {} to {}'.format(lr, new_lr)) tvars['learning_rate'].set_value(np.float32(new_lr)) if cfg['decay_rate'] and epoch > 0: lr = np.float32(tvars['learning_rate'].get_value()) new_lr = lr*(1-cfg['decay_rate']) logging.info('Changing learning rate from {} to {}'.format(lr, new_lr)) tvars['learning_rate'].set_value(np.float32(new_lr)) # Loop across chunks! for chunk_index in xrange(num_chunks): # Define upper index of chunk to load # If you start doing complicated things with data loading, consider # wrapping all of this into its own little function. upper_range = min(len(y),(chunk_index+1)*chunk_size) # Get current chunk x_shared = np.asarray(x[chunk_index*chunk_size:upper_range,:,:,:,:],dtype=np.float32) y_shared = np.asarray(y[chunk_index*chunk_size:upper_range],dtype=np.float32) # Get repeatable seed to shuffle jittered and unjittered instances within chunk. # Note that this seed varies between chunks, but will be constant across epochs. np.random.seed(chunk_index) # Get shuffled chunk indices for a second round of shuffling indices = np.random.permutation(2*len(x_shared)) # Get number of batches in this chunk num_batches = 2*len(x_shared)//cfg['batch_size'] # Combine data with jittered data, then shuffle and change binary range from {0,1} to {-1,3}, then load into GPU memory. tvars['X_shared'].set_value(4.0 * np.append(x_shared,jitter_chunk(x_shared, cfg,chunk_index),axis=0)[indices]-1.0, borrow=True) tvars['y_shared'].set_value(np.append(y_shared,y_shared,axis=0)[indices], borrow=True) # Prepare loss values lvs, accs = [],[] # Loop across batches! for bi in xrange(num_batches): # Train! [classifier_loss,class_acc] = tfuncs['update_iter'](bi) # Record batch loss and accuracy lvs.append(classifier_loss) accs.append(class_acc) # Update iteration counter itr += 1 # Average losses and accuracies across chunk [closs,c_acc] = [float(np.mean(lvs)),1.0-float(np.mean(accs))] # Report and log losses and accuracies logging.info('epoch: {0:^3d}, itr: {1:d}, c_loss: {2:.6f}, class_acc: {3:.5f}'.format(epoch, itr, closs, c_acc)) mlog.log(epoch=epoch, itr=itr, closs=closs,c_acc=c_acc) # Every Nth epoch, save weights if not (epoch%cfg['checkpoint_every_nth']): checkpoints.save_weights(weights_fname, model['l_out'], {'itr': itr, 'ts': time.time(), 'learning_rate': new_lr}) logging.info('training done')
def main(args): # Set random seed to ensure identical network initializations. # Note that cuDNN's convolutions are nondeterministic, so this # does not guarantee that two networks will behave identically. lasagne.random.set_rng(np.random.RandomState(1234)) # Load config file config_module = imp.load_source('config', args.config_path) cfg = config_module.cfg # Get weights and metrics filename weights_fname = str(args.config_path)[:-3] + '.npz' metrics_fname = weights_fname[:-4] + 'METRICS.jsonl' # Prepare logs logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s| %(message)s') logging.info('Metrics will be saved to {}'.format(metrics_fname)) mlog = metrics_logging.MetricsLogger(metrics_fname, reinitialize=(not args.resume)) # Get model model = config_module.get_model() # Compile functions logging.info('Compiling theano functions...') tfuncs, tvars, model = make_training_functions(cfg, model) # Resume training if file exists and you turn on the resume tag if os.path.isfile(weights_fname) and args.resume: print('loading weights') metadata = checkpoints.load_weights(weights_fname, model['l_out']) # GPU Memory Info; currently not implemented, but you can potentially # use this information to monitor GPU memory useage. baseGPUmem = sbcuda.cuda_ndarray.cuda_ndarray.mem_info( )[0] / 1024. / 1024 / 1024 # Training loop logging.info('Training...') itr = 0 # Load data and shuffle training examples. # Note that this loads the entire dataset into RAM! If you don't # have a lot of RAM, consider only loading chunks of this at a time. x = np.load(args.data_path)['features'] # Seed the shuffle np.random.seed(42) # Define shuffle indices index = np.random.permutation(len(x)) # Shuffle inputs x = x[index] # Shuffle targets to match inputs y = np.load(args.data_path)['targets'][index] # Define size of chunk to be loaded into GPU memory chunk_size = cfg['batch_size'] * cfg['batches_per_chunk'] # Determine number of chunks num_chunks = int(math.ceil(len(y) / float(chunk_size))) # Get current learning rate new_lr = np.float32(tvars['learning_rate'].get_value()) # Loop across training epochs! for epoch in xrange(cfg['max_epochs']): # Tic epoch_start_time = time.time() # Update Learning Rate if isinstance(cfg['learning_rate'], dict) and epoch > 0: if any(x == epoch for x in cfg['learning_rate'].keys()): lr = np.float32(tvars['learning_rate'].get_value()) new_lr = cfg['learning_rate'][epoch] logging.info('Changing learning rate from {} to {}'.format( lr, new_lr)) tvars['learning_rate'].set_value(np.float32(new_lr)) if cfg['decay_rate'] and epoch > 0: lr = np.float32(tvars['learning_rate'].get_value()) new_lr = lr * (1 - cfg['decay_rate']) logging.info('Changing learning rate from {} to {}'.format( lr, new_lr)) tvars['learning_rate'].set_value(np.float32(new_lr)) # Loop across chunks! for chunk_index in xrange(num_chunks): # Define upper index of chunk to load # If you start doing complicated things with data loading, consider # wrapping all of this into its own little function. upper_range = min(len(y), (chunk_index + 1) * chunk_size) # Get current chunk x_shared = np.asarray(x[chunk_index * chunk_size:upper_range, :, :, :, :], dtype=np.float32) y_shared = np.asarray(y[chunk_index * chunk_size:upper_range], dtype=np.float32) # Get repeatable seed to shuffle jittered and unjittered instances within chunk. # Note that this seed varies between chunks, but will be constant across epochs. np.random.seed(chunk_index) # Get shuffled chunk indices for a second round of shuffling indices = np.random.permutation(2 * len(x_shared)) # Get number of batches in this chunk num_batches = 2 * len(x_shared) // cfg['batch_size'] # Combine data with jittered data, then shuffle and change binary range from {0,1} to {-1,3}, then load into GPU memory. tvars['X_shared'].set_value(4.0 * np.append( x_shared, jitter_chunk(x_shared, cfg, chunk_index), axis=0)[indices] - 1.0, borrow=True) tvars['y_shared'].set_value(np.append(y_shared, y_shared, axis=0)[indices], borrow=True) # Prepare loss values lvs, accs = [], [] # Loop across batches! for bi in xrange(num_batches): # Train! [classifier_loss, class_acc] = tfuncs['update_iter'](bi) # Record batch loss and accuracy lvs.append(classifier_loss) accs.append(class_acc) # Update iteration counter itr += 1 # Average losses and accuracies across chunk [closs, c_acc] = [float(np.mean(lvs)), 1.0 - float(np.mean(accs))] # Report and log losses and accuracies logging.info( 'epoch: {0:^3d}, itr: {1:d}, c_loss: {2:.6f}, class_acc: {3:.5f}' .format(epoch, itr, closs, c_acc)) mlog.log(epoch=epoch, itr=itr, closs=closs, c_acc=c_acc) # Every Nth epoch, save weights if not (epoch % cfg['checkpoint_every_nth']): checkpoints.save_weights(weights_fname, model['l_out'], { 'itr': itr, 'ts': time.time(), 'learning_rate': new_lr }) logging.info('training done')