def _init_dataset(self): trainset = get_dataset(mode='train') valset = get_dataset(mode='val') print('No. of train images: ', len(trainset)) print('No. of val images: ', len(valset)) self.batch_size = self.cfg['training']['batch_size'] kwargs = { 'num_workers': self.cfg['training']['n_workers'], 'pin_memory': True } self.train_queue = data.DataLoader(trainset, batch_size=self.batch_size, shuffle=True, num_workers=8, drop_last=False, pin_memory=True) self.valid_queue = data.DataLoader(valset, batch_size=self.batch_size, num_workers=8, drop_last=False, pin_memory=True)
def main(args): # Is GPU usable? assert torch.cuda.is_available() # load encoder decoder vocab logger.debug("Loading Vocabulary...") # encoder vocaluraly with open(args.en_vocab_path, "rb") as f: en_vocab = pickle.load(f) logger.debug("Encoder vocab size: {}".format(len(en_vocab))) # decoder vocaburaly with open(args.de_vocab_path, "rb") as f: de_vocab = pickle.load(f) logger.debug("Decoder vocab size: {}".format(len(de_vocab))) en_size, de_size = len(en_vocab), len(de_vocab) logger.debug("[source_vocab]:%d [target_vocab]:%d" % (en_size, de_size)) # setting train and val dataloader logger.debug("Preparing dataset...") train_iter = get_dataset(args.train_path, en_vocab, de_vocab, args.batch_size, args.shuffle, args.num_workers) val_iter = get_dataset(args.val_path, en_vocab, de_vocab, args.batch_size, args.shuffle, args.num_workers) # setting seq2seq model logger.debug("Instantiating models...") encoder = Encoder(en_size, args.embed_dim, args.hidden_dim, n_layers=args.en_n_layers, dropout=args.en_dropout) decoder = Decoder(args.embed_dim, args.hidden_dim, de_size, n_layers=args.de_n_layers, dropout=args.de_dropout) seq2seq = Seq2Seq(encoder, decoder).cuda() if args.pre_trained_path is not None: seq2seq.load_state_dict(torch.load(args.pre_trained_path)) logger.debug("Load pre trained model: {0}".format( args.pre_trained_path)) optimizer = optim.Adam(seq2seq.parameters(), lr=args.lr) logger.debug(seq2seq) # Training and validation model best_val_loss = None for epoch in range(1, args.epochs + 1): train(epoch, seq2seq, optimizer, train_iter, de_size, args.grad_clip, en_vocab, de_vocab) val_loss = evaluate(seq2seq, val_iter, de_size, en_vocab, de_vocab) logger.debug("VAL LOSS: {0:.5f} (epoch={1})".format(val_loss, epoch)) # Save the model if the validation loss is the best we've seen so far. if (best_val_loss is None) or (val_loss < best_val_loss): logger.debug("save model (epoch={0})".format(epoch)) torch.save(seq2seq.state_dict(), args.save_model_path) best_val_loss = val_loss
def train(): dataset = data.get_dataset(train=True) iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() siamese = Siamese() optimizer = tf.train.AdamOptimizer(FLAGS.lr) train_step = optimizer.minimize(siamese.loss) tf.summary.scalar('loss', siamese.loss) tf.summary.scalar('acc', siamese.accuracy) merged_summaries = tf.summary.merge_all() saver = tf.train.Saver() with tf.Session() as sess: train_writer = tf.summary.FileWriter(FLAGS.summaries_dir, sess.graph) sess.run(tf.global_variables_initializer()) for i in trange(FLAGS.n_iters): x1, x2, y = sess.run(next_element) _, loss, summary = sess.run( [train_step, siamese.loss, merged_summaries], feed_dict={ siamese.x1: x1, siamese.x2: x2, siamese.y: y, }) assert not np.isnan(loss), 'Model diverged with loss = NaN' train_writer.add_summary(summary, i) if i % 1000 == 0: saver.save(sess, FLAGS.model_path) print('Training completed, model saved:', saver.save(sess, FLAGS.model_path))
def run(self, use_gpu, learners, params_grid, dataset_dir, result_file, out_dir): dataset = get_dataset(self.name, dataset_dir) device_type = 'GPU' if use_gpu else 'CPU' for LearnerType in learners: learner = LearnerType(dataset, self.task, self.metric, use_gpu) algorithm_name = learner.name() + '-' + device_type print('Started to train ' + algorithm_name) for params in ParameterGrid(params_grid): params_str = params_to_str(params) log_file = os.path.join(out_dir, self.name, algorithm_name, params_str + '.log') print(params_str) hash_id = Track.hash(self.name, algorithm_name, self.task, params_str) if check_exists(hash_id, result_file): print('Skipped: already evaluated') continue try: elapsed = learner.run(params, log_file) print('Timing: ' + str(elapsed) + ' sec') track = parse_log(algorithm_name, self.name, self.task, params_str, log_file, params['iterations']) update_result_file(track, result_file) except Exception as e: print('Exception during training: ' + repr(e))
def plot_principal_component_examples(run_id, dataset_name, analysis_tag="", dset_split="validation", dset_mode='color_mask_crop', n_samples_per_component=15, truncate_n=20, show_metadata=False): analysis_id = run_id if not analysis_tag else "{}_{}".format( run_id, analysis_tag) pca_examples_savepath = os.path.join( cfg.PREDS_DIR, "{}_{}_pca_component_examples.json".format(analysis_id, dset_split)) print("Loading PCA axis examples from: {}".format(pca_examples_savepath)) with open(pca_examples_savepath) as infile: component_examples = json.load(infile) ds = get_dataset(dataset_name, dset_mode=dset_mode, one_sample_only=True) ds = ds[0] if dset_split == "train" else ds[1] def _load_image(imageid): return ds.visualize_item(imageid) im_grid = [] metadata_grid = [] metadata_rows = [] for component_data in component_examples[:truncate_n]: title = "Component {} (Var explained: {}. Sing. value: {})".format( component_data['component_i'], round(component_data['explained_variance_ratio'], 2), round(component_data['singular_value'], 2)) metadata_rows.append(title) examples = component_data['samples_sorted'] indices_to_sample = np.round( np.linspace(0, len(examples) - 1, n_samples_per_component)) examples = [examples[int(i)] for i in indices_to_sample] row_images = [] row_metadata = [] for elt in examples: coeff, imid = elt row_images.append(imid) elt_title = "{} ({})".format( imid, coeff) if show_metadata else str(round(coeff, 2)) row_metadata.append(elt_title) im_grid.append(row_images) metadata_grid.append(row_metadata) img_rows_plt(rows=im_grid, metadata=metadata_grid if show_metadata else None, im_load_func=_load_image, row_metadata=metadata_rows if show_metadata else None)
def data(): labels, images = get_dataset() X_train, X_test, Y_train, Y_test = train_test_split(images, labels, test_size=0.15) X_train = np.repeat(X_train, 10, axis=0) Y_train = np.repeat(Y_train, 10, axis=0) return X_train, Y_train, X_test, Y_test
def main(cfg): if cfg['mode'] == 'train': train_dataset = get_dataset(mode=cfg['mode'], cfg=cfg) val_dataset = get_dataset(mode='val', cfg=cfg) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=cfg['train']['batch_size'], num_workers=8, shuffle=True, collate_fn=collate_remove_none, worker_init_fn=worker_init_fn) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=cfg['val']['batch_size'], num_workers=8, shuffle=False, collate_fn=collate_remove_none, worker_init_fn=worker_init_fn) model = get_network(cfg, device='cuda:0', dataset=train_dataset) else: test_dataset = get_dataset(mode=cfg['mode'], cfg=cfg, return_idx=True) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, num_workers=4, shuffle=False) model = get_network(cfg, device='cuda:0', dataset=test_dataset) if cfg['mode'] == 'train': optimizer = optim.Adam(model.parameters(), lr=1e-4) else: optimizer = None if cfg['mode'] == 'train': checkpoint = CheckpointIO(cfg['out']['checkpoint_dir'], model=model, optimizer=optimizer) load_dict = checkpoint.load(cfg['train']['pretrained']) train(train_loader, val_loader, model, optimizer, checkpoint, cfg) else: checkpoint = CheckpointIO(cfg['out']['checkpoint_dir'], model=model) load_dict = checkpoint.load(cfg['test']['pretrained']) test(test_loader, test_dataset, model, cfg)
def get_data(train_or_test): isTrain = train_or_test == 'train' ds = data_loader.get_dataset(train_or_test) if isTrain: augmentors = [ imgaug.Brightness(15), imgaug.Contrast((0.8, 1.2)), imgaug.MeanVarianceNormalize(all_channel=True) ] else: augmentors = [imgaug.MeanVarianceNormalize(all_channel=True)] ds = AugmentImageComponent(ds, augmentors) ds = BatchData(ds, BATCH_SIZE, remainder=not isTrain) if isTrain: ds = PrefetchData(ds, PREFETCH_SIZE, NR_PROC) return ds
def get_data(train_or_test): isTrain = train_or_test == 'train' ds = data_loader.get_dataset(train_or_test) if isTrain: augmentors = [ imgaug.Brightness(15), imgaug.Contrast((0.8, 1.2)), imgaug.MeanVarianceNormalize(all_channel=True) ] else: augmentors = [ imgaug.MeanVarianceNormalize(all_channel=True) ] ds = AugmentImageComponent(ds, augmentors) ds = BatchData(ds, BATCH_SIZE, remainder=not isTrain) if isTrain: ds = PrefetchData(ds, PREFETCH_SIZE, NR_PROC) return ds
def run(self, use_gpu, learners, params_grid, dataset_dir, out_dir): dataset = get_dataset(self.name, dataset_dir) device_type = 'GPU' if use_gpu else 'CPU' for LearnerType in learners: learner = LearnerType(dataset, self.task, self.metric, use_gpu) algorithm_name = learner.name() + '-' + device_type print('Started to train ' + algorithm_name) for params in ParameterGrid(params_grid): print(params) log_dir_name = os.path.join(out_dir, self.name, algorithm_name) try: elapsed = learner.run(params, log_dir_name) print('Timing: ' + str(elapsed) + ' sec') except Exception as e: print('Exception during training: ' + repr(e))
def plot_nearest_neighbors(run_id, dataset_name, analysis_tag="", dset_split="validation", dset_mode='color_mask_crop', subset_n=100, n_to_show=20, plot_metadata=True): analysis_id = run_id if not analysis_tag else "{}_{}".format( run_id, analysis_tag) nn_savepath = os.path.join( cfg.PREDS_DIR, "{}_{}_{}_nearest_neighbors.json".format(analysis_id, dset_split, subset_n)) print("Loading nearest neighbor data from: {}".format(nn_savepath)) with open(nn_savepath) as infile: nn_data = json.load(infile) ks = list(nn_data.keys())[:n_to_show] nn_data = {k: nn_data[k] for k in ks} ds = get_dataset(dataset_name, dset_mode=dset_mode, one_sample_only=True) ds = ds[0] if dset_split == "train" else ds[1] def _load_image(imageid): return ds.visualize_item(imageid) im_grid = [] metadata_grid = [] for probe_item, neighbor_data in nn_data.items(): row = [elt[0] for elt in neighbor_data] row_metadata = [ "{}: {}".format(elt[0], elt[1]) for elt in neighbor_data ] im_grid.append(row) metadata_grid.append(row_metadata) img_rows_plt(rows=im_grid, metadata=metadata_grid if plot_metadata else None, im_load_func=_load_image)
def get_default_test_loader(): dataset = get_dataset() split = int(0.8 * len(dataset.data)) # train-test split test_data = dataset.data[split:] test_target = dataset.target[split:] # Convert dataset into torch tensors test = data_utils.TensorDataset( torch.tensor(test_data).float(), torch.tensor(test_target).long()) test_loader = data_utils.DataLoader( test, # dataset to load from batch_size=BATCH_SIZE, # examples per batch (default: 1) shuffle=False, sampler= None, # if a sampling method is specified, `shuffle` must be False num_workers=5, # subprocesses to use for sampling pin_memory=False) # whether to return an item pinned to GPU return test_loader
epoch_loss = running_loss / len(test_Y) epoch_acc = running_corrects.double() / len(test_Y) finaltestletteracc = finaltestletteracc + len(test_Y) * epoch_acc if i_batch % 25 == 0: print("Letter accuracy =", epoch_acc) wtestingepoc.append(wordaccuracies(testpredictedletters, testactualletters)) testingepoc.append(finaltestletteracc / len(test)) print("Testing acc = :", finaltestletteracc / len(test)) #word accuracies function #gettinig data using DataLoader class , modified code dataset = get_dataset() #word accuracies function based on letters, required dataset.nextletter info def wordaccuracies(pred, actual): incorrectwords = 0 totalwords = 0 flag = True for i in range(len(pred)): if pred[i] != actual[i]: flag = False if dataset.nextletter[split + i] == -1: if flag == False: incorrectwords += 1
print_status(torch.cuda.device_count()) print_status('Using CUDA..') best_acc = 0 # best test accuracy start_epoch = 0 # start from epoch 0 or last checkpoint epoch if args.seed != 0: torch.manual_seed(args.seed) # Data print_status('==> Preparing data..') if not (args.train_type == 'linear_eval'): assert ('wrong train phase...') else: trainloader, traindst, testloader, testdst = data_loader.get_dataset(args) if args.dataset == 'cifar-10' or args.dataset == 'mnist': num_outputs = 10 elif args.dataset == 'cifar-100': num_outputs = 100 if args.model == 'ResNet50': expansion = 4 else: expansion = 1 # Model print_status('==> Building model..') train_type = args.train_type
writer.add_pr_curve('PR_Curve/test', np.asarray(y_actual_test), np.asarray(y_pred_test)) print( f"Test - Loss: {total_loss}, Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1-score: {f1}, ROC_AUC: {roc_auc}, PRC_AUC: {prc_auc}, PCC: {pcc}, Sensitivity: {sensitivity}, PPV: {PPV}, SRCC: {srcc}" ) if epoch % config.ckpt_num == 0: torch.save(model.state_dict(), config.model_name) if __name__ == "__main__": torch.manual_seed(3) # for reproducibility device = config.device epochs = config.epochs dataset_cls, train_loader, val_loader, test_loader, peptide_embedding, mhc_embedding = get_dataset( device) model = MHCAttnNet(peptide_embedding, mhc_embedding) # model.load_state_dict(torch.load(config.model_name)) model.to(device) print(model) print('Total parameters', sum(p.numel() for p in model.parameters())) print('Trainable parameters', sum(p.numel() for p in model.parameters() if p.requires_grad)) loss_fn = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters()) fit(model=model, train_dl=train_loader, val_dl=val_loader, test_dl=test_loader, loss_fn=loss_fn,
def run( model_name='anp', # choices = ['anp', 'np'] dataset_name='apollo.npy', mask_fname='apollo_train_mask', window_size=100, sample_size=100, # set -1 to run vanilla neural processes sample_scale_sq=31.25, emb_dim=512, eps=1e-3, fix_eps=150, mask_size=5, use_rotation_aug=True, use_scaling_aug=True, learning_rate=1e-4, batch_size=512, max_epoch=500, epoch_split=10, # FOR TRAIN model_path='', # model path recon_nodata=False, # reconstruct on no-data gaps # FOR RECONSTRUCTION # model_path='anp_{EXPERIMENT_ID}.pth', # model path # recon_nodata=True, # reconstruct on no-data gaps ): np.random.seed(7) torch.manual_seed(7) print(f'Experiment ID: {EXPERIMENT_ID}') print(DEVICE) if window_size % 2 == 0: window_size += 1 assert window_size > mask_size assert sample_size < window_size**2 center_idx = window_size ** 2 // 2 # index of center pixel # loaders train_loader, valid_loader, test_loader = get_dataset( dataset_name=dataset_name, window_size=window_size, batch_size=batch_size, mask_fname=mask_fname, mask_size=mask_size, epoch_split=epoch_split, recon_nodata=recon_nodata, ) # models if model_name == 'np': model = NP( x_dim=2, y_dim=1, emb_dim=emb_dim, dist='Gaussian', stochastic=False, ) elif model_name == 'anp': model = ANP( x_dim=2, y_dim=1, emb_dim=emb_dim, dist='Gaussian', stochastic=False, ) else: raise NotImplementedError # load model if model_path: model_path = os.path.join(RESULT_DIR, model_path) model.load_state_dict(torch.load(model_path, map_location=DEVICE)) # initialize model and optimizer if recon_nodata: max_epoch = 1 else: optimizer = Adam(model.parameters(), lr=learning_rate) # initialize variables x_grid = tensor(grid(window_size, window_size, scale=2)) p_grid = tensor(grid(window_size, window_size, scale=(window_size-1)/2)) p_grid = torch.exp(-1/sample_scale_sq * (p_grid[0, :, 0]**2 + p_grid[0, :, 1]**2)) best_valid_loss = np.inf if recon_nodata: loaders = {'test': test_loader} else: loaders = {'train': train_loader, 'valid': valid_loader, 'test': test_loader} for epoch in range(max_epoch): for run_type, loader in loaders.items(): # initialize variables if run_type == 'train': model.train() else: model.eval() losses = [] y_true = [] y_pred = [] y_sig = [] idx = [] tic = time() for i, batch_data in enumerate(loader): y_context, context_mask, target_value, idx0, idx1 = batch_data bs = y_context.size(0) x_context = x_grid.expand(bs, -1, -1) context_mask[:, center_idx:center_idx + 1] = 0 # mask a center non_context = ~context_mask if 0 < sample_size: if run_type == 'train': sample_idx = torch.multinomial(p_grid, sample_size, replacement=False) x_context = x_context[:, sample_idx] y_context = y_context[:, sample_idx] context_mask = context_mask[:, sample_idx] non_context = non_context[:, sample_idx] else: sample_idx = [] for ib in range(bs): prob = p_grid.clone() prob[non_context[ib]] = 0 sample_idx.append(torch.topk(prob, sample_size)[1]) sample_idx = torch.cat(sample_idx) batch_idx = torch.arange(bs).unsqueeze(-1).expand(-1, sample_size).flatten() x_context = x_context[batch_idx, sample_idx].view(bs, sample_size, -1) y_context = y_context[batch_idx, sample_idx].view(bs, sample_size, -1) context_mask = context_mask[batch_idx, sample_idx].view(bs, sample_size) non_context = non_context[batch_idx, sample_idx].view(bs, sample_size) # scale ml = context_mask.sum(dim=1, keepdim=True).unsqueeze(-1) y_context[non_context] = 0.0 mean = y_context.sum(dim=1, keepdim=True) / ml scale = (y_context - mean)**2 scale[non_context] = 0.0 scale = torch.sqrt(scale.sum(dim=1, keepdim=True) / ml) y_context = (y_context - mean) / scale # augment if run_type == 'train': if use_rotation_aug: theta = torch.rand( bs, 1, 1, dtype=torch.float32, device=DEVICE ) * (math.pi * 2) cth = torch.cos(theta) sth = torch.sin(theta) x_context = torch.cat( (x_context[:, :, 0:1] * cth - x_context[:, :, 1:2] * sth, x_context[:, :, 0:1] * sth + x_context[:, :, 1:2] * cth), dim=-1 ) if use_scaling_aug: y_scale = torch.rand( bs, 1, 1, dtype=torch.float32, device=DEVICE ) + 0.5 y_context *= y_scale scale *= y_scale # target value x_center = zeros(bs, 1, 2) y_target = model( x_context, y_context, context_mask, non_context, x_center ) mu, logvar = torch.chunk(y_target, 2, dim=-1) if epoch <= fix_eps: sigma = eps * ones_like(logvar) else: sigma = eps + torch.exp(0.5 * logvar) # rescale mu = mu * scale + mean sigma *= scale # compute loss and update loss = torch.mean( 0.5 * ((target_value - mu) / sigma) ** 2 + torch.log(sigma), ) if run_type == 'train': optimizer.zero_grad() loss.backward() optimizer.step() losses.append(loss.item()) y_true.append(to_np(target_value)) y_pred.append(to_np(mu)) y_sig.append(to_np(sigma)) idx.append(np.concatenate((to_np(idx0), to_np(idx1)), 1)) # report results y_true = np.concatenate(y_true) y_pred = np.concatenate(y_pred) y_sig = np.concatenate(y_sig) idx = np.concatenate(idx) # save sample results if recon_nodata: fname = os.path.join( RESULT_DIR, f'recon_{EXPERIMENT_ID}.npz' ) np.savez( fname, y_pred=y_pred.flatten(), y_sig=y_sig.flatten(), idx=idx ) else: l1_err = np.mean(np.abs(y_true - y_pred)) rmse = np.sqrt(np.mean((y_true - y_pred)**2)) loss = np.mean(losses) if run_type == 'valid' and loss < best_valid_loss: print('Best !!') best_valid_loss = loss # save model fname = os.path.join( RESULT_DIR, f'{model_name}_{EXPERIMENT_ID}.pth' ) torch.save(model.state_dict(), fname) report_dict = { 'epoch': epoch, f'{run_type}__loss': float(loss.item()), f'{run_type}__l1err': float(l1_err), f'{run_type}__rmse': float(rmse), f'{run_type}__epochtime': float(time() - tic), } pprint(report_dict)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if torch.cuda.is_available(): print("cuda") else: print("CPU") if __name__ == '__main__': if not os.path.exists(args.save): os.makedirs(args.save) # Data Loader transformer = ToDouble if args.double else Identity train_dataset, test_dataset, num_classes = get_dataset( args.dataset, tensor_type_transformer=transformer) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, shuffle=False) data_gen = inf_generator(train_loader) batches_per_epoch = len(train_loader) print("Train Data: {}, Test Data: {}".format(len(train_dataset), len(test_dataset))) print("Total number of classes: {}".format(num_classes))
torch.manual_seed(args.seed) world_size = args.ngpu torch.distributed.init_process_group( 'nccl', init_method='env://', world_size=world_size, rank=args.local_rank, ) # Data print_status('==> Preparing data..') if not (args.train_type == 'contrastive'): assert ('wrong train phase...') else: trainloader, traindst, testloader, testdst, train_sampler = data_loader.get_dataset( args) # Model print_status('==> Building model..') torch.cuda.set_device(args.local_rank) model = model_loader.get_model(args) if args.model == 'ResNet18': expansion = 1 elif args.model == 'ResNet50': expansion = 4 else: assert ('wrong model type') projector = Projector(expansion=expansion) if 'Rep' in args.advtrain_type:
def train(args, device): num_client = args.num_client trainset, testset = dl.get_dataset(args) sample_inds = dl.get_indices(trainset, args) # PS model net_ps = get_net(args).to(device) net_users = [get_net(args).to(device) for u in range(num_client)] optimizers = [ torch.optim.SGD(net_users[cl].parameters(), lr=args.lr, weight_decay=1e-4) for cl in range(num_client) ] criterions = [nn.CrossEntropyLoss() for u in range(num_client)] testloader = torch.utils.data.DataLoader(testset, batch_size=args.bs, shuffle=False, num_workers=2) schedulers = [ torch.optim.lr_scheduler.StepLR(optimizers[cl], step_size=30, gamma=0.1) for cl in range(num_client) ] # synch all clients models models with PS [sf.pull_model(net_users[cl], net_ps) for cl in range(num_client)] net_sizes, net_nelements = sf.get_model_sizes(net_ps) ind_pairs = sf.get_indices(net_sizes, net_nelements) N_s = (50000 if args.dataset_name == 'cifar10' else 60000) modelsize = sf.count_parameters(net_ps) errorCorCof = 1 layer_types = [] for p in net_ps.named_parameters(): names = p[0] layer_types.append(names.split('.')) errors = [] accuracys = [] ps_model_mask = torch.ones(modelsize).to(device) sf.initialize_zero(net_ps) currentLR = args.lr for cl in range(num_client): errors.append(torch.zeros(modelsize).to(device)) runs = math.ceil(N_s / (args.bs * num_client)) acc = evaluate_accuracy(net_ps, testloader, device) accuracys.append(acc * 100) for epoch in tqdm(range(args.num_epoch)): if epoch == args.errDecayVals[0] and args.errorDecay is True: errorCorCof = args.errDecayVals[1] if args.warmUp and epoch < 5: for cl in range(num_client): for param_group in optimizers[cl].param_groups: if epoch == 0: param_group['lr'] = 0.1 else: lr_change = (args.lr - 0.1) / 4 param_group['lr'] = (lr_change * epoch) + 0.1 if epoch in args.lr_change: for cl in range(num_client): sf.adjust_learning_rate(optimizers[cl], epoch, args.lr_change, args.lr) currentLR = sf.get_LR(optimizers[0]) for run in range(runs): for cl in range(num_client): trainloader = DataLoader(dl.DatasetSplit( trainset, sample_inds[cl]), batch_size=args.bs, shuffle=True) for data in trainloader: inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) optimizers[cl].zero_grad() predicts = net_users[cl](inputs) loss = criterions[cl](predicts, labels) loss.backward() optimizers[cl].step() break ps_model_flat = sf.get_model_flattened(net_ps, device) ps_model_dif = torch.zeros_like(ps_model_flat) for cl in range(num_client): model_flat = sf.get_model_flattened(net_users[cl], device) model_flat.add_(errors[cl] * currentLR * errorCorCof) difmodel = (model_flat.sub(ps_model_flat)).to(device) difmodel_clone = torch.clone(difmodel).to(device) if not (args.warmUp and epoch < 5): if args.layer_wise_spars and args.worker_LWS: sf.sparse_timeC_alt(difmodel, args.sparsity_window, 10, args.lws_sparsity_w, ps_model_mask, ind_pairs, device) else: sf.sparse_timeC(difmodel, args.sparsity_window, 10, ps_model_mask, device) if args.quantization: sf.groups(difmodel, args.num_groups, args.denominator, device) errors[cl] = (difmodel_clone.sub(difmodel)) / currentLR ps_model_dif.add_(difmodel / num_client) ps_model_flat.add_(ps_model_dif) topk = math.ceil(ps_model_dif.nelement() / args.sparsity_window) ind = torch.topk(ps_model_dif.abs(), k=topk, dim=0)[1] if not (args.warmUp and epoch < 5): if args.layer_wise_spars: ps_model_mask = sf.sparse_special_mask( ps_model_flat, args.sparsity_window, args.lws_sparsity, ind_pairs, device) else: ps_model_mask *= 0 ps_model_mask[ind] = 1 sf.make_model_unflattened(net_ps, ps_model_flat, net_sizes, ind_pairs) [sf.pull_model(net_users[cl], net_ps) for cl in range(num_client)] ''' if run %10 == 0: ##debug acc = evaluate_accuracy(net_ps, testloader, device) print('accuracy:', acc * 100) break ''' acc = evaluate_accuracy(net_ps, testloader, device) accuracys.append(acc * 100) print( 'accuracy:', acc * 100, ) return accuracys
else: batch_loss = train_step(adj, nodes, targ) print('Epoch {} Batch {} Batch Loss {:.4f} '.format( epoch, batch, batch_loss.numpy())) if batch % args.checkpoint == 0: ckpt_save_path = ckpt_manager.save() print("Saving checkpoint \n") print('Time {} \n'.format(time.time() - start)) pbar.update(1) elif args.enc_type == 'rnn' and args.dec_type == "rnn": OUTPUT_DIR += '/' + args.enc_type + '_' + args.dec_type dataset, BUFFER_SIZE, BATCH_SIZE,\ steps_per_epoch, vocab_inp_size, vocab_tgt_size, target_lang = get_dataset(args) step = 0 if args.decay is not None: learning_rate = CustomSchedule(args.emb_dim, warmup_steps=args.decay_steps) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=0.9, beta2=0.98, epsilon=1e-9) else: optimizer = tf.train.AdamOptimizer(beta1=0.9, beta2=0.98, epsilon=1e-9)
def train(): print('loading data') x, y, x_test, y_test = get_dataset() # use pca n_train = x.shape[0] n_test = x_test.shape[0] n_batches = n_train // batch_size print('train samples: %d' % n_train) print('test samples: %d' % n_test) x, y, y_test = get_batches(x, y, y_test) config = tf.ConfigProto(device_count={'GPU': 1}, allow_soft_placement=True) config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: print('creating model') model = create_model(sess) start_time = time.time() total_loss = [] train_acc = [] test_acc = [] # GO !! for e in range(epochs): print('working on epoch {0}/{1}'.format(e + 1, epochs)) epoch_start_time = time.time() epoch_loss, epoch_acc = 0, 0 for i in range(n_batches): print('working on epoch {0}, batch {1}/{2}'.format(e+1, i+1, n_batches)) enc_in, dec_out = x[i], y[i] _, output, step_loss, _ = model.step(sess, enc_in, dec_out) step_acc = evaluate_batch(output, dec_out) / batch_size epoch_loss += step_loss epoch_acc += step_acc print('current batch loss: {:.2f}'.format(step_loss)) epoch_time = time.time() - epoch_start_time print('epoch {0}/{1} finish in {2:.2f} s'.format(e+1, epochs, epoch_time)) epoch_loss /= n_batches epoch_acc /= n_batches total_loss.append(epoch_loss) train_acc.append(epoch_acc) print('average epoch loss: {:.4f}'.format(epoch_loss)) print('average epoch acc: {:.4f}'.format(epoch_acc)) print('saving model...') model.saver.save(sess, ckpt_path, model.global_step.eval()) # test after each epoch output = model.step(sess, x_test, y_test, is_train=False)[0] step_acc = evaluate_batch(output, y_test, n_test) / n_test test_acc.append(step_acc) print('test acc: %.4f\n' % step_acc) print('training finish in {:.2f} s'.format(time.time() - start_time)) with open(os.path.join(store_path, 'summary.txt'), 'w') as f: for i in range(epochs): f.write('{0}\t{1}\t{2}\n'.format(total_loss[i], train_acc[i], test_acc[i]))
from training import get_trainer from testing import get_tester cfg = load_config('config.yaml') is_train = cfg['mode']['train'] is_val = cfg['mode']['val'] is_test = cfg['mode']['test'] mode = 'train' if is_train or is_val else 'test' device = torch.device("cuda:0" if ( torch.cuda.is_available() and not cfg[mode]['no_cuda']) else "cpu") torch.cuda.set_device(device) if __name__ == '__main__': if mode == 'train': train_dataset = get_dataset(name=cfg['data']['dataset'], mode='train', data_path=cfg['data']['data_path'], device=device) val_dataset = get_dataset(name=cfg['data']['dataset'], mode='val', data_path=cfg['data']['data_path'], device=device) train_loader = DataLoader(train_dataset, batch_size=cfg['train']['batch_size'], num_workers=4, shuffle=True) val_loader = DataLoader(val_dataset, batch_size=cfg['val']['batch_size'], num_workers=4, shuffle=True) #The renderer is implemented separately. Becuase it is replacable and it is not present in inference. renderer = get_renderer(name=cfg['train']['model']['renderer'],
from sklearn import svm from data_loader import get_dataset train_x, train_y, test_x, test_y = get_dataset() # use pca for penalty in [1, 10, 100]: print('C = %d' % penalty) for kernel in ['linear', 'rbf']: print(kernel) clf = svm.SVC(kernel=kernel, C=penalty) clf.fit(train_x, train_y) print('score: %.4f' % clf.score(test_x, test_y)) print('support vector: %d' % clf.support_vectors_.shape[0])
def generate_embedding_vectors(run_id, analysis_tag="", num_workers=20, use_gpu=True, dset_mode=None, dset_split="validation", dataset_name="deepfashion", principal_encoder='1', *useless_args, **useless_kwargs): ckpt_path = os.path.join(cfg.CKPT_DIR, "{}.pth".format(run_id)) print("ckpt path: {}".format(ckpt_path)) analysis_id = run_id if not analysis_tag else "{}_{}".format( run_id, analysis_tag) preds_path = os.path.join( cfg.PREDS_DIR, "{}_{}_embedding.json".format(analysis_id, dset_split)) print("preds savepath: {}".format(preds_path)) if use_gpu: if not torch.cuda.is_available(): raise RuntimeError("cuda not available") device = torch.device('cuda') else: device = torch.device("cpu") print('DEVICE', device) # load the ckpt print("Loading model from path: {}".format(ckpt_path)) ckpt = torch.load(ckpt_path) dset_mode = ckpt['dset_mode'] if dset_mode is None else dset_mode model_type = ckpt.get('model_type', 'siamese') # model model = get_model(model_type, freeze_encoder=True, train_mode=False, principal_encoder=principal_encoder) enc_dim = model.enc_dim model = nn.DataParallel(model) model.load_state_dict(ckpt['model_state_dict']) model.to(device) print("USING MODEL TYPE {} ON DSET {}".format(model_type, dataset_name)) print("Using dset mode: {}".format(dset_mode)) # data loader ds = get_dataset(dataset_name, dset_mode, one_sample_only=True) ds = ds[0] if dset_split == "train" else ds[1] itemids = ds.get_itemids() # ds = Subset(ds, range(200)) dl = DataLoader(ds, batch_size=cfg.BATCH_SIZE, shuffle=False, num_workers=num_workers) encodings_arr = np.zeros((len(ds), enc_dim)) with torch.no_grad(): for i, x in tqdm(enumerate(dl), total=len(ds) / cfg.BATCH_SIZE): x = x.to(device) enc = model(x) encodings_arr[i * cfg.BATCH_SIZE:(i + 1) * cfg.BATCH_SIZE, :] = enc.cpu().numpy() print(encodings_arr.shape) encodings = {} for i in range(len(encodings_arr)): k = itemids[i] encoding_vec = encodings_arr[i, :] encodings[k] = encoding_vec.tolist() # TODO: laod ckpt with open(preds_path, "w") as outfile: print("Saving preds to: {}".format(preds_path)) json.dump(encodings, outfile)
import sys from data_loader import get_dataset DATA_NAMES = [ "abalone", "airline", "airline-one-hot", "epsilon", "higgs", "letters", "msrank", "msrank-classification" ] if __name__ == "__main__": out_dir = sys.argv[1] print('out_dir: ' + str(out_dir)) for dataset_name in DATA_NAMES: print('Processing ' + dataset_name) get_dataset(dataset_name, out_dir)
def main(): print("Loading data...\n") dataset = get_dataset() (train_X, train_Y), (test_X, test_Y) = process_data(dataset) # Convert the dataset into torch tensors train = data_utils.TensorDataset( torch.tensor(train_X).float(), torch.tensor(train_Y).long()) test = data_utils.TensorDataset( torch.tensor(test_X).float(), torch.tensor(test_Y).long()) train_loader = data_utils.DataLoader(train, batch_size=BATCH_SIZE, shuffle=True, num_workers=5, sampler=None, pin_memory=False) test_loader = data_utils.DataLoader( test, # dataset to load from batch_size=BATCH_SIZE, # examples per batch (default: 1) shuffle=False, sampler= None, # if a sampling method is specified, `shuffle` must be False num_workers=5, # subprocesses to use for sampling pin_memory=False) # whether to return an item pinned to GPU # Calculate the word-level accuracy on the training and the test ser default_train_loader = get_default_train_loader() default_test_loader = get_default_test_loader() if args.model == "lenet": print("Running LeNet on OCR") model = LeNet() else: print("Running AlexNet on OCR") model = AlexNet(num_classes=26) model.to(device) criterion = nn.CrossEntropyLoss() optimizer = optim.LBFGS(model.parameters(), history_size=5, max_iter=5) if args.num_epochs is not None: NUM_EPOCHS = args.num_epochs else: NUM_EPOCHS = 100 print("Starting Training...\n") letter_training_accuracies = [] letter_test_accuracies = [] word_training_accuracies = [] word_test_accuracies = [] for epoch in range(NUM_EPOCHS): print("Processing epoch {}".format(epoch + 1)) running_loss = 0.0 for i_batch, sample in enumerate(train_loader, 0): train_X = sample[0] train_Y = sample[1] train_X, train_Y = train_X.to(device), train_Y.to(device) train_Y_labels = torch.max(train_Y, 1)[1] def closure(): optimizer.zero_grad() outputs = model(train_X) outputs.to(device) tr_loss = criterion(outputs, train_Y_labels) print('Loss at epoch {}, batch {}: {}'.format( epoch + 1, i_batch, tr_loss.item())) tr_loss.backward() del outputs return tr_loss optimizer.step(closure) del train_X, train_Y, train_Y_labels # Calculate the letter-level accuracy on the training and the test set letter_training_accuracy = letter_accuracy(train_loader, model) letter_test_accuracy = letter_accuracy(test_loader, model) letter_training_accuracies.append(letter_training_accuracy) letter_test_accuracies.append(letter_test_accuracy) word_training_accuracy = word_accuracy(default_train_loader, model) word_test_accuracy = word_accuracy(default_test_loader, model) word_training_accuracies.append(word_training_accuracy) word_test_accuracies.append(word_test_accuracy) print('\nLetter Training Accuracy on epoch {}: {}'.format( epoch + 1, letter_training_accuracy)) print('Letter Test Accuracy on epoch {}: {}'.format( epoch + 1, letter_test_accuracy)) print('Word Training Accuracy on epoch {}: {}'.format( epoch + 1, word_training_accuracy)) print('Word Training Accuracy on epoch {}: {}\n'.format( epoch + 1, word_test_accuracy)) final_letter_test_accuracy = letter_accuracy(test_loader, model) final_word_test_accuracy = word_accuracy(default_test_loader, model) print("Letter Test accuracy of {} on OCR Data: {}".format( args.model, final_letter_test_accuracy)) print("Word Test accuracy of {} on OCR Data: {}".format( args.model, final_word_test_accuracy)) save_accuracies(letter_training_accuracies, letter_test_accuracies, "letter", args.model, "lbfgs") save_accuracies(word_training_accuracies, word_test_accuracies, "word", args.model, "lbfgs") # Save the model print("Saving {} model to {}".format(args.model, PATH)) torch.save(model, PATH)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--use-gpu', action='store_true') parser.add_argument('--datasets', default='datasets') parser.add_argument('--iterations', default=1000, type=int) parser.add_argument('--result', default='result.json') parser.add_argument('--table', default='common-table.txt') args = parser.parse_args() sTime = time.strftime("%m%d_%H_%M_%S", time.localtime()) #sTime = '{:0>2d}_{:0>2d}_{:0>2d}_{:0>2d}'.format(*time.gmtime()[1:5]) args.result = f'./result/result_{sTime}.json' experiments_names = [ #'abalone', "MICROSOFT", "YEAR", "YAHOO", #"HIGGS", # #"CLICK", #'EPSILON', #'airline', #'epsilon', #'higgs', #'letters', #'msrank', #'msrank-classification', #'synthetic', #'synthetic-5k-features' ] #learners = [#XGBoostLearner,LightGBMLearner,#CatBoostLearner ] iterations = args.iterations logs_dir = 'logs' params_grid = { 'iterations': [iterations], 'max_depth': [6], 'learning_rate': [0.03, 0.07, 0.15] } #args.datasets = "L:/Datasets/" nEXP = len(experiments_names) for i, experiment_name in enumerate(experiments_names): print(f"\n********************* {experiment_name} {i+1}/{nEXP} ......", end="") data_tuple, desc = get_dataset(experiment_name, args.datasets) print( f"\r********************* {experiment_name} {i+1}/{nEXP} *********************\n{desc}" ) experiment = EXPERIMENTS[experiment_name] #experiment.run(args.use_gpu, learners, params_grid, args.datasets, args.result, logs_dir) experiment.run(args.use_gpu, learners, params_grid, data_tuple, args.result, logs_dir) stats = get_experiment_stats(args.result, args.use_gpu, niter=iterations) print_all_in_one_table(stats, args.use_gpu, learners, params=(6.0, 1.0), output=args.table)
def main(verbose=1, print_freq=100, restore=True, ckpt_path=None, val_freq=1, run_id="model", dset_mode="grayscale_mask", model_type="siamese", dataset_name="deepfashion", ckpt_type="siamese", freeze_encoder_until_it=1000): print("TRAINING MODEL {} ON DATASET {}".format(model_type, dataset_name)) if restore and ckpt_path: raise RuntimeError("Specify restore 0R ckpt_path") ckpt_savepath = os.path.join(cfg.CKPT_DIR, "{}.pth".format(run_id)) print("Saving ckpts to {}".format(ckpt_savepath)) logs_savepath = os.path.join(cfg.LOGDIR, run_id) print("Saving logs to {}".format(logs_savepath)) if restore or ckpt_path: print("Restoring weights from {}".format( ckpt_savepath if restore else ckpt_path)) if cfg.USE_GPU: if not torch.cuda.is_available(): raise RuntimeError("cuda not available") device = torch.device('cuda') else: device = torch.device("cpu") print('DEVICE', device) # model model = get_model(model_type) model = DataParallel(model) # must call this before constructing the optimizer: # https://pytorch.org/docs/stable/optim.html model.to(device) # set up training # TODO better one? optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0001) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1) criterion = ContrastiveLoss() initial_epoch = 0 iteration = 0 unfrozen = False if ckpt_path: ckpt = torch.load(ckpt_path) state_dict = ckpt['model_state_dict'] if ckpt_type == model_type: model.load_state_dict(state_dict) elif model_type == 'dual' and ckpt_type == 'siamese': model = load_siamese_ckpt_into_dual(model, state_dict) else: raise NotImplementedError() elif restore: if os.path.exists(ckpt_savepath): print("LOADING MODEL") ckpt = torch.load(ckpt_savepath) model.load_state_dict(ckpt['model_state_dict']) optimizer.load_state_dict(ckpt['optimizer_state_dict']) initial_epoch = ckpt['epoch'] iteration = ckpt['it'] dset_mode = ckpt.get('dset_mode', dset_mode) else: raise RuntimeError("Should not get here! Check for bugs") print("Using dset_mode {}".format(dset_mode)) # dataset train_ds, test_ds = get_dataset(dataset_name, dset_mode) # train_ds = Subset(train_ds, range(500)) # test_ds = Subset(test_ds, range(100)) train_dl = DataLoader(train_ds, batch_size=cfg.BATCH_SIZE, shuffle=True, num_workers=cfg.NUM_WORKERS) test_dl = DataLoader(test_ds, batch_size=cfg.BATCH_SIZE, shuffle=False, num_workers=cfg.NUM_WORKERS) # training loop start = time.time() try: for epoch in range(initial_epoch, cfg.NUM_EPOCHS): logger = SummaryWriter(logs_savepath) # effectively puts the model in train mode. # Opposite of model.eval() model.train() print("Epoch {}".format(epoch)) for i, (im1, im2, y) in tqdm(enumerate(train_dl), total=len(train_ds) / cfg.BATCH_SIZE): iteration += 1 if not unfrozen and iteration > freeze_encoder_until_it: print("Unfreezing encoder") unfrozen = True for param in model.parameters(): param.requires_grad = True logger.add_scalar('DataTime', time.time() - start, iteration) im1 = im1.to(device) im2 = im2.to(device) y = y.to(device) enc1, enc2 = model(im1, im2) loss = criterion(enc1, enc2, y) # I think this zeros out previous gradients (in case people # want to accumulate gradients?) optimizer.zero_grad() loss.backward() optimizer.step() # logging logger.add_scalar('TrainLoss', loss.item(), iteration) logger.add_scalar('ItTime', time.time() - start, iteration) start = time.time() # display metrics # do some validation if (epoch + 1) % val_freq == 0: print("Validating...") model.eval() # puts model in validation mode with torch.no_grad(): for i, (im1, im2, y) in tqdm(enumerate(test_dl), total=len(test_ds) / cfg.BATCH_SIZE): im1 = im1.to(device) im2 = im2.to(device) y = y.to(device) enc1, enc2 = model(im1, im2) loss = criterion(enc1, enc2, y) logger.add_scalar('ValLoss', loss, iteration) # end of epoch lr_scheduler.step() save_ckpt(ckpt_savepath, model, epoch, iteration, optimizer, dset_mode, dataset_name, model_type) except KeyboardInterrupt: print('Got keyboard interrupt, saving model...') save_ckpt(ckpt_savepath, model, epoch, iteration, optimizer, dset_mode, dataset_name, model_type)
f1 = 2. * prec * recall / (prec + recall) if (prec + recall) > 0 else 0 report = 'Evaluation: F1 %.4f (%.4f %i/%i, %.4f %i/%i, %i)' % \ (f1, prec, cnt_match, cnt_pred, recall, cnt_match, cnt_label, cnt_length) with open(os.path.join(args.save_dir, 'train_log.txt'), 'a') as f: f.write(report + '\n') print(report) if f1 > args.max_f1: args.max_f1 = f1 torch.save(model.state_dict(), os.path.join(args.save_dir, 'f1_%.4f_params.pkl' % f1)) if __name__ == '__main__': # prepare dataseut train_set, test_set = get_dataset(args) # define model Model = getattr(models, args.model) if args.device > -1: os.environ['CUDA_VISIBLE_DEVICES'] = '{}'.format(args.device) model = Model(args).cuda() # record localtime = time.asctime(time.localtime(time.time())) with open(os.path.join(args.save_dir, 'train_log.txt'), 'a') as f: f.write('*********** %s ***********\n' % localtime) with open(os.path.join(args.save_dir, 'config.json'), 'wt') as f: json.dump(vars(args), f, indent=2) # train
def main(verbose: int = 1, print_freq: int = 100, restore: Union[bool, str] = True, val_freq: int = 1, run_id: str = "model", dset_name: str = "memento_frames", model_name: str = "frames", freeze_until_it: int = 1000, additional_metrics: Mapping[str, Callable] = {'rc': rc}, debug_n: Optional[int] = None, batch_size: int = cfg.BATCH_SIZE, require_strict_model_load: bool = False, restore_optimizer=True, optim_string='adam', lr=0.01) -> None: print("TRAINING MODEL {} ON DATASET {}".format(model_name, dset_name)) ckpt_savedir = os.path.join(cfg.DATA_SAVEDIR, run_id, cfg.CKPT_DIR) print("Saving ckpts to {}".format(ckpt_savedir)) logs_savepath = os.path.join(cfg.DATA_SAVEDIR, run_id, cfg.LOGDIR) print("Saving logs to {}".format(logs_savepath)) utils.makedirs([ckpt_savedir, logs_savepath]) last_ckpt_path = os.path.join(ckpt_savedir, "last_model.pth") device = utils.set_device() print('DEVICE', device) # model model = get_model(model_name, device) # print("model", model) model = DataParallel(model) # must call this before constructing the optimizer: # https://pytorch.org/docs/stable/optim.html model.to(device) # set up training # TODO better one? if optim_string == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=lr) elif optim_string == 'sgd': optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=0.0001) else: raise RuntimeError( "Unrecognized optimizer string {}".format(optim_string)) lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1) # criterion = MemAlphaLoss(device=device) # criterion = MemMSELoss() # criterion = lambda x, y: MemMSELoss()(x, y) + # CaptionsLoss(device=device)(x, y) losses = { 'mem_mse': MemMSELoss(device=device, weights=np.load("memento_weights.npy")), 'captions': CaptionsLoss(device=device, class_weights=cap_utils.get_vocab_weights()) } initial_epoch = 0 iteration = 0 unfrozen = False if restore: ckpt_path = restore if isinstance(restore, str) else last_ckpt_path if os.path.exists(ckpt_path): print("Restoring weights from {}".format(ckpt_path)) ckpt = torch.load(ckpt_path) utils.try_load_state_dict(model, ckpt['model_state_dict'], require_strict_model_load) if restore_optimizer: utils.try_load_optim_state(optimizer, ckpt['optimizer_state_dict'], require_strict_model_load) initial_epoch = ckpt['epoch'] iteration = ckpt['it'] else: ckpt_path = last_ckpt_path # dataset train_ds, val_ds, test_ds = get_dataset(dset_name) assert val_ds or test_ds if debug_n is not None: train_ds = Subset(train_ds, range(debug_n)) test_ds = Subset(test_ds, range(debug_n)) train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=cfg.NUM_WORKERS) test_dl = DataLoader(test_ds, batch_size=batch_size, shuffle=False, num_workers=cfg.NUM_WORKERS) # training loop start = time.time() try: for epoch in range(initial_epoch, cfg.NUM_EPOCHS): logger = SummaryWriter(logs_savepath) # effectively puts the model in train mode. # Opposite of model.eval() model.train() print("Epoch {}".format(epoch)) for i, (x, y_) in tqdm(enumerate(train_dl), total=len(train_ds) / batch_size): y: ModelOutput[MemModelFields] = ModelOutput(y_) iteration += 1 if not unfrozen and iteration > freeze_until_it: print("Unfreezing encoder") unfrozen = True for param in model.parameters(): param.requires_grad = True logger.add_scalar('DataTime', time.time() - start, iteration) x = x.to(device) y = y.to_device(device) out = ModelOutput(model(x, y.get_data())) loss_vals = {name: l(out, y) for name, l in losses.items()} # print("loss_vals", loss_vals) loss = torch.stack(list(loss_vals.values())) if verbose: print("stacked loss", loss) loss = loss.sum() # loss = criterion(out, y) # I think this zeros out previous gradients (in case people # want to accumulate gradients?) optimizer.zero_grad() loss.backward() optimizer.step() # logging utils.log_loss(logger, loss, loss_vals, iteration) logger.add_scalar('ItTime', time.time() - start, iteration) start = time.time() # display metrics # do some validation if (epoch + 1) % val_freq == 0: print("Validating...") model.eval() # puts model in validation mode val_iteration = iteration with torch.no_grad(): labels: Optional[ModelOutput[MemModelFields]] = None preds: Optional[ModelOutput[MemModelFields]] = None val_losses = [] for i, (x, y_) in tqdm(enumerate(test_dl), total=len(test_ds) / batch_size): val_iteration += 1 y = ModelOutput(y_) y_numpy = y.to_numpy() labels = y_numpy if labels is None else labels.merge( y_numpy) x = x.to(device) y = y.to_device(device) out = ModelOutput(model(x, y.get_data())) out_numpy = out.to_device('cpu').to_numpy() preds = out_numpy if preds is None else preds.merge( out_numpy) loss_vals = { name: l(out, y) for name, l in losses.items() } loss = torch.stack(list(loss_vals.values())).sum() utils.log_loss(logger, loss, loss_vals, val_iteration, phase='val') val_losses.append(loss) print("Calculating validation metric...") # print("preds", {k: v.shape for k, v in preds.items()}) # assert False metrics = { fname: f(labels, preds, losses) for fname, f in additional_metrics.items() } print("Validation metrics", metrics) for k, v in metrics.items(): if isinstance(v, numbers.Number): logger.add_scalar('Metric_{}'.format(k), v, iteration) metrics['total_val_loss'] = sum(val_losses) ckpt_path = os.path.join( ckpt_savedir, utils.get_ckpt_path(epoch, metrics)) save_ckpt(ckpt_path, model, epoch, iteration, optimizer, dset_name, model_name, metrics) # end of epoch lr_scheduler.step() save_ckpt(last_ckpt_path, model, epoch, iteration, optimizer, dset_name, model_name) except KeyboardInterrupt: print('Got keyboard interrupt, saving model...') save_ckpt(last_ckpt_path, model, epoch, iteration, optimizer, dset_name, model_name)
help='Beta 1 for Adam optimizer') parser.add_argument('--lr', type=float, default=0.0002, help='Learning rate') parser.add_argument('--epochs', type=int, default=20, help='Number of iterations to train') parser.add_argument('--feature_size', type=int, default=100, help='Size of random noise') # Parse all the arguments args = parser.parse_args() # Get dataset and data loader dataset, num_channels = get_dataset(args.dataset) data_loader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True, num_workers=2) # Check whether GPU is available device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Instantiate generator and discriminator generator = Generator(args.feature_size, num_channels).to(device) discriminator = Discriminator(num_channels=num_channels).to(device) # Select loss function and optimizer loss_fn = torch.nn.BCELoss() optimizer_d = optim.Adam(discriminator.parameters(),