def __init__(self, *args, **kwargs) -> None: super().__init__(*args, **kwargs) self.n = 1000 self.butterfly = utils.get_img(path.join('example', 'butterfly.png')) # Batching self.butterfly = self.butterfly.repeat(16, 1, 1, 1) self.m = torch.Tensor([ [3.2, 0.016, -68], [1.23, 1.7, -54], [0.008, 0.0001, 1], ]) if cuda.is_available(): self.butterfly = self.butterfly.cuda() self.m = self.m.cuda() with utils.Timer('Warm-up: {}'): for _ in range(100): _ = core_warp.warp( self.butterfly, self.m, sizes='auto', kernel='bicubic', fill_value=0, ) cuda.synchronize()
def all_reduce_thread(self, input): input_device = input.get_device() if input_device == 0: data_list = [input] for i in range(self.allreduce_num - 1): data_list.append(self.queue[i].get()) cuda.synchronize() # total_sum = Synchronize.data_list[0].cpu().clone() # for i in range(1, Synchronize.device_num): # total_sum = total_sum + Synchronize.data_list[i].cpu() # for i in range(0, Synchronize.device_num): # with torch.cuda.device_of(Synchronize.data_list[i]): # Synchronize.result_list[i] = total_sum.clone().cuda() cuda.nccl.all_reduce(data_list) cuda.synchronize() for i in range(self.allreduce_num - 1): self.queue[i].task_done() else: self.queue[input_device - 1].put(input) self.queue[input_device - 1].join() return input
def __init__(self, seed=0): manual_seed(0) cuda0 = device('cuda:0') self.W = normal(zeros((10, 784)), ones((10, 784))).to(device=cuda0) self.w0 = normal(zeros((10, 1)), ones((10, 1))).to(device=cuda0) cuda.synchronize() return
def run_node(self, n: Node) -> Any: """ Timing wrapper around executing an FX Node """ start = time.perf_counter() result = super().run_node(n) synchronize() sec = time.perf_counter() - start for prof in self.profile_stats: prof.record(n, sec) return result
def test_warp_bicubic(self) -> torch.Tensor: with utils.Timer('Bicubic warping: {}'): for _ in range(self.n): _ = core_warp.warp( self.butterfly, self.m, sizes='auto', kernel='bicubic', fill_value=0, ) cuda.synchronize()
def _train(data, opt=True): total = 0 for y, x in data: y, x = y.to(device), x.to(device) pred_y = model(x) l = loss(pred_y, y) total += l.item() if opt: optimizer.zero_grad() l.backward() optimizer.step() cuda.synchronize() return total
def gradient_default(self, X, Y): N = X.size()[1] W_ext = unsqueeze(self.forward_model.W, 0).expand(N, -1, -1) w0_ext = unsqueeze(self.forward_model.w0, 0).expand(N, -1, -1) X_ext = transpose(unsqueeze(X, 0), 0, 2) Y_ext = transpose(unsqueeze(Y, 0), 0, 2) cuda.synchronize() return ( torch_sum(bmm( bmm(W_ext, X_ext) + w0_ext - Y_ext, transpose(X_ext, 1, 2)), dim=0) * 2 / N, # W gradient unsqueeze(torch_sum(self.forward_model(X) - Y, dim=1) * 2 / N, 1) # w0 gradient )
def profile(device, name, model, example_inputs, args): model = torch.fx.symbolic_trace(model) prof = FXProfiler(model) for _ in range(args.warmup): model(*example_inputs) for _ in range(args.repeat): synchronize() prof.run(*example_inputs) for aggregate, stats in zip(PROFILES, prof.profile_stats): print(f"{device:4} {name:20} {aggregate.name:13} {stats.summary()}") aggregate.update(stats, name=name) return model
def cuda_time(fname, f, *args, **kwargs): start = cuda.Event(enable_timing=True) end = cuda.Event(enable_timing=True) cuda.synchronize() start.record() ret = f(*args, **kwargs) end.record() cuda.synchronize() t = start.elapsed_time(end) / 1000 fmt = "'{}' ran in {:.2e} seconds" print(fmt.format(fname, t, flush=True)) return ret
def _store_side_effects(self): """ Sub routine for proc_side_effects """ # Code ran so we need to store the side-effects forced = NamespaceStack.get_forced() forced_objects = [each[2] for each in forced] # Looks like everything in forced will be forced to disk # There may be some redundancies between forced and self.args, that's what REDUNDANT is for # On redundancies, we skip from self.args, not from namespace_stack materialize_additionals = False # First, write everything new in self.args to disk for arg in self.args: if NamespaceStack.is_comparable(arg) and arg in forced_objects: # arg will be written from forced Writer.store(REDUNDANT, self.static_key, self.global_key) # If optimizer was modified, you'll also want to materialize the network materialize_additionals = True else: # write this arg to disk, it's not in forced if hasattr(arg, 'state_dict'): Writer.store(deepcopy_cpu(arg.state_dict()), self.static_key, self.global_key) else: # Not state_dict() if hasattr(arg, 'cpu'): Writer.store(arg.cpu(), self.static_key, self.global_key) else: Writer.store(copy.deepcopy(arg), self.static_key, self.global_key) # Enter a separator Writer.store(SEPARATOR, self.static_key, self.global_key) # If I should materialize a node in a group, materialize the entire group (forced) if materialize_additionals: for l, k, v in forced: Writer.store(str(l), self.static_key, self.global_key) Writer.store(k, self.static_key, self.global_key) Writer.store(deepcopy_cpu(v.state_dict()), self.static_key, self.global_key) cuda.synchronize()
def forked_write(): cuda.synchronize() pid = os.fork() if not pid: path = flags.LOG_PATH.absolute.split('.') path.insert(-1, str(Writer.lsn)) path = '.'.join(path) fd = open(path, 'w') os.nice(1) # child process gets lower priority and starts flushing for each in Writer.write_buffer: if 'value' in each and not isinstance( each['value'], str): # the dict can have 'value' or 'state' each['value'] = Writer.serialize(each['value']) fd.write(json.dumps(each) + '\n') fd.close() os._exit(0) else: Writer.write_buffer = [] # parent process resets buffer
def test_net(net, writer, te, out_maps, noise, den_var, epoch, conv_field, GPU, cuda): if GPU == 1: cuda.synchronize() time_te = time.time() err_te = [] net.eval() #train(False) with torch.no_grad( ): # we're just computing the test set error so we won't be updating the gradients or weights for i, input in enumerate(te): if GPU == 1: input = input.cuda() target = input.data * (out_maps - 1 ) # switch from training to output space channels = target.shape[-3] if noise != 0: input = scramble_images(input, noise, den_var, GPU) #NOT SETUP FOR MULTI-CHANNEL output = net( input.float() ) # reshape output from flat filters to channels * filters per channel output = torch.reshape(output, (output.shape[0], out_maps, channels, output.shape[-2], output.shape[-1])) loss = F.cross_entropy(output, target.long()) # c err_te.append(loss.data) if i % 10 == 0: # log loss to tensorboard writer.add_scalar( 'test_loss', loss.data, epoch * len(te) ) # writer.add_histogram('conv1_weight', net[0].weight[0], epoch) # if you want to watch the evolution of the filters # writer.add_histogram('conv1_grad', net[0].weight.grad[0], epoch) if GPU == 1: cuda.synchronize() time_te = time.time() - time_te return err_te, time_te
def train_net(net, optimizer, writer, tr, epoch, out_maps, noise, den_var, conv_field, GPU, cuda): if GPU == 1: cuda.synchronize() # synchronize for timing purposes time_tr = time.time() err_tr = [] net.train(True) for i, input in enumerate(tr): if GPU == 1: input = input.cuda(non_blocking=True) target = input.data * (out_maps - 1 ) # switch from training to output space channels = target.shape[-3] if noise != 0: input = scramble_images( input, noise, den_var, GPU ) # introduce uniform noise to training samples (second term controls magnitude), not setup for multi-channel output = net( input.float() ) # reshape output from flat filters to channels * filters per channel output = torch.reshape(output, (output.shape[0], out_maps, channels, output.shape[-2], output.shape[-1])) loss = F.cross_entropy(output, target.long( )) # compute the loss between the network output, and our target err_tr.append(loss.data) # record loss optimizer.zero_grad() # reset gradients from previous passes loss.backward() # back-propagation optimizer.step() # update parameters if i % 10 == 0: # log loss to tensorboard writer.add_scalar('training_loss', loss.data, epoch * len(tr) + i) if GPU == 1: cuda.synchronize() time_tr = time.time() - time_tr return err_tr, time_tr
def _measure_performance(g, mem): tm = TicToc() tt = 0 f = 1 if g == -1: dev = torch.device('cpu') else: dev = torch.device('cuda:%s' % g) dtt = torch.double a = torch.eye(1024, 1024, dtype=dtt, device=dev) a.addmm_(a, a) if g >= 0: tcd.synchronize(device=dev) while tt < 1.0 and mem > 8.0 * (f * 2048.0) ** 2: tm.tic() a = torch.eye(f * 2048, f * 2048, dtype=dtt, device=dev) a.addmm_(a, a) if g >= 0: tcd.synchronize(device=dev) tt = tm.toc_val() f *= 2 print('%s:%s - speed: %s' % (dev.type, dev.index, (float(f) ** 3) / tt)) del a if g >= 0: tcd.synchronize(device=dev) tcd.empty_cache() return (float(f) ** 3) / tt
def run_inf(model, size, model_name, start_bs=64, logging=True): bs = start_bs finish = False if logging: print('Dataset Size:', size) while not finish: try: start = time.perf_counter() total_lat = 0 num_iter = 0 data_loader = DataLoader(dataset=RandomDataset(size), batch_size=bs) with torch.no_grad(): for _, (img, lb) in enumerate(data_loader): iter_start = time.perf_counter() img = img.cuda() out = model(img) cuda.synchronize() total_lat += time.perf_counter() - iter_start num_iter += 1 finish = True if logging: print('Batch Size:', bs) print('Latency(s): {:.2f}'.format(total_lat / num_iter)) print('FPS: {:.2f}'.format(LEN / (time.perf_counter() - start))) except Exception as e: bs -= 2
def _network_execution_time(network, batch): # forward pass start = time() cuda.synchronize() out, _ = network(batch) cuda.synchronize() t_forward = time() - start # backward pass start = time() cuda.synchronize() out.backward(out) cuda.synchronize() t_backward = time() - start return t_forward, t_backward
def _sync_cuda(): from torch import cuda cuda.synchronize()
def train( model_id, sequences_per_img=5, batch_size=10, resnet_conv_feature_size=2048, start_from=None, input_json_file_name=None, input_label_h5_file_name=None, label_smoothing=0, structure_loss_weight=1, train_sample_method="sample", train_beam_size=1, struc_use_logsoftmax=True, train_sample_n=5, structure_loss_type="seqnll", optimizer_type=NOAM, noamopt_factor=1, noamopt_warmup=20000, core_optimizer="sgd", learning_rate=0.0005, optimizer_alpha=0.9, optimizer_beta=0.999, optimizer_epsilon=1e-8, weight_decay=0, load_best_score=True, max_epochs=50, scheduled_sampling_start=-1, scheduled_sampling_increase_every=5, scheduled_sampling_increase_prob=0.05, scheduled_sampling_max_prob=0.25, self_critical_after=-1, structure_after=-1, cached_tokens="coco-train-idxs", grad_clip_value=0.1, grad_clip_mode=CLIP_VALUE, log_loss_iterations=25, save_every_epoch=True, save_checkpoint_iterations=3000, save_history_ckpt=True, eval_language_model=True, ): # # File names info_file_name = ( join(start_from, "infos_" + model_id + ".pkl") if start_from is not None else "" ) history_file_name = ( join(start_from, "histories_" + model_id + ".pkl") if start_from is not None else "" ) model_file_name = join(start_from, "model.pth") if start_from is not None else "" optimizer_file_name = ( join(start_from, "optimizer.pth") if start_from is not None else "" ) # # Load data loader = DataLoader( sequences_per_img, batch_size=batch_size, use_fc=True, use_att=True, use_box=0, norm_att_feat=0, norm_box_feat=0, input_json_file_name=input_json_file_name, input_label_h5_file_name=input_label_h5_file_name, ) vocab_size = loader.vocab_size seq_length = loader.seq_length # # Initialize training info infos = { "iter": 0, "epoch": 0, "loader_state_dict": None, "vocab": loader.get_vocab(), } # # Load existing state training information, if there is any if start_from is not None and isfile(info_file_name): # with open(info_file_name, "rb") as f: assert True # # Create data logger histories = defaultdict(dict) if start_from is not None and isfile(history_file_name): with open(history_file_name, "rb") as f: histories.update(pickle_load(f)) # tensorboard logger tb_summary_writer = SummaryWriter(checkpoint_path) # # Create our model vocab = loader.get_vocab() model = Transformer( vocab_size, resnet_conv_feature_size=resnet_conv_feature_size ).cuda() # # Load pretrained weights: if start_from is not None and isfile(model_file_name): model.load_state_dict(torch_load(model_file_name)) # # Wrap generation model with loss function(used for training) # This allows loss function computed separately on each machine lw_model = LossWrapper( model, label_smoothing=label_smoothing, structure_loss_weight=structure_loss_weight, train_sample_method=train_sample_method, train_beam_size=train_beam_size, struc_use_logsoftmax=struc_use_logsoftmax, train_sample_n=train_sample_n, structure_loss_type=structure_loss_type, ) # # Wrap with dataparallel dp_model = DataParallel(model) dp_lw_model = DataParallel(lw_model) # # Build optimizer if optimizer_type == NOAM: optimizer = get_std_opt(model, factor=noamopt_factor, warmup=noamopt_warmup) elif optimizer_type == REDUCE_LR: optimizer = build_optimizer( model.parameters(), core_optimizer=core_optimizer, learning_rate=learning_rate, optimizer_alpha=optimizer_alpha, optimizer_beta=optimizer_beta, optimizer_epsilon=optimizer_epsilon, weight_decay=weight_decay, ) optimizer = ReduceLROnPlateau(optimizer, factor=0.5, patience=3) else: raise ( Exception("Only supports NoamOpt and ReduceLROnPlateau optimization types") ) # # # Load the optimizer if start_from is not None and isfile(optimizer_file_name): optimizer.load_state_dict(torch_load(optimizer_file_name)) # # Prepare for training iteration = infos["iter"] epoch = infos["epoch"] # # For back compatibility if "iterators" in infos: infos["loader_state_dict"] = { split: { "index_list": infos["split_ix"][split], "iter_counter": infos["iterators"][split], } for split in ["train", "val", "test"] } loader.load_state_dict(infos["loader_state_dict"]) if load_best_score == 1: best_val_score = infos.get("best_val_score", None) if optimizer_type == NOAM: optimizer._step = iteration # # Assure in training mode dp_lw_model.train() epoch_done = True # # Start training try: while True: # # Check max epochs if epoch >= max_epochs and max_epochs != -1: break # # Update end of epoch data if epoch_done: # # Assign the scheduled sampling prob if epoch > scheduled_sampling_start and scheduled_sampling_start >= 0: frac = ( epoch - scheduled_sampling_start ) // scheduled_sampling_increase_every ss_prob = min( scheduled_sampling_increase_prob * frac, scheduled_sampling_max_prob, ) model.ss_prob = ss_prob # # If start self critical training if self_critical_after != -1 and epoch >= self_critical_after: sc_flag = True init_scorer(cached_tokens) else: sc_flag = False # # If start structure loss training if structure_after != -1 and epoch >= structure_after: struc_flag = True init_scorer(cached_tokens) else: struc_flag = False # # End epoch update epoch_done = False # # Compute time to load data start = time.time() data = loader.get_batch("train") load_data_time = time.time() - start print(f"Time to load data: {load_data_time} seconds") ######################## # SYNC ######################## synchronize() # # Compute time to complete epoch start = time.time() # # Make sure data is in GPU memory tmp = [ data["fc_feats"], data["att_feats"], data["labels"], data["masks"], data["att_masks"], ] tmp = [_ if _ is None else _.cuda() for _ in tmp] fc_feats, att_feats, labels, masks, att_masks = tmp # # Reset gradient optimizer.zero_grad() # print("MADE IT TO THE MODEL EVALUATION") # # Evaluate model model_out = dp_lw_model( fc_feats, att_feats, labels, masks, att_masks, data["gts"], torch_arange(0, len(data["gts"])), sc_flag, struc_flag, ) # # Average loss over training batch loss = model_out["loss"].mean() # # Compute gradient loss.backward() # # Clip gradient if grad_clip_value != 0: gradient_clipping_functions[grad_clip_mode]( model.parameters(), grad_clip_value ) # # Update optimizer.step() train_loss = loss.item() end = time.time() ######################## # SYNC ######################## synchronize() # # Output status if struc_flag: print( "iter {} (epoch {}), train_loss = {:.3f}, lm_loss = {:.3f}, struc_loss = {:.3f}, time/batch = {:.3f}".format( iteration, epoch, train_loss, model_out["lm_loss"].mean().item(), model_out["struc_loss"].mean().item(), end - start, ) ) elif not sc_flag: print( "iter {} (epoch {}), train_loss = {:.3f}, time/batch = {:.3f}".format( iteration, epoch, train_loss, end - start ) ) else: print( "iter {} (epoch {}), avg_reward = {:.3f}, time/batch = {:.3f}".format( iteration, epoch, model_out["reward"].mean(), end - start ) ) # # Update the iteration and epoch iteration += 1 if data["bounds"]["wrapped"]: epoch += 1 epoch_done = True # # Write the training loss summary if iteration % log_loss_iterations == 0: tb_summary_writer.add_scalar("train_loss", train_loss, iteration) if optimizer_type == NOAM: current_lr = optimizer.rate() elif optimizer_type == REDUCE_LR: current_lr = optimizer.current_lr tb_summary_writer.add_scalar("learning_rate", current_lr, iteration) tb_summary_writer.add_scalar( "scheduled_sampling_prob", model.ss_prob, iteration ) if sc_flag: tb_summary_writer.add_scalar( "avg_reward", model_out["reward"].mean(), iteration ) elif struc_flag: tb_summary_writer.add_scalar( "lm_loss", model_out["lm_loss"].mean().item(), iteration ) tb_summary_writer.add_scalar( "struc_loss", model_out["struc_loss"].mean().item(), iteration ) tb_summary_writer.add_scalar( "reward", model_out["reward"].mean().item(), iteration ) tb_summary_writer.add_scalar( "reward_var", model_out["reward"].var(1).mean(), iteration ) histories["loss_history"][iteration] = ( train_loss if not sc_flag else model_out["reward"].mean() ) histories["lr_history"][iteration] = current_lr histories["ss_prob_history"][iteration] = model.ss_prob # # Update infos infos["iter"] = iteration infos["epoch"] = epoch infos["loader_state_dict"] = loader.state_dict() # # Make evaluation on validation set, and save model if ( iteration % save_checkpoint_iterations == 0 and not save_every_epoch ) or (epoch_done and save_every_epoch): # # Evaluate model on Validation set of COCO eval_kwargs = {"split": "val", "dataset": input_json_file_name} val_loss, predictions, lang_stats = eval_split( dp_model, lw_model.crit, loader, verbose=True, verbose_beam=False, verbose_loss=True, num_images=-1, split="val", lang_eval=False, dataset="coco", beam_size=1, sample_n=1, remove_bad_endings=False, dump_path=False, dump_images=False, job_id="FUN_TIME", ) # # Reduces learning rate if no improvement in objective if optimizer_type == REDUCE_LR: if "CIDEr" in lang_stats: optimizer.scheduler_step(-lang_stats["CIDEr"]) else: optimizer.scheduler_step(val_loss) # # Write validation result into summary tb_summary_writer.add_scalar("validation loss", val_loss, iteration) if lang_stats is not None: for k, v in lang_stats.items(): tb_summary_writer.add_scalar(k, v, iteration) histories["val_result_history"][iteration] = { "loss": val_loss, "lang_stats": lang_stats, "predictions": predictions, } # # Save model if is improving on validation result if eval_language_model: current_score = lang_stats["CIDEr"] else: current_score = -val_loss best_flag = False if best_val_score is None or current_score > best_val_score: best_val_score = current_score best_flag = True # # Dump miscalleous informations infos["best_val_score"] = best_val_score # # Save checkpoints...seems only most recent one keep histories, # and it's overwritten each time save_checkpoint( model, infos, optimizer, checkpoint_dir=checkpoint_path, histories=histories, append="RECENT", ) if save_history_ckpt: save_checkpoint( model, infos, optimizer, checkpoint_dir=checkpoint_path, append=str(epoch) if save_every_epoch else str(iteration), ) if best_flag: save_checkpoint( model, infos, optimizer, checkpoint_dir=checkpoint_path, append="BEST", ) except (RuntimeError, KeyboardInterrupt): print(f'{BAR("=", 20)}Save checkpoint on exception...') save_checkpoint( model, infos, optimizer, checkpoint_dir=checkpoint_path, append="EXCEPTION" ) print(f'...checkpoint saved.{BAR("=", 20)}') stack_trace = format_exc() print(stack_trace)
def update(self, W_grad, w0_grad, step_size): self.W -= W_grad * step_size self.w0 -= w0_grad * step_size cuda.synchronize() return
def weight_norm(self): self.norm = sqrt(norm(self.W)**2 + norm(self.w0)**2) cuda.synchronize() return self.norm
def train(self, train_loader, loss_fn, optimizer,train_metrics,test_loader=None,test_metrics=None, num_epochs=10, lr_schedule=None, save_models="all", model_dir=os.getcwd(),notebook_mode=False,batch_log=True,save_logs=None,display_metrics=True,save_metrics=True): if save_models not in ["all", "best"]: raise ValueError("save models must be 'all' or 'best' , {} is invalid".format(save_models)) if save_models == "best" and test_loader is None: raise ValueError("save models can only be best when testloader is provided") if test_loader is not None: if test_metrics is None: raise ValueError("You must provide a metric for your test data") elif len(test_loader) == 0: raise ValueError("test metrics cannot be an empty list") if not os.path.exists(model_dir): os.mkdir(model_dir) models_all = os.path.join(model_dir, "all_models") models_best = os.path.join(model_dir, "best_models") if not os.path.exists(models_all): os.mkdir(models_all) if not os.path.exists(models_best) and test_loader is not None: os.mkdir(models_best) from tqdm import tqdm_notebook from tqdm import tqdm best_metric = 0.0 train_start_time = time() for e in tqdm(range(num_epochs)): print("Epoch {} of {}".format(e,num_epochs)) for metric in train_metrics: metric.reset() self.model.train() self.on_epoch_start(e) running_loss = torch.Tensor([0.0]) train_loss = 0.0 data_len = 0 if notebook_mode and batch_log: progress_ = tqdm_notebook(enumerate(train_loader)) elif batch_log: progress_ = tqdm(enumerate(train_loader)) else: progress_ = enumerate(train_loader) main_batch_size = 0 init_time = time() for i, data in progress_: self.on_batch_start(e, i) if isinstance(data, list) or isinstance(data, tuple): inputs = data[0] else: inputs = data batch_size = inputs.size(0) if main_batch_size < batch_size: main_batch_size = batch_size if len(self.__input_hooks) > 0: for hook in self.__input_hooks: inputs = hook(inputs) if isinstance(data, list): data[0] = inputs elif isinstance(data, tuple): data = (inputs,data[1]) else: data = inputs self.__train_func__(data,optimizer,loss_fn,train_metrics,running_loss,e,i) data_len += batch_size train_loss = running_loss.item()/data_len if batch_log: progress_message = "" for metric in train_metrics: progress_message += "Train {} : {}".format(metric.name, metric.getValue()) progress_.set_description("{}/{} batches ".format(int(ceil(data_len / main_batch_size)), int(ceil(len(train_loader.dataset) / main_batch_size)))) progress_dict = {"Train Loss": train_loss} for metric in train_metrics: progress_dict["Train " + metric.name] = metric.getValue() progress_.set_postfix(progress_dict) self.on_batch_end(e, i, train_metrics, train_loss) if self.cuda: cuda.synchronize() self.loss_history.append(train_loss) duration = time() - init_time if lr_schedule is not None: lr = lr_schedule(e) adjust_learning_rate(lr,optimizer) model_file = os.path.join(models_all, "model_{}.pth".format(e)) self.save_model(model_file) logfile = None if save_logs is not None: logfile = open(save_logs,"a") print(os.linesep+"Epoch: {}, Duration: {} , Train Loss: {}".format(e, duration, train_loss)) if logfile is not None: logfile.write(os.linesep+"Epoch: {}, Duration: {} , Train Loss: {}".format(e, duration, train_loss)) if test_loader is not None: message = "Accuracy did not improve" current_best = best_metric self.evaluate(test_loader,test_metrics) result = test_metrics[0].getValue() if result > current_best: best_metric = result message = "{} improved from {} to {}".format(test_metrics[0].name,current_best, result) model_file = os.path.join(models_best,"model_{}.pth".format(e)) self.save_model(model_file) print(os.linesep+"{} New Best Model saved in {}".format(message,model_file)) if logfile is not None: logfile.write(os.linesep+"{} New Best Model saved in {}".format(message,model_file)) else: print(os.linesep+message) if logfile is not None: logfile.write(os.linesep+message) for metric in test_metrics: print("Test {} : {}".format(metric.name,metric.getValue())) if logfile is not None: logfile.write(os.linesep+"Test {} : {}".format(metric.name,metric.getValue())) for metric in train_metrics: print("Train {} : {}".format(metric.name, metric.getValue())) if logfile is not None: logfile.write(os.linesep + "Train {} : {}".format(metric.name, metric.getValue())) if logfile is not None: logfile.close() for metric in train_metrics: metric.add_history() epoch_arr = [x for x in range(e+1)] if display_metrics or save_metrics: save_path = None if save_metrics: save_path = os.path.join(model_dir, "epoch_{}_loss.png".format(e)) visualize(epoch_arr, [PlotInput(value=self.loss_history, name="Train Loss", color="red")],display=display_metrics, save_path=save_path) if test_loader is not None and (display_metrics or save_metrics): for metric in test_metrics: save_path = None if save_metrics: save_path = os.path.join(model_dir, "test_{}_epoch_{}.png".format(metric.name, e)) visualize(epoch_arr, [PlotInput(value=metric.history, name="Test "+metric.name, color="blue")],display=display_metrics, save_path=save_path) for metric in train_metrics: if save_metrics: save_path = os.path.join(model_dir, "test_{}_epoch_{}.png".format(metric.name, e)) visualize(epoch_arr, [PlotInput(value=metric.history, name="Test " + metric.name, color="blue")],display=display_metrics, save_path=save_path) self.on_epoch_end(e, train_metrics, test_metrics, train_loss, duration) train_end_time = time() - train_start_time self.on_training_completed(train_metrics,test_metrics,train_end_time)
def train(self, target, source, gen_optimizer, disc_optimizer, num_epochs=10, disc_steps=1, gen_lr_schedule=None, disc_lr_schedule=None, model_dir=os.getcwd(), save_interval=100, notebook_mode=False, batch_log=True, save_logs=None, display_metrics=True, save_metrics=True): assert (len(target.dataset) == len(source.dataset)) assert (disc_steps < len(target.dataset)) if not os.path.exists(model_dir): os.mkdir(model_dir) self.model_dir = model_dir models_gen = os.path.join(model_dir, "gen_models") models_disc = os.path.join(model_dir, "disc_models") if not os.path.exists(models_gen): os.mkdir(models_gen) if not os.path.exists(models_disc): os.mkdir(models_disc) iterations = 0 from tqdm import tqdm_notebook from tqdm import tqdm train_start_time = time() for e in tqdm(range(num_epochs)): self.gen_model.train() self.disc_model.train() self.on_epoch_start(e) running_gen_loss = torch.Tensor([0.0]) running_disc_loss = torch.Tensor([0.0]) gen_loss = 0.0 disc_loss = 0.0 gen_data_len = 0 disc_data_len = 0 if notebook_mode and batch_log: progress_ = tqdm_notebook(enumerate(zip(target, source))) elif batch_log: progress_ = tqdm(enumerate(zip(target, source))) else: progress_ = enumerate(zip(target, source)) init_time = time() for i, (t, s) in progress_: if isinstance(t, list) or isinstance(t, tuple): inputs = t[0] else: inputs = t batch_size = inputs.size(0) disc_data_len += batch_size if len(self.__input_hooks) > 0: for hook in self.__input_hooks: inputs = hook(inputs) if isinstance(t, list): t[0] = inputs elif isinstance(t, tuple): t = (inputs, t[1]) else: t = inputs self.__disc_train_func__(t, s, disc_optimizer, running_disc_loss, e, i) disc_loss = (running_disc_loss.data[0] / disc_data_len).item() if (i + 1) % disc_steps == 0: self.__gen_train_func__(t, s, gen_optimizer, running_gen_loss, e, i) gen_data_len += batch_size gen_loss = (running_gen_loss.data[0] / gen_data_len).item() if batch_log: progress_dict = { "Gen Loss": gen_loss, "Disc Loss": disc_loss } progress_.set_postfix(progress_dict) iterations += 1 if iterations % save_interval == 0: self.save(s, iterations) self.show(s, iterations) self.on_batch_end(e, i, gen_loss, disc_loss) if self.cuda: cuda.synchronize() duration = time() - init_time self.disc_loss_history.append(disc_loss) self.gen_loss_history.append(gen_loss) if gen_lr_schedule is not None: lr = gen_lr_schedule(e) adjust_learning_rate(lr, gen_optimizer) if disc_lr_schedule is not None: lr = disc_lr_schedule(e) adjust_learning_rate(lr, disc_optimizer) model_file = os.path.join(models_gen, "gen_model_{}.pth".format(e)) self.save_generator(model_file) model_file = os.path.join(models_disc, "disc_model_{}.pth".format(e)) self.save_discriminator(model_file) print( "Epoch: {}, Duration: {} , Gen Loss: {} Disc Loss: {}".format( e, duration, gen_loss, disc_loss)) if save_logs is not None: logfile = open(save_logs, "a") logfile.write( "Epoch: {}, Duration: {} , Gen Loss: {} Disc Loss: {}". format(e, duration, gen_loss, disc_loss)) logfile.close() epoch_arr = [x for x in range(e + 1)] if display_metrics or save_metrics: save_path = None if save_metrics: save_path = os.path.join(model_dir, "epoch_{}_loss.png".format(e)) visualize(epoch_arr, [ PlotInput(value=self.gen_loss_history, name="Generator Loss", color="red"), PlotInput(value=self.disc_loss_history, name="Discriminator Loss", color="red") ], display=display_metrics, save_path=save_path) self.on_epoch_end(e, gen_loss, disc_loss, duration) train_end_time = time() - train_start_time self.on_training_completed(train_end_time)
def main(): # For reproducibility torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) np.random.seed(args.seed) train_loader, val_loader = getDataLoader(args, logger) # Network aanet = nets.AANet( args.max_disp, num_downsample=args.num_downsample, feature_type=args.feature_type, no_feature_mdconv=args.no_feature_mdconv, feature_pyramid=args.feature_pyramid, feature_pyramid_network=args.feature_pyramid_network, feature_similarity=args.feature_similarity, aggregation_type=args.aggregation_type, useFeatureAtt=args.useFeatureAtt, num_scales=args.num_scales, num_fusions=args.num_fusions, num_stage_blocks=args.num_stage_blocks, num_deform_blocks=args.num_deform_blocks, no_intermediate_supervision=args.no_intermediate_supervision, refinement_type=args.refinement_type, mdconv_dilation=args.mdconv_dilation, deformable_groups=args.deformable_groups).to(device) # logger.info('%s' % aanet) if local_master else None if local_master: structure_of_net = os.path.join(args.checkpoint_dir, 'structure_of_net.txt') with open(structure_of_net, 'w') as f: f.write('%s' % aanet) if args.pretrained_aanet is not None: logger.info('=> Loading pretrained AANet: %s' % args.pretrained_aanet) # Enable training from a partially pretrained model utils.load_pretrained_net(aanet, args.pretrained_aanet, no_strict=(not args.strict)) aanet.to(device) logger.info('=> Use %d GPUs' % torch.cuda.device_count()) if local_master else None # if torch.cuda.device_count() > 1: if args.distributed: # aanet = torch.nn.DataParallel(aanet) # 尝试分布式训练 aanet = torch.nn.SyncBatchNorm.convert_sync_batchnorm(aanet) aanet = torch.nn.parallel.DistributedDataParallel( aanet, device_ids=[local_rank], output_device=local_rank) synchronize() # Save parameters num_params = utils.count_parameters(aanet) logger.info('=> Number of trainable parameters: %d' % num_params) save_name = '%d_parameters' % num_params open(os.path.join(args.checkpoint_dir, save_name), 'a').close( ) if local_master else None # 这是个空文件,只是通过其文件名称指示模型有多少个需要训练的参数 # Optimizer # Learning rate for offset learning is set 0.1 times those of existing layers specific_params = list( filter(utils.filter_specific_params, aanet.named_parameters())) base_params = list( filter(utils.filter_base_params, aanet.named_parameters())) specific_params = [kv[1] for kv in specific_params] # kv is a tuple (key, value) base_params = [kv[1] for kv in base_params] specific_lr = args.learning_rate * 0.1 params_group = [ { 'params': base_params, 'lr': args.learning_rate }, { 'params': specific_params, 'lr': specific_lr }, ] optimizer = torch.optim.Adam(params_group, weight_decay=args.weight_decay) # Resume training if args.resume: # 1. resume AANet start_epoch, start_iter, best_epe, best_epoch = utils.resume_latest_ckpt( args.checkpoint_dir, aanet, 'aanet') # 2. resume Optimizer utils.resume_latest_ckpt(args.checkpoint_dir, optimizer, 'optimizer') else: start_epoch = 0 start_iter = 0 best_epe = None best_epoch = None # LR scheduler if args.lr_scheduler_type is not None: last_epoch = start_epoch if args.resume else start_epoch - 1 if args.lr_scheduler_type == 'MultiStepLR': milestones = [int(step) for step in args.milestones.split(',')] lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=milestones, gamma=args.lr_decay_gamma, last_epoch=last_epoch ) # 最后这个last_epoch参数很重要:如果是resume的话,则会自动调整学习率适去应last_epoch。 else: raise NotImplementedError # model.Model(object)对AANet做了进一步封装。 train_model = model.Model(args, logger, optimizer, aanet, device, start_iter, start_epoch, best_epe=best_epe, best_epoch=best_epoch) logger.info('=> Start training...') trainLoss_dict, trainLossKey, valLoss_dict, valLossKey = getLossRecord( netName="AANet") if args.evaluate_only: assert args.val_batch_size == 1 train_model.validate( val_loader, local_master, valLoss_dict, valLossKey) # test模式。应该设置--evaluate_only,且--mode为“test”。 # 保存Loss用于分析 save_loss_for_matlab(trainLoss_dict, valLoss_dict) else: for epoch in range(start_epoch, args.max_epoch): # 训练主循环(Epochs)!!! if not args.evaluate_only: # ensure distribute worker sample different data, # set different random seed by passing epoch to sampler if args.distributed: train_loader.sampler.set_epoch(epoch) logger.info( 'train_loader.sampler.set_epoch({})'.format(epoch)) train_model.train(train_loader, local_master, trainLoss_dict, trainLossKey) if not args.no_validate: train_model.validate(val_loader, local_master, valLoss_dict, valLossKey) # 训练模式下:边训练边验证。 if args.lr_scheduler_type is not None: lr_scheduler.step() # 调整Learning Rate # 保存Loss用于分析。每个epoch结束后,都保存一次,覆盖之前的保存。避免必须训练完成才保存的弊端。 save_loss_for_matlab(trainLoss_dict, valLoss_dict) logger.info('=> End training\n\n')
def main(): # For reproducibility torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) np.random.seed(args.seed) train_loader, val_loader = getDataLoader(args, logger) net = selectModel(args.model) # logger.info('%s' % net) if local_master else None # if args.pretrained_net is not None: # logger.info('=> Loading pretrained Net: %s' % args.pretrained_net) # # Enable training from a partially pretrained model # utils.load_pretrained_net(net, args.pretrained_net, strict=args.strict, logger=logger) net.to(device) # if torch.cuda.device_count() > 1: if args.distributed: # aanet = torch.nn.DataParallel(aanet) # 尝试分布式训练 net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net) net = torch.nn.parallel.DistributedDataParallel( net, device_ids=[local_rank], output_device=local_rank) synchronize() # Save parameters num_params = utils.count_parameters(net) logger.info('=> Number of trainable parameters: %d' % num_params) # 网络的特殊部分,设置特殊的学习率:specific_lr = args.learning_rate * 0.1 params_group = setInitLR(net, args) # Optimizer optimizer = torch.optim.Adam(params_group, weight_decay=args.weight_decay) # Resume training if args.resume: # 1. resume Net start_epoch, start_iter, best_epe, best_epoch = utils.resume_latest_ckpt( args.checkpoint_dir, net, 'net_latest', False, logger) # 2. resume Optimizer utils.resume_latest_ckpt(args.checkpoint_dir, optimizer, 'optimizer_latest', True, logger) else: start_epoch = 0 start_iter = 0 best_epe = None best_epoch = None # LR scheduler if args.lr_scheduler_type is not None: last_epoch = start_epoch if args.resume else start_epoch - 1 if args.lr_scheduler_type == 'MultiStepLR': milestones = [int(step) for step in args.milestones.split(',')] lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=milestones, gamma=args.lr_decay_gamma, last_epoch=last_epoch ) # 最后这个last_epoch参数很重要:如果是resume的话,则会自动调整学习率适去应last_epoch。 else: raise NotImplementedError # model.Model(net)对net做了进一步封装。 train_model = model.Model(args, logger, optimizer, net, device, start_iter, start_epoch, best_epe=best_epe, best_epoch=best_epoch) logger.info('=> Start training...') for epoch in range(start_epoch, args.max_epoch): # 训练主循环(Epochs)!!! # ensure distribute worker sample different data, # set different random seed by passing epoch to sampler if args.distributed: train_loader.sampler.set_epoch(epoch) logger.info('train_loader.sampler.set_epoch({})'.format(epoch)) train_model.train(train_loader, local_master) if args.do_validate: train_model.validate(val_loader, local_master) # 训练模式下:边训练边验证。 if args.lr_scheduler_type is not None: lr_scheduler.step() # 调整Learning Rate logger.info('=> End training\n\n')
def generate_samples(n_samples, sample_batch_size, sample_x_dim, sample_y_dim, conv_field, generator, bound_type, GPU, cuda, training_data, out_maps, boundary_layers, noise, den_var, channels, temperature, dataset_size): if GPU == 1: cuda.synchronize() time_ge = time.time() sample_x_padded = sample_x_dim + 2 * conv_field * boundary_layers sample_y_padded = sample_y_dim + conv_field * boundary_layers # don't need to pad the bottom sample_batch_size, changed = get_sample_batch_size( sample_batch_size, generator, sample_x_padded, sample_y_padded, conv_field, channels, GPU ) # add extra padding by conv_field in both x-directions, and in the + y direction, which we will remove later if changed: print('Sample batch size changed to {}'.format(sample_batch_size)) if n_samples < sample_batch_size: n_samples = sample_batch_size batches = int(np.ceil(n_samples / sample_batch_size)) n_samples = sample_batch_size * batches sample = torch.ByteTensor(n_samples, channels, sample_y_dim, sample_x_dim) # sample placeholder print('Generating {} Samples'.format(n_samples)) for batch in range( batches): # can't do these all at once so we do it in batches print('Batch {} of {} batches'.format(batch + 1, batches)) sample_batch = torch.FloatTensor( sample_batch_size, channels, sample_y_padded + 2 * conv_field, sample_x_padded + 2 * conv_field ) # needs to be explicitly padded by the convolutional fieldn sample_batch.fill_(0) # initialize with minimum value if bound_type > 0: sample_batch = build_boundary(sample_batch, sample_batch_size, training_data, conv_field, generator, bound_type, out_maps, noise, den_var, dataset_size, GPU) if GPU == 1: sample_batch = sample_batch.cuda() #generator.train(False) generator.eval() with torch.no_grad(): # we will not be updating weights for i in tqdm.tqdm(range(conv_field, sample_y_padded + conv_field)): # for each pixel for j in range(conv_field, sample_x_padded + conv_field): for k in range(channels): out = generator( sample_batch[:, :, i - conv_field:i + conv_field + 1, j - conv_field:j + conv_field + 1].float() ) # query the network about only area within the receptive field out = torch.reshape( out, (out.shape[0], out_maps, channels, out.shape[-2], out.shape[-1])) # reshape to select channels normed_temp = torch.mean( torch.abs(out[:, 1:, k, 0, 0]) ) * ( temperature ) # + np.exp(- i/conv_field/2)) # normalize temperature, graded against the boundary probs = F.softmax( out[:, 1:, k, 0, 0] / normed_temp, dim=1 ).data # the remove the lowest element (boundary) sample_batch[:, k, i, j] = (torch.multinomial( probs, 1).float() + 1).squeeze(1) / ( out_maps - 1 ) # convert output back to training space del out, probs for k in range(channels): sample[batch * sample_batch_size:(batch + 1) * sample_batch_size, k, :, :] = sample_batch[:, k, ( boundary_layers + 1) * conv_field:-conv_field, ( boundary_layers + 1) * conv_field:-( (boundary_layers + 1) * conv_field)] * ( out_maps - 1) - 1 # convert back to input space if GPU == 1: cuda.synchronize() time_ge = time.time() - time_ge return sample, time_ge, sample_batch_size, n_samples
batch_size=batch_size, shuffle=True, num_workers=4) model = draw(seq_len) model.cuda() # setup optimizer optimizer = optim.Adam(model.parameters(), lr=0.0002, betas=(0.5, 0.999)) # train for epoch in range(25): for i, (data, _) in enumerate(loader, 0): input = Variable(data).cuda() recon_batch, mu_t, logvar_t = model(input, seq_len) # loss = Variable(torch.FloatTensor(1).fill_(0).cuda()) cuda.synchronize() # elsewise synchronize error loss = loss_function(recon_batch, input, mu_t, logvar_t, seq_len, input.size(0), img_size) model.zero_grad() loss.backward() optimizer.step() if i % 10 == 0: ########################## # Visualization ########################## images = make_grid(recon_batch.data[:8]) writer.add_image('output', images, i) images = make_grid(data[:8]) writer.add_image('images', images, i) writer.add_scalar('error', loss.data[0], i)
with open(fname, "w", newline="") as f: csv_file = csv.writer(f, delimiter=',') # # Iterate to convergence eval_counter = 0 train_loss = L(MNIST_Data.X, MNIST_Data.Y) train_loss_delta = train_loss while eval_counter <= max_iter: print(eval_counter) # # Iterate over batches for (x, y) in MNIST_Data: # # Compute gradient and gradient norm grad_W, grad_w0 = L.gradient(x, y) cuda.synchronize() # # Update weights cuda.synchronize() F.update(grad_W, grad_w0, step_size) # # Update time step # step_size = dynamic_stepper.next() # # Update convergence criterion check train_loss = L(MNIST_Data.X, MNIST_Data.Y) test_loss = L(MNIST_Data_test.X, MNIST_Data_test.Y) eval_counter += 1 # # Write progress csv_file.writerow(
def train(self, target,source,gen_optimizer,disc_optimizer,num_epochs=10, disc_steps=1, gen_lr_schedule=None,disc_lr_schedule=None, model_dir=os.getcwd(), save_interval=100,notebook_mode=False,batch_log=True,save_logs=None,display_metrics=True,save_metrics=True): assert(len(target.dataset) == len(source.dataset)) assert(disc_steps < len(target.dataset)) if not os.path.exists(model_dir): os.mkdir(model_dir) self.model_dir = model_dir models_gen = os.path.join(model_dir, "gen_models") models_disc = os.path.join(model_dir, "disc_models") if not os.path.exists(models_gen): os.mkdir(models_gen) if not os.path.exists(models_disc): os.mkdir(models_disc) iterations = 0 from tqdm import tqdm_notebook from tqdm import tqdm train_start_time = time() for e in tqdm(range(num_epochs)): self.gen_model.train() self.disc_model.train() self.on_epoch_start(e) running_gen_loss = torch.Tensor([0.0]) running_disc_loss = torch.Tensor([0.0]) gen_loss = 0.0 disc_loss = 0.0 gen_data_len = 0 disc_data_len = 0 if notebook_mode and batch_log: progress_ = tqdm_notebook(enumerate(zip(target,source))) elif batch_log: progress_ = tqdm(enumerate(zip(target,source))) else: progress_ = enumerate(zip(target,source)) init_time = time() for i,(t,s) in progress_: if isinstance(t, list) or isinstance(t, tuple): inputs = t[0] else: inputs = t batch_size = inputs.size(0) disc_data_len += batch_size if len(self.__input_hooks) > 0: for hook in self.__input_hooks: inputs = hook(inputs) if isinstance(t, list): t[0] = inputs elif isinstance(t, tuple): t = (inputs,t[1]) else: t = inputs self.__disc_train_func__(t, s, disc_optimizer, running_disc_loss, e, i) disc_loss = running_disc_loss.data[0] / disc_data_len if (i+1) % disc_steps == 0: self.__gen_train_func__(t, s, gen_optimizer, running_gen_loss, e, i) gen_data_len += batch_size gen_loss = running_gen_loss.data[0] / gen_data_len if batch_log: progress_dict = {"Gen Loss": gen_loss,"Disc Loss":disc_loss} progress_.set_postfix(progress_dict) iterations += 1 if iterations % save_interval == 0: self.save(s,iterations) self.show(s,iterations) self.on_batch_end(e, i, gen_loss, disc_loss) if self.cuda: cuda.synchronize() duration = time() - init_time self.disc_loss_history.append(disc_loss) self.gen_loss_history.append(gen_loss) if gen_lr_schedule is not None: lr = gen_lr_schedule(e) adjust_learning_rate(lr,gen_optimizer) if disc_lr_schedule is not None: lr = disc_lr_schedule(e) adjust_learning_rate(lr, disc_optimizer) model_file = os.path.join(models_gen, "gen_model_{}.pth".format(e)) self.save_generator(model_file) model_file = os.path.join(models_disc, "disc_model_{}.pth".format(e)) self.save_discriminator(model_file) print("Epoch: {}, Duration: {} , Gen Loss: {} Disc Loss: {}".format(e, duration, gen_loss,disc_loss)) if save_logs is not None: logfile = open(save_logs, "a") logfile.write("Epoch: {}, Duration: {} , Gen Loss: {} Disc Loss: {}".format(e, duration, gen_loss,disc_loss)) logfile.close() epoch_arr = [x for x in range(e + 1)] if display_metrics or save_metrics: save_path = None if save_metrics: save_path = os.path.join(model_dir, "epoch_{}_loss.png".format(e)) visualize(epoch_arr, [PlotInput(value=self.gen_loss_history, name="Generator Loss", color="red"), PlotInput(value=self.disc_loss_history, name="Discriminator Loss", color="red")],display=display_metrics, save_path=save_path) self.on_epoch_end(e,gen_loss, disc_loss, duration) train_end_time = time() - train_start_time self.on_training_completed(train_end_time)