def train(cfg): n_class = int(cfg["data"]["n_class"]) img_h = int(cfg["data"]["img_h"]) img_w = int(cfg["data"]["img_w"]) batch_size = int(cfg["training"]["batch_size"]) epochs = int(cfg["training"]["epochs"]) lr = float(cfg["training"]["optimizer"]["lr"]) momentum = float(cfg["training"]["optimizer"]["momentum"]) w_decay = float(cfg["training"]["optimizer"]["weight_decay"]) step_size = int(cfg["training"]["lr_schedule"]["step_size"]) gamma = float(cfg["training"]["lr_schedule"]["gamma"]) configs = "FCNs-BCEWithLogits_batch{}_epoch{}_RMSprop_scheduler-step{}-gamma{}_lr{}_momentum{}_w_decay{}_input_size{}_03091842".format( batch_size, epochs, step_size, gamma, lr, momentum, w_decay, img_h) print("Configs:", configs) root_dir = cfg["data"]["root_dir"] train_filename = cfg["data"]["train_file"] val_filename = cfg["data"]["val_file"] mean_filename = cfg["data"]["mean_file"] class_weight_filename = cfg["data"]["class_weight_file"] train_file = os.path.join(root_dir, train_filename) print(train_file) val_file = os.path.join(root_dir, val_filename) mean_file = os.path.join(root_dir, mean_filename) class_weight_file = os.path.join(root_dir, class_weight_filename) model_dir = cfg["training"]["model_dir"] if not os.path.exists(model_dir): os.makedirs(model_dir) model_path = os.path.join(model_dir, configs) use_gpu = torch.cuda.is_available() num_gpu = list(range(torch.cuda.device_count())) continue_train = False #MeanRGB_train = ComputeMeanofInput(train_file) #MeanRGB_train = np.load(mean_file) MeanRGB_train = np.array([0.0, 0.0, 0.0]) print("MeanRGB_train: {}".format(MeanRGB_train)) train_data = ScanNet2d(csv_file=train_file, phase='train', trainsize=(img_h, img_w), MeanRGB=MeanRGB_train) val_data = ScanNet2d(csv_file=val_file, phase='val', trainsize=(img_h, img_w), MeanRGB=MeanRGB_train) train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=1) val_loader = DataLoader(val_data, batch_size=batch_size, shuffle=False, num_workers=1) #class_weight = trainer.computer_class_weights(train_file) class_weight = np.load(class_weight_file) print("class_weight: {}".format(class_weight)) class_weight = torch.from_numpy(class_weight) print("shape of class weight {}".format(class_weight.shape)) vgg_model = VGGNet(requires_grad=True, remove_fc=True) fcn_model = FCN8s(encoder_net=vgg_model, n_class=n_class) if use_gpu: ts = time.time() vgg_model = vgg_model.cuda() fcn_model = fcn_model.cuda() fcn_model = nn.DataParallel(fcn_model, device_ids=num_gpu) class_weight = class_weight.cuda() print("Finish cuda loading, tme elapsed {}".format(time.time() - ts)) L = nn.BCEWithLogitsLoss(reduction='none') optimizer = optim.RMSprop(fcn_model.parameters(), lr=lr, momentum=momentum, weight_decay=w_decay) #optimizer = optim.SGD(fcn_model.parameters(), lr=lr, momentum= momentum, weight_decay=w_decay) scheduler = lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma) score_dir = os.path.join("scores", configs) if not os.path.exists(score_dir): os.makedirs(score_dir) log_headers = [ 'epoch', 'train/loss', 'train/acc', 'train/acc_cls', 'train/mean_iu', 'train/fwavacc', 'val/loss', 'val/acc', 'val/acc_cls', 'val/mean_iu', 'val/fwavacc', 'elapsed_time' ] if not os.path.exists(os.path.join(score_dir, 'log.csv')): with open(os.path.join(score_dir, 'log.csv'), 'w') as f: f.write(','.join(log_headers) + '\n') IU_scores = np.zeros((epochs, n_class + 1)) pixel_scores = np.zeros(epochs) writer = SummaryWriter() # color_mapping = util.GenerateColorMapping(n_class) best_mean_iu = 0 epoch_loss = 0.0 if continue_train: model_path = "C:\\Users\\ji\\Documents\\FCN-VGG16\\models\\FCNs-BCEWithLogits_batch1_epoch500_RMSprop_scheduler-step50-gamma0.5_lr0.0001_momentum0.0_w_decay1e-05" fcn_model = torch.load(model_path) fcn_model.train() for epoch in range(epochs): fcn_model.train() scheduler.step() ts = time.time() running_loss = 0.0 ###### label_preds = [] label_trues = [] ###### for i, batch in enumerate(train_loader): optimizer.zero_grad() if use_gpu: inputs = Variable(batch['X'].cuda()) labels = Variable(batch['Y'].cuda()) else: inputs, labels = Variable(batch['X']), Variable(batch['Y']) outputs = fcn_model(inputs) #print("out: {}".format(outputs.shape)) #print("label: {}".format(labels.shape)) #print(outputs) #print(labels) loss = L(outputs, labels) # print(loss.shape) loss = loss.permute(0, 2, 3, 1).reshape(-1, n_class + 1) #.view(-1,n_class+1) # print(loss.shape) loss = torch.mean(loss, dim=0) # print(loss.shape) loss = torch.mul(loss, class_weight) # print(loss.shape) loss = torch.mean(loss) # print(loss) loss.backward() # print("grad") # print(fcn_model.outp.weight.grad) # print(fcn_model.embs[0].weight.grad) optimizer.step() #scheduler.step() if i == 0 and epoch == 0: # count= util.count_parameters(fcn_model) # print("number of parameters in model {}".format(count)) visIn = inputs[:3] #print('shape of in {}'.format(visIn[:5].shape)) visLabel = batch['l'][:3] epoch_loss += loss.item() running_loss += loss.item() # print("loss: {}".format(loss.data)) if i % 10 == 9 and i != 0: print("epoch{}, iter{}, Iterloss: {}".format( epoch, i, running_loss / 10)) writer.add_scalar('train/iter_loss', running_loss / 10, epoch * len(train_loader) + i) running_loss = 0.0 # N, _, h, w = outputs.shape # targets = batch['l'].cpu().numpy().reshape(N,h,w) # outputs = outputs.data.cpu().numpy() # preds_v, targets_v = util.visulaize_output(outputs,targets,color_mapping,n_class) # writer.add_images('train/predictions',torch.from_numpy(preds_v),dataformats='NHWC') # writer.add_images('train/targets',torch.from_numpy(targets_v),dataformats='NHWC') ##################################### outputs = outputs.data.cpu().numpy() N, _, h, w = outputs.shape pred = outputs.transpose(0, 2, 3, 1).reshape( -1, n_class + 1).argmax(axis=1).reshape(N, h, w) target = batch['l'].cpu().numpy().reshape(N, h, w) ######### for lt, lp in zip(target, pred): label_trues.append(lt) label_preds.append(lp) metrics = util.label_accuracy_score(label_trues, label_preds, n_class + 1) with open(os.path.join(score_dir, "log.csv"), 'a') as f: log = [epoch] + [epoch_loss] + list(metrics) + [''] * 7 log = map(str, log) f.write(','.join(log) + '\n') ######################################## #scheduler.step() writer.add_scalar('train/epoch_loss', epoch_loss, epoch) print("Finish epoch{}, epoch loss {}, time eplapsed {}".format( epoch, epoch_loss, time.time() - ts)) epoch_loss = 0.0 #################### writer.add_scalar('train/mean_iu', metrics[2], epoch) writer.add_scalar('train/acc', metrics[0], epoch) writer.add_scalar('train/acc_cls', metrics[1], epoch) ###################### #Training precess visulize visOut = fcn_model(visIn) preds_v, targets_v = util.visulaize_output(visOut, visLabel, n_class) writer.add_images('train/predictions', torch.from_numpy(preds_v), global_step=epoch, dataformats='NHWC') writer.add_images('train/targets', torch.from_numpy(targets_v), global_step=epoch, dataformats='NHWC') if not os.path.exists(model_path): os.makedirs(model_path) torch.save(fcn_model, os.path.join(model_path, str(epoch))) best_mean_iu = val_model(epoch, val_loader, fcn_model, use_gpu, n_class, IU_scores, pixel_scores, score_dir, writer, best_mean_iu, model_path, L) writer.flush() writer.close()
def get_summary_writer(): name = str(datetime.datetime.now())[:19] utils.make_dir(PATH['TF_LOGS']) logs_path = os.path.join(PATH['TF_LOGS'], name) return SummaryWriter(logs_path)
def step(self, action): observation, reward, done, info = super(MicroRTSStatsRecorder, self).step(action) self.raw_rewards += [info["raw_rewards"]] if done: raw_rewards = np.array(self.raw_rewards).sum(0) raw_names = [str(rf) for rf in self.rfs] info['microrts_stats'] = dict(zip(raw_names, raw_rewards)) self.raw_rewards = [] return observation, reward, done, info # TRY NOT TO MODIFY: setup the environment experiment_name = f"{args.gym_id}__{args.exp_name}__{args.seed}__{int(time.time())}" writer = SummaryWriter(f"runs/{experiment_name}") writer.add_text( 'hyperparameters', "|param|value|\n|-|-|\n%s" % ('\n'.join([f"|{key}|{value}|" for key, value in vars(args).items()]))) if args.prod_mode: import wandb run = wandb.init(project=args.wandb_project_name, entity=args.wandb_entity, sync_tensorboard=True, config=vars(args), name=experiment_name, monitor_gym=True, save_code=True) writer = SummaryWriter(f"/tmp/{experiment_name}") # TRY NOT TO MODIFY: seeding
def test_discrete_bcq(args=get_args()): # envs env = make_atari_env(args) args.state_shape = env.observation_space.shape or env.observation_space.n args.action_shape = env.action_space.shape or env.action_space.n # should be N_FRAMES x H x W print("Observations shape:", args.state_shape) print("Actions shape:", args.action_shape) # make environments test_envs = ShmemVectorEnv( [lambda: make_atari_env_watch(args) for _ in range(args.test_num)]) # seed np.random.seed(args.seed) torch.manual_seed(args.seed) test_envs.seed(args.seed) # model feature_net = DQN(*args.state_shape, args.action_shape, device=args.device, features_only=True).to(args.device) policy_net = Actor(feature_net, args.action_shape, device=args.device, hidden_sizes=args.hidden_sizes, softmax_output=False).to(args.device) imitation_net = Actor(feature_net, args.action_shape, device=args.device, hidden_sizes=args.hidden_sizes, softmax_output=False).to(args.device) optim = torch.optim.Adam(list(policy_net.parameters()) + list(imitation_net.parameters()), lr=args.lr) # define policy policy = DiscreteBCQPolicy(policy_net, imitation_net, optim, args.gamma, args.n_step, args.target_update_freq, args.eps_test, args.unlikely_action_threshold, args.imitation_logits_penalty) # load a previous policy if args.resume_path: policy.load_state_dict( torch.load(args.resume_path, map_location=args.device)) print("Loaded agent from: ", args.resume_path) # buffer assert os.path.exists(args.load_buffer_name), \ "Please run atari_dqn.py first to get expert's data buffer." if args.load_buffer_name.endswith('.pkl'): buffer = pickle.load(open(args.load_buffer_name, "rb")) elif args.load_buffer_name.endswith('.hdf5'): buffer = VectorReplayBuffer.load_hdf5(args.load_buffer_name) else: print(f"Unknown buffer format: {args.load_buffer_name}") exit(0) # collector test_collector = Collector(policy, test_envs, exploration_noise=True) # log log_path = os.path.join( args.logdir, args.task, 'bcq', f'seed_{args.seed}_{datetime.datetime.now().strftime("%m%d-%H%M%S")}') writer = SummaryWriter(log_path) writer.add_text("args", str(args)) logger = TensorboardLogger(writer, update_interval=args.log_interval) def save_fn(policy): torch.save(policy.state_dict(), os.path.join(log_path, 'policy.pth')) def stop_fn(mean_rewards): return False # watch agent's performance def watch(): print("Setup test envs ...") policy.eval() policy.set_eps(args.eps_test) test_envs.seed(args.seed) print("Testing agent ...") test_collector.reset() result = test_collector.collect(n_episode=args.test_num, render=args.render) pprint.pprint(result) rew = result["rews"].mean() print(f'Mean reward (over {result["n/ep"]} episodes): {rew}') if args.watch: watch() exit(0) result = offline_trainer(policy, buffer, test_collector, args.epoch, args.update_per_epoch, args.test_num, args.batch_size, stop_fn=stop_fn, save_fn=save_fn, logger=logger) pprint.pprint(result) watch()
def initialize(): # Training settings parser = argparse.ArgumentParser( description='PyTorch ImageNet Example', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--train-dir', default='/home/cys/datas/data_back/imagenet12/train/', help='path to training data') parser.add_argument('--val-dir', default='/home/cys/datas/data_back/imagenet12/val/', help='path to validation data') parser.add_argument('--log-dir', default='./logs', help='tensorboard/checkpoint log directory') parser.add_argument('--checkpoint-format', default='checkpoint-{epoch}.pth.tar', help='checkpoint file format') parser.add_argument('--fp16-allreduce', action='store_true', default=False, help='use fp16 compression during allreduce') parser.add_argument('--batches-per-allreduce', type=int, default=1, help='number of batches processed locally before ' 'executing allreduce across workers; it multiplies ' 'total batch size.') # Default settings from https://arxiv.org/abs/1706.02677. parser.add_argument( '--model', default='resnet50', help= 'Model (resnet35, resnet50, resnet101, resnet152, resnext50, resnext101)' ) parser.add_argument('--batch-size', type=int, default=32, help='input batch size for training') parser.add_argument('--val-batch-size', type=int, default=32, help='input batch size for validation') parser.add_argument('--epochs', type=int, default=90, help='number of epochs to train') parser.add_argument('--base-lr', type=float, default=0.0125, help='learning rate for a single GPU') parser.add_argument('--lr-decay', nargs='+', type=int, default=[30, 60, 80], help='epoch intervals to decay lr') parser.add_argument('--warmup-epochs', type=float, default=5, help='number of warmup epochs') parser.add_argument('--momentum', type=float, default=0.9, help='SGD momentum') parser.add_argument('--wd', type=float, default=0.00005, help='weight decay') parser.add_argument('--label-smoothing', type=float, default=0.1, help='label smoothing (default 0.1)') # KFAC Parameters parser.add_argument( '--kfac-update-freq', type=int, default=10, help='iters between kfac inv ops (0 = no kfac) (default: 10)') parser.add_argument('--kfac-cov-update-freq', type=int, default=1, help='iters between kfac cov ops (default: 1)') parser.add_argument('--kfac-update-freq-alpha', type=float, default=10, help='KFAC update freq multiplier (default: 10)') parser.add_argument('--kfac-update-freq-decay', nargs='+', type=int, default=None, help='KFAC update freq schedule (default None)') parser.add_argument( '--stat-decay', type=float, default=0.95, help='Alpha value for covariance accumulation (default: 0.95)') parser.add_argument('--damping', type=float, default=0.002, help='KFAC damping factor (default 0.003)') parser.add_argument('--damping-alpha', type=float, default=0.5, help='KFAC damping decay factor (default: 0.5)') parser.add_argument('--damping-decay', nargs='+', type=int, default=[40, 80], help='KFAC damping decay schedule (default [40, 80])') parser.add_argument('--kl-clip', type=float, default=0.001, help='KL clip (default: 0.001)') parser.add_argument( '--diag-blocks', type=int, default=1, help='Number of blocks to approx layer factor with (default: 1)') parser.add_argument( '--diag-warmup', type=int, default=0, help='Epoch to start diag block approximation at (default: 0)') parser.add_argument( '--distribute-layer-factors', action='store_true', default=None, help='Compute A and G for a single layer on different workers. ' 'None to determine automatically based on worker and ' 'layer count.') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--single-threaded', action='store_true', default=False, help='disables multi-threaded dataloading') parser.add_argument('--seed', type=int, default=42, help='random seed') args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() hvd.init() torch.manual_seed(args.seed) args.verbose = 1 if hvd.rank() == 0 else 0 if args.verbose: print(args) if args.cuda: torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) cudnn.benchmark = True args.log_dir = os.path.join( args.log_dir, "imagenet_resnet50_kfac{}_gpu_{}_{}".format( args.kfac_update_freq, hvd.size(), datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))) args.checkpoint_format = os.path.join(args.log_dir, args.checkpoint_format) os.makedirs(args.log_dir, exist_ok=True) # If set > 0, will resume training from a given checkpoint. args.resume_from_epoch = 0 for try_epoch in range(args.epochs, 0, -1): if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)): args.resume_from_epoch = try_epoch break # Horovod: broadcast resume_from_epoch from rank 0 (which will have # checkpoints) to other ranks. args.resume_from_epoch = hvd.broadcast(torch.tensor( args.resume_from_epoch), root_rank=0, name='resume_from_epoch').item() # Horovod: write TensorBoard logs on first worker. try: if LooseVersion(torch.__version__) >= LooseVersion('1.2.0'): from torch.utils.tensorboard import SummaryWriter else: from tensorboardX import SummaryWriter args.log_writer = SummaryWriter( args.log_dir) if hvd.rank() == 0 else None except ImportError: args.log_writer = None return args
def _setup_writer(self): self.writer = SummaryWriter(log_dir=self.log_dir) def __del__(self): self._remove()
[MODEL] - {cfg.model['type']} [OPTIMIZER] - {optimizer.__class__.__name__} - hyper params : {optimizer.defaults} [SCHEDULER] - {scheduler.__class__.__name__} - state_dict : {scheduler.state_dict()} <-><-><-><-><-><-><-><-><-><-><-><-><-><-><-><-><-><-><-><-> ''') min_val_loss = 99999 with SummaryWriter(log_dir=log_dir) as writer: for epoch in range(initial_epoch, cfg.runtime['epochs'] + initial_epoch): losses = { 'train': defaultdict(lambda: 0), 'val': defaultdict(lambda: 0) } counts = {'train': 0, 'val': 0} result = [] for phase, (images, image_metas, gt_bboxes, gt_labels) in tqdm( chain(dataloaders), total=sum(len(dl) for dl in dataloaders.values()), desc=f'[Epoch {epoch:3}]'): if phase == 'train': model.train() optimizer.zero_grad()
def calc_qvals(rewards): res = [] sum_r = 0.0 for r in reversed(rewards): sum_r *= GAMMA sum_r += r res.append(sum_r) res = list(reversed(res)) mean_q = np.mean(res) return [q - mean_q for q in res] if __name__ == "__main__": env = gym.make("CartPole-v0") writer = SummaryWriter(comment="-cartpole-reinforce-baseline") net = PGN(env.observation_space.shape[0], env.action_space.n) print(net) agent = ptan.agent.PolicyAgent( net, preprocessor=ptan.agent.float32_preprocessor, apply_softmax=True) exp_source = ptan.experience.ExperienceSourceFirstLast(env, agent, gamma=GAMMA) optimizer = optim.Adam(net.parameters(), lr=LEARNING_RATE) total_rewards = [] step_idx = 0 done_episodes = 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('--device', default='0,1,2,3', type=str, required=False, help='设置使用哪些显卡') parser.add_argument('--model_config', default='config/model_config_small.json', type=str, required=False, help='选择模型参数') parser.add_argument('--tokenizer_path', default='cache/vocab_small.txt', type=str, required=False, help='选择词库') parser.add_argument('--raw_data_path', default='data/train.json', type=str, required=False, help='原始训练语料') parser.add_argument('--tokenized_data_path', default='data/tokenized/', type=str, required=False, help='tokenized语料存放位置') parser.add_argument('--raw', action='store_true', help='是否先做tokenize') parser.add_argument('--epochs', default=5, type=int, required=False, help='训练循环') parser.add_argument('--batch_size', default=8, type=int, required=False, help='训练batch size') parser.add_argument('--lr', default=1.5e-4, type=float, required=False, help='学习率') parser.add_argument('--warmup_steps', default=2000, type=int, required=False, help='warm up步数') parser.add_argument('--log_step', default=1, type=int, required=False, help='多少步汇报一次loss') parser.add_argument('--stride', default=768, type=int, required=False, help='训练时取训练数据的窗口步长') parser.add_argument('--gradient_accumulation', default=1, type=int, required=False, help='梯度积累') parser.add_argument('--fp16', action='store_true', help='混合精度') parser.add_argument('--fp16_opt_level', default='O1', type=str, required=False) parser.add_argument('--max_grad_norm', default=1.0, type=float, required=False) parser.add_argument('--num_pieces', default=100, type=int, required=False, help='将训练语料分成多少份') parser.add_argument('--min_length', default=128, type=int, required=False, help='最短收录文章长度') parser.add_argument('--output_dir', default='model/', type=str, required=False, help='模型输出路径') parser.add_argument('--pretrained_model', default='', type=str, required=False, help='模型训练起点路径') parser.add_argument('--writer_dir', default='tensorboard_summary/', type=str, required=False, help='Tensorboard路径') parser.add_argument('--segment', action='store_true', help='中文以词为单位') parser.add_argument('--bpe_token', action='store_true', help='subword') parser.add_argument('--encoder_json', default="tokenizations/encoder.json", type=str, help="encoder.json") parser.add_argument('--vocab_bpe', default="tokenizations/vocab.bpe", type=str, help="vocab.bpe") args = parser.parse_args() print('args:\n' + args.__repr__()) if args.segment: from tokenizations import tokenization_bert_word_level as tokenization_bert else: from tokenizations import tokenization_bert os.environ["CUDA_VISIBLE_DEVICES"] = args.device # 此处设置程序使用哪些显卡 model_config = pytorch_transformers.modeling_gpt2.GPT2Config.from_json_file(args.model_config) print('config:\n' + model_config.to_json_string()) n_ctx = model_config.n_ctx if args.bpe_token: full_tokenizer = get_encoder(args.encoder_json, args.vocab_bpe) else: full_tokenizer = tokenization_bert.BertTokenizer(vocab_file=args.tokenizer_path) full_tokenizer.max_len = n_ctx device = 'cuda' if torch.cuda.is_available() else 'cpu' print('using device:', device) raw_data_path = args.raw_data_path tokenized_data_path = args.tokenized_data_path raw = args.raw # 选择是否从零开始构建数据集 epochs = args.epochs batch_size = args.batch_size lr = args.lr warmup_steps = args.warmup_steps log_step = args.log_step stride = args.stride gradient_accumulation = args.gradient_accumulation fp16 = args.fp16 # 不支持半精度的显卡请勿打开 fp16_opt_level = args.fp16_opt_level max_grad_norm = args.max_grad_norm num_pieces = args.num_pieces min_length = args.min_length output_dir = args.output_dir tb_writer = SummaryWriter(log_dir=args.writer_dir) if not os.path.exists(output_dir): os.mkdir(output_dir) if raw: print('building files') build_files(data_path=raw_data_path, tokenized_data_path=tokenized_data_path, num_pieces=num_pieces, full_tokenizer=full_tokenizer, min_length=min_length) print('files built') if not args.pretrained_model: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel(config=model_config) else: model = pytorch_transformers.modeling_gpt2.GPT2LMHeadModel.from_pretrained(args.pretrained_model) model.train() model.to(device) num_parameters = 0 parameters = model.parameters() for parameter in parameters: num_parameters += parameter.numel() print('number of parameters: {}'.format(num_parameters)) multi_gpu = False full_len = 0 print('calculating total steps') for i in tqdm(range(num_pieces)): with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: full_len += len([int(item) for item in f.read().strip().split()]) total_steps = int(full_len / stride * epochs / batch_size / gradient_accumulation) print('total steps = {}'.format(total_steps)) optimizer = pytorch_transformers.AdamW(model.parameters(), lr=lr, correct_bias=True) scheduler = pytorch_transformers.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=total_steps) if fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=fp16_opt_level) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = DataParallel(model) multi_gpu = True print('starting training') overall_step = 0 running_loss = 0 for epoch in range(epochs): print('epoch {}'.format(epoch + 1)) now = datetime.now() print('time: {}'.format(now)) x = np.linspace(0, num_pieces - 1, num_pieces, dtype=np.int32) random.shuffle(x) piece_num = 0 for i in x: with open(tokenized_data_path + 'tokenized_train_{}.txt'.format(i), 'r') as f: line = f.read().strip() tokens = line.split() tokens = [int(token) for token in tokens] start_point = 0 samples = [] while start_point < len(tokens) - n_ctx: samples.append(tokens[start_point: start_point + n_ctx]) start_point += stride random.shuffle(samples) for step in range(len(samples) // batch_size): # drop last # prepare data batch = samples[step * batch_size: (step + 1) * batch_size] batch_labels = [] batch_inputs = [] for ids in batch: int_ids_for_labels = [int(x) for x in ids] int_ids_for_inputs = [int(x) for x in ids] batch_labels.append(int_ids_for_labels) batch_inputs.append(int_ids_for_inputs) batch_labels = torch.tensor(batch_labels).long().to(device) batch_inputs = torch.tensor(batch_inputs).long().to(device) # forward pass outputs = model.forward(input_ids=batch_inputs, labels=batch_labels) loss, logits = outputs[:2] # get loss if multi_gpu: loss = loss.mean() if gradient_accumulation > 1: loss = loss / gradient_accumulation # loss backward if fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # optimizer step if (step + 1) % gradient_accumulation == 0: running_loss += loss.item() optimizer.step() optimizer.zero_grad() scheduler.step() overall_step += 1 if (overall_step + 1) % log_step == 0: tb_writer.add_scalar('loss', loss.item(), overall_step) if (overall_step + 1) % log_step == 0: print('now time: {}:{}. Step {} of piece {} of epoch {}, loss {}'.format( datetime.now().hour, datetime.now().minute, (step + 1) // gradient_accumulation, piece_num, epoch + 1, running_loss * gradient_accumulation / log_step)) running_loss = 0 piece_num += 1 print('saving model for epoch {}'.format(epoch + 1)) if not os.path.exists(output_dir + 'model_epoch{}'.format(epoch + 1)): os.mkdir(output_dir + 'model_epoch{}'.format(epoch + 1)) model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'model_epoch{}'.format(epoch + 1)) # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1)) # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1)) print('epoch {} finished'.format(epoch + 1)) then = datetime.now() print('time: {}'.format(then)) print('time for one epoch: {}'.format(then - now)) print('training finished') if not os.path.exists(output_dir + 'final_model'): os.mkdir(output_dir + 'final_model') model_to_save = model.module if hasattr(model, 'module') else model model_to_save.save_pretrained(output_dir + 'final_model')
# Hyperparameters with open(opt.hyp) as f: hyp = yaml.load(f, Loader=yaml.FullLoader) # load hyps if 'box' not in hyp: warn('Compatibility: %s missing "box" which was renamed from "giou" in %s' % (opt.hyp, 'https://github.com/ultralytics/yolov5/pull/1120')) hyp['box'] = hyp.pop('giou') # Train logger.info(opt) if not opt.evolve: tb_writer = None # init loggers if opt.global_rank in [-1, 0]: logger.info(f'Start Tensorboard with "tensorboard --logdir {opt.project}", view at http://localhost:6006/') tb_writer = SummaryWriter(opt.save_dir) # Tensorboard train(hyp, opt, device, tb_writer, wandb) # Evolve hyperparameters (optional) else: # Hyperparameter evolution metadata (mutation scale 0-1, lower_limit, upper_limit) meta = {'lr0': (1, 1e-5, 1e-1), # initial learning rate (SGD=1E-2, Adam=1E-3) 'lrf': (1, 0.01, 1.0), # final OneCycleLR learning rate (lr0 * lrf) 'momentum': (0.3, 0.6, 0.98), # SGD momentum/Adam beta1 'weight_decay': (1, 0.0, 0.001), # optimizer weight decay 'warmup_epochs': (1, 0.0, 5.0), # warmup epochs (fractions ok) 'warmup_momentum': (1, 0.0, 0.95), # warmup initial momentum 'warmup_bias_lr': (1, 0.0, 0.2), # warmup initial bias lr 'box': (1, 0.02, 0.2), # box loss gain 'cls': (1, 0.2, 4.0), # cls loss gain 'cls_pw': (1, 0.5, 2.0), # cls BCELoss positive_weight
# load dataset train_dataset = datasets.MNIST(root='dataset/', train=True, transform=transforms.ToTensor(), download=True) train_loader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=batch_size) model = CNN(in_channels=in_channels, num_classes=num_classes) model.to(device) # Loss and Optimier function criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.0) writer = SummaryWriter(f'runs/MNIST/traingout_tensorboard') step = 0 for epoch in epochs: losses = [] accuracies = [] for batch_idx, (data, targets) in enumerate(train_loader): data = data.to(device) targets = targets.to(device) scores = model(data) loss = criterion(scores, targets) losses.apppend(loss) #backaward optimizer.zero_grad()
# ============================ step 3/5 损失函数 ============================ criterion = nn.CrossEntropyLoss() # 选择损失函数 # ============================ step 4/5 优化器 ============================ optimizer = optim.SGD(net.parameters(), lr=LR, momentum=0.9) # 选择优化器 scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) # 设置学习率下降策略 # ============================ step 5/5 训练 ============================ train_curve = list() valid_curve = list() iter_count = 0 # 构建 SummaryWriter writer = SummaryWriter(comment='test_your_comment', filename_suffix="_test_your_filename_suffix") for epoch in range(MAX_EPOCH): loss_mean = 0. correct = 0. total = 0. net.train() for i, data in enumerate(train_loader): iter_count += 1 # forward inputs, labels = data outputs = net(inputs)
print("\n\tLoad Complete !") #load pretrained model and reset fully connected layer model = resnet50(pretrained = True) num_features = model.fc.in_features model.fc = nn.Linear(num_features, 2) model.cuda() # Observe that all parameters are being optimized criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=momentum) # Decay LR by a factor of 0.1 every 7 epochs exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1) #start training summary = SummaryWriter() training_accuracy = [] validation_accuracy = [] for epoch in range(num_epochs): # training set -- perform model training epoch_training_loss = 0.0 num_batches = 0 for batch_num, training_batch in enumerate(train_loader): # 'enumerate' is a super helpful function # split training data into inputs and labels inputs, labels = training_batch # 'training_batch' is a list # wrap data in 'Variable' inputs, labels = torch.autograd.Variable(inputs.cuda()), torch.autograd.Variable(labels.cuda()) # Make gradients zero for parameters 'W', 'b' optimizer.zero_grad()
parser.add_argument('--prefix', type=str, default='', help='log prefix') args = parser.parse_args() print("\n==> Arguments passed in were:\n") for key, value in args.__dict__.items(): print(f"{key}: {value}") print("") torch.manual_seed(args.seed) np.random.seed(args.seed) device = 'cuda' if torch.cuda.is_available() else 'cpu' log_dir = f'logs/paper/{args.prefix}{args.optim}-{args.initial_lr}-seed{args.seed}/' writer = SummaryWriter(log_dir=log_dir) os.mkdir(log_dir + 'samples') ######################################### #### Prepare data ####################### ######################################### print("\n==> Downloading CIFAR-10 dataset...\n") transform = transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) # normalizes pixels to be in range (-1,1) trainset = torchvision.datasets.CIFAR10(root='./data',
parser.add_argument("--model_path", default="") parser.add_argument("--texture", default="") if __name__ == '__main__': opt = parser.parse_args() # =========================================================== # Set train dataset & test dataset # =========================================================== opt.main_path, main_filename = mkdir_path(opt,os.path.basename(__file__)) UPSCALE_FACTOR = opt.upscale_factor NUM_EPOCHS = opt.num_epochs opt.GPU_IN_USE = torch.cuda.is_available() opt.device = torch.device('cuda' if opt.GPU_IN_USE else 'cpu') zipDir(os.getcwd(), opt.main_path+"code.zip") writer = SummaryWriter(opt.log_path) print('===> Loading datasets') train_loader = create_data_loader(opt) #create_old_loader(args) #create_data_loader(args) val_loader = create_test_loader(opt) print('===> Construct network') netG = SKINET(in_channels=3, out_channels=3, nf=64, scale_factor=1).to(opt.device) # netG = Generator(UPSCALE_FACTOR) netD = Discriminator() generator_criterion = GeneratorLoss() bce_loss = torch.nn.BCELoss() print('# generator parameters:', sum(param.numel() for param in netG.parameters())) print('# discriminator parameters:', sum(param.numel() for param in netD.parameters())) # print(netG) # print(netD)
if 'box' not in hyp: warn( 'Compatibility: %s missing "box" which was renamed from "giou" in %s' % (opt.hyp, 'https://github.com/ultralytics/yolov5/pull/1120')) hyp['box'] = hyp.pop('giou') # Train logger.info(opt) if not opt.evolve: tb_writer, wandb = None, None # init loggers if opt.global_rank in [-1, 0]: # Tensorboard logger.info( f'Start Tensorboard with "tensorboard --logdir {opt.project}", view at http://localhost:6006/' ) tb_writer = SummaryWriter(opt.save_dir) # runs/train/exp # W&B try: import wandb assert os.environ.get('WANDB_DISABLED') != 'true' except (ImportError, AssertionError): logger.info( "Install Weights & Biases for experiment logging via 'pip install wandb' (recommended)" ) wandb = None train(hyp, opt, device, tb_writer, wandb) # Evolve hyperparameters (optional) else:
trainset = trainset[:cutoff] if p.shuffle_dataset: trainset = trainset.shuffle() n_features = trainset.get(0).x.shape[1] print('Setting up model...') model = MultiScaleFeaStNet(4, heads=p.heads).to(device) optimizer = torch.optim.Adam(model.parameters(), lr=learn_rate, weight_decay=p.weight_decay) # scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', # factor=p.lr_decay, # patience=p.patience) writer = SummaryWriter( comment='model:{}_lr:{}_lr_decay:{}_shuffle:{}_seed:{}'.format( p.version, learn_rate, p.lr_decay, p.shuffle_dataset, p.random_seed)) # axes = [0, 1, 2] max_roc_auc = 0 # ---- Training ---- print('Training...') for epoch in range(1, epochs + 1): train_loader = DataLoader(trainset, shuffle=p.shuffle_dataset, batch_size=p.batch_size) val_loader = DataLoader(validset, shuffle=False, batch_size=p.test_batch_size)
def train(args): ## 트레이닝 파라메터 설정하기 mode = args.mode train_continue = args.train_continue lr = args.lr batch_size = args.batch_size num_epoch = args.num_epoch data_dir = args.data_dir ckpt_dir = args.ckpt_dir log_dir = args.log_dir result_dir = args.result_dir task = args.task opts = [args.opts[0], np.asarray(args.opts[1:]).astype(np.float)] ny = args.ny nx = args.nx nch = args.nch nker = args.nker wgt_cycle = args.wgt_cycle wgt_ident = args.wgt_ident norm = args.norm network = args.network learning_type = args.learning_type device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("mode: %s" % mode) print("norm: %s" % norm) print("learning rate: %.4e" % lr) print("batch size: %d" % batch_size) print("number of epoch: %d" % num_epoch) print("task: %s" % task) print("opts: %s" % opts) print("network: %s" % network) print("learning type: %s" % learning_type) print("data dir: %s" % data_dir) print("ckpt dir: %s" % ckpt_dir) print("log dir: %s" % log_dir) print("result dir: %s" % result_dir) print("device: %s" % device) ## 디렉토리 생성하기 result_dir_train = os.path.join(result_dir, 'train') if not os.path.exists(result_dir_train): os.makedirs(os.path.join(result_dir_train, 'png', 'a2b')) os.makedirs(os.path.join(result_dir_train, 'png', 'b2a')) ## 네트워크 학습하기 if mode == 'train': transform_train = transforms.Compose([ Resize(shape=(286, 286, nch)), RandomCrop((ny, nx)), Normalization(mean=MEAN, std=STD) ]) dataset_train = Dataset(data_dir=os.path.join(data_dir, 'train'), transform=transform_train, task=task, data_type='both') loader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKER) # 그밖에 부수적인 variables 설정하기 num_data_train = len(dataset_train) num_batch_train = np.ceil(num_data_train / batch_size) ## 네트워크 생성하기 if network == "CycleGAN": netG_a2b = CycleGAN(in_channels=nch, out_channels=nch, nker=nker, norm=norm, nblk=9).to(device) netG_b2a = CycleGAN(in_channels=nch, out_channels=nch, nker=nker, norm=norm, nblk=9).to(device) netD_a = Discriminator(in_channels=nch, out_channels=1, nker=nker, norm=norm).to(device) netD_b = Discriminator(in_channels=nch, out_channels=1, nker=nker, norm=norm).to(device) init_weights(netG_a2b, init_type='normal', init_gain=0.02) init_weights(netG_b2a, init_type='normal', init_gain=0.02) init_weights(netD_a, init_type='normal', init_gain=0.02) init_weights(netD_b, init_type='normal', init_gain=0.02) ## 손실함수 정의하기 fn_cycle = nn.L1Loss().to(device) fn_gan = nn.BCELoss().to(device) fn_ident = nn.L1Loss().to(device) ## Optimizer 설정하기 optimG = torch.optim.Adam(itertools.chain(netG_a2b.parameters(), netG_b2a.parameters()), lr=lr, betas=(0.5, 0.999)) optimD = torch.optim.Adam(itertools.chain(netD_a.parameters(), netD_b.parameters()), lr=lr, betas=(0.5, 0.999)) ## 그밖에 부수적인 functions 설정하기 fn_tonumpy = lambda x: x.to('cpu').detach().numpy().transpose(0, 2, 3, 1) fn_denorm = lambda x: (x * STD) + MEAN cmap = None ## Tensorboard 를 사용하기 위한 SummaryWriter 설정 writer_train = SummaryWriter(log_dir=os.path.join(log_dir, 'train')) ## 네트워크 학습시키기 st_epoch = 0 # TRAIN MODE if mode == 'train': if train_continue == "on": netG_a2b, netG_b2a, \ netD_a, netD_b, \ optimG, optimD, st_epoch = load(ckpt_dir=ckpt_dir, netG_a2b=netG_a2b, netG_b2a=netG_b2a, netD_a=netD_a, netD_b=netD_b, optimG=optimG, optimD=optimD) for epoch in range(st_epoch + 1, num_epoch + 1): netG_a2b.train() netG_b2a.train() netD_a.train() netD_b.train() loss_G_a2b_train = [] loss_G_b2a_train = [] loss_D_a_train = [] loss_D_b_train = [] loss_cycle_a_train = [] loss_cycle_b_train = [] loss_ident_a_train = [] loss_ident_b_train = [] for batch, data in enumerate(loader_train, 1): input_a = data['data_a'].to(device) input_b = data['data_b'].to(device) # forward netG output_b = netG_a2b(input_a) output_a = netG_b2a(input_b) recon_b = netG_a2b(output_a) recon_a = netG_b2a(output_b) # backward netD set_requires_grad([netD_a, netD_b], True) optimD.zero_grad() # backward netD_a pred_real_a = netD_a(input_a) pred_fake_a = netD_a(output_a.detach()) loss_D_a_real = fn_gan(pred_real_a, torch.ones_like(pred_real_a)) loss_D_a_fake = fn_gan(pred_fake_a, torch.zeros_like(pred_fake_a)) loss_D_a = 0.5 * (loss_D_a_real + loss_D_a_fake) # backward netD_b pred_real_b = netD_b(input_b) pred_fake_b = netD_b(output_b.detach()) loss_D_b_real = fn_gan(pred_real_b, torch.ones_like(pred_real_b)) loss_D_b_fake = fn_gan(pred_fake_b, torch.zeros_like(pred_fake_b)) loss_D_b = 0.5 * (loss_D_b_real + loss_D_b_fake) loss_D = loss_D_a + loss_D_b loss_D.backward() optimD.step() # backward netG set_requires_grad([netD_a, netD_b], False) optimG.zero_grad() pred_fake_a = netD_a(output_a) pred_fake_b = netD_b(output_b) loss_G_a2b = fn_gan(pred_fake_a, torch.ones_like(pred_fake_a)) loss_G_b2a = fn_gan(pred_fake_b, torch.ones_like(pred_fake_b)) loss_cycle_a = fn_cycle(input_a, recon_a) loss_cycle_b = fn_cycle(input_b, recon_b) ident_a = netG_b2a(input_a) ident_b = netG_a2b(input_b) loss_ident_a = fn_ident(input_a, ident_a) loss_ident_b = fn_ident(input_b, ident_b) loss_G = (loss_G_a2b + loss_G_b2a) + \ wgt_cycle * (loss_cycle_a + loss_cycle_b) + \ wgt_cycle * wgt_ident * (loss_ident_a + loss_ident_b) loss_G.backward() optimG.step() # 손실함수 계산 loss_G_a2b_train += [loss_G_a2b.item()] loss_G_b2a_train += [loss_G_b2a.item()] loss_D_a_train += [loss_D_a.item()] loss_D_b_train += [loss_D_b.item()] loss_cycle_a_train += [loss_cycle_a.item()] loss_cycle_b_train += [loss_cycle_b.item()] loss_ident_a_train += [loss_ident_a.item()] loss_ident_b_train += [loss_ident_b.item()] print( "TRAIN: EPOCH %04d / %04d | BATCH %04d / %04d | " "GEN a2b %.4f b2a %.4f | " "DISC a %.4f b %.4f | " "CYCLE a %.4f b %.4f | " "IDENT a %.4f b %.4f | " % (epoch, num_epoch, batch, num_batch_train, np.mean(loss_G_a2b_train), np.mean(loss_G_b2a_train), np.mean(loss_D_a_train), np.mean(loss_D_b_train), np.mean(loss_cycle_a_train), np.mean(loss_cycle_b_train), np.mean(loss_ident_a_train), np.mean(loss_ident_b_train))) if batch % 20 == 0: # Tensorboard 저장하기 input_a = fn_tonumpy(fn_denorm(input_a)).squeeze() input_b = fn_tonumpy(fn_denorm(input_b)).squeeze() output_a = fn_tonumpy(fn_denorm(output_a)).squeeze() output_b = fn_tonumpy(fn_denorm(output_b)).squeeze() input_a = np.clip(input_a, a_min=0, a_max=1) input_b = np.clip(input_b, a_min=0, a_max=1) output_a = np.clip(output_a, a_min=0, a_max=1) output_b = np.clip(output_b, a_min=0, a_max=1) id = num_batch_train * (epoch - 1) + batch plt.imsave(os.path.join(result_dir_train, 'png', 'a2b', '%04d_input_a.png' % id), input_a[0], cmap=cmap) plt.imsave(os.path.join(result_dir_train, 'png', 'a2b', '%04d_output_b.png' % id), output_b[0], cmap=cmap) plt.imsave(os.path.join(result_dir_train, 'png', 'b2a', '%04d_input_b.png' % id), input_b[0], cmap=cmap) plt.imsave(os.path.join(result_dir_train, 'png', 'b2a', '%04d_output_a.png' % id), output_a[0], cmap=cmap) writer_train.add_image('input_a', input_a, id, dataformats='NHWC') writer_train.add_image('input_b', input_b, id, dataformats='NHWC') writer_train.add_image('output_a', output_a, id, dataformats='NHWC') writer_train.add_image('output_b', output_b, id, dataformats='NHWC') writer_train.add_scalar('loss_G_a2b', np.mean(loss_G_a2b_train), epoch) writer_train.add_scalar('loss_G_b2a', np.mean(loss_G_b2a_train), epoch) writer_train.add_scalar('loss_D_a', np.mean(loss_D_a_train), epoch) writer_train.add_scalar('loss_D_b', np.mean(loss_D_b_train), epoch) writer_train.add_scalar('loss_cycle_a', np.mean(loss_cycle_a_train), epoch) writer_train.add_scalar('loss_cycle_b', np.mean(loss_cycle_b_train), epoch) writer_train.add_scalar('loss_ident_a', np.mean(loss_ident_a_train), epoch) writer_train.add_scalar('loss_ident_b', np.mean(loss_ident_b_train), epoch) if epoch % 2 == 0 or epoch == num_epoch: save(ckpt_dir=ckpt_dir, epoch=epoch, netG_a2b=netG_a2b, netG_b2a=netG_b2a, netD_a=netD_a, netD_b=netD_b, optimG=optimG, optimD=optimD) writer_train.close()
D_net.load_state_dict(checkpoint['D_state_dict']) # update the step_idx if LOAD_NET: step_idx = common.find_stepidx(load_fileName, "-", "\.") else: step_idx = 0 # create the target net (stable) tgt_D_net = common.TargetNet(D_net) # define the net_processor net_processor = common.GANPreprocessor(G_net, D_net, tgt_D_net.target_model) # define the writer writer = SummaryWriter(log_dir="../runs/GAN/" + dt_string, comment="GAN_stock_trading") with common.gan_lossTracker(writer, stop_loss=np.inf, mean_size=1000) as loss_tracker: while True: step_idx += 1 net_processor.train_mode(batch_size=BATCH_SIZE) # generate the training set X_v, K_v, x_v, k_v = train_container.generate_batch(BATCH_SIZE) input_real = data.D_preprocess(X_v, K_v, x_v, k_v) # train D by input_real D_W = D_net(input_real) # train D for input_fake optimizerD.zero_grad() x_v_, k_v_ = G_net(X_v, K_v)
# Model load_model = False device = torch.device("cuda" if torch.cuda.is_available() else "cpu") input_size_encoder = len(german.vocab) input_size_decoder = len(english.vocab) output_size = len(english.vocab) encoder_embedding_size = 300 decoder_embedding_size = 300 hidden_size = 1024 num_layers = 1 enc_dropout = 0.5 dec_dropout = 0.5 # Tensorboard writer = SummaryWriter(f"runs/Loss_plot") step = 0 train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, validation_data, test_data), batch_size=batch_size, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device, ) encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device) decoder_net = Decoder( input_size_decoder,
batch_size = 32 # Model hyperparameters src_vocab_size = len(lithuanian.vocab) trg_vocab_size = len(english.vocab) embedding_size = 256 num_heads = 8 num_encoder_layers = 3 num_decoder_layers = 3 dropout = 0.10 max_len = 100 forward_expansion = 4 src_pad_idx = english.vocab.stoi["<pad>"] # Tensorboard to get nice loss plot writer = SummaryWriter("runs/loss_plot") step = 0 step_valid = 0 # build into torchtext iterators train_iterator, valid_iterator, test_iterator = BucketIterator.splits( (train_data, valid_data, test_data), batch_size=batch_size, sort_within_batch=True, sort_key=lambda x: len(x.src), device=device, ) # Initiate transformer model model = Transformer( embedding_size,
"WARNING: You have a CUDA device, so you should probably run with using cuda" ) is_data_parallel = False if isinstance(config['cuda']['gpu_id'], list): is_data_parallel = True cuda_str = 'cuda:' + str(config['cuda']['gpu_id'][0]) elif isinstance(config['cuda']['gpu_id'], int): cuda_str = 'cuda:' + str(config['cuda']['gpu_id']) else: raise ValueError('Check out gpu id in config') device = torch.device(cuda_str if config['cuda']['using_cuda'] else "cpu") # tensorboard summary_writer = SummaryWriter(os.path.join(config['model']['exp_path'], 'log')) # Data target_classes = utils.read_txt(config['params']['classes']) num_classes = len(target_classes) img_size = config['params']['image_size'].split('x') img_size = (int(img_size[0]), int(img_size[1])) print('==> Preparing data..') bbox_params = A.BboxParams(format='pascal_voc', min_visibility=0.3) train_transforms = A.Compose( [ A.Resize(height=img_size[0], width=img_size[1], p=1.0), A.HorizontalFlip(p=0.5), # A.OneOf([ # A.Sequential([
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len( train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": args.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0 }, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) # Check if saved optimizer or scheduler states exist if os.path.isfile(os.path.join( args.model_name_or_path, "optimizer.pt")) and os.path.isfile( os.path.join(args.model_name_or_path, "scheduler.pt")): # Load in optimizer and scheduler states optimizer.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict( torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 1 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split( "/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % ( len(train_dataloader) // args.gradient_accumulation_steps) logger.info( " Continuing training from checkpoint, will skip to saved global_step" ) logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) # Added here for reproductibility set_seed(args) for e in train_iterator: if args.local_rank != -1: train_dataloader.sampler.set_epoch(e) epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { "input_ids": batch[0], "attention_mask": batch[1], "token_type_ids": batch[2], "start_positions": batch[3], "end_positions": batch[4], } if args.model_type in [ "xlm", "roberta", "distilbert", "camembert", "bart" ]: del inputs["token_type_ids"] if args.model_type in ["xlnet", "xlm"]: inputs.update({"cls_index": batch[5], "p_mask": batch[6]}) if args.version_2_with_negative: inputs.update({"is_impossible": batch[7]}) if hasattr(model, "config") and hasattr( model.config, "lang2id"): inputs.update({ "langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device) }) outputs = model(**inputs) # model outputs are always tuple in transformers (see doc) loss = outputs[0] if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 # Log metrics if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Only evaluate when single GPU otherwise metrics may not average well if args.local_rank == -1 and args.evaluate_during_training: results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss # Save model checkpoint if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: output_dir = os.path.join( args.output_dir, "checkpoint-{}".format(global_step)) # Take care of distributed/parallel training model_to_save = model.module if hasattr( model, "module") else model model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if args.max_steps > 0 and global_step > args.max_steps: epoch_iterator.close() break if args.max_steps > 0 and global_step > args.max_steps: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def _set_writer(self, **kwargs): if not kwargs: kwargs = {'log_dir': os.path.join(self.workdir, 'tensorboard')} self.summary_writer_path = kwargs['log_dir'] self.writer = SummaryWriter(**kwargs)
storage_client = storage.Client() bucket_name = opt.job_dir[:-1] if opt.job_dir.endswith('/') else opt.job_dir bucket_name = bucket_name.replace('gs://', '') bucket = storage_client.bucket(bucket_name) def upload_file(src: str): blob = bucket.blob(os.path.basename(src)) blob.upload_from_filename(src) # ------------ # Tensorboard configuration # ------------ writer = SummaryWriter(opt.job_dir) # ---------- # Training # ---------- def sample_image(writer: SummaryWriter, samples_per_class: int, iterations: int): z = torch.randn(samples_per_class * n_classes, opt.latent_dim).to(device) labels = np.array( [num for num in range(samples_per_class) for _ in range(n_classes)]) labels = torch.Tensor(labels).to(device) gen_imgs = generator(z, labels) writer.add_images('gan_grid', gen_imgs, iterations)
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]: """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) def collate(examples: List[torch.Tensor]): if tokenizer._pad_token is None: return pad_sequence(examples, batch_first=True) return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id) train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset) train_dataloader = DataLoader( train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate ) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs model = model.module if hasattr(model, "module") else model # Take care of distributed/parallel training model.resize_token_embeddings(len(tokenizer)) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ["bias", "LayerNorm.weight"] optimizer_grouped_parameters = [ { "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay, }, {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}, ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total ) # Check if saved optimizer or scheduler states exist if ( args.model_name_or_path and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt")) ): # Load in optimizer and scheduler states optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))) scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))) if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True ) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1), ) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs_trained = 0 steps_trained_in_current_epoch = 0 # Check if continuing training from a checkpoint if args.model_name_or_path and os.path.exists(args.model_name_or_path): try: # set global_step to gobal_step of last saved checkpoint from model path checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0] global_step = int(checkpoint_suffix) epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps) steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps) logger.info(" Continuing training from checkpoint, will skip to saved global_step") logger.info(" Continuing training from epoch %d", epochs_trained) logger.info(" Continuing training from global step %d", global_step) logger.info(" Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch) except ValueError: logger.info(" Starting fine-tuning.") tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange( epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0] ) set_seed(args) # Added here for reproducibility for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): # Skip past any already trained steps if resuming training if steps_trained_in_current_epoch > 0: steps_trained_in_current_epoch -= 1 continue inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch) inputs = inputs.to(args.device) labels = labels.to(args.device) model.train() outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels) loss = outputs[0] # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() scheduler.step() # Update learning rate schedule model.zero_grad() global_step += 1 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if ( args.local_rank == -1 and args.evaluate_during_training ): # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar("eval_{}".format(key), value, global_step) tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step) tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: checkpoint_prefix = "checkpoint" # Save model checkpoint output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step)) os.makedirs(output_dir, exist_ok=True) model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) torch.save(args, os.path.join(output_dir, "training_args.bin")) logger.info("Saving model checkpoint to %s", output_dir) _rotate_checkpoints(args, checkpoint_prefix) torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt")) torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt")) logger.info("Saving optimizer and scheduler states to %s", output_dir) if 0 < args.max_steps < global_step: epoch_iterator.close() break if 0 < args.max_steps < global_step: train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
"batch_size": BATCH_SIZE, "layer_size": LAYER_SIZE, "nStep": nstep, "gamma": GAMMA, "tau": TAU, "learningRate": LR, "epsilon": EPS, "updateEvery": UPDATE_EVERY, "nUpdate": NUPDATES } np.random.seed(seed) env = BipedalWalker() now = datetime.now() writer = SummaryWriter('logdir/' + now.strftime("%Y%m%d-%H%M%S") + "/") writer.add_hparams(paramDict, {}) env.seed(seed) action_size = env.action_space.shape[0] state_size = env.observation_space.shape[0] agent = DQN_Agent(state_size=state_size, action_size=action_size, layer_size=LAYER_SIZE, BATCH_SIZE=BATCH_SIZE, BUFFER_SIZE=BUFFER_SIZE, PER=per, LR=LR, EPS=EPS, Nstep=nstep,
'pin_memory': True } val_params = { 'batch_size': args.batch, 'shuffle': False, 'num_workers': 20, 'pin_memory': True } num_epochs = args.epochs save_path = os.path.join(args.save_root, args.expt) utils.create_dirs(save_path) ## tensorboard summary logger writer = SummaryWriter(log_dir=os.path.join(save_path, 'logs')) ## configure runtime logging logging.basicConfig(level=logging.INFO, filename=os.path.join(save_path, 'logs', 'logfile.log'), format='%(asctime)s - %(message)s', filemode='w') # logger=logging.getLogger()#.setLevel(logging.INFO) console = logging.StreamHandler() console.setLevel(logging.INFO) console.setFormatter(logging.Formatter('%(asctime)s - %(message)s')) logging.getLogger('').addHandler(console) logging.info(args) ## dataloaders using hdf5 file data_path = None
def main(): #### options parser = argparse.ArgumentParser() parser.add_argument('-opt', type=str, help='Path to option YAML file.') parser.add_argument('--launcher', choices=['none', 'pytorch'], default='none', help='job launcher') parser.add_argument('--local_rank', type=int, default=0) args = parser.parse_args() opt = option.parse(args.opt, is_train=True) #### distributed training settings if args.launcher == 'none': # disabled distributed training opt['dist'] = False rank = -1 print('Disabled distributed training.') else: opt['dist'] = True init_dist() world_size = torch.distributed.get_world_size() rank = torch.distributed.get_rank() #### loading resume state if exists if opt['path'].get('resume_state', None): # distributed resuming: all load into default GPU device_id = torch.cuda.current_device() resume_state = torch.load( opt['path']['resume_state'], map_location=lambda storage, loc: storage.cuda(device_id)) option.check_resume(opt, resume_state['iter']) # check resume options else: resume_state = None #### mkdir and loggers if rank <= 0: # normal training (rank -1) OR distributed training (rank 0) if resume_state is None: util.mkdir_and_rename( opt['path'] ['experiments_root']) # rename experiment folder if exists util.mkdirs( (path for key, path in opt['path'].items() if not key == 'experiments_root' and 'pretrain_model' not in key and 'resume' not in key)) # config loggers. Before it, the log will not work util.setup_logger('base', opt['path']['log'], 'train_' + opt['name'], level=logging.INFO, screen=True, tofile=True) logger = logging.getLogger('base') logger.info(option.dict2str(opt)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: version = float(torch.__version__[0:3]) if version >= 1.1: # PyTorch 1.1 from torch.utils.tensorboard import SummaryWriter else: logger.info( 'You are using PyTorch {}. Tensorboard will use [tensorboardX]' .format(version)) from tensorboardX import SummaryWriter tb_logger = SummaryWriter(log_dir='../tb_logger/' + opt['name']) else: util.setup_logger('base', opt['path']['log'], 'train', level=logging.INFO, screen=True) logger = logging.getLogger('base') # convert to NoneDict, which returns None for missing keys opt = option.dict_to_nonedict(opt) #### random seed seed = opt['train']['manual_seed'] if seed is None: seed = random.randint(1, 10000) if rank <= 0: logger.info('Random seed: {}'.format(seed)) util.set_random_seed(seed) torch.backends.cudnn.benchmark = True # torch.backends.cudnn.deterministic = True #### create train and val dataloader dataset_ratio = 200 # enlarge the size of each epoch for phase, dataset_opt in opt['datasets'].items(): if phase == 'train': train_set = create_dataset(dataset_opt) train_size = int( math.ceil(len(train_set) / dataset_opt['batch_size'])) total_iters = int(opt['train']['niter']) total_epochs = int(math.ceil(total_iters / train_size)) if dataset_opt['use_text']: import pdb pdb.set_trace() from lib.datasets.dataset import LmdbDataset as genTextDataset print("Start Load IIIT5K dataset!") text_dataset = genTextDataset(dataset_opt['dataset_dir'], dataset_opt['voc_type'], dataset_opt['max_len'], dataset_opt['num_sample']) print("Load IIIT5K dataset success!") text_dataloader = DataLoader( text_dataset, batch_size=dataset_opt['batch_size'], number_workers=dataset_opt['n_workers'], shuffle=True, pin_memory=True, drop_last=True, collate_fn=AlignCollate(imgH=height, imgW=width, keep_ratio=keep_ratio)) if opt['dist']: train_sampler = DistIterSampler(train_set, world_size, rank, dataset_ratio) total_epochs = int( math.ceil(total_iters / (train_size * dataset_ratio))) else: train_sampler = None train_loader = create_dataloader(train_set, dataset_opt, opt, train_sampler) if rank <= 0: logger.info( 'Number of train images: {:,d}, iters: {:,d}'.format( len(train_set), train_size)) logger.info('Total epochs needed: {:d} for iters {:,d}'.format( total_epochs, total_iters)) elif phase == 'val': val_set = create_dataset(dataset_opt) val_loader = create_dataloader(val_set, dataset_opt, opt, None) if rank <= 0: logger.info('Number of val images in [{:s}]: {:d}'.format( dataset_opt['name'], len(val_set))) else: raise NotImplementedError( 'Phase [{:s}] is not recognized.'.format(phase)) assert train_loader is not None assert len(train_loader) == len( text_dataloader), "text gt size should be the same as training dataset" #### create model model = create_model(opt, text_dataset) #### resume training if resume_state: logger.info('Resuming training from epoch: {}, iter: {}.'.format( resume_state['epoch'], resume_state['iter'])) start_epoch = resume_state['epoch'] current_step = resume_state['iter'] model.resume_training(resume_state) # handle optimizers and schedulers else: current_step = 0 start_epoch = 0 #### training logger.info('Start training from epoch: {:d}, iter: {:d}'.format( start_epoch, current_step)) for epoch in range(start_epoch, total_epochs + 1): if opt['dist']: train_sampler.set_epoch(epoch) for _, (train_data, input_dict) in enumerate(zip(train_loader, text_dataloader)): current_step += 1 if current_step > total_iters: break #### update learning rate model.update_learning_rate(current_step, warmup_iter=opt['train']['warmup_iter']) #### training model.feed_data(train_data) model.optimize_parameters(current_step) #### log if current_step % opt['logger']['print_freq'] == 0: logs = model.get_current_log() message = '[epoch:{:3d}, iter:{:8,d}, lr:('.format( epoch, current_step) for v in model.get_current_learning_rate(): message += '{:.3e},'.format(v) message += ')] ' for k, v in logs.items(): message += '{:s}: {:.4e} '.format(k, v) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: if rank <= 0: tb_logger.add_scalar(k, v, current_step) if rank <= 0: logger.info(message) #### validation if opt['datasets'].get( 'val', None) and current_step % opt['train']['val_freq'] == 0: if opt['model'] in [ 'sr', 'srgan' ] and rank <= 0: # image restoration validation # does not support multi-GPU validation pbar = util.ProgressBar(len(val_loader)) avg_psnr = 0. idx = 0 for val_data in val_loader: idx += 1 img_name = os.path.splitext( os.path.basename(val_data['LQ_path'][0]))[0] img_dir = os.path.join(opt['path']['val_images'], img_name) util.mkdir(img_dir) model.feed_data(val_data) model.test() visuals = model.get_current_visuals() sr_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # Save SR images for reference save_img_path = os.path.join( img_dir, '{:s}_{:d}.png'.format(img_name, current_step)) util.save_img(sr_img, save_img_path) # calculate PSNR sr_img, gt_img = util.crop_border([sr_img, gt_img], opt['scale']) avg_psnr += util.calculate_psnr(sr_img, gt_img) pbar.update('Test {}'.format(img_name)) avg_psnr = avg_psnr / idx # log logger.info('# Validation # PSNR: {:.4e}'.format(avg_psnr)) # tensorboard logger if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr', avg_psnr, current_step) else: # video restoration validation if opt['dist']: # multi-GPU testing psnr_rlt = {} # with border and center frames if rank == 0: pbar = util.ProgressBar(len(val_set)) for idx in range(rank, len(val_set), world_size): val_data = val_set[idx] val_data['LQs'].unsqueeze_(0) val_data['GT'].unsqueeze_(0) folder = val_data['folder'] idx_d, max_idx = val_data['idx'].split('/') idx_d, max_idx = int(idx_d), int(max_idx) if psnr_rlt.get(folder, None) is None: psnr_rlt[folder] = torch.zeros( max_idx, dtype=torch.float32, device='cuda') # tmp = torch.zeros(max_idx, dtype=torch.float32, device='cuda') model.feed_data(val_data) model.test() visuals = model.get_current_visuals() rlt_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # calculate PSNR psnr_rlt[folder][idx_d] = util.calculate_psnr( rlt_img, gt_img) if rank == 0: for _ in range(world_size): pbar.update('Test {} - {}/{}'.format( folder, idx_d, max_idx)) # # collect data for _, v in psnr_rlt.items(): dist.reduce(v, 0) dist.barrier() if rank == 0: psnr_rlt_avg = {} psnr_total_avg = 0. for k, v in psnr_rlt.items(): psnr_rlt_avg[k] = torch.mean(v).cpu().item() psnr_total_avg += psnr_rlt_avg[k] psnr_total_avg /= len(psnr_rlt) log_s = '# Validation # PSNR: {:.4e}:'.format( psnr_total_avg) for k, v in psnr_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) if opt['use_tb_logger'] and 'debug' not in opt[ 'name']: tb_logger.add_scalar('psnr_avg', psnr_total_avg, current_step) for k, v in psnr_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) else: pbar = util.ProgressBar(len(val_loader)) psnr_rlt = {} # with border and center frames psnr_rlt_avg = {} psnr_total_avg = 0. for val_data in val_loader: folder = val_data['folder'][0] idx_d = val_data['idx'].item() # border = val_data['border'].item() if psnr_rlt.get(folder, None) is None: psnr_rlt[folder] = [] model.feed_data(val_data) model.test() visuals = model.get_current_visuals() rlt_img = util.tensor2img(visuals['rlt']) # uint8 gt_img = util.tensor2img(visuals['GT']) # uint8 # calculate PSNR psnr = util.calculate_psnr(rlt_img, gt_img) psnr_rlt[folder].append(psnr) pbar.update('Test {} - {}'.format(folder, idx_d)) for k, v in psnr_rlt.items(): psnr_rlt_avg[k] = sum(v) / len(v) psnr_total_avg += psnr_rlt_avg[k] psnr_total_avg /= len(psnr_rlt) log_s = '# Validation # PSNR: {:.4e}:'.format( psnr_total_avg) for k, v in psnr_rlt_avg.items(): log_s += ' {}: {:.4e}'.format(k, v) logger.info(log_s) if opt['use_tb_logger'] and 'debug' not in opt['name']: tb_logger.add_scalar('psnr_avg', psnr_total_avg, current_step) for k, v in psnr_rlt_avg.items(): tb_logger.add_scalar(k, v, current_step) #### save models and training states if current_step % opt['logger']['save_checkpoint_freq'] == 0: if rank <= 0: logger.info('Saving models and training states.') model.save(current_step) model.save_training_state(epoch, current_step) if rank <= 0: logger.info('Saving the final model.') model.save('latest') logger.info('End of training.') tb_logger.close()
else: self.early_stop_count += 1 def evaluate(self): pass def main(): parser = argparse.ArgumentParser(description='Parse the config path') parser.add_argument( "-c", "--config", dest="path", default='./configs/train.json', help= 'The path to the config file. e.g. python train.py --config configs/dc_config.json' ) config = parser.parse_args() with open(config.path) as f: args = json.load(f) args = AttrDict(args) t = trainer(args) t.run() if __name__ == "__main__": writer = SummaryWriter("./tensorboard/speakerbeam") main() writer.close()