def experiments(train_loader, test_loader, norm_type, l1_factor, l2_factor, dropout, epochs): train_losses = [] test_losses = [] train_accuracy = [] test_accuracy = [] use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") model = m.Net(norm_type, dropout).to(device) optimizer = optim.SGD(model.parameters(), lr=0.015, momentum=0.7, weight_decay=l2_factor) scheduler = OneCycleLR(optimizer, max_lr=0.015, epochs=epochs, steps_per_epoch=len(train_loader)) epochs = epochs for epoch in range(1, epochs + 1): print(f'Epoch {epoch}:') trn.train(model, device, train_loader, optimizer, epoch, train_accuracy, train_losses, l1_factor, scheduler) tst.test(model, device, test_loader, test_accuracy, test_losses) return (train_accuracy, train_losses, test_accuracy, test_losses), model
def main(opt): train_dataset = bAbIDataset(opt.dataroot, opt.question_id, True) train_dataloader = bAbIDataloader(train_dataset, batch_size=opt.batchSize, shuffle=True, num_workers=2) test_dataset = bAbIDataset(opt.dataroot, opt.question_id, False) test_dataloader = bAbIDataloader(test_dataset, batch_size=opt.batchSize, shuffle=False, num_workers=2) opt.annotation_dim = 1 # for bAbI opt.n_edge_types = train_dataset.n_edge_types opt.n_node = train_dataset.n_node net = GGNN(opt) print(net) criterion = nn.CrossEntropyLoss() if opt.cuda: net.cuda() criterion.cuda() optimizer = optim.Adam(net.parameters(), lr=opt.lr) for epoch in range(0, opt.niter): train(epoch, train_dataloader, net, criterion, optimizer, opt) test(test_dataloader, net, criterion, optimizer, opt)
def main(): exp_path = CHK_DIR + FLAGS.name build_dir_tree(exp_path) if FLAGS.train: print 'Starting training...' train() else: print 'Starting testing...' test()
def main(_): tf.logging.set_verbosity(tf.logging.INFO) action_set = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17] if FLAGS.multi_task == 1 and FLAGS.mode == 'train': level_names = atari_utils.ATARI_GAMES.keys() elif FLAGS.multi_task == 1 and FLAGS.mode == 'test': level_names = atari_utils.ATARI_GAMES.values() else: level_names = [FLAGS.level_name] action_set = atari_env.get_action_set(FLAGS.level_name) if FLAGS.mode == 'train': train(action_set, level_names) else: test(action_set, level_names)
def main(opt): train_dataset = BADataset(opt.dataroot, opt.L, True, False, False) train_dataloader = BADataloader(train_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) valid_dataset = BADataset(opt.dataroot, opt.L, False, True, False) valid_dataloader = BADataloader(valid_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) test_dataset = BADataset(opt.dataroot, opt.L, False, False, True) test_dataloader = BADataloader(test_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) all_dataset = BADataset(opt.dataroot, opt.L, False, False, False) all_dataloader = BADataloader(all_dataset, batch_size=opt.batchSize, \ shuffle=False, num_workers=opt.workers, drop_last=False) opt.n_edge_types = train_dataset.n_edge_types opt.n_node = train_dataset.n_node net = STGGNN(opt, kernel_size=2, n_blocks=1, state_dim_bottleneck=opt.state_dim, annotation_dim_bottleneck=opt.annotation_dim) net.double() print(net) criterion = nn.BCELoss() if opt.cuda: net.cuda() criterion.cuda() optimizer = optim.Adam(net.parameters(), lr=opt.lr) early_stopping = EarlyStopping(patience=opt.patience, verbose=True) os.makedirs(OutputDir, exist_ok=True) train_loss_ls = [] valid_loss_ls = [] test_loss_ls = [] #net.load_state_dict(torch.load(OutputDir + '/checkpoint_5083.pt')) for epoch in range(0, opt.niter): train_loss = train(epoch, train_dataloader, net, criterion, optimizer, opt) valid_loss = valid(valid_dataloader, net, criterion, opt) test_loss = test(test_dataloader, net, criterion, opt) train_loss_ls.append(train_loss) valid_loss_ls.append(valid_loss) test_loss_ls.append(test_loss) early_stopping(valid_loss, net, OutputDir) if early_stopping.early_stop: print("Early stopping") break df = pd.DataFrame({'epoch':[i for i in range(1, len(train_loss_ls)+1)], 'train_loss': train_loss_ls, 'valid_loss': valid_loss_ls, 'test_loss': test_loss_ls}) df.to_csv(OutputDir + '/loss.csv', index=False) net.load_state_dict(torch.load(OutputDir + '/checkpoint.pt')) inference(all_dataloader, net, criterion, opt, OutputDir)
def main(opt): train_dataset = BADataset(opt.dataroot, opt.L, True, False, False) train_dataloader = BADataloader(train_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) valid_dataset = BADataset(opt.dataroot, opt.L, False, True, False) valid_dataloader = BADataloader(valid_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) test_dataset = BADataset(opt.dataroot, opt.L, False, False, True) test_dataloader = BADataloader(test_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) all_dataset = BADataset(opt.dataroot, opt.L, False, False, False) all_dataloader = BADataloader(all_dataset, batch_size=opt.batchSize, \ shuffle=False, num_workers=opt.workers, drop_last=False) opt.n_edge_types = train_dataset.n_edge_types opt.n_node = train_dataset.n_node net = EGCN(gcn_args, activation = torch.nn.RReLU(), device = opt.device) print(net) criterion = nn.MSELoss() #criterion = nn.CosineSimilarity(dim=-1, eps=1e-6) if opt.cuda: net.cuda() criterion.cuda() optimizer = optim.Adam(net.parameters(), lr=opt.lr) early_stopping = EarlyStopping(patience=opt.patience, verbose=True) os.makedirs(OutputDir, exist_ok=True) train_loss_ls = [] valid_loss_ls = [] test_loss_ls = [] for epoch in range(0, opt.niter): train_loss = train(epoch, train_dataloader, net, criterion, optimizer, opt) valid_loss = valid(valid_dataloader, net, criterion, opt) test_loss = test(test_dataloader, net, criterion, opt) train_loss_ls.append(train_loss) valid_loss_ls.append(valid_loss) test_loss_ls.append(test_loss) early_stopping(valid_loss, net, OutputDir) if early_stopping.early_stop: print("Early stopping") break df = pd.DataFrame({'epoch':[i for i in range(1, len(train_loss_ls)+1)], 'train_loss': train_loss_ls, 'valid_loss': valid_loss_ls, 'test_loss': test_loss_ls}) df.to_csv(OutputDir + '/loss.csv', index=False) #net.load_state_dict(torch.load(OutputDir + '/checkpoint.pt')) net = torch.load(OutputDir + '/checkpoint.pt') inference(all_dataloader, net, criterion, opt, OutputDir)
def run_experiment(self): dropout = self.config['model_params']['dropout'] epochs = self.config['training_params']['epochs'] l2_factor = self.config['training_params']['l2_factor'] l1_factor = self.config['training_params']['l1_factor'] criterion = nn.CrossEntropyLoss( ) if self.config['criterion'] == 'CrossEntropyLoss' else F.nll_loss() opt_func = optim.Adam if self.config['optimizer'][ 'type'] == 'optim.Adam' else optim.SGD lr = self.config['optimizer']['args']['lr'] grad_clip = 0.1 train_losses = [] test_losses = [] train_accuracy = [] test_accuracy = [] lrs = [] #device = self.set_device() model = m.Net(dropout).to(self.device) # optimizer = optim.SGD(model.parameters(), lr=0.02, momentum=0.7,weight_decay=l2_factor) optimizer = opt_func(model.parameters(), lr=lr, weight_decay=l2_factor) #scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True,mode='max') scheduler = OneCycleLR(optimizer, max_lr=lr, epochs=epochs, steps_per_epoch=len(self.dataset.train_loader)) for epoch in range(1, epochs + 1): print(f'Epoch {epoch}:') trn.train(model, self.device, self.dataset.train_loader, optimizer, epoch, train_accuracy, train_losses, l1_factor, scheduler, criterion, lrs, grad_clip) tst.test(model, self.device, self.dataset.test_loader, test_accuracy, test_losses, criterion) # if epoch > 20: # scheduler.step(test_accuracy[-1]) return (train_accuracy, train_losses, test_accuracy, test_losses), model
def testing(lane_agent, test_image, step, loss): lane_agent.evaluate_mode() _, _, ti = test.test(lane_agent, np.array([test_image])) cv2.imwrite('test_result/result_' + str(step) + '_' + str(loss) + '.png', ti[0]) lane_agent.training_mode()
def main(opt): train_dataset = ABoxDataset(opt.dataroot, True) train_dataloader = ABoxDataloader(train_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers) opt.annotation_dim = train_dataset.annotation_dim # An example of accessing A using dataloader and dataset # Very important # for idx, (annotation, A, target, data_idx) in enumerate(train_dataloader): # print('index', data_idx) # A = [train_dataset.all_data[1][i] for i in data_idx] # print(A) # print(target) test_dataset = ABoxDataset(opt.dataroot, False) test_dataloader = ABoxDataloader(test_dataset, batch_size=opt.batchSize, \ shuffle=False, num_workers=opt.workers) opt.n_edge_types = train_dataset.n_edge_types opt.n_node = train_dataset.n_node net = GGNN(train_dataset.n_node, train_dataset.n_edge_types*2, opt) # times 2 because it's directed net.double() # print(net) criterion = nn.BCELoss() # print(opt.cuda) # print(opt.niter) if opt.cuda: net.cuda() criterion.cuda() optimizer = optim.Adam(net.parameters(), lr=opt.lr) best_acc = 0.0 # best accuracy has been achieved num_of_dec = 0 # number of epochs have a decline of accuracy, used for early stop acc_last_iter = 0.0 # accuracy of the last iteration for epoch in range(0, opt.niter): if num_of_dec >= 15: print("Early stop! The accuracy has been dropped for 15 iterations!") break train(epoch, train_dataloader, train_dataset, net, criterion, optimizer, opt) correct = test(test_dataloader, test_dataset, net, criterion, opt) acc = float(correct) / float(len(test_dataset)) if acc > best_acc: best_acc = acc print("Best accuracy by far: ", best_acc) torch.save(net, './' + fileName + str(opt.n_steps) + '_model.pth') if acc >= best_acc: num_of_dec = 0 else: num_of_dec += 1 print("The best accuracy achieved by far: ", best_acc)
def main(opt): train_dataset = Dataset(opt.dataroot, True) train_dataloader = Dataloader(train_dataset, batch_size=opt.batchSize, \ shuffle=False, num_workers=2) test_dataset = Dataset(opt.dataroot, False) test_dataloader = Dataloader(test_dataset, batch_size=opt.batchSize, \ shuffle=False, num_workers=2) net = FNN(d=opt.d, n=opt.n) net.double() print(net) criterion = nn.CosineSimilarity(dim=2) optimizer = optim.Adam(net.parameters(), lr=opt.lr) with open('train.csv', 'a') as f: writer = csv.writer(f, lineterminator='\n') writer.writerow( ["train_loss", "train_gain", "baseline_loss", "baseline_gain"]) with open('test.csv', 'a') as f: writer = csv.writer(f, lineterminator='\n') writer.writerow( ["test_loss", "test_gain", "baseline_loss", "baseline_gain"]) start = time.time() for epoch in range(0, opt.niter): train(epoch, train_dataloader, net, criterion, optimizer, opt) test(test_dataloader, net, criterion, optimizer, opt) elapsed_time = time.time() - start with open('time.csv', 'a') as f: writer = csv.writer(f, lineterminator='\n') writer.writerow(["学習時間", elapsed_time])
def run(opt): start_time = time.time() opt.dataroot = 'babi_data/%s/train/%d_graphs.txt' % (opt.processed_path, opt.task_id) print(opt) train_dataset = bAbIDataset(opt.dataroot, opt.question_id, True, opt.train_size) train_dataloader = bAbIDataloader(train_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=2) test_dataset = bAbIDataset(opt.dataroot, opt.question_id, False, opt.train_size) test_dataloader = bAbIDataloader(test_dataset, batch_size=opt.batchSize, \ shuffle=False, num_workers=2) opt.annotation_dim = 1 # for bAbI opt.n_edge_types = train_dataset.n_edge_types opt.n_node = train_dataset.n_node if opt.net == 'GGNN': net = GGNN(opt) net.double() else: net = Graph_OurConvNet(opt) net.double() print(net) criterion = nn.CrossEntropyLoss() if opt.cuda: net.cuda() criterion.cuda() optimizer = optim.Adam(net.parameters(), lr=opt.lr) for epoch in range(0, opt.niter): train_loss = train(epoch, train_dataloader, net, criterion, optimizer, opt) test_loss, numerator, denominator = test(test_dataloader, net, criterion, optimizer, opt) return train_loss, test_loss, numerator, denominator, time.time( ) - start_time
def test_result(): if request.method == 'GET': dir_path = request.args.get('dir_path') dir_name = request.args.get('dir_name') jsonl = request.args.get('jsonl') return render_template('test_loading.html', dir_name=dir_name, dir_path=dir_path, jsonl=jsonl) if request.method == 'POST': arguments = json.loads(request.data.decode("UTF-8")) dir_name, dir_path, jsonl = arguments.get('dir_name'), arguments.get( 'dir_path'), arguments.get('jsonl') dir_name = dir_name.replace('.zip', '') data = test(root=dir_path, img_zip=dir_name, annotations=jsonl) output = get_formatted_data( **data, threshold=request.args.get('threshold', 0.5), ) img_dir = os.path.join(dir_path, dir_name) root = os.path.dirname(__file__) command = f"cp -r {os.path.join(os.path.dirname(root), img_dir)} {os.path.join(root, 'static')}" os.popen("sudo -S %s" % command, 'w').write('Root2018!\n') y_true, y_pred, pretty_y = get_formatted_test_data( pred=output, true_json='meme_data/dev.jsonl') metrics = get_metrics(y_pred, y_true) if not os.path.exists(os.path.join(root, 'static', dir_name)): os.listdir(os.path.join(root, 'static', dir_name)) uid = str(uuid.uuid4()) TESTS[uid] = dict(dir_name=dir_name, image_names=[], predictions=pretty_y, metrics=metrics) return {'done': True, 'uid': uid}
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env_name', type=str, default='coinrun', help='name of the environment to train on.') parser.add_argument( '--model', type=str, default='ppo', help='the model to use for training. {ppo, ibac, ibac_sni, dist_match}' ) args, rest_args = parser.parse_known_args() env_name = args.env_name model = args.model # --- ARGUMENTS --- if model == 'ppo': args = args_ppo.get_args(rest_args) elif model == 'ibac': args = args_ibac.get_args(rest_args) elif model == 'ibac_sni': args = args_ibac_sni.get_args(rest_args) elif model == 'dist_match': args = args_dist_match.get_args(rest_args) else: raise NotImplementedError # place other args back into argparse.Namespace args.env_name = env_name args.model = model args.num_train_envs = args.num_processes - args.num_val_envs if args.num_val_envs > 0 else args.num_processes # warnings if args.deterministic_execution: print('Envoking deterministic code execution.') if torch.backends.cudnn.enabled: warnings.warn('Running with deterministic CUDNN.') if args.num_processes > 1: raise RuntimeError( 'If you want fully deterministic code, run it with num_processes=1.' 'Warning: This will slow things down and might break A2C if ' 'policy_num_steps < env._max_episode_steps.') elif args.num_val_envs > 0 and (args.num_val_envs >= args.num_processes or not args.percentage_levels_train < 1.0): raise ValueError( 'If --args.num_val_envs>0 then you must also have' '--num_val_envs < --num_processes and 0 < --percentage_levels_train < 1.' ) elif args.num_val_envs > 0 and not args.use_dist_matching and args.dist_matching_coef != 0: raise ValueError( 'If --num_val_envs>0 and --use_dist_matching=False then you must also have' '--dist_matching_coef=0.') elif args.use_dist_matching and not args.num_val_envs > 0: raise ValueError( 'If --use_dist_matching=True then you must also have' '0 < --num_val_envs < --num_processes and 0 < --percentage_levels_train < 1.' ) elif args.analyse_rep and not args.use_bottleneck: raise ValueError('If --analyse_rep=True then you must also have' '--use_bottleneck=True.') # --- TRAINING --- print("Setting up wandb logging.") # Weights & Biases logger if args.run_name is None: # make run name as {env_name}_{TIME} now = datetime.datetime.now().strftime('_%d-%m_%H:%M:%S') args.run_name = args.env_name + '_' + args.algo + now # initialise wandb wandb.init(project=args.proj_name, name=args.run_name, group=args.group_name, config=args, monitor_gym=False) # save wandb dir path args.run_dir = wandb.run.dir # make directory for saving models save_dir = os.path.join(wandb.run.dir, 'models') if not os.path.exists(save_dir): os.makedirs(save_dir) # set random seed of random, torch and numpy utl.set_global_seed(args.seed, args.deterministic_execution) # initialise environments for training print("Setting up Environments.") if args.num_val_envs > 0: train_num_levels = int(args.train_num_levels * args.percentage_levels_train) val_start_level = args.train_start_level + train_num_levels val_num_levels = args.train_num_levels - train_num_levels train_envs = make_vec_envs(env_name=args.env_name, start_level=args.train_start_level, num_levels=train_num_levels, distribution_mode=args.distribution_mode, paint_vel_info=args.paint_vel_info, num_processes=args.num_train_envs, num_frame_stack=args.num_frame_stack, device=device) val_envs = make_vec_envs(env_name=args.env_name, start_level=val_start_level, num_levels=val_num_levels, distribution_mode=args.distribution_mode, paint_vel_info=args.paint_vel_info, num_processes=args.num_val_envs, num_frame_stack=args.num_frame_stack, device=device) else: train_envs = make_vec_envs(env_name=args.env_name, start_level=args.train_start_level, num_levels=args.train_num_levels, distribution_mode=args.distribution_mode, paint_vel_info=args.paint_vel_info, num_processes=args.num_processes, num_frame_stack=args.num_frame_stack, device=device) # initialise environments for evaluation eval_envs = make_vec_envs(env_name=args.env_name, start_level=0, num_levels=0, distribution_mode=args.distribution_mode, paint_vel_info=args.paint_vel_info, num_processes=args.num_processes, num_frame_stack=args.num_frame_stack, device=device) _ = eval_envs.reset() # initialise environments for analysing the representation if args.analyse_rep: analyse_rep_train1_envs, analyse_rep_train2_envs, analyse_rep_val_envs, analyse_rep_test_envs = make_rep_analysis_envs( args, device) print("Setting up Actor-Critic model and Training algorithm.") # initialise policy network actor_critic = ACModel(obs_shape=train_envs.observation_space.shape, action_space=train_envs.action_space, hidden_size=args.hidden_size, use_bottleneck=args.use_bottleneck, sni_type=args.sni_type).to(device) # initialise policy training algorithm if args.algo == 'ppo': policy = PPO(actor_critic=actor_critic, ppo_epoch=args.policy_ppo_epoch, num_mini_batch=args.policy_num_mini_batch, clip_param=args.policy_clip_param, value_loss_coef=args.policy_value_loss_coef, entropy_coef=args.policy_entropy_coef, max_grad_norm=args.policy_max_grad_norm, lr=args.policy_lr, eps=args.policy_eps, vib_coef=args.vib_coef, sni_coef=args.sni_coef, use_dist_matching=args.use_dist_matching, dist_matching_loss=args.dist_matching_loss, dist_matching_coef=args.dist_matching_coef, num_train_envs=args.num_train_envs, num_val_envs=args.num_val_envs) else: raise NotImplementedError # initialise rollout storage for the policy training algorithm rollouts = RolloutStorage(num_steps=args.policy_num_steps, num_processes=args.num_processes, obs_shape=train_envs.observation_space.shape, action_space=train_envs.action_space) # count number of frames and updates frames = 0 iter_idx = 0 # update wandb args wandb.config.update(args) # wandb.watch(actor_critic, log="all") # to log gradients of actor-critic network update_start_time = time.time() # reset environments if args.num_val_envs > 0: obs = torch.cat([train_envs.reset(), val_envs.reset()]) # obs.shape = (n_envs,C,H,W) else: obs = train_envs.reset() # obs.shape = (n_envs,C,H,W) # insert initial observation to rollout storage rollouts.obs[0].copy_(obs) rollouts.to(device) # initialise buffer for calculating mean episodic returns train_episode_info_buf = deque(maxlen=10) val_episode_info_buf = deque(maxlen=10) # calculate number of updates # number of frames ÷ number of policy steps before update ÷ number of processes args.num_batch = args.num_processes * args.policy_num_steps args.num_updates = int(args.num_frames) // args.num_batch print("Training beginning.") print("Number of updates: ", args.num_updates) for iter_idx in range(args.num_updates): print("Iter: ", iter_idx) # put actor-critic into train mode actor_critic.train() # rollout policy to collect num_batch of experience and place in storage for step in range(args.policy_num_steps): # sample actions from policy with torch.no_grad(): value, action, action_log_prob, _ = actor_critic.act( rollouts.obs[step]) # observe rewards and next obs if args.num_val_envs > 0: obs, reward, done, infos = train_envs.step( action[:args.num_train_envs, :]) val_obs, val_reward, val_done, val_infos = val_envs.step( action[args.num_train_envs:, :]) obs = torch.cat([obs, val_obs]) reward = torch.cat([reward, val_reward]) done, val_done = list(done), list(val_done) done.extend(val_done) infos.extend(val_infos) else: obs, reward, done, infos = train_envs.step(action) # log episode info if episode finished for i, info in enumerate(infos): if i < args.num_train_envs and 'episode' in info.keys(): train_episode_info_buf.append(info['episode']) elif i >= args.num_train_envs and 'episode' in info.keys(): val_episode_info_buf.append(info['episode']) # create mask for episode ends masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]).to(device) # add experience to storage rollouts.insert(obs, reward, action, value, action_log_prob, masks) frames += args.num_processes # --- UPDATE --- # bootstrap next value prediction with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1]).detach() # compute returns for current rollouts rollouts.compute_returns(next_value, args.policy_gamma, args.policy_gae_lambda) # update actor-critic using policy gradient algo total_loss, value_loss, action_loss, dist_entropy, vib_kl, dist_matching_loss = policy.update( rollouts) # clean up storage after update rollouts.after_update() # --- LOGGING --- if iter_idx % args.log_interval == 0 or iter_idx == args.num_updates - 1: # --- EVALUATION --- eval_episode_info_buf = utl_eval.evaluate( eval_envs=eval_envs, actor_critic=actor_critic, device=device) # --- ANALYSE REPRESENTATION --- if args.analyse_rep: rep_measures = utl_rep.analyse_rep( args=args, train1_envs=analyse_rep_train1_envs, train2_envs=analyse_rep_train2_envs, val_envs=analyse_rep_val_envs, test_envs=analyse_rep_test_envs, actor_critic=actor_critic, device=device) # get stats for run update_end_time = time.time() num_interval_updates = 1 if iter_idx == 0 else args.log_interval fps = num_interval_updates * ( args.num_processes * args.policy_num_steps) / (update_end_time - update_start_time) update_start_time = update_end_time # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = utl_math.explained_variance(utl.sf01(rollouts.value_preds), utl.sf01(rollouts.returns)) wandb.log( { 'misc/timesteps': frames, 'misc/fps': fps, 'misc/explained_variance': float(ev), 'losses/total_loss': total_loss, 'losses/value_loss': value_loss, 'losses/action_loss': action_loss, 'losses/dist_entropy': dist_entropy, 'train/mean_episodic_return': utl_math.safe_mean([ episode_info['r'] for episode_info in train_episode_info_buf ]), 'train/mean_episodic_length': utl_math.safe_mean([ episode_info['l'] for episode_info in train_episode_info_buf ]), 'eval/mean_episodic_return': utl_math.safe_mean([ episode_info['r'] for episode_info in eval_episode_info_buf ]), 'eval/mean_episodic_length': utl_math.safe_mean([ episode_info['l'] for episode_info in eval_episode_info_buf ]) }, step=iter_idx) if args.use_bottleneck: wandb.log({'losses/vib_kl': vib_kl}, step=iter_idx) if args.num_val_envs > 0: wandb.log( { 'losses/dist_matching_loss': dist_matching_loss, 'val/mean_episodic_return': utl_math.safe_mean([ episode_info['r'] for episode_info in val_episode_info_buf ]), 'val/mean_episodic_length': utl_math.safe_mean([ episode_info['l'] for episode_info in val_episode_info_buf ]) }, step=iter_idx) if args.analyse_rep: wandb.log( { "analysis/" + key: val for key, val in rep_measures.items() }, step=iter_idx) # --- SAVE MODEL --- # save for every interval-th episode or for the last epoch if iter_idx != 0 and (iter_idx % args.save_interval == 0 or iter_idx == args.num_updates - 1): print("Saving Actor-Critic Model.") torch.save(actor_critic.state_dict(), os.path.join(save_dir, "policy{0}.pt".format(iter_idx))) # close envs train_envs.close() eval_envs.close() # --- TEST --- if args.test: print("Testing beginning.") episodic_return, latents_z = utl_test.test(args=args, actor_critic=actor_critic, device=device) # save returns from train and test levels to analyse using interactive mode train_levels = torch.arange( args.train_start_level, args.train_start_level + args.train_num_levels) for i, level in enumerate(train_levels): wandb.log({ 'test/train_levels': level, 'test/train_returns': episodic_return[0][i] }) test_levels = torch.arange( args.test_start_level, args.test_start_level + args.test_num_levels) for i, level in enumerate(test_levels): wandb.log({ 'test/test_levels': level, 'test/test_returns': episodic_return[1][i] }) # log returns from test envs wandb.run.summary["train_mean_episodic_return"] = utl_math.safe_mean( episodic_return[0]) wandb.run.summary["test_mean_episodic_return"] = utl_math.safe_mean( episodic_return[1]) # plot latent representation if args.plot_pca: print("Plotting PCA of Latent Representation.") utl_rep.pca(args, latents_z)
if hparams["model_arc"] == "encdec": model = EncDec(hparams, n_words, itfloss_weight, fix_embedding).cuda() elif hparams["model_arc"] == "hred": model = HRED(hparams, n_words, itfloss_weight, fix_embedding).cuda() elif hparams["model_arc"] == "vhred": model = VHRED(hparams, n_words, itfloss_weight, fix_embedding).cuda() elif hparams["model_arc"] == "vhcr": model = VHCR(hparams, n_words, itfloss_weight, fix_embedding).cuda() else: raise ValueError("Unknown model architecture!") if checkpoint: model.load_state_dict(checkpoint["model"]) print("Model built and ready to go!") if mode == "train": print("Training model...") run_epochs(hparams, model, dataset, valid_dataset, model_pre, valid_every, save_every, checkpoint, pretrained) elif mode == "inference": print("Inference utterances...") test( hparams, model, dataset, os.path.join(os.path.dirname(checkpoint_path), "inf." + os.path.basename(checkpoint_path))) elif mode == "chat": print("Chatting with bot...") chat(hparams, model, vocab) else: raise ValueError("Unknown mode!") print("Done")
def main(opt): if not os.path.exists(opt.resume): os.makedirs(opt.resume) if not os.path.exists(opt.logroot): os.makedirs(opt.logroot) log_dir_name = str(opt.manualSeed) + '/' log_path = os.path.join(opt.logroot, log_dir_name) opt.resume = os.path.join(opt.resume, log_dir_name) if not os.path.exists(log_path): os.makedirs(log_path) #log_file_name = log_path + 'ucf_log_st.txt' #log_file_name = opt.logroot + 'ucf_log_st_'+str(opt.manualSeed)+'.txt' log_file_name = opt.logroot + 'something_log_v4.1_' + str( opt.manualSeed) + '.txt' with open(log_file_name, 'a+') as file: file.write('manualSeed is %d \n' % opt.manualSeed) paths = config.Paths() train_datalist = "/home/mcislab/zhaojw/AAAI/sth_train_list.txt" val_datalist = "/home/mcislab/zhaojw/AAAI/sth_val_list.txt" test_datalist = "/home/mcislab/zhaojw/AAAI/sth_test_list.txt" #test_datalist = '/home/mcislab/wangruiqi/IJCV2019/data/newsomething-check.txt' #opt.resume = os.path.join(opt.resume,log_dir_name) train_dataset = dataset(train_datalist, paths.sthv2_final, opt) train_dataloader = DataLoader(train_dataset, batch_size=opt.batch_size, shuffle=True, num_workers=opt.workers, drop_last=False) val_dataset = dataset(val_datalist, paths.sthv2_final, opt) val_dataloader = DataLoader(val_dataset, batch_size=opt.batch_size, shuffle=False, num_workers=opt.workers, drop_last=False) test_dataset = dataset(test_datalist, paths.sthv2_final, opt) test_dataloader = DataLoader(test_dataset, batch_size=opt.batch_size, shuffle=False, num_workers=opt.workers, drop_last=False) model = sthv2_model.Model(opt) ''' if opt.show: show(model) exit() ''' optimizer = optim.SGD(model.parameters(), lr=opt.lr, momentum=opt.momentum) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.9) criterion1 = nn.CrossEntropyLoss() criterion2 = nn.NLLLoss() if opt.cuda: model.cuda() #criterion.cuda(opt.device_id) criterion1.cuda() criterion2.cuda() ''' if opt.epoch != 0: if os.path.exists('./models/hmdb_split1/'+checkpoint_model_name): model.load_state_dict(torch.load('./models/hmdb_split1/' + checkpoint_model_name)) else: print('model not found') exit() ''' #Lin commented on Sept. 2nd #model.double() writer = SummaryWriter(log_dir=os.path.join(log_path, 'runs/')) # For training sum_test_acc = [] best_acc = 0. #epoch_errors = list() avg_epoch_error = np.inf best_epoch_error = np.inf ''' #haha, output Acc for each class test_load_dir = opt.resume #test_load_dir = '/home/mcislab/linhanxi/IJCV19_Experiments/sth_scale/something_scale5_M/ckpnothresh/ours' model.load_state_dict(torch.load(os.path.join(test_load_dir, 'model_best.pth'))['state_dict']) if opt.featdir: model.feat_mode() test_acc, output = test(0,test_dataloader, model, criterion1, criterion2, opt, writer, test_load_dir, is_test=True) exit() ''' print("Test once to get a baseline.") loaded_checkpoint = utils.load_best_checkpoint(opt, model, optimizer) if loaded_checkpoint: opt, model, optimizer = loaded_checkpoint test_acc, output = test(51, test_dataloader, model, criterion1, criterion2, opt, writer, log_file_name, is_test=True) tmp_test_acc = np.mean(test_acc) if tmp_test_acc > best_acc: best_acc = tmp_test_acc print("Start to train.....") for epoch_i in range(opt.epoch, opt.niter): scheduler.step() train(epoch_i, train_dataloader, model, criterion1, criterion2, optimizer, opt, writer, log_file_name) #val_acc, val_out, val_error =test(valid_loader, model, criterion1,criterion2, opt, log_file_name, is_test=False) # Lin changed according to 'sth_pre_abl1' on Sept. 3rd test_acc, output = val(epoch_i, val_dataloader, model, criterion1, criterion2, opt, writer, log_file_name, is_test=True) #test_acc,_ = test(test_dataloader, model, criterion1, criterion2, opt, log_file_name, is_test=True) tmp_test_acc = np.mean(test_acc) sum_test_acc.append(test_acc) if tmp_test_acc > best_acc: is_best = True best_acc = tmp_test_acc else: is_best = False utils.save_checkpoint( { 'epoch': epoch_i, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict() }, is_best=is_best, directory=opt.resume) print("A training epoch finished!") #epoch_i =33 # For testing print("Training finished.Start to test.") loaded_checkpoint = utils.load_best_checkpoint(opt, model, optimizer) if loaded_checkpoint: opt, model, optimizer = loaded_checkpoint # Lin changed according to 'sth_pre_abl1' on Sept. 3rd test_acc, output = test(epoch_i, test_dataloader, model, criterion1, criterion2, opt, writer, log_file_name, is_test=True) #test_acc,output = test(test_dataloader, model, criterion1,criterion2, opt, log_file_name, is_test=True) print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") print("ratio=0.1, test Accuracy: %.2f " % (100. * test_acc[0][0])) print("ratio=0.2, test Accuracy: %.2f " % (100. * test_acc[0][1])) print("ratio=0.3, test Accuracy: %.2f " % (100. * test_acc[0][2])) print("ratio=0.4, test Accuracy: %.2f " % (100. * test_acc[0][3])) print("ratio=0.5, test Accuracy: %.2f " % (100. * test_acc[0][4])) print("ratio=0.6, test Accuracy: %.2f " % (100. * test_acc[0][5])) print("ratio=0.7, test Accuracy: %.2f " % (100. * test_acc[0][6])) print("ratio=0.8, test Accuracy: %.2f " % (100. * test_acc[0][7])) print("ratio=0.9, test Accuracy: %.2f " % (100. * test_acc[0][8])) print("ratio=1.0, test Accuracy: %.2f " % (100. * test_acc[0][9])) print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
def main(opt): train_dataset = BADataset(opt.dataroot, opt.L, True, False, False) train_dataloader = BADataloader(train_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) valid_dataset = BADataset(opt.dataroot, opt.L, False, True, False) valid_dataloader = BADataloader(valid_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) test_dataset = BADataset(opt.dataroot, opt.L, False, False, True) test_dataloader = BADataloader(test_dataset, batch_size=opt.batchSize, \ shuffle=True, num_workers=opt.workers, drop_last=True) all_dataset = BADataset(opt.dataroot, opt.L, False, False, False) all_dataloader = BADataloader(all_dataset, batch_size=opt.batchSize, \ shuffle=False, num_workers=opt.workers, drop_last=False) net = PointNet(d0=opt.d0, d1=opt.d1, d2=opt.d2, d3=opt.d3, d4=opt.d4, d5=opt.d5, d6=opt.d6) net.double() print(net) criterion = nn.CosineSimilarity(dim=1) if opt.cuda: net.cuda() criterion.cuda() optimizer = optim.Adam(net.parameters(), lr=opt.lr) early_stopping = EarlyStopping(patience=opt.patience, verbose=True) os.makedirs(OutputDir, exist_ok=True) train_loss_ls = [] valid_loss_ls = [] test_loss_ls = [] for epoch in range(0, opt.niter): train_loss = train(epoch, train_dataloader, net, criterion, optimizer, opt) valid_loss = valid(valid_dataloader, net, criterion, opt) test_loss = test(test_dataloader, net, criterion, opt) train_loss_ls.append(train_loss) valid_loss_ls.append(valid_loss) test_loss_ls.append(test_loss) early_stopping(valid_loss, net, OutputDir) if early_stopping.early_stop: print("Early stopping") break df = pd.DataFrame({ 'epoch': [i for i in range(1, len(train_loss_ls) + 1)], 'train_loss': train_loss_ls, 'valid_loss': valid_loss_ls, 'test_loss': test_loss_ls }) df.to_csv(OutputDir + '/loss.csv', index=False) net.load_state_dict(torch.load(OutputDir + '/checkpoint.pt')) inference(all_dataloader, net, opt, OutputDir)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--env_name', type=str, default='coinrun', help='name of the environment to train on.') parser.add_argument('--model', type=str, default='ppo', help='the model to use for training. {ppo, ppo_aup}') args, rest_args = parser.parse_known_args() env_name = args.env_name model = args.model # --- ARGUMENTS --- if model == 'ppo': args = args_ppo.get_args(rest_args) elif model == 'ppo_aup': args = args_ppo_aup.get_args(rest_args) else: raise NotImplementedError # place other args back into argparse.Namespace args.env_name = env_name args.model = model # warnings if args.deterministic_execution: print('Envoking deterministic code execution.') if torch.backends.cudnn.enabled: warnings.warn('Running with deterministic CUDNN.') if args.num_processes > 1: raise RuntimeError( 'If you want fully deterministic code, run it with num_processes=1.' 'Warning: This will slow things down and might break A2C if ' 'policy_num_steps < env._max_episode_steps.') # --- TRAINING --- print("Setting up wandb logging.") # Weights & Biases logger if args.run_name is None: # make run name as {env_name}_{TIME} now = datetime.datetime.now().strftime('_%d-%m_%H:%M:%S') args.run_name = args.env_name + '_' + args.algo + now # initialise wandb wandb.init(project=args.proj_name, name=args.run_name, group=args.group_name, config=args, monitor_gym=False) # save wandb dir path args.run_dir = wandb.run.dir # make directory for saving models save_dir = os.path.join(wandb.run.dir, 'models') if not os.path.exists(save_dir): os.makedirs(save_dir) # set random seed of random, torch and numpy utl.set_global_seed(args.seed, args.deterministic_execution) print("Setting up Environments.") # initialise environments for training train_envs = make_vec_envs(env_name=args.env_name, start_level=args.train_start_level, num_levels=args.train_num_levels, distribution_mode=args.distribution_mode, paint_vel_info=args.paint_vel_info, num_processes=args.num_processes, num_frame_stack=args.num_frame_stack, device=device) # initialise environments for evaluation eval_envs = make_vec_envs(env_name=args.env_name, start_level=0, num_levels=0, distribution_mode=args.distribution_mode, paint_vel_info=args.paint_vel_info, num_processes=args.num_processes, num_frame_stack=args.num_frame_stack, device=device) _ = eval_envs.reset() print("Setting up Actor-Critic model and Training algorithm.") # initialise policy network actor_critic = ACModel(obs_shape=train_envs.observation_space.shape, action_space=train_envs.action_space, hidden_size=args.hidden_size).to(device) # initialise policy training algorithm if args.algo == 'ppo': policy = PPO(actor_critic=actor_critic, ppo_epoch=args.policy_ppo_epoch, num_mini_batch=args.policy_num_mini_batch, clip_param=args.policy_clip_param, value_loss_coef=args.policy_value_loss_coef, entropy_coef=args.policy_entropy_coef, max_grad_norm=args.policy_max_grad_norm, lr=args.policy_lr, eps=args.policy_eps) else: raise NotImplementedError # initialise rollout storage for the policy training algorithm rollouts = RolloutStorage(num_steps=args.policy_num_steps, num_processes=args.num_processes, obs_shape=train_envs.observation_space.shape, action_space=train_envs.action_space) # initialise Q_aux function(s) for AUP if args.use_aup: print("Initialising Q_aux models.") q_aux = [ QModel(obs_shape=train_envs.observation_space.shape, action_space=train_envs.action_space, hidden_size=args.hidden_size).to(device) for _ in range(args.num_q_aux) ] if args.num_q_aux == 1: # load weights to model path = args.q_aux_dir + "0.pt" q_aux[0].load_state_dict(torch.load(path)) q_aux[0].eval() else: # get max number of q_aux functions to choose from args.max_num_q_aux = os.listdir(args.q_aux_dir) q_aux_models = random.sample(list(range(0, args.max_num_q_aux)), args.num_q_aux) # load weights to models for i, model in enumerate(q_aux): path = args.q_aux_dir + str(q_aux_models[i]) + ".pt" model.load_state_dict(torch.load(path)) model.eval() # count number of frames and updates frames = 0 iter_idx = 0 # update wandb args wandb.config.update(args) update_start_time = time.time() # reset environments obs = train_envs.reset() # obs.shape = (num_processes,C,H,W) # insert initial observation to rollout storage rollouts.obs[0].copy_(obs) rollouts.to(device) # initialise buffer for calculating mean episodic returns episode_info_buf = deque(maxlen=10) # calculate number of updates # number of frames ÷ number of policy steps before update ÷ number of processes args.num_batch = args.num_processes * args.policy_num_steps args.num_updates = int(args.num_frames) // args.num_batch # define AUP coefficient if args.use_aup: aup_coef = args.aup_coef_start aup_linear_increase_val = math.exp( math.log(args.aup_coef_end / args.aup_coef_start) / args.num_updates) print("Training beginning.") print("Number of updates: ", args.num_updates) for iter_idx in range(args.num_updates): print("Iter: ", iter_idx) # put actor-critic into train mode actor_critic.train() if args.use_aup: aup_measures = defaultdict(list) # rollout policy to collect num_batch of experience and place in storage for step in range(args.policy_num_steps): # sample actions from policy with torch.no_grad(): value, action, action_log_prob = actor_critic.act( rollouts.obs[step]) # observe rewards and next obs obs, reward, done, infos = train_envs.step(action) # calculate AUP reward if args.use_aup: intrinsic_reward = torch.zeros_like(reward) with torch.no_grad(): for model in q_aux: # get action-values action_values = model.get_action_value( rollouts.obs[step]) # get action-value for action taken action_value = torch.sum( action_values * torch.nn.functional.one_hot( action, num_classes=train_envs.action_space.n).squeeze( dim=1), dim=1) # calculate the penalty intrinsic_reward += torch.abs( action_value.unsqueeze(dim=1) - action_values[:, 4].unsqueeze(dim=1)) intrinsic_reward /= args.num_q_aux # add intrinsic reward to the extrinsic reward reward -= aup_coef * intrinsic_reward # log the intrinsic reward from the first env. aup_measures['intrinsic_reward'].append(aup_coef * intrinsic_reward[0, 0]) if done[0] and infos[0]['prev_level_complete'] == 1: aup_measures['episode_complete'].append(2) elif done[0] and infos[0]['prev_level_complete'] == 0: aup_measures['episode_complete'].append(1) else: aup_measures['episode_complete'].append(0) # log episode info if episode finished for info in infos: if 'episode' in info.keys(): episode_info_buf.append(info['episode']) # create mask for episode ends masks = torch.FloatTensor([[0.0] if done_ else [1.0] for done_ in done]).to(device) # add experience to storage rollouts.insert(obs, reward, action, value, action_log_prob, masks) frames += args.num_processes # linearly increase aup coefficient after every update if args.use_aup: aup_coef *= aup_linear_increase_val # --- UPDATE --- # bootstrap next value prediction with torch.no_grad(): next_value = actor_critic.get_value(rollouts.obs[-1]).detach() # compute returns for current rollouts rollouts.compute_returns(next_value, args.policy_gamma, args.policy_gae_lambda) # update actor-critic using policy training algorithm total_loss, value_loss, action_loss, dist_entropy = policy.update( rollouts) # clean up storage after update rollouts.after_update() # --- LOGGING --- if iter_idx % args.log_interval == 0 or iter_idx == args.num_updates - 1: # --- EVALUATION --- eval_episode_info_buf = utl_eval.evaluate( eval_envs=eval_envs, actor_critic=actor_critic, device=device) # get stats for run update_end_time = time.time() num_interval_updates = 1 if iter_idx == 0 else args.log_interval fps = num_interval_updates * ( args.num_processes * args.policy_num_steps) / (update_end_time - update_start_time) update_start_time = update_end_time # calculates whether the value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = utl_math.explained_variance(utl.sf01(rollouts.value_preds), utl.sf01(rollouts.returns)) if args.use_aup: step = frames - args.num_processes * args.policy_num_steps for i in range(args.policy_num_steps): wandb.log( { 'aup/intrinsic_reward': aup_measures['intrinsic_reward'][i], 'aup/episode_complete': aup_measures['episode_complete'][i] }, step=step) step += args.num_processes wandb.log( { 'misc/timesteps': frames, 'misc/fps': fps, 'misc/explained_variance': float(ev), 'losses/total_loss': total_loss, 'losses/value_loss': value_loss, 'losses/action_loss': action_loss, 'losses/dist_entropy': dist_entropy, 'train/mean_episodic_return': utl_math.safe_mean([ episode_info['r'] for episode_info in episode_info_buf ]), 'train/mean_episodic_length': utl_math.safe_mean([ episode_info['l'] for episode_info in episode_info_buf ]), 'eval/mean_episodic_return': utl_math.safe_mean([ episode_info['r'] for episode_info in eval_episode_info_buf ]), 'eval/mean_episodic_length': utl_math.safe_mean([ episode_info['l'] for episode_info in eval_episode_info_buf ]) }, step=frames) # --- SAVE MODEL --- # save for every interval-th episode or for the last epoch if iter_idx != 0 and (iter_idx % args.save_interval == 0 or iter_idx == args.num_updates - 1): print("Saving Actor-Critic Model.") torch.save(actor_critic.state_dict(), os.path.join(save_dir, "policy{0}.pt".format(iter_idx))) # close envs train_envs.close() eval_envs.close() # --- TEST --- if args.test: print("Testing beginning.") episodic_return = utl_test.test(args=args, actor_critic=actor_critic, device=device) # save returns from train and test levels to analyse using interactive mode train_levels = torch.arange( args.train_start_level, args.train_start_level + args.train_num_levels) for i, level in enumerate(train_levels): wandb.log({ 'test/train_levels': level, 'test/train_returns': episodic_return[0][i] }) test_levels = torch.arange( args.test_start_level, args.test_start_level + args.test_num_levels) for i, level in enumerate(test_levels): wandb.log({ 'test/test_levels': level, 'test/test_returns': episodic_return[1][i] }) # log returns from test envs wandb.run.summary["train_mean_episodic_return"] = utl_math.safe_mean( episodic_return[0]) wandb.run.summary["test_mean_episodic_return"] = utl_math.safe_mean( episodic_return[1])
]) if args.imshow == True: train_dataset = selfData(args.train_img, args.train_lab, transforms) train_loader = DataLoader(train_dataset, batch_size = 64, shuffle = True, num_workers = 0, drop_last= False) imgs, labels = train_loader.__iter__().__next__() imshow(train_loader) if args.model == 'mAlexNet': net = mAlexNet().to(device) elif args.model == 'AlexNet': net = AlexNet().to(device) criterion = nn.CrossEntropyLoss() if args.path == '': train(args.epochs, args.train_img, args.train_lab, transforms, net, criterion) PATH = './model.pth' torch.save(net.state_dict(), PATH) if args.model == 'mAlexNet': net = mAlexNet().to(device) elif args.model == 'AlexNet': net = AlexNet().to(device) net.load_state_dict(torch.load(PATH)) else: PATH = args.path if args.model == 'mAlexNet': net = mAlexNet().to(device) elif args.model == 'AlexNet': net = AlexNet().to(device) net.load_state_dict(torch.load(PATH)) accuracy = test(args.test_img, args.test_lab, transforms, net) print("\nThe accuracy of training on '{}' and testing on '{}' is {:.3f}.".format(args.train_lab.split('.')[0], args.test_lab.split('.')[0], accuracy))
def train(): cfg = opt.cfg data = opt.data epochs = opt.epochs # 500200 batches at bs 64, 117263 images = 273 epochs batch_size = opt.batch_size accumulate = opt.accumulate # effective bs = batch_size * accumulate = 16 * 4 = 64 weights = opt.weights # initial training weights imgsz_min, imgsz_max, imgsz_test = opt.img_size # img sizes (min, max, test) # Image Sizes gs = 64 # (pixels) grid size assert math.fmod( imgsz_min, gs) == 0, '--img-size %g must be a %g-multiple' % (imgsz_min, gs) opt.multi_scale |= imgsz_min != imgsz_max # multi if different (min, max) if opt.multi_scale: if imgsz_min == imgsz_max: imgsz_min //= 1.5 imgsz_max //= 0.667 grid_min, grid_max = imgsz_min // gs, imgsz_max // gs imgsz_min, imgsz_max = grid_min * gs, grid_max * gs img_size = imgsz_max # initialize with max size # Configure run init_seeds() data_dict = parse_data_cfg(data) train_path = data_dict['train'] test_path = data_dict['valid'] nc = 1 if opt.single_cls else int( data_dict['classes']) # number of classes hyp['cls'] *= nc / 80 # update coco-tuned hyp['cls'] to current dataset # Remove previous results for f in glob.glob('*_batch*.png') + glob.glob(results_file): os.remove(f) # Initialize model model = MDENet(path=weights, yolo_props=yolo_props, freeze=freeze).to(device) # print(model) # Optimizer pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in dict(model.named_parameters()).items(): if '.bias' in k: pg2 += [v] # biases elif 'Conv2d.weight' in k: pg1 += [v] # apply weight_decay else: pg0 += [v] # all else if opt.adam: # hyp['lr0'] *= 0.1 # reduce lr (i.e. SGD=5E-3, Adam=5E-4) optimizer = optim.Adam(pg0, lr=hyp['lr0']) # optimizer = AdaBound(pg0, lr=hyp['lr0'], final_lr=0.1) else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) del pg0, pg1, pg2 start_epoch = 0 best_fitness = 0.0 #attempt_download(weights) """ if weights.endswith('.pt'): # pytorch format # possible weights are '*.pt', 'yolov3-spp.pt', 'yolov3-tiny.pt' etc. chkpt = torch.load(weights, map_location=device) # load model try: chkpt['model'] = {k: v for k, v in chkpt['model'].items() if model.state_dict()[k].numel() == v.numel()} model.load_state_dict(chkpt['model'], strict=False) except KeyError as e: s = "%s is not compatible with %s. Specify --weights '' or specify a --cfg compatible with %s. " \ "See https://github.com/ultralytics/yolov3/issues/657" % (opt.weights, opt.cfg, opt.weights) raise KeyError(s) from e # load optimizer if chkpt['optimizer'] is not None: optimizer.load_state_dict(chkpt['optimizer']) best_fitness = chkpt['best_fitness'] # load results if chkpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(chkpt['training_results']) # write results.txt start_epoch = chkpt['epoch'] + 1 del chkpt elif len(weights) > 0: # darknet format # possible weights are '*.weights', 'yolov3-tiny.conv.15', 'darknet53.conv.74' etc. load_darknet_weights(model, weights) """ # Mixed precision training https://github.com/NVIDIA/apex if mixed_precision: model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) # Scheduler https://github.com/ultralytics/yolov3/issues/238 lf = lambda x: ( ((1 + math.cos(x * math.pi / epochs)) / 2 )**1.0) * 0.95 + 0.05 # cosine https://arxiv.org/pdf/1812.01187.pdf scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf, last_epoch=start_epoch - 1) # scheduler = lr_scheduler.MultiStepLR(optimizer, [round(epochs * x) for x in [0.8, 0.9]], 0.1, start_epoch - 1) # Plot lr schedule # y = [] # for _ in range(epochs): # scheduler.step() # y.append(optimizer.param_groups[0]['lr']) # plt.plot(y, '.-', label='LambdaLR') # plt.xlabel('epoch') # plt.ylabel('LR') # plt.tight_layout() # plt.savefig('LR.png', dpi=300) # Initialize distributed training if device.type != 'cpu' and torch.cuda.device_count( ) > 1 and torch.distributed.is_available(): dist.init_process_group( backend='nccl', # 'distributed backend' init_method= 'tcp://127.0.0.1:9999', # distributed training init method world_size=1, # number of nodes for distributed training rank=0) # distributed training node rank model = torch.nn.parallel.DistributedDataParallel( model, find_unused_parameters=True) model.yolo_layers = model.module.yolo_layers # move yolo layer indices to top level # Dataset print("train_path: ", train_path) print("img_size", img_size) print("batch_size", batch_size) print("hyp", hyp) print("opt.rect", opt.rect) print("opt.cache_images", opt.cache_images) print("opt.single_cls", opt.single_cls) dataset = LoadImagesAndLabels( train_path, img_size, batch_size, augment=not freeze["yolo"], hyp=hyp, # augmentation hyperparameters rect=opt.rect, # rectangular training cache_images=opt.cache_images, single_cls=opt.single_cls) # Dataloader batch_size = min(batch_size, len(dataset)) print("Dataloader batch_size", batch_size) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print("nw", nw) print("opt.rect", opt.rect) print("dataset.collate_fn", dataset.collate_fn) dataloader = torch.utils.data.DataLoader( dataset, batch_size=batch_size, num_workers=nw, shuffle=True, # Shuffle=True unless rectangular training is used pin_memory=True, collate_fn=dataset.collate_fn) # Testloader testloader = torch.utils.data.DataLoader(LoadImagesAndLabels( test_path, imgsz_test, batch_size, hyp=hyp, rect=True, cache_images=opt.cache_images, single_cls=opt.single_cls), batch_size=batch_size, num_workers=nw, pin_memory=True, collate_fn=dataset.collate_fn) # Model parameters model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights # Model EMA ema = torch_utils.ModelEMA(model) # Start training nb = len(dataloader) # number of batches n_burn = max(3 * nb, 500) # burn-in iterations, max(3 epochs, 500 iterations) maps = np.zeros(nc) # mAP per class # torch.autograd.set_detect_anomaly(True) results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' t0 = time.time() for param in model.pretrained.parameters(): param.requires_grad = False print('Image sizes %g - %g train, %g test' % (imgsz_min, imgsz_max, imgsz_test)) print('Using %g dataloader workers' % nw) print('Starting training for %g epochs...' % epochs) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ model.train() # Update image weights (optional) if dataset.image_weights: w = model.class_weights.cpu().numpy() * (1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices(range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx mloss = torch.zeros(5).to(device) # mean losses print( ('\n' + '%10s' * 9) % ('Epoch', 'gpu_mem', 'GIoU', 'obj', 'cls', 'l_depth', 'total', 'targets', 'img_size')) pbar = tqdm(enumerate(dataloader), total=nb) # progress bar for i, ( imgs, targets, paths, _, dp_imgs, pln_imgs ) in pbar: # batch ------------------------------------------------------------- #print("imgs:", len(imgs)) #print("targets:", targets) #print("paths:", paths) ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device).float( ) / 255.0 # uint8 to float32, 0 - 255 to 0.0 - 1.0 targets = targets.to(device) # Burn-in if ni <= n_burn * 2: model.gr = np.interp( ni, [0, n_burn * 2], [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) if ni == n_burn: # burnin complete print_model_biases(model) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, [0, n_burn], [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, [0, n_burn], [0.9, hyp['momentum']]) # Multi-Scale training if opt.multi_scale: if ni / accumulate % 1 == 0: # adjust img_size (67% - 150%) every 1 batch img_size = random.randrange(grid_min, grid_max + 1) * gs sf = img_size / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to 32-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) #print("model.training:", model.training) # Run model midas_out, yolo_out = model(imgs) #showimg(imgs[0].detach().cpu()) #showimg(midas_out[0].detach().cpu()) """ print("midas_out", midas_out.shape) print(len(yolo_out)) print("yolo_out_0", yolo_out[0].shape) print("yolo_out_1", yolo_out[1].shape) print("yolo_out_2", yolo_out[2].shape) """ # Compute loss loss, loss_items = compute_loss(yolo_out, targets, midas_out, dp_imgs, alpha, model) if not torch.isfinite(loss): print('WARNING: non-finite loss, ending training ', loss_items) return results # Scale loss by nominal batch_size of 64 loss *= batch_size / 64 # Compute gradient if mixed_precision: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() # Optimize accumulated gradient if ni % accumulate == 0: optimizer.step() optimizer.zero_grad() ema.update(model) # Print batch results #print("loss_items", loss_items) mloss = (mloss * i + loss_items) / (i + 1) # update mean losses #print("mloss", mloss) mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.3g' * 7) % ('%g/%g' % (epoch, epochs - 1), mem, *mloss, len(targets), img_size) pbar.set_description(s) # Plot images with bounding boxes if ni < 1: f = 'train_batch%g.png' % i # filename plot_images(imgs=imgs, targets=targets, paths=paths, fname=f) if tb_writer: tb_writer.add_image(f, cv2.imread(f)[:, :, ::-1], dataformats='HWC') # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Update scheduler scheduler.step() # Process epoch results ema.update_attr(model) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP is_coco = any([ x in data for x in ['coco.data', 'coco2014.data', 'coco2017.data'] ]) and model.nc == 80 results, maps = test.test( cfg, data, batch_size=batch_size, img_size=imgsz_test, model=ema.ema, #model=model, save_json=final_epoch and is_coco, single_cls=opt.single_cls, dataloader=testloader) # Write epoch results with open(results_file, 'a') as f: f.write( s + '%10.3g' * 8 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls, ldepth) if len(opt.name) and opt.bucket: os.system('gsutil cp results.txt gs://%s/results/results%s.txt' % (opt.bucket, opt.name)) # Write Tensorboard results if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/F1', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss' ] for x, tag in zip(list(mloss[:-1]) + list(results), tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save training results save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # Create checkpoint chkpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema.module.state_dict() if hasattr(model, 'module') else ema.ema.state_dict(), 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last checkpoint torch.save(chkpt, last) # Save best checkpoint if (best_fitness == fi) and not final_epoch: torch.save(chkpt, best) # Save backup every 10 epochs (optional) # if epoch > 0 and epoch % 10 == 0: # torch.save(chkpt, wdir + 'backup%g.pt' % epoch) # Delete checkpoint del chkpt # end epoch ---------------------------------------------------------------------------------------------------- # end training n = opt.name if len(n): n = '_' + n if not n.isnumeric() else n fresults, flast, fbest = 'results%s.txt' % n, wdir + 'last%s.pt' % n, wdir + 'best%s.pt' % n for f1, f2 in zip([wdir + 'last.pt', wdir + 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename ispt = f2.endswith('.pt') # is *.pt strip_optimizer(f2) if ispt else None # strip optimizer os.system('gsutil cp %s gs://%s/weights' % ( f2, opt.bucket)) if opt.bucket and ispt else None # upload if not opt.evolve: plot_results() # save as results.png print('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if torch.cuda.device_count() > 1 else None torch.cuda.empty_cache() return results
def train(hyp, opt, device, tb_writer=None): #logger.info(f'Hyperparameters {hyp}') log_dir = Path(tb_writer.log_dir) if tb_writer else Path( opt.logdir) / 'evolve' # logging directory wdir = log_dir / 'weights' # weights directory os.makedirs(wdir, exist_ok=True) last = wdir / 'last.pt' best = wdir / 'best.pt' results_file = str(log_dir / 'results.txt') epochs, batch_size, total_batch_size, weights, rank = \ opt.epochs, opt.batch_size, opt.total_batch_size, opt.weights, opt.global_rank # Save run settings with open(log_dir / 'hyp.yaml', 'w') as f: yaml.dump(hyp, f, sort_keys=False) with open(log_dir / 'opt.yaml', 'w') as f: yaml.dump(vars(opt), f, sort_keys=False) # Configure cuda = device.type != 'cpu' init_seeds(2 + rank) with open(opt.data) as f: data_dict = yaml.load(f, Loader=yaml.FullLoader) # data dict with torch_distributed_zero_first(rank): check_dataset(data_dict) # check train_path = data_dict['train'] test_path = data_dict['val'] nc, names = (1, ['item']) if opt.single_cls else (int( data_dict['nc']), data_dict['names']) # number classes, names assert len(names) == nc, '%g names found for nc=%g dataset in %s' % ( len(names), nc, opt.data) # check # Model pretrained = weights.endswith('.pt') if pretrained: with torch_distributed_zero_first(rank): download(weights) # download if not found locally ckpt = torch.load(weights, map_location=device) # load checkpoint model = Model(opt.cfg or ckpt['model'].yaml, ch=3, nc=nc).to(device) # create exclude = ['anchor'] if opt.cfg else [] # exclude keys state_dict = ckpt['model'].float().state_dict() # to FP32 state_dict = intersect_dicts(state_dict, model.state_dict(), exclude=exclude) # intersect model.load_state_dict(state_dict, strict=False) # load #logger.info('Transferred %g/%g items from %s' % (len(state_dict), len(model.state_dict()), weights)) # report else: model = Model(opt.cfg, ch=3, nc=nc).to(device) # create # Freeze freeze = [ '', ] # parameter names to freeze (full or partial) if any(freeze): for k, v in model.named_parameters(): if any(x in k for x in freeze): print('freezing %s' % k) v.requires_grad = False # Optimizer nbs = 64 # nominal batch size accumulate = max(round(nbs / total_batch_size), 1) # accumulate loss before optimizing hyp['weight_decay'] *= total_batch_size * accumulate / nbs # scale weight_decay pg0, pg1, pg2 = [], [], [] # optimizer parameter groups for k, v in model.named_parameters(): v.requires_grad = True if '.bias' in k: pg2.append(v) # biases elif '.weight' in k and '.bn' not in k: pg1.append(v) # apply weight decay else: pg0.append(v) # all else if opt.adam: optimizer = optim.Adam(pg0, lr=hyp['lr0'], betas=(hyp['momentum'], 0.999)) # adjust beta1 to momentum else: optimizer = optim.SGD(pg0, lr=hyp['lr0'], momentum=hyp['momentum'], nesterov=True) optimizer.add_param_group({ 'params': pg1, 'weight_decay': hyp['weight_decay'] }) # add pg1 with weight_decay optimizer.add_param_group({'params': pg2}) # add pg2 (biases) #logger.info('Optimizer groups: %g .bias, %g conv.weight, %g other' % (len(pg2), len(pg1), len(pg0))) del pg0, pg1, pg2 lf = lambda x: ((1 + math.cos(x * math.pi / epochs)) / 2) * (1 - hyp[ 'lrf']) + hyp['lrf'] # cosine scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda=lf) start_epoch, best_fitness = 0, 0.0 if pretrained: # Optimizer if ckpt['optimizer'] is not None: optimizer.load_state_dict(ckpt['optimizer']) best_fitness = ckpt['best_fitness'] # Results if ckpt.get('training_results') is not None: with open(results_file, 'w') as file: file.write(ckpt['training_results']) # write results.txt # Epochs start_epoch = ckpt['epoch'] + 1 if opt.resume: assert start_epoch > 0, '%s training to %g epochs is finished, nothing to resume.' % ( weights, epochs) shutil.copytree(wdir, wdir.parent / f'weights_backup_epoch{start_epoch - 1}' ) # save previous weights if epochs < start_epoch: #logger.info('%s has been trained for %g epochs. Fine-tuning for %g additional epochs.' % # (weights, ckpt['epoch'], epochs)) epochs += ckpt['epoch'] # finetune additional epochs del ckpt, state_dict # Image sizes gs = int(max(model.stride)) # grid size (max stride) imgsz, imgsz_test = [check_img_size(x, gs) for x in opt.img_size ] # verify imgsz are gs-multiples # DP mode if cuda and rank == -1 and torch.cuda.device_count() > 1: model = torch.nn.DataParallel(model) # SyncBatchNorm if opt.sync_bn and cuda and rank != -1: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model).to(device) #logger.info('Using SyncBatchNorm()') # Exponential moving average ema = ModelEMA(model) if rank in [-1, 0] else None # DDP mode if cuda and rank != -1: model = DDP(model, device_ids=[opt.local_rank], output_device=(opt.local_rank)) # Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size, gs, opt, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=rank, world_size=opt.world_size, workers=opt.workers) mlc = np.concatenate(dataset.labels, 0)[:, 0].max() # max label class nb = len(dataloader) # number of batches assert mlc < nc, 'Label class %g exceeds nc=%g in %s. Possible class labels are 0-%g' % ( mlc, nc, opt.data, nc - 1) # Testloader if rank in [-1, 0]: ema.updates = start_epoch * nb // accumulate # set EMA updates testloader = create_dataloader( test_path, imgsz_test, total_batch_size, gs, opt, hyp=hyp, augment=False, cache=opt.cache_images, rect=True, rank=-1, world_size=opt.world_size, workers=opt.workers)[0] # only runs on process 0 # Model parameters hyp['cls'] *= nc / 80. # scale coco-tuned hyp['cls'] to current dataset model.nc = nc # attach number of classes to model model.hyp = hyp # attach hyperparameters to model model.gr = 1.0 # giou loss ratio (obj_loss = 1.0 or giou) model.class_weights = labels_to_class_weights(dataset.labels, nc).to( device) # attach class weights model.names = names # Class frequency if rank in [-1, 0]: labels = np.concatenate(dataset.labels, 0) c = torch.tensor(labels[:, 0]) # classes # cf = torch.bincount(c.long(), minlength=nc) + 1. # model._initialize_biases(cf.to(device)) plot_labels(labels, save_dir=log_dir) if tb_writer: # tb_writer.add_hparams(hyp, {}) # causes duplicate https://github.com/ultralytics/yolov5/pull/384 tb_writer.add_histogram('classes', c, 0) # Check anchors if not opt.noautoanchor: check_anchors(dataset, model=model, thr=hyp['anchor_t'], imgsz=imgsz) # Start training t0 = time.time() nw = max(3 * nb, 1e3) # number of warmup iterations, max(3 epochs, 1k iterations) # nw = min(nw, (epochs - start_epoch) / 2 * nb) # limit warmup to < 1/2 of training maps = np.zeros(nc) # mAP per class results = ( 0, 0, 0, 0, 0, 0, 0 ) # 'P', 'R', 'mAP', 'F1', 'val GIoU', 'val Objectness', 'val Classification' scheduler.last_epoch = start_epoch - 1 # do not move scaler = amp.GradScaler(enabled=cuda) #logger.info('Image sizes %g train, %g test' % (imgsz, imgsz_test)) #logger.info('Using %g dataloader workers' % dataloader.num_workers) logger.info('++Training for %g epochs' % epochs) # torch.autograd.set_detect_anomaly(True) for epoch in range( start_epoch, epochs ): # epoch ------------------------------------------------------------------ print('>>Epoch {} of {}'.format(epoch + 1, epochs)) model.train() # Update image weights (optional) if dataset.image_weights: # Generate indices if rank in [-1, 0]: w = model.class_weights.cpu().numpy() * ( 1 - maps)**2 # class weights image_weights = labels_to_image_weights(dataset.labels, nc=nc, class_weights=w) dataset.indices = random.choices( range(dataset.n), weights=image_weights, k=dataset.n) # rand weighted idx # Broadcast if DDP if rank != -1: indices = torch.zeros([dataset.n], dtype=torch.int) if rank == 0: indices[:] = torch.tensor(dataset.indices, dtype=torch.int) dist.broadcast(indices, 0) if rank != 0: dataset.indices = indices.cpu().numpy() # Update mosaic border # b = int(random.uniform(0.25 * imgsz, 0.75 * imgsz + gs) // gs * gs) # dataset.mosaic_border = [b - imgsz, -b] # height, width borders mloss = torch.zeros(4, device=device) # mean losses if rank != -1: dataloader.sampler.set_epoch(epoch) pbar = enumerate(dataloader) #logger.info(('\n' + '%10s' * 2) % ('Epoch', 'giou_loss')) if rank in [-1, 0]: pbar = tqdm(pbar, total=nb) # progress bar optimizer.zero_grad() for i, ( imgs, targets, paths, _ ) in pbar: # batch ------------------------------------------------------------- ni = i + nb * epoch # number integrated batches (since train start) imgs = imgs.to(device, non_blocking=True).float( ) / 255.0 # uint8 to float32, 0-255 to 0.0-1.0 # Warmup if ni <= nw: xi = [0, nw] # x interp # model.gr = np.interp(ni, xi, [0.0, 1.0]) # giou loss ratio (obj_loss = 1.0 or giou) accumulate = max( 1, np.interp(ni, xi, [1, nbs / total_batch_size]).round()) for j, x in enumerate(optimizer.param_groups): # bias lr falls from 0.1 to lr0, all other lrs rise from 0.0 to lr0 x['lr'] = np.interp( ni, xi, [0.1 if j == 2 else 0.0, x['initial_lr'] * lf(epoch)]) if 'momentum' in x: x['momentum'] = np.interp(ni, xi, [0.9, hyp['momentum']]) # Multi-scale if opt.multi_scale: sz = random.randrange(imgsz * 0.5, imgsz * 1.5 + gs) // gs * gs # size sf = sz / max(imgs.shape[2:]) # scale factor if sf != 1: ns = [math.ceil(x * sf / gs) * gs for x in imgs.shape[2:] ] # new shape (stretched to gs-multiple) imgs = F.interpolate(imgs, size=ns, mode='bilinear', align_corners=False) # Forward with amp.autocast(enabled=cuda): pred = model(imgs) # forward loss, loss_items = compute_loss( pred, targets.to(device), model) # loss scaled by batch_size if rank != -1: loss *= opt.world_size # gradient averaged between devices in DDP mode # Backward scaler.scale(loss).backward() # Optimize if ni % accumulate == 0: scaler.step(optimizer) # optimizer.step scaler.update() optimizer.zero_grad() if ema: ema.update(model) # Print if rank in [-1, 0]: mloss = (mloss * i + loss_items) / (i + 1 ) # update mean losses mem = '%.3gG' % (torch.cuda.memory_reserved() / 1E9 if torch.cuda.is_available() else 0) # (GB) s = ('%10s' * 2 + '%10.4g' * 3) % ('%g/%g' % (epoch, epochs - 1), *mloss) #print() pbar.set_description('>>GIOU_loss : {}'.format( mloss.mean().item())) # Plot if ni < 3: f = str(log_dir / ('train_batch%g.jpg' % ni)) # filename result = plot_images(images=imgs, targets=targets, paths=paths, fname=f) if tb_writer and result is not None: tb_writer.add_image(f, result, dataformats='HWC', global_step=epoch) # tb_writer.add_graph(model, imgs) # add model to tensorboard # end batch ------------------------------------------------------------------------------------------------ # Scheduler lr = [x['lr'] for x in optimizer.param_groups] # for tensorboard scheduler.step() # DDP process 0 or single-GPU if rank in [-1, 0]: # mAP if ema: ema.update_attr( model, include=['yaml', 'nc', 'hyp', 'gr', 'names', 'stride']) final_epoch = epoch + 1 == epochs if not opt.notest or final_epoch: # Calculate mAP results, maps, times = test.test(opt.data, batch_size=total_batch_size, imgsz=imgsz_test, model=ema.ema, single_cls=opt.single_cls, dataloader=testloader, save_dir=log_dir) # Write with open(results_file, 'a') as f: f.write(s + '%10.4g' * 7 % results + '\n') # P, R, mAP, F1, test_losses=(GIoU, obj, cls) if len(opt.name) and opt.bucket: os.system('gsutil cp %s gs://%s/results/results%s.txt' % (results_file, opt.bucket, opt.name)) # Tensorboard if tb_writer: tags = [ 'train/giou_loss', 'train/obj_loss', 'train/cls_loss', # train loss 'metrics/precision', 'metrics/recall', 'metrics/mAP_0.5', 'metrics/mAP_0.5:0.95', 'val/giou_loss', 'val/obj_loss', 'val/cls_loss', # val loss 'x/lr0', 'x/lr1', 'x/lr2' ] # params for x, tag in zip(list(mloss[:-1]) + list(results) + lr, tags): tb_writer.add_scalar(tag, x, epoch) # Update best mAP fi = fitness(np.array(results).reshape( 1, -1)) # fitness_i = weighted combination of [P, R, mAP, F1] if fi > best_fitness: best_fitness = fi # Save model save = (not opt.nosave) or (final_epoch and not opt.evolve) if save: with open(results_file, 'r') as f: # create checkpoint ckpt = { 'epoch': epoch, 'best_fitness': best_fitness, 'training_results': f.read(), 'model': ema.ema, 'optimizer': None if final_epoch else optimizer.state_dict() } # Save last, best and delete torch.save(ckpt, last) if best_fitness == fi: torch.save(ckpt, best) del ckpt # end epoch ---------------------------------------------------------------------------------------------------- # end training if rank in [-1, 0]: # Strip optimizers n = ('_' if len(opt.name) and not opt.name.isnumeric() else '') + opt.name fresults, flast, fbest = 'results%s.txt' % n, wdir / f'last{n}.pt', wdir / f'best{n}.pt' for f1, f2 in zip([wdir / 'last.pt', wdir / 'best.pt', 'results.txt'], [flast, fbest, fresults]): if os.path.exists(f1): os.rename(f1, f2) # rename if str(f2).endswith('.pt'): # is *.pt strip_optimizer(f2) # strip optimizer os.system( 'gsutil cp %s gs://%s/weights' % (f2, opt.bucket)) if opt.bucket else None # upload # Finish if not opt.evolve: pass #plot_results(save_dir=log_dir) # save as results.png #logger.info('%g epochs completed in %.3f hours.\n' % (epoch - start_epoch + 1, (time.time() - t0) / 3600)) dist.destroy_process_group() if rank not in [-1, 0] else None torch.cuda.empty_cache() return results
{'params': model.fc_concat.parameters(), 'lr': opt.lr}, ], lr=1, momentum=0.9, weight_decay=1e-5) #model = nn.DataParallel(model, device_ids=[0,1,2]) #model.features.requires_grad = False '''待会打印出来模型看看有没有这个features''' criterion = nn.NLLLoss() ''' optimizer = optim.SGD([ {'params': model.sample_128.parameters(), 'lr': opt.lr}, {'params': model.sample_256.parameters(), 'lr': opt.lr}, # 这样可以在不同的层使用不同的学习率,实现差异化 {'params': model.fc_concat.parameters(), 'lr': opt.lr}, ], lr=1, momentum=0.9, weight_decay=1e-5) ''' writer = SummaryWriter(log_dir='avg_loss') '''我想把log_dir='绝对路径'就各种报错''' for epoch in range(1, 81): avg_train_loss = train(epoch, model, criterion, optimizer, trainloader, device) writer.add_scalar('avg_train_loss_per_epoch', avg_train_loss, epoch) if epoch % 5 == 0: avg_test_loss, accuracy = test(model, criterion, testloader, device) #print(test(model, criterion, testloader, device)) writer.add_scalar('avg_test_loss', avg_test_loss, int(epoch/5)) writer.add_scalar('test_accuracy', accuracy, int(epoch/5)) if epoch % 40 == 0: adjust_learning_rate(optimizer) writer.close() torch.save(model.state_dict(), 'firststep_aircarft.pth')
"--freeze-pattern", type=str, default="", help="regex pattern string, fix matched param when training") parser.add_argument( "--load-solver", type=str, default=None, help="YAML file that contains all args of an experiment, " "if this option presents, program will read all args from this " "yaml file. This mechanism is designed for reproducible experiments.") parser.add_argument( "--save-solver", type=str, default=None, help="the prefix to dump all args for current experiment, default is " "[symbol_name].[iterator_name].[timestamp].yaml") parser.add_argument( "--freeze-pattern", type=str, default="", help="regex pattern string, fix matched param when training") # TODO: divide the parse_args into two phase, parse to yaml, parse yaml to args # TODO: add support for run a yaml config args = parse_args(parser) if args.test: test(args) else: fit(args)
def main() -> int: parser = argparse.ArgumentParser() parser.add_argument( '--mode', help='Select mode', choices=['train', 'test', 'demo'], default='train', ) args = parser.parse_args() config = yaml.safe_load(open("config.yml")) if config['LOAD_MODEL']: model = DQN( in_channels=config['IN_CHANNELS'], out_dim=config['OUT_DIM'], ) model_name = config['LOAD_MODEL'] model.load_model(model_name) else: model = DQN( in_channels=config['IN_CHANNELS'], out_dim=config['OUT_DIM'], ) if args.mode == 'test': test( device=config['DEVICE'], n_games=config['TEST_GAMES'], model=model, frame_skipping=config['FRAME_SKIPPING'], ) elif args.mode == 'demo': demo( device=config['DEVICE'], model=model, frame_skipping=config['FRAME_SKIPPING'], ) else: memory = ReplayMemory(capacity=config['N']) optimizer_name = config['OPTIMIZER'] if optimizer_name == 'adam': optimizer = torch.optim.Adam(lr=config['LEARNING_RATE'], betas=(0.9, 0.999), eps=1e-8, amsgrad=False, params=model.model.parameters()) elif optimizer_name == 'sgd': optimizer = torch.optim.SGD(lr=config['LEARNING_RATE'], momentum=0.9, params=model.model.parameters()) else: raise ValueError(f'Unknown optimizer name: {optimizer_name}') experiment = Experiment( api_key=os.environ['COMET_ML_API_KEY'], project_name=config['COMET_ML_PROJECT_NAME'], workspace=config['COMET_ML_WORKSPACE'], ) experiment.set_name(config['COMET_ML_NAME']) experiment.add_tag(config['COMET_ML_TAG']) experiment.log_parameters({ 'n_games': config['M'], 'minibatch_size': config['MINIBATCH_SIZE'], 'eps': config['EPS'], 'eps_n_frames': config['EPS_N_FRAMES'], 'gamma': config['GAMMA'], 'frame_skipping': config['FRAME_SKIPPING'], 'save_model_every': config['SAVE_MODEL_EVERY'] }) experiment.set_model_graph(str(model.model)) train( device=config['DEVICE'], n_games=config['M'], memory=memory, optimizer=optimizer, model=model, experiment=experiment, minibatch_size=config['MINIBATCH_SIZE'], eps=config['EPS'], eps_n_frames=config['EPS_N_FRAMES'], gamma=config['GAMMA'], frame_skipping=config['FRAME_SKIPPING'], update_model_target_every=config['UPDATE_MODEL_TARGET_EVERY'], save_model_every=config['SAVE_MODEL_EVERY'], save_model_as=config['SAVE_MODEL_AS'], save_average_metrics_every=config['SAVE_AVERAGE_METRICS_EVERY'], )