def test_func(args, shared_model, env_conf, datasets=None, tests=None, shared_dict=None): ptitle('Valid agent') if args.valid_gpu < 0: gpu_id = args.gpu_ids[-1] else: gpu_id = args.valid_gpu env_conf["env_gpu"] = gpu_id if not args.deploy: log = {} logger = Logger(args.log_dir) create_dir(args.log_dir + "models/") create_dir(args.log_dir + "tifs/") create_dir(args.log_dir + "tifs_test/") os.system("cp *.py " + args.log_dir) os.system("cp *.sh " + args.log_dir) os.system("cp models/*.py " + args.log_dir + "models/") setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) env_conf_log = env_conf if tests is not None: if args.testlbl: test_env = EM_env(tests[0], env_conf, type="test", gt_lbl_list=tests[1]) else: test_env = EM_env(tests[0], env_conf, type="test") if not args.deploy: for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format( k, d_args[k])) for k in env_conf_log.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format( k, env_conf_log[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) raw_list, gt_lbl_list = datasets env = EM_env(raw_list, env_conf, type="train", gt_lbl_list=gt_lbl_list) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = get_model(args, args.model, env_conf["observation_shape"], args.features, atrous_rates=args.atr_rate, num_actions=2, split=args.data_channel, gpu_id=gpu_id, multi=args.multi) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() player.model.eval() flag = True if not args.deploy: create_dir(args.save_model_dir) recent_episode_scores = ScalaTracker(100) recent_FgBgDice = ScalaTracker(100) recent_bestDice = ScalaTracker(100) recent_diffFG = ScalaTracker(100) recent_MUCov = ScalaTracker(100) recent_MWCov = ScalaTracker(100) recent_AvgFP = ScalaTracker(100) recent_AvgFN = ScalaTracker(100) recent_rand_i = ScalaTracker(100) renderlist = [] renderlist.append(player.env.render()) max_score = 0 # ----------------------------------------- Deploy / Inference ----------------------------------------- if args.deploy: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) # inference (args, None, player.model, tests [0], test_env, gpu_id, player.env.rng, len (tests [0])) if len(tests) == 4: inference(args, None, player.model, tests[0], test_env, gpu_id, player.env.rng, len(tests[0]), tests[3]) else: inference(args, None, player.model, tests[0], test_env, gpu_id, player.env.rng, len(tests[0])) return # ----------------------------------------- End Deploy / Inference ----------------------------------------- merge_ratios = [] split_ratios = [] if args.wctrl == "s2m": schedule = args.wctrl_schedule delta = (shared_dict['spl_w'] - shared_dict['mer_w']) / (2 * len(schedule)) mer_w_delta = delta mer_w_var = shared_dict['mer_w'] mer_w_scheduler = Scheduler(mer_w_var, schedule, mer_w_delta) split_delta = -delta / len(args.out_radius) split_var = shared_dict['spl_w'] / len(args.out_radius) spl_w_scheduler = Scheduler(split_var, schedule, split_delta) while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward.mean() renderlist.append(player.env.render()) if player.done: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "VALID: Time {0}, episode reward {1}, num tests {4}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, num_tests)) recent_episode_scores.push(reward_sum) if args.save_max and recent_episode_scores.mean() >= max_score: max_score = recent_episode_scores.mean() if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = {} state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, 'best_model_' + args.env)) if num_tests % args.save_period == 0: if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, str(num_tests))) if num_tests % args.log_period == 0: if tests is not None and not args.DEBUG: inference(args, logger, player.model, tests[0], test_env, gpu_id, player.env.rng, num_tests) if (np.max(env.lbl) != 0 and np.max(env.gt_lbl) != 0): bestDice, FgBgDice, diffFG, MWCov, MUCov, AvgFP, AvgFN, rand_i = evaluate( args, player.env) recent_FgBgDice.push(FgBgDice) recent_diffFG.push(abs(diffFG)) recent_bestDice.push(bestDice) recent_MWCov.push(MWCov) recent_MUCov.push(MUCov) recent_AvgFP.push(AvgFP) recent_AvgFN.push(AvgFN) recent_rand_i.push(rand_i) log_info = { "bestDice": recent_bestDice.mean(), "FgBgDice": recent_FgBgDice.mean(), "diffFG": recent_diffFG.mean(), "MWCov": recent_MWCov.mean(), "MUCov": recent_MUCov.mean(), "AvgFP": recent_AvgFP.mean(), "AvgFN": recent_AvgFN.mean(), "rand_i": recent_rand_i.mean() } for tag, value in log_info.items(): logger.scalar_summary(tag, value, num_tests) else: bestDice, FgBgDice, diffFG = 0, 0, 0 MWCov, MUCov, AvgFP, AvgFN = 0, 0, 0, 0 rand_i = 0 print( "----------------------VALID SET--------------------------" ) print(args.env) print("bestDice:", bestDice, "FgBgDice:", FgBgDice, "diffFG:", diffFG, "MWCov:", MWCov, "MUCov:", MUCov, "AvgFP:", AvgFP, "AvgFN:", AvgFN, "rand_i:", rand_i) # print ("mean bestDice") print("Log test #:", num_tests) print("rewards: ", player.reward.mean()) print("sum rewards: ", reward_sum) print("#gt_values:", len(np.unique(player.env.gt_lbl))) print("values:") values = player.env.unique() print(np.concatenate([values[0][None], values[1][None]], 0)) print("------------------------------------------------") log_img = np.concatenate(renderlist[::-1], 0) if not "3D" in args.data: for i in range(3): player.probs.insert(0, np.zeros_like(player.probs[0])) while (len(player.probs) - 3 < args.max_episode_length): player.probs.append(np.zeros_like(player.probs[0])) probslist = [ np.repeat(np.expand_dims(prob, -1), 3, -1) for prob in player.probs ] probslist = np.concatenate(probslist, 1) probslist = (probslist * 256).astype(np.uint8, copy=False) # log_img = renderlist [-1] print(probslist.shape, log_img.shape) log_img = np.concatenate([probslist, log_img], 0) log_info = {"valid_sample": log_img} print(log_img.shape) io.imsave( args.log_dir + "tifs/" + str(num_tests) + "_sample.tif", log_img.astype(np.uint8)) io.imsave( args.log_dir + "tifs/" + str(num_tests) + "_pred.tif", player.env.lbl.astype(np.uint8)) io.imsave(args.log_dir + "tifs/" + str(num_tests) + "_gt.tif", player.env.gt_lbl.astype(np.int32)) if args.seg_scale: log_info["scaler"] = player.env.scaler for tag, img in log_info.items(): img = img[None] logger.image_summary(tag, img, num_tests) if not args.deploy: log_info = { 'mean_valid_reward': reward_mean, '100_mean_reward': recent_episode_scores.mean(), 'split_ratio': player.env.split_ratio_sum.sum() / np.count_nonzero(player.env.gt_lbl), 'merge_ratio': player.env.merge_ratio_sum.sum() / np.count_nonzero(player.env.gt_lbl), } if args.wctrl == 's2m': log_info.update({ 'mer_w': mer_w_scheduler.value(), 'spl_w': spl_w_scheduler.value() * len(args.out_radius), }) merge_ratios.append(player.env.merge_ratio_sum.sum() / np.count_nonzero(player.env.gt_lbl)) split_ratios.append(player.env.split_ratio_sum.sum() / np.count_nonzero(player.env.gt_lbl)) print("split ratio: ", np.max(player.env.split_ratio_sum), np.min(player.env.split_ratio_sum)) print("merge ratio: ", np.max(player.env.merge_ratio_sum), np.min(player.env.merge_ratio_sum)) print("merge ratio: ", merge_ratios) print("split ratio: ", split_ratios) for tag, value in log_info.items(): logger.scalar_summary(tag, value, num_tests) renderlist = [] reward_sum = 0 player.eps_len = 0 if args.wctrl == "s2m": shared_dict["spl_w"] = spl_w_scheduler.next() shared_dict["mer_w"] = mer_w_scheduler.next() player.env.config["spl_w"] = shared_dict["spl_w"] player.env.config["mer_w"] = shared_dict["mer_w"] player.clear_actions() state = player.env.reset(player.model, gpu_id) renderlist.append(player.env.render()) time.sleep(15) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def train (rank, args, shared_model, optimizer, env_conf, datasets=None): ptitle('Training Agent: {}'.format(rank)) print ('Start training agent: ', rank) if rank == 0: logger = Logger (args.log_dir) train_step = 0 gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] env_conf ["env_gpu"] = gpu_id torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) if "EM_env" in args.env: raw, lbl, prob, gt_lbl = datasets env = EM_env (raw, lbl, prob, env_conf, 'train', gt_lbl) else: env = Voronoi_env (env_conf) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop (shared_model.parameters (), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam (shared_model.parameters (), lr=args.lr, amsgrad=args.amsgrad) # env.seed (args.seed + rank) if not args.continuous: player = Agent (None, env, args, None) else: player = Agent_continuous (None, env, args, None) player.gpu_id = gpu_id if not args.continuous: player.model = A3Clstm (env.observation_space.shape, env_conf["num_action"], args.hidden_feat) else: player.model = A3Clstm_continuous (env.observation_space.shape, env_conf["num_action"], args.hidden_feat) player.state = player.env.reset () player.state = torch.from_numpy (player.state).float () old_score = player.env.old_score final_score = 0 if gpu_id >= 0: with torch.cuda.device (gpu_id): player.state = player.state.cuda () player.model = player.model.cuda () player.model.train () if rank == 0: eps_reward = 0 pinned_eps_reward = 0 mean_log_prob = 0 # print ("rank: ", rank) while True: if gpu_id >= 0: with torch.cuda.device (gpu_id): player.model.load_state_dict (shared_model.state_dict ()) else: player.model.load_state_dict (shared_model.state_dict ()) if player.done: player.eps_len = 0 if rank == 0: if 0 <= (train_step % args.train_log_period) < args.max_episode_length: print ("train: step", train_step, "\teps_reward", eps_reward, "\timprovement", final_score - old_score) old_score = player.env.old_score pinned_eps_reward = eps_reward eps_reward = 0 mean_log_prob = 0 if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx = Variable(torch.zeros(1, args.hidden_feat).cuda()) player.hx = Variable(torch.zeros(1, args.hidden_feat).cuda()) else: player.cx = Variable(torch.zeros(1, args.hidden_feat)) player.hx = Variable(torch.zeros(1, args.hidden_feat)) else: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train () if rank == 0: # if 0 <= (train_step % args.train_log_period) < args.max_episode_length: # print ("train: step", train_step, "\taction = ", player.action) eps_reward += player.reward # print (eps_reward) mean_log_prob += player.log_probs [-1] / env_conf ["T"] if player.done: break if player.done: # if rank == 0: # print ("----------------------------------------------") final_score = player.env.old_score state = player.env.reset () player.state = torch.from_numpy (state).float () if gpu_id >= 0: with torch.cuda.device (gpu_id): player.state = player.state.cuda () R = torch.zeros (1, 1) if not player.done: if not args.continuous: value, _, _ = player.model((Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) else: value, _, _, _ = player.model((Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) delta_t = player.values[i + 1].data * args.gamma + player.rewards[i] - \ player.values[i].data gae = gae * args.gamma * args.tau + delta_t # print (player.rewards [i]) if not args.continuous: policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] else: policy_loss = policy_loss - \ player.log_probs[i].sum () * Variable(gae) - \ 0.01 * player.entropies[i].sum () player.model.zero_grad () sum_loss = (policy_loss + value_loss) sum_loss.backward () ensure_shared_grads (player.model, shared_model, gpu=gpu_id >= 0) optimizer.step () player.clear_actions () if rank == 0: train_step += 1 if train_step % args.log_period == 0: log_info = { # 'train: sum_loss': sum_loss, 'train: value_loss': value_loss, 'train: policy_loss': policy_loss, 'train: advanage': advantage, # 'train: entropy': entropy, 'train: eps reward': pinned_eps_reward, # 'train: mean log prob': mean_log_prob } for tag, value in log_info.items (): logger.scalar_summary (tag, value, train_step)
def test(args, shared_model, env_conf, datasets): ptitle('Test agent') gpu_id = args.gpu_ids[-1] log = {} logger = Logger(args.log_dir) setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) raw, gt_lbl = datasets env = EM_env(raw, gt_lbl, env_conf) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id # player.model = A3Clstm (env.observation_space.shape, env_conf["num_action"], args.hidden_feat) player.model = SimpleCNN(env.observation_space.shape, env_conf["num_action"]) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() flag = True create_dir(args.save_model_dir) recent_episode_scores = [] renderlist = [] renderlist.append(player.env.render()) max_score = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward renderlist.append(player.env.render()) if player.done: flag = True if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "Time {0}, episode reward {1}, num tests {4}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, num_tests)) recent_episode_scores += [reward_sum] if len(recent_episode_scores) > 200: recent_episode_scores.pop(0) if args.save_max and np.mean(recent_episode_scores) >= max_score: max_score = np.mean(recent_episode_scores) if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, 'best_model_' + args.env)) if num_tests % args.save_period == 0: if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format( args.save_model_dir, args.env + '_' + str(num_tests))) if num_tests % args.log_period == 0: print("------------------------------------------------") print("Log test #:", num_tests) print("Prob: ") for i in range(player.env.agent_out_shape[1]): for j in range(player.env.agent_out_shape[2]): print("{:.3f}\t".format(player.prob_cpu[0, i, j]), end='') print() print("Actions :", player.actions) print("Actions transformed: ") print(player.actions_explained) print("rewards: ", player.rewards) print("sum rewards: ", reward_sum) print("------------------------------------------------") log_img = np.concatenate(renderlist, 0) log_info = {"test: traning_sample": log_img} for tag, img in log_info.items(): img = img[None] logger.image_summary(tag, img, num_tests) log_info = {'test: mean_reward': reward_mean} for tag, value in log_info.items(): logger.scalar_summary(tag, value, num_tests) renderlist = [] reward_sum = 0 player.eps_len = 0 time.sleep(30) player.clear_actions() state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def train_func(rank, args, shared_model, optimizer, env_conf, datasets=None, shared_dict=None): if args.deploy: return ptitle('Train {0}'.format(rank)) print('Start training agent: ', rank) if rank == 0: logger = Logger(args.log_dir[:-1] + '_losses/') train_step = 0 gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] env_conf["env_gpu"] = gpu_id torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) raw_list, gt_lbl_list = datasets env = EM_env(raw_list, env_conf, type="train", gt_lbl_list=gt_lbl_list, seed=args.seed + rank) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) player = Agent(None, env, args, None) player.gpu_id = gpu_id player.model = get_model(args, args.model, env.observation_space.shape, args.features, atrous_rates=args.atr_rate, num_actions=2, split=args.data_channel, gpu_id=gpu_id, multi=args.multi) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() if rank == 0: eps_reward = 0 pinned_eps_reward = 0 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: player.eps_len = 0 if rank == 0: if train_step % args.train_log_period == 0 and train_step > 0: print("train: step", train_step, "\teps_reward", eps_reward) if train_step > 0: pinned_eps_reward = player.env.sum_reward.mean() eps_reward = 0 if args.lstm_feats: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.cx, player.hx = player.model.lstm.init_hidden( batch_size=1, use_cuda=True) else: player.cx, player.hx = player.model.lstm.init_hidden( batch_size=1, use_cuda=False) elif args.lstm_feats: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): if rank < args.lbl_agents: player.action_train(use_lbl=True) else: player.action_train() if rank == 0: eps_reward = player.env.sum_reward.mean() if player.done: break if player.done: state = player.env.reset(player.model, gpu_id) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() if "3D" in args.data: R = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1], env_conf["size"][2]) else: R = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1]) if args.lowres: R = torch.zeros(1, 1, env_conf["size"][0] // 2, env_conf["size"][1] // 2) if not player.done: if args.lstm_feats: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) else: value, _ = player.model(Variable(player.state.unsqueeze(0))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 if "3D" in args.data: gae = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1], env_conf["size"][2]) else: gae = torch.zeros(1, 1, env_conf["size"][0], env_conf["size"][1]) if args.rew_drop: keep_map = torch.tensor(player.env.keep_map) if args.lowres: gae = torch.zeros(1, 1, env_conf["size"][0] // 2, env_conf["size"][1] // 2) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() if args.rew_drop: keep_map = keep_map.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): if gpu_id >= 0: with torch.cuda.device(gpu_id): reward_i = torch.tensor(player.rewards[i]).cuda() else: reward_i = torch.tensor(player.rewards[i]) R = args.gamma * R + reward_i if args.rew_drop: advantage = R - player.values[i] value_loss = value_loss + (0.5 * advantage * advantage * keep_map).mean() delta_t = player.values[ i + 1].data * args.gamma + reward_i - player.values[i].data gae = gae * args.gamma * args.tau + delta_t else: advantage = R - player.values[i] value_loss = value_loss + (0.5 * advantage * advantage).mean() delta_t = player.values[ i + 1].data * args.gamma + reward_i - player.values[i].data gae = gae * args.gamma * args.tau + delta_t if args.noisy: policy_loss = policy_loss - \ (player.log_probs[i] * Variable(gae)).mean () else: if args.rew_drop: policy_loss = policy_loss - \ (player.log_probs[i] * Variable(gae) * keep_map).mean () - \ (args.entropy_alpha * player.entropies[i] * keep_map).mean () else: policy_loss = policy_loss - \ (player.log_probs[i] * Variable(gae)).mean () - \ (args.entropy_alpha * player.entropies[i]).mean () player.model.zero_grad() sum_loss = (policy_loss + value_loss) curtime = time.time() # print ("backward curtime:", curtime) sum_loss.backward() # print ("backward done", time.time () - curtime) ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) curtime = time.time() # print ("optim curtime:", curtime) optimizer.step() # print ("optim done", time.time () - curtime) player.clear_actions() if args.wctrl == "s2m": player.env.config["spl_w"] = shared_dict["spl_w"] player.env.config["mer_w"] = shared_dict["mer_w"] if rank == 0: train_step += 1 if train_step % args.log_period == 0 and train_step > 0: log_info = { 'train: value_loss': value_loss, 'train: policy_loss': policy_loss, 'train: eps reward': pinned_eps_reward, } if "EX" in args.model: log_info["cell_prob_loss"] = cell_prob_loss for tag, value in log_info.items(): logger.scalar_summary(tag, value, train_step)
def test(args, shared_model, env_conf, datasets=None, hasLbl=True): if hasLbl: ptitle('Valid agent') else: ptitle("Test agent") gpu_id = args.gpu_ids[-1] env_conf["env_gpu"] = gpu_id log = {} logger = Logger(args.log_dir) setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) if hasLbl: for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format( k, d_args[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) if "EM_env" in args.env: raw_list, gt_lbl_list = datasets env = EM_env(raw_list, env_conf, type="train", gt_lbl_list=gt_lbl_list) else: env = Voronoi_env(env_conf) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id if args.model == "UNet": player.model = UNet(env.observation_space.shape[0], args.features, 2) elif args.model == "FusionNetLstm": player.model = FusionNetLstm(env.observation_space.shape, args.features, 2, args.hidden_feat) elif args.model == "FusionNet": player.model = FusionNet(env.observation_space.shape[0], args.features, 2) elif (args.model == "UNetLstm"): player.model = UNetLstm(env.observation_space.shape, args.features, 2, args.hidden_feat) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() player.model.eval() flag = True create_dir(args.save_model_dir) recent_episode_scores = [] renderlist = [] renderlist.append(player.env.render()) max_score = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward.mean() renderlist.append(player.env.render()) if player.done: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests if hasLbl: log['{}_log'.format(args.env)].info( "VALID: Time {0}, episode reward {1}, num tests {4}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, num_tests)) recent_episode_scores += [reward_sum] if len(recent_episode_scores) > 200: recent_episode_scores.pop(0) if args.save_max and np.mean(recent_episode_scores) >= max_score: max_score = np.mean(recent_episode_scores) if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, 'best_model_' + args.env)) if num_tests % args.save_period == 0: if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format( args.save_model_dir, args.env + '_' + str(num_tests))) if num_tests % args.log_period == 0: if hasLbl: print( "----------------------VALID SET--------------------------" ) print("Log test #:", num_tests) print("rewards: ", player.reward.mean()) print("sum rewards: ", reward_sum) print("------------------------------------------------") log_img = np.concatenate(renderlist, 0) if hasLbl: log_info = {"valid_sample": log_img} else: log_info = {"test_sample": log_img} for tag, img in log_info.items(): img = img[None] logger.image_summary(tag, img, num_tests) if hasLbl: log_info = {'mean_valid_reward': reward_mean} for tag, value in log_info.items(): logger.scalar_summary(tag, value, num_tests) renderlist = [] reward_sum = 0 player.eps_len = 0 player.clear_actions() state = player.env.reset() renderlist.append(player.env.render()) time.sleep(15) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def train_func(rank, args, shared_model, optimizer, env_conf, datasets): if args.deploy: return ptitle('Train {0}'.format(rank)) print('Start training agent: ', rank) if rank == 0: logger = Logger(args.log_dir[:-1] + '_losses/') train_step = 0 gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] env_conf["env_gpu"] = gpu_id torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = Debug_env(datasets, env_conf, seed=args.seed + rank) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) player = Agent(None, env, args, None) player.gpu_id = gpu_id nChan = 3 if args.is3D: nChan = 4 if args.alpha_only: nChan = 1 if not args.is3D: player.model = get_model(args, "ENet", input_shape=env_conf["obs_shape"], num_actions=args.num_actions * nChan) elif not args.obs3D: player.model = get_model(args, "ENet", input_shape=env_conf["obs_shape"], num_actions=args.num_actions * nChan) elif args.obs3D: player.model = get_model(args, "Net3D", input_shape=env_conf["obs_shape"], num_actions=args.num_actions * nChan) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() if rank == 0: eps_reward = 0 pinned_eps_reward = 0 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: player.eps_len = 0 if rank == 0: if train_step % args.train_log_period == 0 and train_step > 0: print("train: step", train_step, "\teps_reward", eps_reward) if train_step > 0: pinned_eps_reward = player.env.sum_rewards.mean() eps_reward = 0 for step in range(args.num_steps): player.action_train() if rank == 0: eps_reward = player.env.sum_rewards.mean() if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() if not args.alpha_only: if not args.is3D: R = torch.zeros(1, 1, args.num_actions * 3) else: R = torch.zeros(1, 1, args.num_actions * 4) else: R = torch.zeros(1, 1, args.num_actions) if not player.done: value, _ = player.model(Variable(player.state.unsqueeze(0))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 if not args.alpha_only: if not args.is3D: gae = torch.zeros(1, 1, args.num_actions * 3) else: gae = torch.zeros(1, 1, args.num_actions * 4) else: gae = torch.zeros(1, 1, args.num_actions) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): if gpu_id >= 0: with torch.cuda.device(gpu_id): reward_i = torch.tensor(player.rewards[i]).cuda() else: reward_i = torch.tensor(player.rewards[i]) R = args.gamma * R + reward_i advantage = R - player.values[i] value_loss = value_loss + (0.5 * advantage * advantage).mean() delta_t = player.values[ i + 1].data * args.gamma + reward_i - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ (player.log_probs[i] * Variable(gae)).mean () - \ (args.entropy_alpha * player.entropies[i]).mean () player.model.zero_grad() sum_loss = (policy_loss + value_loss) curtime = time.time() sum_loss.backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) curtime = time.time() optimizer.step() player.clear_actions() if rank == 0: train_step += 1 if train_step % args.log_period * 10 == 0 and train_step > 0: log_info = { 'train: value_loss': value_loss, 'train: policy_loss': policy_loss, 'train: eps reward': pinned_eps_reward, } for tag, value in log_info.items(): logger.scalar_summary(tag, value, train_step)
def test(args, shared_model, env_conf): ptitle('Valid agent') if args.valid_gpu < 0: gpu_id = args.gpu_ids[-1] else: gpu_id = args.valid_gpu env_conf["env_gpu"] = gpu_id log = {} logger = Logger(args.log_dir) create_dir(args.log_dir + "models/") os.system("cp *.sh " + args.log_dir) os.system("cp *.py " + args.log_dir) os.system("cp models/models.py " + args.log_dir + "models/") os.system("cp models/basic_modules.py " + args.log_dir + "models/") setup_logger('{}_log'.format(args.env), r'{0}{1}_log'.format(args.log_dir, args.env)) log['{}_log'.format(args.env)] = logging.getLogger('{}_log'.format( args.env)) d_args = vars(args) env_conf_log = env_conf for k in d_args.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format(k, d_args[k])) for k in env_conf_log.keys(): log['{}_log'.format(args.env)].info('{0}: {1}'.format( k, env_conf_log[k])) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) env = database_env(env_conf, seed=0, dstype="test") env.max_step = 900 reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None, gpu_id) player.gpu_id = gpu_id player.model = get_model(args, args.model, env_conf["observation_shape"], args.features, env_conf["num_actions"], gpu_id=0, lstm_feats=args.lstm_feats) with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model.eval() flag = True create_dir(args.save_model_dir) recent_episode_scores = ScalaTracker(100) max_score = 0 while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward.mean() if player.done: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests log['{}_log'.format(args.env)].info( "VALID: Time {0}, episode reward {1}, num tests {4}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, num_tests)) recent_episode_scores.push(reward_sum) if args.save_max and recent_episode_scores.mean() >= max_score: max_score = recent_episode_scores.mean() if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = {} state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, 'best_model_' + args.env)) if num_tests % args.save_period == 0: if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format( args.save_model_dir, args.env + '_' + str(num_tests))) if num_tests % args.log_period == 0: print("------------------------------------------------") print(args.env) print("Log test #:", num_tests) print("sum rewards: ", player.env.sum_reward) print("action_history\n", player.env.action_his) print() print("------------------------------------------------") log_info = { 'mean_reward': reward_mean, '100_mean_reward': recent_episode_scores.mean() } for tag, value in log_info.items(): logger.scalar_summary(tag, value, num_tests) reward_sum = 0 player.eps_len = 0 player.clear_actions() state = player.env.reset() time.sleep(15) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def train(rank, args, shared_model, optimizer, env_conf): ptitle('Train {0}: {1}'.format(args.env, rank)) print('Start training agent: ', rank) if rank == 0: logger = Logger(args.log_dir + '_losses/') train_step = 0 gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] env_conf["env_gpu"] = gpu_id torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) env = database_env(env_conf, seed=0) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) player = Agent(None, env, args, None, gpu_id) player.gpu_id = gpu_id player.model = get_model(args, args.model, env_conf["observation_shape"], args.features, env_conf["num_actions"], gpu_id=0, lstm_feats=args.lstm_feats) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() if rank == 0: eps_reward = 0 pinned_eps_reward = 0 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: player.eps_len = 0 if rank == 0: if train_step % args.train_log_period == 0 and train_step > 0: print("train: step", train_step, "\teps_reward", eps_reward) if train_step > 0: pinned_eps_reward = player.env.sum_reward eps_reward = 0 if args.lstm_feats: player.cx, player.hx = init_linear_lstm( args.lstm_feats, gpu_id) elif args.lstm_feats: player.cx = Variable(player.cx.data) player.hx = Variable(player.hx.data) for step in range(args.num_steps): player.action_train() if rank == 0: eps_reward = player.env.sum_reward if player.done: break if player.done: if rank == 0: if train_step % args.train_log_period == 0 and train_step > 0: print("train: step", train_step, "\teps_reward", eps_reward) # print ("rewards: ", player.env.rewards) # print ("actions: ", player.actions) if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1, 1, 1) if not player.done: if args.lstm_feats: value, _, _ = player.model( (Variable(player.state.unsqueeze(0)), (player.hx, player.cx))) else: value, _ = player.model(Variable(player.state.unsqueeze(0))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1, 1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) for i in reversed(range(len(player.rewards))): if gpu_id >= 0: with torch.cuda.device(gpu_id): reward_i = torch.tensor(player.rewards[i]).cuda() else: reward_i = torch.tensor(player.rewards[i]) R = args.gamma * R + reward_i advantage = R - player.values[i] value_loss = value_loss + (0.5 * advantage * advantage).mean() delta_t = player.values[ i + 1].data * args.gamma + reward_i - player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ (player.log_probs[i] * Variable(gae)).mean () - \ (args.entropy_alpha * player.entropies[i]).mean () player.model.zero_grad() sum_loss = (policy_loss + value_loss) sum_loss.backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions() if rank == 0: train_step += 1 if train_step % args.log_period == 0 and train_step > 0: log_info = { 'sum_loss': sum_loss, 'value_loss': value_loss, 'policy_loss': policy_loss, 'advanage': advantage, 'train eps reward': pinned_eps_reward, } for tag, value in log_info.items(): logger.scalar_summary(tag, value, train_step)
def test_func(args, shared_model, env_conf, datasets): ptitle('Valid agent') gpu_id = args.gpu_ids[-1] env_conf["env_gpu"] = gpu_id if not args.deploy: logger = Logger(args.log_dir) saved_src_dir = args.log_dir + "/src/" create_dir(saved_src_dir) os.system("cp *.py " + saved_src_dir) os.system("cp -r Models " + saved_src_dir) os.system("cp -r run_scripts " + saved_src_dir) os.system("cp -r Utils " + saved_src_dir) torch.manual_seed(args.seed) if gpu_id >= 0: torch.cuda.manual_seed(args.seed) env = Debug_env(datasets, env_conf) reward_sum = 0 start_time = time.time() num_tests = 0 reward_total_sum = 0 player = Agent(None, env, args, None) player.gpu_id = gpu_id nChan = 3 if args.is3D: nChan = 4 if args.alpha_only: nChan = 1 if not args.is3D: player.model = get_model(args, "ENet", input_shape=env_conf["obs_shape"], num_actions=args.num_actions * nChan) elif not args.obs3D: player.model = get_model(args, "ENet", input_shape=env_conf["obs_shape"], num_actions=args.num_actions * nChan) elif args.obs3D: player.model = get_model(args, "Net3D", input_shape=env_conf["obs_shape"], num_actions=args.num_actions * nChan) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model = player.model.cuda() player.state = player.state.cuda() player.model.eval() flag = True if not args.deploy: create_dir(args.save_model_dir) recent_episode_scores = ScalaTracker(100) recent_rand_i = ScalaTracker(100) renderlist = [] renderlist.append(player.env.render()) max_score = 0 if args.deploy: deploy(args, shared_model, player, gpu_id) exit() while True: if flag: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) player.model.eval() flag = False player.action_test() reward_sum += player.reward.mean() renderlist.append(player.env.render()) if player.done: flag = True num_tests += 1 reward_total_sum += reward_sum reward_mean = reward_total_sum / num_tests print( "VALID: Time {0}, episode reward {1}, num tests {4}, episode length {2}, reward mean {3:.4f}" .format( time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)), reward_sum, player.eps_len, reward_mean, num_tests)) recent_episode_scores.push(reward_sum) if num_tests % args.save_period == 0: if gpu_id >= 0: with torch.cuda.device(gpu_id): state_to_save = player.model.state_dict() torch.save( state_to_save, '{0}{1}.dat'.format(args.save_model_dir, str(num_tests))) if num_tests % args.log_period == 0: print( "----------------------VALID SET--------------------------" ) print(args.env) print("Log test #:", num_tests) print("rewards: ", player.reward.mean()) print("sum rewards: ", reward_sum) log_rewards = [ int(rew * 100) for rew in player.env.sum_rewards ] print("rewards:", log_rewards) print("action: ", player.env.actions) print("reward history: ", player.env.rewards) print("------------------------------------------------") log_img = np.concatenate(renderlist, 0) log_info = {"valid_sample": log_img} for tag, img in log_info.items(): img = img[None] logger.image_summary(tag, img, num_tests) if not args.deploy: log_info = { 'mean_valid_reward': reward_mean, '100_mean_reward': recent_episode_scores.mean(), } for tag, value in log_info.items(): logger.scalar_summary(tag, value, num_tests) if args.save_sample: deploy_list = player.env.deploy print(len(deploy_list)) for stepi, (vol, ref_img, lut, _) in enumerate(deploy_list): io.imsave( args.log_dir + "/" + str(num_tests) + "_vol_" + str(stepi) + ".tif", vol) io.imsave( args.log_dir + "/" + str(num_tests) + "_ref_" + str(stepi) + ".tif", ref_img) plt.figure(figsize=(10, 10)) plt.plot(range(256), lut[..., 2], 'b') plt.plot(range(256), lut[..., 1], 'g') plt.plot(range(256), lut[..., 0], 'r') plt.plot(range(256), lut[..., 3], 'gray') plt.ylabel('Mapping value') plt.xlabel('Voxel intensity') plt.title("Transfer function visualization") plt.savefig("Ref_LUT" + "_" + str(num_tests) + "_" + str(stepi) + ".png") renderlist = [] reward_sum = 0 player.eps_len = 0 player.clear_actions() state = player.env.reset() renderlist.append(player.env.render()) time.sleep(15) player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda()
def train(rank, args, shared_model, optimizer, env_conf, datasets): ptitle('Training Agent: {}'.format(rank)) print('Start training agent: ', rank) if rank == 0: logger = Logger(args.log_dir) train_step = 0 gpu_id = args.gpu_ids[rank % len(args.gpu_ids)] torch.manual_seed(args.seed + rank) if gpu_id >= 0: torch.cuda.manual_seed(args.seed + rank) raw, gt_lbl = datasets env = EM_env(raw, gt_lbl, env_conf) if optimizer is None: if args.optimizer == 'RMSprop': optimizer = optim.RMSprop(shared_model.parameters(), lr=args.lr) if args.optimizer == 'Adam': optimizer = optim.Adam(shared_model.parameters(), lr=args.lr, amsgrad=args.amsgrad) gamma = torch.tensor(1.0) if gpu_id >= 0: with torch.cuda.device(gpu_id): gamma = gamma.cuda() # env.seed (args.seed + rank) player = Agent(None, env, args, None) player.gpu_id = gpu_id # player.model = A3Clstm (env.observation_space.shape, env_conf["num_action"], args.hidden_feat) player.model = SimpleCNN(env.observation_space.shape, env_conf["num_action"]) player.state = player.env.reset() player.state = torch.from_numpy(player.state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() player.model = player.model.cuda() player.model.train() if rank == 0: eps_reward = 0 pinned_eps_reward = 0 mean_log_prob = 0 while True: if gpu_id >= 0: with torch.cuda.device(gpu_id): player.model.load_state_dict(shared_model.state_dict()) else: player.model.load_state_dict(shared_model.state_dict()) if player.done: player.eps_len = 0 if rank == 0: pinned_eps_reward = eps_reward eps_reward = 0 mean_log_prob = 0 for step in range(args.num_steps): player.action_train() # print ('step: ', step, 'reward_len: ', len (player.rewards)) if rank == 0: eps_reward += player.reward # mean_log_prob += player.log_probs [-1] if player.done: break if player.done: state = player.env.reset() player.state = torch.from_numpy(state).float() if gpu_id >= 0: with torch.cuda.device(gpu_id): player.state = player.state.cuda() R = torch.zeros(1, 1) if not player.done: value, _ = player.model(Variable(player.state.unsqueeze(0))) R = value.data if gpu_id >= 0: with torch.cuda.device(gpu_id): R = R.cuda() player.values.append(Variable(R)) policy_loss = 0 value_loss = 0 gae = torch.zeros(1, 1) if gpu_id >= 0: with torch.cuda.device(gpu_id): gae = gae.cuda() R = Variable(R) # print ("updating -------------------") # print ("values:", player.values) # print ("gamma:", args.gamma) # print ("rewards:", player.rewards) for i in reversed(range(len(player.rewards))): R = args.gamma * R + player.rewards[i] advantage = R - player.values[i] value_loss = value_loss + 0.5 * advantage.pow(2) # print ("advatage: ", advantage) # print ("value_loss: ", value_loss) # print ("delta_t: ", player.values[i + 1].data + player.rewards[i]) # Generalized Advantage Estimataion delta_t = player.values[i + 1].data * args.gamma + player.rewards[i] - \ player.values[i].data gae = gae * args.gamma * args.tau + delta_t policy_loss = policy_loss - \ player.log_probs[i] * \ Variable(gae) - 0.01 * player.entropies[i] player.model.zero_grad() sum_loss = (policy_loss + value_loss) sum_loss.backward() ensure_shared_grads(player.model, shared_model, gpu=gpu_id >= 0) optimizer.step() player.clear_actions() if rank == 0: train_step += 1 if train_step % (args.log_period) == 0: log_info = { 'train: sum_loss': sum_loss, 'train: value_loss': value_loss, 'train: policy_loss': policy_loss, 'train: advanage': advantage, # 'entropy': entropy, 'train: eps reward': pinned_eps_reward, # 'mean log prob': mean_log_prob } for tag, value in log_info.items(): logger.scalar_summary(tag, value, train_step)
def train (model, args, name): model = model optimizer = optim.Adam (model.parameters (), lr=1e-4) lr_scheduler = optim.lr_scheduler.StepLR (optimizer, step_size=100, gamma=0.999) gpu_id = args.gpu_id batch_size = args.batch_size optimizers = optim.Adam (model.parameters (), lr=1e-4) if args.loss == 'WBCE': loss_func = Losses.weighted_binary_cross_entropy else: loss_func = nn.BCELoss () lr_scheduler = optim.lr_scheduler.StepLR (optimizer, step_size=100, gamma=0.999) print ('Prepare dataset ...') train_data, test_data = prepare_dataset (model, args) print ('Finish preparing dataset, start training') logger = Logger ('log_dir/' + name + '/') save_path = 'checkpoints/' + name + '/' create_dir (save_path) i_iter = 0 for i_ipoc in range (10000000): ipoc_loss = 0 for i_batch, sample in enumerate (train_data): if i_batch == len (train_data): break with torch.cuda.device (gpu_id): raw_t = torch.tensor (sample['raw'], dtype=torch.float32).cuda () / 255.0 target_t = torch.tensor (sample['lbl'], dtype=torch.float32).cuda () / 255.0 pred_t = model (raw_t) if args.loss == 'weighted': if args.weights is not None: weights = args.weights else: ESP = 1e-5 neg_weight = torch.sum (target_t) / (1.0 * np.prod (target_t.shape)) + ESP weights = [neg_weight, 1 - neg_weight] loss = loss_func (pred_t, target_t, weights) else: loss = loss_func (pred_t, target_t) optimizer.zero_grad () loss.backward () optimizer.step () ipoc_loss += loss.item () / len (train_data) lr_scheduler.step () i_iter += 1 print('type: {}\tTrain Epoch: {} \tLoss: {:.6f}'.format( args.type, i_ipoc, ipoc_loss)) info = {'loss': ipoc_loss, 'learning_rate': lr_scheduler.get_lr () [0]} for tag, value in info.items (): logger.scalar_summary (tag, value, i_iter) visual_log (raw_t, target_t, pred_t, logger, i_iter, 'train') if (i_ipoc + 1) % LOG_PERIOD == 0: raw_t, pred_t = eval (test_data, loss_func, hasTarget=False, model=model, gpu_id=gpu_id) visual_log (raw_t, None, pred_t, logger, i_iter, 'test', hasTarget=False) if i_ipoc % SAVE_PERIOD == 0: torch.save ({ 'i_iter': i_iter, 'state_dict': model.state_dict (), 'optimizer': optimizer.state_dict () }, save_path + 'checkpoint_' + str (i_iter) + '.pth.tar')