def train(rank, args, ns, best_result, actor_optim, critic_optim, shared_model, debug=True): torch.manual_seed(args.seed + rank) env = RunEnv(False) env.seed(args.seed + rank) nb_states = env.observation_space.shape[0] nb_actions = env.action_space.shape[0] if args.use_more_states: agent = DDPG(nb_states * args.num_states, nb_actions, args) else: agent = DDPG(nb_states, nb_actions, args) if args.shared: agent.add_optim(actor_optim, critic_optim) agent.train() if args.load_weights: agent.load_weights("weights") agent.is_training = True step = episode = episode_steps = 0 observation = None done = True episode_reward = 0. observations = None last_reward = -10 while True: if args.shared: agent.load_state_dict(shared_model.state_dict()) if observation is None: observation = deepcopy(env.reset()) agent.reset(observation) if args.use_more_states: observations = deque( list(observation for i in range(2**args.num_states)), 2**args.num_states) # observations = deque(list(observation for i in range(args.num_states)), args.num_states) if step <= args.warmup: action = agent.random_action() elif args.use_more_states: cur_observations = list() for i in range(args.num_states): cur_observations.append(list(observations)[2**i - 1]) action = agent.select_action( np.concatenate(list(cur_observations)).ravel().tolist()) else: action = agent.select_action(observation) observation2, reward, done, info = env.step(action) observation = deepcopy(observation2) if observation and args.use_more_states: observations.appendleft(observation) if args.use_more_states: cur_observations = list() for i in range(args.num_states): cur_observations.append(list(observations)[2**i - 1]) agent.observe( reward, np.concatenate(list(cur_observations)).ravel().tolist(), done) else: agent.observe(reward, observation, done) if step > args.warmup: for i in range(5): agent.update_policy(shared_model, args) step += 1 episode_steps += 1 episode_reward += reward if done: if args.use_more_states: cur_observations = list() for i in range(args.num_states): cur_observations.append(list(observations)[2**i - 1]) agent.memory.append( np.concatenate(list(cur_observations)).ravel().tolist(), agent.select_action( np.concatenate( list(cur_observations)).ravel().tolist()), 0., False) else: agent.memory.append(observation, agent.select_action(observation), 0., False) if step > args.warmup and best_result.value < episode_reward and episode_reward > last_reward: best_model = ns.best_model best_model.load_state_dict(agent.state_dict()) if debug: prLightPurple( "best reward: {:.3f} current reward: {:.3f} updated best model from agent {}" .format(best_model.best_reward, episode_reward, rank)) best_model.best_reward = episode_reward agent.best_reward = episode_reward ns.best_model = best_model best_result.value = episode_reward last_reward = best_result.value elif step > args.warmup and episode % 10 == 0 and episode > 0 and args.update_train_agents > 0 \ and best_result.value > episode_reward and best_result.value > agent.best_reward: best_model = ns.best_model test_agent = deepcopy(agent) test_agent.load_state_dict(best_model.state_dict()) if test_new_state_dict(test_agent, episode_reward, env, args.update_train_agents, use_more_states=args.use_more_states, num_states=args.num_states): agent = test_agent agent.best_reward = best_model.best_reward if debug: prGreen("best result {:.3f} updated agent {}".format( best_model.best_reward, rank)) last_reward = best_model.best_reward observation = None observations = None if debug: prCyan('agent_{:02d} ep:{} ep_steps:{} reward:{:.3f} '.format( rank, episode, episode_steps, episode_reward)) episode_steps = 0 episode += 1 episode_reward = 0.
else: ns.best_model = DDPG(nb_states, nb_actions, args) shared_model = DDPG(nb_states, nb_actions, args) if args.load_weights: shared_model.load_weights("weights") ns.best_model.load_weights("weights") actor_optim = critic_optim = None if args.shared: actor_optim = shared_adam.SharedAdam(shared_model.actor.parameters(), lr=args.rate) critic_optim = shared_adam.SharedAdam(shared_model.actor.parameters(), lr=args.rate) shared_model.add_optim(actor_optim, critic_optim) actor_optim.share_memory() critic_optim.share_memory() shared_model.share_memory() processes = [] best_result = mp.Value('f', -10) p = mp.Process(target=test, args=(args.num_processes, args, ns, best_result)) p.start() processes.append(p) for rank in range(0, args.num_processes): p = mp.Process(target=train,