Beispiel #1
0
def train(rank,
          args,
          ns,
          best_result,
          actor_optim,
          critic_optim,
          shared_model,
          debug=True):
    torch.manual_seed(args.seed + rank)

    env = RunEnv(False)
    env.seed(args.seed + rank)

    nb_states = env.observation_space.shape[0]
    nb_actions = env.action_space.shape[0]
    if args.use_more_states:
        agent = DDPG(nb_states * args.num_states, nb_actions, args)
    else:
        agent = DDPG(nb_states, nb_actions, args)

    if args.shared:
        agent.add_optim(actor_optim, critic_optim)

    agent.train()
    if args.load_weights:
        agent.load_weights("weights")
    agent.is_training = True
    step = episode = episode_steps = 0
    observation = None
    done = True
    episode_reward = 0.
    observations = None
    last_reward = -10
    while True:
        if args.shared:
            agent.load_state_dict(shared_model.state_dict())

        if observation is None:
            observation = deepcopy(env.reset())
            agent.reset(observation)
            if args.use_more_states:
                observations = deque(
                    list(observation for i in range(2**args.num_states)),
                    2**args.num_states)
                # observations = deque(list(observation for i in range(args.num_states)), args.num_states)

        if step <= args.warmup:
            action = agent.random_action()
        elif args.use_more_states:
            cur_observations = list()
            for i in range(args.num_states):
                cur_observations.append(list(observations)[2**i - 1])
            action = agent.select_action(
                np.concatenate(list(cur_observations)).ravel().tolist())
        else:
            action = agent.select_action(observation)

        observation2, reward, done, info = env.step(action)
        observation = deepcopy(observation2)
        if observation and args.use_more_states:
            observations.appendleft(observation)

        if args.use_more_states:
            cur_observations = list()
            for i in range(args.num_states):
                cur_observations.append(list(observations)[2**i - 1])
            agent.observe(
                reward,
                np.concatenate(list(cur_observations)).ravel().tolist(), done)
        else:
            agent.observe(reward, observation, done)

        if step > args.warmup:
            for i in range(5):
                agent.update_policy(shared_model, args)

        step += 1
        episode_steps += 1
        episode_reward += reward

        if done:
            if args.use_more_states:
                cur_observations = list()
                for i in range(args.num_states):
                    cur_observations.append(list(observations)[2**i - 1])
                agent.memory.append(
                    np.concatenate(list(cur_observations)).ravel().tolist(),
                    agent.select_action(
                        np.concatenate(
                            list(cur_observations)).ravel().tolist()), 0.,
                    False)
            else:
                agent.memory.append(observation,
                                    agent.select_action(observation), 0.,
                                    False)

            if step > args.warmup and best_result.value < episode_reward and episode_reward > last_reward:
                best_model = ns.best_model
                best_model.load_state_dict(agent.state_dict())
                if debug:
                    prLightPurple(
                        "best reward: {:.3f} current reward: {:.3f} updated best model from agent {}"
                        .format(best_model.best_reward, episode_reward, rank))
                best_model.best_reward = episode_reward
                agent.best_reward = episode_reward
                ns.best_model = best_model
                best_result.value = episode_reward
                last_reward = best_result.value
            elif step > args.warmup and episode % 10 == 0 and episode > 0 and args.update_train_agents > 0 \
                    and best_result.value > episode_reward and best_result.value > agent.best_reward:
                best_model = ns.best_model
                test_agent = deepcopy(agent)
                test_agent.load_state_dict(best_model.state_dict())
                if test_new_state_dict(test_agent,
                                       episode_reward,
                                       env,
                                       args.update_train_agents,
                                       use_more_states=args.use_more_states,
                                       num_states=args.num_states):
                    agent = test_agent
                    agent.best_reward = best_model.best_reward
                    if debug:
                        prGreen("best result {:.3f} updated agent {}".format(
                            best_model.best_reward, rank))
                    last_reward = best_model.best_reward

            observation = None
            observations = None
            if debug:
                prCyan('agent_{:02d} ep:{} ep_steps:{} reward:{:.3f} '.format(
                    rank, episode, episode_steps, episode_reward))
            episode_steps = 0
            episode += 1
            episode_reward = 0.
Beispiel #2
0
    else:
        ns.best_model = DDPG(nb_states, nb_actions, args)
        shared_model = DDPG(nb_states, nb_actions, args)

    if args.load_weights:
        shared_model.load_weights("weights")
        ns.best_model.load_weights("weights")

    actor_optim = critic_optim = None

    if args.shared:
        actor_optim = shared_adam.SharedAdam(shared_model.actor.parameters(),
                                             lr=args.rate)
        critic_optim = shared_adam.SharedAdam(shared_model.actor.parameters(),
                                              lr=args.rate)
        shared_model.add_optim(actor_optim, critic_optim)

        actor_optim.share_memory()
        critic_optim.share_memory()
        shared_model.share_memory()

    processes = []
    best_result = mp.Value('f', -10)

    p = mp.Process(target=test,
                   args=(args.num_processes, args, ns, best_result))
    p.start()
    processes.append(p)

    for rank in range(0, args.num_processes):
        p = mp.Process(target=train,