Ejemplo n.º 1
0
def _train(args):
    if not os.path.exists("./results"):
        os.makedirs("./results")
    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir)

    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = NormalizeWrapper(env)
    env = ImgWrapper(env)  # to make the images from 160x120x3 into 3x160x120
    env = ActionWrapper(env)
    env = DtRewardWrapper(env)
    print("Initialized Wrappers")

    # Set seeds
    seed(args.seed)

    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Initialize policy
    policy = DDPG(state_dim, action_dim, max_action, net_type="cnn")
    replay_buffer = ReplayBuffer(args.replay_buffer_max_size)
    print("Initialized DDPG")

    # Evaluate untrained policy
    evaluations = [evaluate_policy(env, policy)]

    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True
    episode_reward = None
    env_counter = 0
    reward = 0
    episode_timesteps = 0

    print("Starting training")
    while total_timesteps < args.max_timesteps:

        print("timestep: {} | reward: {}".format(total_timesteps, reward))

        if done:
            if total_timesteps != 0:
                print(
                    ("Total T: %d Episode Num: %d Episode T: %d Reward: %f") %
                    (total_timesteps, episode_num, episode_timesteps,
                     episode_reward))
                policy.train(replay_buffer, episode_timesteps, args.batch_size,
                             args.discount, args.tau)

                # Evaluate episode
                if timesteps_since_eval >= args.eval_freq:
                    timesteps_since_eval %= args.eval_freq
                    evaluations.append(evaluate_policy(env, policy))
                    print("rewards at time {}: {}".format(
                        total_timesteps, evaluations[-1]))

                    if args.save_models:
                        policy.save(filename='{}_{}'.format(
                            'ddpg', total_timesteps),
                                    directory=args.model_dir)
                    np.savez("./results/rewards.npz", evaluations)

            # Reset environment
            env_counter += 1
            obs = env.reset()
            done = False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # Select action randomly or according to policy
        if total_timesteps < args.start_timesteps:
            action = env.action_space.sample()
        else:
            action = policy.predict(np.array(obs))
            if args.expl_noise != 0:
                action = (action + np.random.normal(
                    0, args.expl_noise, size=env.action_space.shape[0])).clip(
                        env.action_space.low, env.action_space.high)

        # Perform action
        new_obs, reward, done, _ = env.step(action)

        if episode_timesteps >= args.env_timesteps:
            done = True

        done_bool = 0 if episode_timesteps + 1 == args.env_timesteps else float(
            done)
        episode_reward += reward

        # Store data in replay buffer
        replay_buffer.add(obs, new_obs, action, reward, done_bool)

        obs = new_obs

        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

    print("Training done, about to save..")
    policy.save(filename='ddpg', directory=args.model_dir)
    print("Finished saving..should return now!")
Ejemplo n.º 2
0
def _train(args):
    if not os.path.exists("./results"):
        os.makedirs("./results")
    if not os.path.exists(args.model_dir):
        os.makedirs(args.model_dir)

    # Launch the env with our helper function
    env = launch_env()
    print("Initialized environment")

    # Wrappers
    env = ResizeWrapper(env)
    env = GrayscaleWrapper(env)
    env = NormalizeWrapper(env)
    env = FrameStack(env, 4)
    env = DtRewardWrapper(env)
    env = ActionWrapper(env)
    print("Initialized Wrappers")

    # Set seeds
    seed(args.seed)

    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])

    # Init training data
    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    episode_reward = 0
    env_counter = 0
    reward = 0
    episode_timesteps = 0

    avg_episodes = 100

    # Keep track of the best reward over time
    best_reward = -np.inf

    # Keep track of train_rewards
    train_rewards = []

    # To print mean actions per episode
    mean_action = []

    # To keep track of moving averages
    moving_avgs = []

    # Summary writer for tensorboard
    writer = SummaryWriter(log_dir="reinforcement/pytorch/runs")

    # Initialize policy
    if args.policy not in policies:
        raise ValueError(
            "Policy {} is not available, chose one of : {}".format(
                args.policy, list(policies.keys())))

    policy = policies[args.policy](state_dim, action_dim, max_action, args.per,
                                   args.gradclip)

    # Evaluate untrained policy
    evaluations = [evaluate_policy(env, policy)]
    moving_avgs.append(evaluations[0])

    writer.add_scalar("Timesteps/EvaluationReward", evaluations[0],
                      total_timesteps)

    ## Initialize ReplayBuffer
    if args.per:
        print("Training with Prioritized Experience Reply")
        replay_buffer = PrioritizedReplayBuffer(
            args.replay_buffer_max_size,
            args.batch_size,
            args.seed,
            initial_beta=0.5,
            delta_beta=2 / args.max_timesteps,
        )
    else:
        replay_buffer = ReplayBuffer(args.replay_buffer_max_size,
                                     args.batch_size, args.seed)

    # Load previous policy
    if args.load_initial_policy:

        # Disable random start steps
        args.start_timesteps = 0

        # Load training data
        checkpoint = load_training_state(args.model_dir,
                                         args.policy + "_training")

        evaluations = checkpoint["evaluations"]
        total_timesteps = checkpoint["total_timesteps"]
        train_rewards = checkpoint["train_rewards"]
        episode_num = checkpoint["episode_num"]
        best_reward = checkpoint["best_reward"]
        moving_avgs = checkpoint["moving_avgs"]

        # Load policy
        policy.load(args.model_dir, args.policy)

    print("Starting training")

    obs = env.reset()

    while total_timesteps < args.max_timesteps:

        # Select action
        if total_timesteps < args.start_timesteps:
            action = env.action_space.sample()
        else:
            action = policy.predict(np.array(obs))
            action = add_noise(action, args.expl_noise, env.action_space.low,
                               env.action_space.high)

        mean_action.append(action)

        # Perform action
        new_obs, reward, done, _ = env.step(action)

        # Update episode reward
        episode_reward += reward

        # Store data in replay buffer
        replay_buffer.add(obs, action, reward, new_obs, float(done))

        # Update network
        if len(replay_buffer) >= args.batch_size:
            policy.update(replay_buffer, args.discount, args.tau)

        # Update env
        obs = new_obs
        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

        if episode_timesteps >= args.env_timesteps:
            done = True

        if done:
            print((
                "Total T: %d Episode Num: %d \nMean actions: %.2f %.2f Episode T: %d Reward: %.1f Moving Average: %.1f"
            ) % (
                total_timesteps,
                episode_num,
                np.mean(np.array(mean_action), axis=0)[0],
                np.mean(np.array(mean_action), axis=0)[1],
                episode_timesteps,
                episode_reward,
                moving_avgs[-1],
            ))

            train_rewards.append(episode_reward)
            moving_avgs.append(moving_average(train_rewards, avg_episodes))

            writer.add_scalar("Timesteps/Rewards", episode_reward,
                              total_timesteps)
            writer.add_scalar("Timesteps/MovingAverage", moving_avgs[-1],
                              total_timesteps)
            # writer.add_scalar("Episode/Wheel1Mean", np.mean(np.array(mean_action), axis=0)[0], episode_num)
            # writer.add_scalar("Episode/Wheel2Mean", np.mean(np.array(mean_action), axis=0)[1], episode_num)

            # Evaluate episode
            if timesteps_since_eval >= args.eval_freq:

                timesteps_since_eval %= args.eval_freq
                eval_reward = evaluate_policy(env, policy)
                evaluations.append(eval_reward)

                writer.add_scalar("Timesteps/EvaluationReward", eval_reward,
                                  total_timesteps)

                print(
                    "\n-+-+-+-+-+-+-+-+-+-+ Evaluation reward at time {}: {} +-+-+-+-+-+-+-+-+-+-"
                    .format(total_timesteps, eval_reward))

                np.savetxt(
                    "reinforcement/pytorch/results/eval_rewards_" +
                    args.policy + ".csv",
                    np.array(evaluations),
                    delimiter=",",
                )
                np.savetxt(
                    "reinforcement/pytorch/results/train_rewards_" +
                    args.policy + ".csv",
                    np.array(train_rewards),
                    delimiter=",",
                )
                np.savetxt(
                    "reinforcement/pytorch/results/moving_averages_" +
                    args.policy + ".csv",
                    np.array(moving_avgs),
                    delimiter=",",
                )

                # Save the policy according to the best reward over training
                if eval_reward > best_reward:
                    best_reward = eval_reward
                    policy.save(args.model_dir, args.policy)
                    save_training_state(
                        args.model_dir,
                        args.policy + "_training",
                        best_reward,
                        total_timesteps,
                        evaluations,
                        train_rewards,
                        episode_num,
                        moving_avgs,
                    )

                    print(
                        "-+-+-+-+-+-+-+-+-+-+ Model saved +-+-+-+-+-+-+-+-+-+-\n"
                    )

            # Reset environment
            mean_action = []

            obs = env.reset()
            env_counter += 1
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

    print("Finished..should return now!")