Beispiel #1
0
def eval(cfg, saved_model_path=SAVED_MODEL_PATH):
    print('start to eval ! \n')
    env = NormalizedActions(gym.make("Pendulum-v0"))
    n_states = env.observation_space.shape[0]
    n_actions = env.action_space.shape[0]
    agent = DDPG(n_states,
                 n_actions,
                 critic_lr=1e-3,
                 actor_lr=1e-4,
                 gamma=0.99,
                 soft_tau=1e-2,
                 memory_capacity=100000,
                 batch_size=128)
    agent.load_model(saved_model_path + 'checkpoint.pth')
    rewards = []
    moving_average_rewards = []
    ep_steps = []
    log_dir = os.path.split(
        os.path.abspath(__file__))[0] + "/logs/eval/" + SEQUENCE
    writer = SummaryWriter(log_dir)
    for i_episode in range(1, cfg.eval_eps + 1):
        state = env.reset()  # reset环境状态
        ep_reward = 0
        for i_step in range(1, cfg.eval_steps + 1):
            action = agent.select_action(state)  # 根据当前环境state选择action
            next_state, reward, done, _ = env.step(action)  # 更新环境参数
            ep_reward += reward
            state = next_state  # 跳转到下一个状态
            if done:
                break
        print('Episode:', i_episode, ' Reward: %i' % int(ep_reward),
              'n_steps:', i_step, 'done: ', done)
        ep_steps.append(i_step)
        rewards.append(ep_reward)
        # 计算滑动窗口的reward
        if i_episode == 1:
            moving_average_rewards.append(ep_reward)
        else:
            moving_average_rewards.append(0.9 * moving_average_rewards[-1] +
                                          0.1 * ep_reward)
        writer.add_scalars('rewards', {
            'raw': rewards[-1],
            'moving_average': moving_average_rewards[-1]
        }, i_episode)
        writer.add_scalar('steps_of_each_episode', ep_steps[-1], i_episode)
    writer.close()
    '''存储reward等相关结果'''
    if not os.path.exists(RESULT_PATH):  # 检测是否存在文件夹
        os.mkdir(RESULT_PATH)
    np.save(RESULT_PATH + 'rewards_eval.npy', rewards)
    np.save(RESULT_PATH + 'moving_average_rewards_eval.npy',
            moving_average_rewards)
    np.save(RESULT_PATH + 'steps_eval.npy', ep_steps)
Beispiel #2
0
def train(cfg):
    print('Start to train ! \n')
    env = NormalizedActions(gym.make("Pendulum-v0"))

    # 增加action噪声
    ou_noise = OUNoise(env.action_space)

    n_states = env.observation_space.shape[0]
    n_actions = env.action_space.shape[0]
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    agent = DDPG(n_states,
                 n_actions,
                 device="cpu",
                 critic_lr=1e-3,
                 actor_lr=1e-4,
                 gamma=0.99,
                 soft_tau=1e-2,
                 memory_capacity=100000,
                 batch_size=128)
    rewards = []
    moving_average_rewards = []
    ep_steps = []
    log_dir = os.path.split(
        os.path.abspath(__file__))[0] + "/logs/train/" + SEQUENCE
    writer = SummaryWriter(log_dir)
    for i_episode in range(1, cfg.train_eps + 1):
        state = env.reset()
        ou_noise.reset()
        ep_reward = 0
        for i_step in range(1, cfg.train_steps + 1):
            action = agent.select_action(state)
            action = ou_noise.get_action(action,
                                         i_step)  # 即paper中的random process
            next_state, reward, done, _ = env.step(action)
            ep_reward += reward
            agent.memory.push(state, action, reward, next_state, done)
            agent.update()
            state = next_state
            if done:
                break
        print('Episode:', i_episode, ' Reward: %i' % int(ep_reward),
              'n_steps:', i_step)
        ep_steps.append(i_step)
        rewards.append(ep_reward)
        if i_episode == 1:
            moving_average_rewards.append(ep_reward)
        else:
            moving_average_rewards.append(0.9 * moving_average_rewards[-1] +
                                          0.1 * ep_reward)
        writer.add_scalars('rewards', {
            'raw': rewards[-1],
            'moving_average': moving_average_rewards[-1]
        }, i_episode)
        writer.add_scalar('steps_of_each_episode', ep_steps[-1], i_episode)
    writer.close()
    print('Complete training!')
    ''' 保存模型 '''
    if not os.path.exists(SAVED_MODEL_PATH):  # 检测是否存在文件夹
        os.mkdir(SAVED_MODEL_PATH)
    agent.save_model(SAVED_MODEL_PATH + 'checkpoint.pth')
    '''存储reward等相关结果'''
    if not os.path.exists(RESULT_PATH):  # 检测是否存在文件夹
        os.mkdir(RESULT_PATH)
    np.save(RESULT_PATH + 'rewards_train.npy', rewards)
    np.save(RESULT_PATH + 'moving_average_rewards_train.npy',
            moving_average_rewards)
    np.save(RESULT_PATH + 'steps_train.npy', ep_steps)
Beispiel #3
0
def train_agent(args, param):
    """

    Args:
    """

    # create CNN convert the [1,3,84,84] to [1, 200]

    use_gym = False
    # in case seed experements
    args.seed = param
    now = datetime.now()
    dt_string = now.strftime("%d_%m_%Y_%H:%M:%S")
    #args.repeat_opt = repeat_opt
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)
    pathname = str(args.locexp) + "/" + str(args.env_name) + '-agent-' + str(
        args.policy)
    pathname += "_batch_size_" + str(args.batch_size)
    pathname += '_update_freq: ' + str(
        args.target_update_freq) + "num_q_target_" + str(
            args.num_q_target) + "_seed_" + str(args.seed)
    pathname += "_actor_300_200"
    text = "Star_training target_update_freq: {}  num_q_target: {}  use device {} ".format(
        args.target_update_freq, args.num_q_target, args.device)
    print(pathname, text)
    write_into_file(pathname, text)
    arg_text = str(args)
    write_into_file(pathname, arg_text)
    tensorboard_name = str(args.locexp) + '/runs/' + pathname
    writer = SummaryWriter(tensorboard_name)

    if use_gym:
        env = gym.make(args.env_name)
        env.seed(args.seed)
        state_dim = env.observation_space.shape[0]
        action_dim = env.action_space.shape[0]
        max_action = float(env.action_space.high[0])
        args.max_episode_steps = env._max_episode_steps
    else:
        size = 84
        env = suite.make(
            args.env_name,
            has_renderer=False,
            use_camera_obs=True,
            ignore_done=True,
            has_offscreen_renderer=True,
            camera_height=size,
            camera_width=size,
            render_collision_mesh=False,
            render_visual_mesh=True,
            camera_name='agentview',
            use_object_obs=False,
            camera_depth=True,
            reward_shaping=True,
        )

    state_dim = 200
    print("State dim, ", state_dim)
    action_dim = env.dof
    print("action_dim ", action_dim)
    max_action = 1
    args.max_episode_steps = 200

    policy = DDPG(state_dim, action_dim, max_action, args)
    file_name = str(args.locexp) + "/pytorch_models/{}".format(args.env_name)
    obs_shape = (3, 84, 84)
    action_shape = (action_dim, )
    print("obs", obs_shape)
    print("act", action_shape)
    replay_buffer = ReplayBuffer(obs_shape, action_shape,
                                 int(args.buffer_size), args.image_pad,
                                 args.device)
    save_env_vid = False
    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True
    t0 = time.time()
    scores_window = deque(maxlen=100)
    episode_reward = 0
    evaluations = []
    tb_update_counter = 0
    while total_timesteps < args.max_timesteps:
        tb_update_counter += 1
        # If the episode is done
        if done:
            episode_num += 1
            #env.seed(random.randint(0, 100))
            scores_window.append(episode_reward)
            average_mean = np.mean(scores_window)
            if tb_update_counter > args.tensorboard_freq:
                print("Write tensorboard")
                tb_update_counter = 0
                writer.add_scalar('Reward', episode_reward, total_timesteps)
                writer.add_scalar('Reward mean ', average_mean,
                                  total_timesteps)
                writer.flush()
            # If we are not at the very beginning, we start the training process of the model
            if total_timesteps != 0:
                text = "Total Timesteps: {} Episode Num: {} ".format(
                    total_timesteps, episode_num)
                text += "Episode steps {} ".format(episode_timesteps)
                text += "Reward: {:.2f}  Average Re: {:.2f} Time: {}".format(
                    episode_reward, np.mean(scores_window),
                    time_format(time.time() - t0))

                print(text)
                write_into_file(pathname, text)
                #policy.train(replay_buffer, writer, episode_timesteps)
            # We evaluate the episode and we save the policy
            if total_timesteps > args.start_timesteps:
                policy.train(replay_buffer, writer, 200)
            if timesteps_since_eval >= args.eval_freq:
                timesteps_since_eval %= args.eval_freq
                evaluations.append(
                    evaluate_policy(policy, writer, total_timesteps, args,
                                    env))
                torch.manual_seed(args.seed)
                np.random.seed(args.seed)
                save_model = file_name + '-{}reward_{:.2f}-agent{}'.format(
                    episode_num, evaluations[-1], args.policy)
                policy.save(save_model)
            # When the training step is done, we reset the state of the environment
            if use_gym:
                obs = env.reset()
            else:
                state = env.reset()
                obs, state_buffer = stacked_frames(state, size, args, policy)

            # Set the Done to False
            done = False
            # Set rewards and episode timesteps to zero
            episode_reward = 0
            episode_timesteps = 0
        # Before 10000 timesteps, we play random actions
        if total_timesteps < args.start_timesteps:
            if use_gym:
                action = env.action_space.sample()
            else:
                action = np.random.randn(env.dof)
        else:  # After 10000 timesteps, we switch to the model
            if use_gym:
                action = policy.select_action(np.array(obs))
                # If the explore_noise parameter is not 0, we add noise to the action and we clip it
                if args.expl_noise != 0:
                    action = (action + np.random.normal(
                        0, args.expl_noise,
                        size=env.action_space.shape[0])).clip(
                            env.action_space.low, env.action_space.high)
            else:
                action = (policy.select_action(np.array(obs)) +
                          np.random.normal(
                              0, max_action * args.expl_noise,
                              size=action_dim)).clip(-max_action, max_action)

        if total_timesteps % args.target_update_freq == 0:
            if args.policy == "TD3_ad":
                policy.hardupdate()
        # The agent performs the action in the environment, then reaches the next state and receives the reward
        new_obs, reward, done, _ = env.step(action)
        done = float(done)
        if not use_gym:
            new_obs, state_buffer = create_next_obs(new_obs, size, args,
                                                    state_buffer, policy)
        # We check if the episode is done
        #done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float(done)
        done_bool = 0 if episode_timesteps + 1 == args.max_episode_steps else float(
            done)
        if not use_gym:
            if episode_timesteps + 1 == args.max_episode_steps:
                done = True
        # We increase the total reward
        reward = reward * args.reward_scalling
        episode_reward += reward
        # We store the new transition into the Experience Replay memory (ReplayBuffer)
        if args.debug:
            print("add to buffer next_obs ", obs.shape)
            print("add to bufferobs ", new_obs.shape)
        replay_buffer.add(obs, action, reward, new_obs, done, done_bool)
        # We update the state, the episode timestep, the total timesteps, and the timesteps since the evaluation of the policy
        obs = new_obs
        if total_timesteps > args.start_timesteps:
            policy.train(replay_buffer, writer, 0)
        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

    # We add the last policy evaluation to our list of evaluations and we save our model
    evaluations.append(
        evaluate_policy(policy, writer, total_timesteps, args, episode_num))