Example #1
0
def test():
    # trained model directory
    directory = "./preTrained"
    filename = "ddpg"
    # initialize DDPG agent
    agent = DDPG(
        state_dim=env.observation_space.shape[0] -
        1,  # state_dim=env.observation_space.shape[0]
        action_dim=env.action_space.shape[0],
        action_bounds=env.action_space.high[0],
        lr=0)

    # load trained agent
    assert os.path.exists(
        directory), "Trained model not exists, try running train.py first."
    agent.load(directory, filename)
    for epoch in range(max_episode):
        # reset environment
        state = env.reset()
        done = False
        rewards = 0

        while not done:
            action = agent.select_action(state)
            # perform one step update on pendulum
            next_state, reward, done, _ = env.step(action)
            # go to next state
            state = next_state
            rewards += reward
            # render envrionment
            env.render()
        print("Episode:{:2d}, Rewards:{:3f}".format(epoch, rewards))
def get_policy(arglist, kwargs, max_action):
    # Initialize policy
    if arglist.policy == "td3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = arglist.policy_noise * max_action
        kwargs["noise_clip"] = arglist.noise_clip * max_action
        kwargs["policy_freq"] = arglist.policy_freq
        policy = TD3.TD3(**kwargs)
    elif arglist.policy == "OurDDPG":
        policy = OurDDPG.DDPG(**kwargs)
    elif arglist.policy == "DDPG":
        policy = DDPG.DDPG(**kwargs)
    elif arglist.policy == 'adv':
        kwargs['alpha'] = arglist.alpha
        kwargs['adv_epsilon'] = arglist.adv_epsilon
        kwargs[
            'logdir'] = f'./tensorboard/{arglist.policy}_{arglist.env}_{arglist.seed}/'
        policy = TD3_adv2.TD3(**kwargs)
    else:
        raise NotImplementedError
    return policy
Example #3
0
def train(env_name, start_episodes, num_episodes, gamma, tau, noise_std,
          batch_size, eval_freq, seed):
    """ Main training loop
    Args:
        env_name: OpenAI Gym environment name, e.g. 'Hopper-v1'
        start_episodes: how many episodes purely random policy is run for
        num_episodes: maximum number of episodes to run
        gamma: reward discount factor
        tau: target network update rate
        batch_size: number of episodes per policy training batch
        eval_freq: number of training batch before test
        seed: random seed for all modules with randomness
    """
    # set seeds
    set_global_seed(seed)
    # configure log
    configure_log_info(env_name, seed)

    # create env
    env = gym.make(env_name)
    env.seed(seed)  # set env seed
    obs_dim = env.observation_space.shape[0]
    obs_dim += 1  # add 1 to obs dimension for time step feature (see run_episode())
    act_dim = env.action_space.shape[0]

    # create actor and target actor
    actor = Actor(obs_dim, act_dim, float(env.action_space.high[0])).to(device)
    target_actor = Actor(obs_dim, act_dim,
                         float(env.action_space.high[0])).to(device)

    # create critic and target critic
    critic = Critic(obs_dim, act_dim).to(device)
    target_critic = Critic(obs_dim, act_dim).to(device)

    # create DDPG agent (hollowed object)
    agent = DDPG(actor, critic, target_actor, target_critic, noise_std, gamma,
                 tau)
    agent.align_target()

    # create replay_buffer
    replay_buffer = ReplayBuffer()
    # run a few episodes of untrained policy to initialize scaler and fill in replay buffer
    run_policy(env,
               agent,
               replay_buffer,
               mode="random",
               episodes=start_episodes)

    num_iteration = num_episodes // eval_freq
    current_episodes = 0
    current_steps = 0
    for iter in range(num_iteration):
        # train models
        for i in range(eval_freq):
            # sample transitions
            train_returns, total_steps = run_policy(env,
                                                    agent,
                                                    replay_buffer,
                                                    mode="train",
                                                    episodes=batch_size)
            current_episodes += batch_size
            current_steps += total_steps
            logger.info('[train] average return:{0}, std return: {1}'.format(
                np.mean(train_returns), np.std(train_returns)))
            # train
            num_epoch = total_steps // batch_size
            for e in range(num_epoch):
                observation, action, reward, next_obs, done = replay_buffer.sample(
                )
                agent.update(observation, action, reward, next_obs, done)
        # test models
        num_test_episodes = 10
        returns, _ = run_policy(env,
                                agent,
                                replay_buffer,
                                mode="test",
                                episodes=num_test_episodes)
        avg_return = np.mean(returns)
        std_return = np.std(returns)
        logger.record_tabular('iteration', iter)
        logger.record_tabular('episodes', current_episodes)
        logger.record_tabular('steps', current_steps)
        logger.record_tabular('avg_return', avg_return)
        logger.record_tabular('std_return', std_return)
        logger.dump_tabular()
        "state_dim": state_dim,
        "action_dim": action_dim,
        "max_action": max_action,
        "discount": args.discount,
        "tau": args.tau,
    }

    # Initialize policy
    if args.policy == "TD3":
        # Target policy smoothing is scaled wrt the action scale
        kwargs["policy_noise"] = args.policy_noise
        kwargs["noise_clip"] = args.noise_clip
        kwargs["policy_freq"] = args.policy_freq
        policy = TD3.TD3(**kwargs)
    elif args.policy == "DDPG":
        policy = DDPG.DDPG(**kwargs)

    if args.load_model != "":
        policy_file = file_name if args.load_model == "default" else args.load_model
        policy.load(f"./checkpoint/{policy_file}")

    replay_buffer = ReplayBuffer(state_dim, action_dim)

    # Evaluate untrained policy
    evaluations = []
    # evaluations = [eval_policy(policy, env, args.seed, group_name)]

    # state, done = env.reset(group_name), False
    episode_reward = 0
    episode_Rsim = 0
    episode_Robs = 0
Example #5
0
from models import DDPG

model = DDPG("BotGym-v0")
#model = DDPG("Pendulum-v0")

model.train(RENDER=True, MAX_EPISODES=10000000)
def train():
    # save trained model under preTrained directory
    directory = "./preTrained"
    filename = "ddpg"
    # set epsilon exploration rate and decay rate
    epsilon = 0.2
    eps_min = 1e-3
    eps_decay = 2e-3
    gaussian_exploration_noise = 0.2
    # set learning rate and batch size
    lr = 1e-3
    batch_size = 128
    # initialize replay memory
    replay_buffer = ReplayBuffer(max_size=5e4)
    # rewards for each episode / for plot
    rewards = np.zeros(max_episode)
    # initialize DDPG agent
    agent = DDPG(
        state_dim=env.observation_space.shape[0] -
        1,  # state_dim=env.observation_space.shape[0]
        action_dim=env.action_space.shape[0],
        action_bounds=env.action_space.high[0],
        lr=lr)

    for epoch in range(max_episode):
        # reset environment
        state = env.reset()
        done = False
        # epsilon decay
        epsilon = eps_min if (epsilon - eps_decay) < 0 else (epsilon -
                                                             eps_decay)

        while not done:
            if np.random.random_sample() > epsilon:
                action = agent.select_action(state)
                action = action + np.random.normal(0,
                                                   gaussian_exploration_noise)
            else:
                action = np.array(
                    np.random.uniform(env.action_space.low[0],
                                      env.action_space.high[0])).reshape(1, )
            # perform one step update on pendulum
            next_state, reward, done, _ = env.step(action)
            env.render()
            replay_buffer.add((state, action, reward, next_state, done))

            # go to next state
            state = next_state
            # store rewards
            rewards[epoch] += reward
            # update the DDPG agent sampled on replay buffer and n_iter times
            agent.update(buffer=replay_buffer,
                         n_iter=10,
                         batch_size=batch_size)

        if rewards[epoch] > -1.0:
            print("task solved!\n")
            # save trained agent
            if not os.path.exists(directory):
                os.mkdir(directory)
                agent.save(directory, filename)

        # print rewards of current episode
        if epoch % 10 == 0:
            print('train epoch:', epoch, 'rewards:', rewards[epoch])

    return rewards