Ejemplo n.º 1
0
def main(env_id, dim_latent, render, num_process, lr_p, lr_v, gamma, polyak,
         target_action_noise_std, target_action_noise_clip, explore_size,
         memory_size, step_per_iter, batch_size, min_update_step, update_step,
         max_iter, eval_iter, save_iter, action_noise, policy_update_delay,
         model_path, log_path, seed):
    base_dir = log_path + env_id + "/TD3_encoder_exp{}".format(seed)
    writer = SummaryWriter(base_dir)

    td3 = TD3(env_id,
              dim_latent=dim_latent,
              render=render,
              num_process=num_process,
              memory_size=memory_size,
              lr_p=lr_p,
              lr_v=lr_v,
              gamma=gamma,
              polyak=polyak,
              target_action_noise_std=target_action_noise_std,
              target_action_noise_clip=target_action_noise_clip,
              explore_size=explore_size,
              step_per_iter=step_per_iter,
              batch_size=batch_size,
              min_update_step=min_update_step,
              update_step=update_step,
              action_noise=action_noise,
              policy_update_delay=policy_update_delay,
              seed=seed,
              model_path='trained_models')

    for i_iter in range(1, 6):

        td3.eval(i_iter, render=True)

        torch.cuda.empty_cache()
Ejemplo n.º 2
0
def main(env_id, render, num_process, lr_p, lr_v, gamma, polyak, target_action_noise_std, target_action_noise_clip,
         explore_size, memory_size, step_per_iter, batch_size, min_update_step, update_step, test_epochs,
         action_noise, policy_update_delay, model_path, seed):
    td3 = TD3(env_id,
              render=render,
              num_process=num_process,
              memory_size=memory_size,
              lr_p=lr_p,
              lr_v=lr_v,
              gamma=gamma,
              polyak=polyak,
              target_action_noise_std=target_action_noise_std,
              target_action_noise_clip=target_action_noise_clip,
              explore_size=explore_size,
              step_per_iter=step_per_iter,
              batch_size=batch_size,
              min_update_step=min_update_step,
              update_step=update_step,
              action_noise=action_noise,
              policy_update_delay=policy_update_delay,
              seed=seed,
              model_path=model_path
              )

    for i_iter in range(1, test_epochs + 1):
        td3.eval(i_iter)
Ejemplo n.º 3
0
def get_td3_agent(*, d_state, d_action, discount, device, value_tau,
                  value_loss, policy_lr, value_lr, policy_n_units,
                  value_n_units, policy_n_layers, value_n_layers,
                  policy_activation, value_activation, agent_grad_clip,
                  td3_policy_delay, tdg_error_weight, td_error_weight,
                  td3_expl_noise):
    return TD3(d_state=d_state,
               d_action=d_action,
               device=device,
               gamma=discount,
               tau=value_tau,
               value_loss=value_loss,
               policy_lr=policy_lr,
               value_lr=value_lr,
               policy_n_layers=policy_n_layers,
               value_n_layers=value_n_layers,
               value_n_units=value_n_units,
               policy_n_units=policy_n_units,
               policy_activation=policy_activation,
               value_activation=value_activation,
               grad_clip=agent_grad_clip,
               policy_delay=td3_policy_delay,
               tdg_error_weight=tdg_error_weight,
               td_error_weight=td_error_weight,
               expl_noise=td3_expl_noise)
Ejemplo n.º 4
0
def main():
    value_function_1 = Sequential(Linear(in_features=4, out_features=128),
                                  ReLU(),
                                  Linear(in_features=128, out_features=128),
                                  ReLU(),
                                  Linear(in_features=128, out_features=128),
                                  ReLU(),
                                  Linear(in_features=128, out_features=1)).to(
                                      torch.device("cuda:0"))

    value_function_2 = Sequential(Linear(in_features=4, out_features=128),
                                  ReLU(),
                                  Linear(in_features=128, out_features=128),
                                  ReLU(),
                                  Linear(in_features=128, out_features=128),
                                  ReLU(),
                                  Linear(in_features=128, out_features=1)).to(
                                      torch.device("cuda:0"))

    policy_function = Sequential(Linear(in_features=3, out_features=128),
                                 ReLU(),
                                 Linear(in_features=128, out_features=128),
                                 ReLU(),
                                 Linear(in_features=128, out_features=128),
                                 ReLU(), Linear(in_features=128,
                                                out_features=1)).to(
                                                    torch.device("cuda:0"))

    optimizer_value_1 = Adam(params=value_function_1.parameters(), lr=0.0003)
    optimizer_value_2 = Adam(params=value_function_2.parameters(), lr=0.0003)
    optimizer_policy = Adam(params=policy_function.parameters(), lr=0.0003)

    agent = TD3(value_net_1=value_function_1,
                value_net_2=value_function_2,
                policy_net=policy_function,
                optimizer_value_net_1=optimizer_value_1,
                optimizer_value_net_2=optimizer_value_2,
                optimizer_policy_net=optimizer_policy,
                lr_scheduler_value_net_1=None,
                lr_scheduler_value_net_2=None,
                lr_scheduler_policy_net=None,
                gamma=0.99,
                noise_std_f=lambda x: 0.1,
                target_policy_smoothing_std=0.2,
                target_policy_smoothing_bound=0.5,
                policy_update_frequency=2,
                tau=0.005,
                min_action=-2,
                max_action=2,
                replay_buffer_size=10000,
                replay_batch_size=64,
                start_training_at=1000,
                device=torch.device("cuda:0"),
                verbose=True)

    run_td3(agent, render=True)
Ejemplo n.º 5
0
def main(env_name, seed, hyper_params, eval_episodes=10):
    env = gym.make(env_name)

    state_dim = sum(list(env.observation_space.shape))
    action_dim = sum(list(env.action_space.shape))
    action_max = float(env.action_space.high[0])

    torch.manual_seed(seed)
    np.random.seed(seed)
    env.seed(seed)
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    kwargs = {
        'device': device,
        'state_dim': state_dim,
        'action_dim': action_dim,
        'action_max': action_max,
        'gamma': hyper_params['gamma'],
        'tau': hyper_params['tau'],
        'lr': hyper_params['lr'],
        'policy_noise': hyper_params['policy_noise'] * action_max,
        'noise_clip': hyper_params['noise_clip'] * action_max,
        'policy_freq': hyper_params['policy_freq']
    }

    agent = TD3(**kwargs)

    file_dir = os.path.abspath(os.path.dirname(__file__))
    save_dir = os.path.join(file_dir, 'results', env_name, 'seed' + str(seed),
                            'learned_model')
    agent.load(save_dir)

    env.seed(seed + 100)

    episode_rewards = []
    for _ in range(eval_episodes):
        state = env.reset()
        done = False
        sum_rewards = 0
        while not done:
            env.render()
            action = agent.rollout_actor.deterministic_action(state)
            next_state, reward, done, _ = env.step(action)
            sum_rewards += reward
            state = next_state
        episode_rewards.append(sum_rewards)
        print(
            f'Episode: {len(episode_rewards)} Sum Rewards: {sum_rewards:.3f}')

    avg_reward = np.mean(episode_rewards)
    print('\n---------------------------------------')
    print(f'Evaluation over {eval_episodes} episodes: {avg_reward:.3f}')
    print('---------------------------------------')
Ejemplo n.º 6
0
def start_training(args):
    env = build_env(args)

    td3 = TD3(state_dim=env.observation_space.shape[0],
              action_num=env.action_space.shape[0],
              lr=args.learning_rate,
              batch_size=args.batch_size,
              device=args.gpu)
    load_params(td3, args)

    run_training_loop(env, td3, args)

    env.close()
Ejemplo n.º 7
0
def start_test_run(args):
    env = build_env(args)

    td3 = TD3(state_dim=env.observation_space.shape[0],
              action_num=env.action_space.shape[0],
              lr=args.learning_rate,
              batch_size=args.batch_size,
              device=args.gpu)
    load_params(td3, args)

    rewards = td3.evaluate_policy(env, render=True, save_video=args.save_video)
    print('rewards: ', rewards)
    mean = np.mean(rewards)
    median = np.median(rewards)

    print('mean: {mean}, median: {median}'.format(mean=mean, median=median))

    env.close()
Ejemplo n.º 8
0
def get_algorithm(*argv, **kwargs):

    if args.algorithm == 'pg':
        return PG(*argv, **kwargs)
    if args.algorithm == 'ddpg':
        return DDPG(*argv, **kwargs)
    if args.algorithm == 'td3':
        return TD3(*argv, **kwargs)
    if args.algorithm == 'rbi':
        return RBI(*argv, **kwargs)
    if args.algorithm == 'drbi':
        return DRBI(*argv, **kwargs)
    if args.algorithm == 'ppo':
        return PPO(*argv, **kwargs)
    if args.algorithm == 'sacq':
        return SACQ(*argv, **kwargs)
    if args.algorithm == 'sspg':
        return SSPG(*argv, **kwargs)
    raise NotImplementedError
Ejemplo n.º 9
0
def run_td3(agent: TD3, render: bool = True):
    env = gym.make("Pendulum-v0")
    draw = env.render if render else lambda:...

    # Train forever.
    while True:
        next_state = env.reset()
        reward = 0
        done = False
        ret = 0
        while True:
            action = agent.train_step(state=next_state.flatten(),
                                      reward=reward,
                                      episode_ended=done)
            if done:
                break
            next_state, reward, done, info = env.step(action)
            ret += reward
            draw()
Ejemplo n.º 10
0
 min_epsilon = 0.1
 EXPLORE = 200
 BUFFER_SIZE = 100000
 RANDOM_SEED = 51234
 MINIBATCH_SIZE = 64  # 32 # 5
 with tf.Session() as sess:
     np.random.seed(RANDOM_SEED)
     tf.set_random_seed(RANDOM_SEED)
     env = gym.make(ENV_NAME)
     state_dim = np.size(env.reset())  #2 #env.observation_space.shape[0]
     action_dim = 1  #env.action_space.shape[0]
     ddpg = TD3(sess,
                state_dim,
                action_dim,
                max_action,
                min_action,
                ACTOR_LEARNING_RATE,
                CRITIC_LEARNING_RATE,
                TAU,
                RANDOM_SEED,
                device=DEVICE)
     sess.run(tf.global_variables_initializer())
     ddpg.load()
     replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED)
     ruido = OUNoise(action_dim, mu=0.0)
     llegadas = 0
     init_state = np.zeros(state_dim)
     irradiancias = list(
         [1000.]
     )  #list([1000., 500., 1000., 500., 900., 600., 800., 400., 100.]) #irradiancias = list([1000., 1000., 800., 700.]) #list([100., 200., 300., 400., 500., 600., 700., 800., 900., 1000])
     temperaturas = list(
         [25.]
Ejemplo n.º 11
0
    drone.load_level(params["level_name"])

    return drone


################################################################################
# Exec
env = init_drone_env(params)
env.start_race(params["race_tier"])
env.initialize_drone()
env.takeoff_with_moveOnSpline()
env.get_ground_truth_gate_poses()
print([[en.position.x_val, en.position.y_val, en.position.z_val]
       for en in env.gate_poses_ground_truth])
raise

state_dim = env.observation_space[0]
action_dim = env.action_space[0]
max_action = float(env.action_high)

total_timesteps = 0
timesteps_since_eval = 0
episode_num = 0
done = True
writer = SummaryWriter(log_dir="./logs")

policy = TD3(state_dim, action_dim, max_action, env)

eval(policy, env, writer, params)

time.sleep(2.0)
Ejemplo n.º 12
0
            'rl_method': args.rl_method,
            'delayed_reward_threshold': args.delayed_reward_threshold,
            'net': args.net,
            'num_steps': args.num_steps,
            'lr': args.lr,
            'output_path': output_path,
            'reuse_models': args.reuse_models
        }

        # 강화학습 시작
        learner = None
        common_params.update({
            'stock_code': stock_code,
            'chart_data': chart_data,
            'training_data': training_data,
            'min_trading_unit': min_trading_unit,
            'max_trading_unit': max_trading_unit
        })
        if args.rl_method == 'td3':
            learner = TD3(
                **{
                    **common_params, 'value_network_path': value_network_path,
                    'policy_network_path': policy_network_path
                })
        if learner is not None:
            learner.run(balance=args.balance,
                        num_epoches=args.num_epoches,
                        discount_factor=args.discount_factor,
                        start_epsilon=args.start_epsilon,
                        learning=args.learning)
Ejemplo n.º 13
0
                    help='number of simulations steps per update (default: 1)')
parser.add_argument('--dir', default="runs",
                    help='loggin directory to create folder containing tensorboard and loggin files')
parser.add_argument('--cuda', action="store_true",
                    help='run on CUDA (default: False)')
args = parser.parse_args()

env = gym.make(args.env)

env.seed(args.seed)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
torch.backends.cudnn.deterministic=True

agent = TD3(env.observation_space.shape[0], env.action_space.shape[0], env.action_space, args)
LOG_DIR = '{}/{}_TD3_{}'.format(args.dir, datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S"), args.env)
writer = SummaryWriter(logdir=LOG_DIR)
LOG = Logger(LOG_DIR)
LOG.create("q_values")
LOG.create("estimated_r")
LOG.create("test_reward")
LOG.create("train_reward")

total_numsteps = 0
for i_episode in itertools.count(1):

    episode_reward = 0
    episode_steps = 0
    done = False
    state = env.reset()
Ejemplo n.º 14
0
                   hidden_dim=crit_hid_dim,
                   output_non_linearity=crit_out_non_linear)

# Agent
lr = 3e-5
gamma = 0.99
tau = 0.01
policy_freq = 2
rb_max_size = 1e6
rb_batch_size = 64

agent = TD3(actor,
            critic,
            reward_fun,
            gamma=gamma,
            tau=tau,
            policy_freq=policy_freq,
            max_buffer_size=rb_max_size,
            batch_size=rb_batch_size,
            lr=lr)

# Training
show = False

train_agent(agent,
            desc,
            file_name,
            runs,
            episodes,
            time_steps,
            test_episodes,
Ejemplo n.º 15
0
episode_length = deque(maxlen=10)

kwargs = {
    "state_dim": state_dim,
    "action_dim": action_dim,
    "max_action": max_action,
    "discount": args.discount,
    "tau": args.tau,
    "policy": args.policy
}

# Target policy smoothing is scaled wrt the action scale
kwargs["policy_noise"] = args.policy_noise * max_action
kwargs["noise_clip"] = args.noise_clip * max_action
kwargs["policy_freq"] = args.policy_freq
policy = TD3.TD3(**kwargs)

replay_buffer = ReplayBuffer(state_dim, action_dim, max_size=int(1e5))

# Evaluate untrained policy
evaluations = [eval_policy(policy, args.env_name, args.seed)]

state, done = env.reset(), False
episode_reward = 0
episode_timesteps = 0
episode_num = 0

for t in range(int(args.max_timesteps)):

    episode_timesteps += 1
Ejemplo n.º 16
0
import gym

if __name__ == '__main__':
    env_id = 'Pendulum-v0'  #Pendulum-v0, MountainCarContinuous-v0
    env = gym.make(env_id)
    agent = TD3(env,
                h_layers=[64, 64],
                seed=0,
                steps_per_epoch=4000,
                epochs=10,
                max_ep_len=1000,
                batch_size=100,
                start_steps=10000,
                update_after=1000,
                update_every=50,
                replay_size=int(1e5),
                gamma=0.99,
                polyak=0.995,
                lr_a=1e-3,
                lr_c=1e-3,
                act_noise=0.1,
                target_noise=0.2,
                noise_clip=0.5,
                policy_delay=2,
                save_freq=1,
                save_path='./checkpoints/')

    # training
    agent.train()

    #test
Ejemplo n.º 17
0
def main(env_name, low_list, high_list):
    sess = tf.Session()
    K.set_session(sess)

    # Define environment
    env = gym.make(env_name)

    td3 = TD3(env,
              sess,
              low_action_bound_list=low_list,
              high_action_bound_list=high_list)

    # Main loop
    num_episodes = 2000
    max_episode_len = 1000

    scores_deque = deque(maxlen=50)
    for i in range(num_episodes):
        total_reward = 0

        current_state = env.reset()

        for step in range(max_episode_len):
            current_state = current_state.reshape((1, td3.state_dim))
            action = td3.act(i, current_state)
            if td3.action_dim == 1:
                action = action.reshape((1, td3.action_dim))
            elif td3.action_dim > 1:
                action = action.reshape((1, td3.action_dim))[0]

            next_state, reward, done, info = env.step(action)
            next_state = next_state.reshape((1, td3.state_dim))
            total_reward += reward

            td3.replay_buffer.add(current_state, action, reward, next_state,
                                  done)
            current_state = next_state

            td3.train_critic()

            # Delayed training for policy
            if (step % 2) == 0:
                td3.train_actor()
                td3.update_target_models()

            if done:
                break

        scores_deque.append(total_reward)
        score_average = np.mean(scores_deque)

        print('Episode {}, Reward {}, Avg reward:{}'.format(
            i, total_reward, score_average))

        if score_average >= -300:

            td3.actor_model.save_weights('model_{}.h5'.format(env_name))

            # Display when finished
            current_state = env.reset()
            for step in range(1000):
                env.render()
                current_state = current_state.reshape((1, td3.state_dim))
                action = td3.act(i, current_state)
                if td3.action_dim == 1:
                    action = action.reshape((1, td3.action_dim))
                elif td3.action_dim > 1:
                    action = action.reshape((1, td3.action_dim))[0]

                next_state, reward, done, info = env.step(action)
                next_state = next_state.reshape((1, td3.state_dim))

                current_state = next_state

                if done:
                    break

            break
Ejemplo n.º 18
0
# Adding this line if we don't want the right click to put a red point
Config.set('input', 'mouse', 'mouse,multitouch_on_demand')
Config.set('graphics', 'resizable', False)
Config.set('graphics', 'width', '1429')
Config.set('graphics', 'height', '660')

# Introducing last_x and last_y, used to keep the last point in memory when we draw the sand on the map
last_x = 0
last_y = 0
n_points = 0
length = 0
max_action = 45

# Getting our AI, which we call "brain", and that contains our neural network that represents our Q-function
#brain = Dqn(5,3,0.9)
brain = TD3(5, 1, max_action)
#action2rotation = [0,5,-5]
replay_buffer = ReplayBuffer()
last_reward = 0
scores = []
im = CoreImage("./images/MASK1.png")

# Initializing the map
first_update = True


def init():
    global sand
    global goal_x
    global goal_y
    global first_update
Ejemplo n.º 19
0
 def __init__(self,
         policy_primitive_learning_rate, policy_movement_learning_rate, policy_model_arch,
         critic_learning_rate, critic_model_arch,
         target_smoothing_stddev, tau, exploration_prob,
         state_size, action_size, goal_size, n_simulations,
         movement_exploration_prob_ratio,
         policy_bottleneck_size, policy_default_layer_size, critic_default_layer_size):
     self.movement_exploration_prob_ratio = movement_exploration_prob_ratio
     full_policy_model = keras.models.model_from_yaml(
         policy_model_arch.pretty(resolve=True),
         custom_objects=custom_objects
     )
     if not isinstance(full_policy_model.layers[-1], NormalNoise):
         raise ValueError("Last layer of the policy must be of type NormalNoise")
     noise_layers_indices = [
         i for i, layer in enumerate(full_policy_model.layers)
         if isinstance(layer, NormalNoise)
     ]
     if len(noise_layers_indices) > 2:
         raise ValueError("More than 2 NormalNoise layers have been found in the policy")
     self.has_movement_primitive = len(noise_layers_indices) == 2
     if self.has_movement_primitive:
         primitive_policy_model = keras.models.Sequential(
             full_policy_model.layers[
             :noise_layers_indices[0] + 1
         ])
         movement_policy_model = keras.models.Sequential(
             full_policy_model.layers[
             noise_layers_indices[0] + 1:noise_layers_indices[1] + 1
         ])
         self.primitive_size = primitive_policy_model.layers[-2].units
         self.primitive_td3 = TD3(
             policy_learning_rate=policy_primitive_learning_rate,
             policy_model=primitive_policy_model,
             critic_learning_rate=critic_learning_rate,
             critic_model=keras.models.model_from_yaml(
                 critic_model_arch.pretty(resolve=True),
                 custom_objects=custom_objects
             ),
             target_smoothing_stddev=target_smoothing_stddev,
             tau=tau,
             policy_state_size=state_size + goal_size,
             critic_state_size=state_size + goal_size,
             action_size=self.primitive_size,
             n_simulations=n_simulations,
         )
         self.movement_td3 = TD3(
             policy_learning_rate=policy_movement_learning_rate,
             policy_model=movement_policy_model,
             critic_learning_rate=critic_learning_rate,
             critic_model=keras.models.model_from_yaml(
                 critic_model_arch.pretty(resolve=True),
                 custom_objects=custom_objects
             ),
             target_smoothing_stddev=target_smoothing_stddev,
             tau=tau,
             policy_state_size=self.primitive_size,
             critic_state_size=state_size + goal_size,
             action_size=int(action_size),
             n_simulations=n_simulations,
         )
     else:
         movement_policy_model = full_policy_model
         self.primitive_td3 = None
         self.primitive_size = None
         self.movement_td3 = TD3(
             policy_learning_rate=policy_movement_learning_rate,
             policy_model=movement_policy_model,
             critic_learning_rate=critic_learning_rate,
             critic_model=keras.models.model_from_yaml(
                 critic_model_arch.pretty(resolve=True),
                 custom_objects=custom_objects
             ),
             target_smoothing_stddev=target_smoothing_stddev,
             tau=tau,
             policy_state_size=state_size + goal_size,
             critic_state_size=state_size + goal_size,
             action_size=int(action_size),
             n_simulations=n_simulations,
         )
     self.n_simulations = n_simulations
     self.exploration_prob = exploration_prob
    else:
        raise Exception('Unknown env')

    obs_size, act_size = env.observation_space.shape[
        0], env.action_space.shape[0]

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    env.seed(args.seed)
    env.action_space.seed(args.seed)
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    # if args.agent == 'td3':
    agent = TD3(device, obs_size, act_size)
    train(agent, env, n_episodes=100, n_random_episodes=10)
    plt.plot(episode_list, reward_list, label='td3')
    #plt.savefig('ddpg_stoch2.png')
    # elif args.agent == 'ddpg':
    np.save('td3_episode_list.npy', episode_list)
    np.save('td3_reward_list.npy', reward_list)

    episode_list = []
    reward_list = []
    agent = DDPG(device, obs_size, act_size)
    train(agent, env, n_episodes=100, n_random_episodes=10)
    plt.plot(episode_list, reward_list, label='ddpg')

    plt.legend()
    plt.savefig('ddpg_stoch2.png')
Ejemplo n.º 21
0
def main(env_name, seed, hyper_params):
    env = gym.make(env_name)

    state_dim = sum(list(env.observation_space.shape))
    action_dim = sum(list(env.action_space.shape))
    action_max = float(env.action_space.high[0])

    torch.manual_seed(seed)
    np.random.seed(seed)
    env.seed(seed)
    env.action_space.np_random.seed(seed)

    device = torch.device('cuda' if hyper_params['use_cuda'] else 'cpu')

    kwargs = {
        'device': device,
        'state_dim': state_dim,
        'action_dim': action_dim,
        'action_max': action_max,
        'gamma': hyper_params['gamma'],
        'tau': hyper_params['tau'],
        'lr': hyper_params['lr'],
        'policy_noise': hyper_params['policy_noise'] * action_max,
        'noise_clip': hyper_params['noise_clip'] * action_max,
        'exploration_noise': hyper_params['exploration_noise'] * action_max,
        'policy_freq': hyper_params['policy_freq']
    }

    agent = TD3(**kwargs)
    replay_buffer = ReplayBuffer(state_dim, action_dim, device, max_size=int(1e6))

    file_dir = os.path.abspath(os.path.dirname(__file__))
    save_dir = os.path.join(
        file_dir,
        'results',
        env_name,
        'seed' + str(seed)
    )
    os.makedirs(save_dir, exist_ok=True)

    evals = [eval_policy(agent.rollout_actor, env_name, seed)]

    state = env.reset()
    episode_reward = 0
    episode_time_step = 0
    episode_num = 0

    episode_start = time.time()
    for t in range(hyper_params['max_time_step']):
        episode_time_step += 1

        if t < hyper_params['initial_time_step']:
            action = env.action_space.sample()
        else:
            action = agent.rollout_actor.select_action(state)

        next_state, reward, done, _ = env.step(action)
        done_buffer = done if episode_time_step < env._max_episode_steps else False
        replay_buffer.add(state, next_state, action, reward, done_buffer)

        state = next_state
        episode_reward += reward

        if t >= hyper_params['initial_time_step']:
            agent.train(replay_buffer, batch_size=hyper_params['batch_size'])

        if done:
            print(f'Total T: {t + 1} Episode Num: {episode_num + 1} Reward: {episode_reward:.3f}',
                  f'(Frame/sec {episode_time_step / (time.time() - episode_start):.3f})')
            # Reset environment
            state = env.reset()
            episode_reward = 0
            episode_time_step = 0
            episode_num += 1
            episode_start = time.time()

        if (t + 1) % hyper_params['eval_freq'] == 0:
            test_start = time.time()
            # test policy
            evals.append(eval_policy(agent.rollout_actor, env_name, seed))
            test_time = time.time() - test_start
            episode_start += test_time

    evals = np.array(evals)
    np.savetxt(os.path.join(save_dir, 'Episode_Rewards.txt'), evals)
    plt.figure()
    time_step = np.arange(len(evals)) * hyper_params['eval_freq']
    plt.plot(time_step, evals)
    plt.xlabel('Time Steps')
    plt.ylabel('Episode Rewards')
    plt.grid()
    file_name = 'Episode_Rewards.png'
    file_path = os.path.join(save_dir, file_name)
    plt.savefig(file_path)
    plt.close()

    model_path = os.path.join(save_dir, 'learned_model')
    os.makedirs(model_path, exist_ok=True)
    agent.save(model_path)
Ejemplo n.º 22
0
def main():
    env = gym.make('BipedalWalker-v3')

    # set seed for reproducable results
    seed = 1
    env.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    random.seed(seed)

    state_dim = env.observation_space.shape[0]
    action_dim = env.action_space.shape[0]
    max_action = float(env.action_space.high[0])
    buffer_size = 1000000
    batch_size = 100
    noise = 0.1

    # Uncomment to use GPU, but errors exist if GPU is not supported anymore.
    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    device = torch.device("cpu")

    policy = TD3(state_dim, action_dim, max_action, env, device)

    try:
        print("Loading previous model")
        policy.load()
    except Exception as e:
        print('No previous model to load. Training from scratch.')

    buffer = ExperienceReplay(buffer_size, batch_size, device)

    save_score = 400
    episodes = 650
    timesteps = 2000

    best_reward = -1 * sys.maxsize
    scores_over_episodes = []

    for episode in range(episodes):
        avg_reward = 0
        state = env.reset()
        for i in range(timesteps):
            # Same as the TD3, select an action and add noise:
            action = policy.select_action(state) + np.random.normal(
                0, max_action * noise, size=action_dim)
            action = action.clip(env.action_space.low, env.action_space.high)
            # Make an action.
            next_state, reward, done, _ = env.step(action)
            buffer.store_transition(state, action, reward, next_state, done)
            state = next_state
            avg_reward += reward
            env.render()
            if (len(buffer) > batch_size):
                policy.train(buffer, i)
            if (done or i > timesteps):
                scores_over_episodes.append(avg_reward)
                print('Episode ', episode, 'finished with reward:', avg_reward)
                print('Finished at timestep ', i)
                break

        if (np.mean(scores_over_episodes[-50:]) > save_score):
            print('Saving agent- past 50 scores gave better avg than ',
                  save_score)
            best_reward = np.mean(scores_over_episodes[-50:])
            save_score = best_reward
            policy.save()
            break  # Saved agent. Break out of episodes and end, 400 is pretty good.

        if (episode >= 0 and avg_reward > best_reward):
            print(
                'Saving agent- score for this episode was better than best-known score..'
            )
            best_reward = avg_reward
            policy.save()  # Save current policy + optimizer

    fig = plt.figure()
    plt.plot(np.arange(1, len(scores_over_episodes) + 1), scores_over_episodes)
    plt.ylabel('Score')
    plt.xlabel('Episode #')
    plt.show()
Ejemplo n.º 23
0
    # action noise
    if args.ou_noise:
        a_noise = OrnsteinUhlenbeckProcess(action_dim,
                                           mu=args.ou_mu,
                                           theta=args.ou_theta,
                                           sigma=args.ou_sigma)
    else:
        a_noise = GaussianNoise(action_dim, sigma=args.gauss_sigma)

    for run in range(7, 8):  #args.nbRuns):
        memory = Memory(args.mem_size, state_dim, action_dim)
        # agent
        if args.use_td3:
            print("RUNNING : TD3")
            #TD3
            agent = TD3(state_dim, action_dim, max_action, memory, args)
        else:
            print("RUNNING : DDPG")
            #DDPG
            agent = DDPG(state_dim, action_dim, max_action, memory, args)

        if args.mode == 'train':
            train(run,
                  n_episodes=args.n_episodes,
                  output=args.output,
                  debug=args.debug,
                  render=False)  #modif en brut

        else:
            raise RuntimeError('undefined mode {}'.format(args.mode))
Ejemplo n.º 24
0
from tqdm import trange
from IPython.display import clear_output

from td3 import TD3
import tensorflow as tf
from cpprb import ReplayBuffer, PrioritizedReplayBuffer

BUFFER_SIZE = int(1e5)
STATE_DIM = 5
ACTION_DIM = 1
BATCH_SIZE = 256

env = suite.load(domain_name='cartpole', task_name='swingup')
action_spec = env.action_spec()

agent = TD3(STATE_DIM, ACTION_DIM, max_action=action_spec.maximum)
print('Running on ', agent.device)

rb = ReplayBuffer(BUFFER_SIZE, {"obs": {"shape": (STATE_DIM,)},
                               "act": {"shape": ACTION_DIM},
                               "rew": {},
                               "next_obs": {"shape": (STATE_DIM,)},
                               "done": {}})

n_episodes=3; max_t=1e3; print_every=2
scores_deque = deque(maxlen=print_every)
scores = []

for i_episode in trange(1, int(n_episodes)+1):
    
    time_step = env.reset()