Exemple #1
0
 def __init__(self, actor_state_size, actor_action_size, critic_state_size, critic_action_size, **kwargs):
   
   if 'filename' in kwargs.keys(): 
     data= torch.load(kwargs['filename'])
     self.config= data["config"]
     self.scores= data["scores"]
   elif 'config' in kwargs.keys():
     self.config= kwargs['config']
     data= {}
     self.scores= []
   else:
     raise OSError('DDPG: no configuration parameter in class init')
     
       
   self.actor_state_size = actor_state_size
   self.actor_action_size = actor_action_size
   self.critic_state_size = critic_state_size
   self.critic_action_size = critic_action_size
   memory_size = self.config.get("memory_size", 100000)
   actor_lr = self.config.get("actor_lr", 1e-3)
   critic_lr = self.config.get("critic_lr", 1e-3)
   self.batch_size = self.config.get("batch_size", 256)
   self.discount = self.config.get("discount", 0.9)
   sigma = self.config.get("sigma", 0.2)
   self.tau= self.config.get("tau", 0.001)
   self.seed = self.config.get("seed", 0)
   self.action_noise= self.config.get("action_noise", "No")
   self.critic_l2_reg= self.config.get("critic_l2_reg", 0.0)
   random.seed(self.seed)
   torch.manual_seed(self.seed)
   
   param_noise= False
   if self.action_noise== "Param": param_noise= True
   
   self.actor = Actor(actor_state_size, actor_action_size, nodes= self.config["actor_nodes"], seed= self.seed, param_noise= param_noise).to(device)
   self.critic = Critic(critic_state_size, critic_action_size, nodes= self.config["critic_nodes"], seed= self.seed).to(device)
   self.targetActor = Actor(actor_state_size, actor_action_size, nodes= self.config["actor_nodes"], seed= self.seed, param_noise= param_noise).to(device)
   self.targetCritic = Critic(critic_state_size, critic_action_size, nodes= self.config["critic_nodes"], seed= self.seed).to(device)
   # Initialize parameters
   self.hard_update(self.actor, self.targetActor)
   self.hard_update(self.critic, self.targetCritic)
       
   self.actor_optimizer = optim.Adam(self.actor.parameters(), lr= actor_lr)
   self.critic_optimizer = optim.Adam(self.critic.parameters(), lr= critic_lr, weight_decay= self.critic_l2_reg)
   self.criticLoss = nn.MSELoss()  #nn.SmoothL1Loss()
   #self.criticLoss = nn.SmoothL1Loss()
   
   #self.noise= None
   self.noise = NoNoise()
   if self.action_noise== "OU":
     self.noise = OUNoise(np.zeros(actor_action_size), sigma= sigma)
   elif self.action_noise== "No":
     self.noise = NoNoise()
   elif self.action_noise== "Normal":
     self.noise = NormalActionNoise(np.zeros(actor_action_size), sigma= sigma)
     
   self.memory = Memory(memory_size, self.batch_size, self.seed)
Exemple #2
0
 def __init__(self, **kwargs):
   
   if 'filename' in kwargs.keys(): 
     data= torch.load(kwargs['filename'])
     self.config= data["config"]
     self.scores= data["scores"]
   elif 'config' in kwargs.keys():
     self.config= kwargs['config']
     data= {}
     self.scores= []
   else:
     raise OSError('DDPG: no configuration parameter in class init')
     
       
   self.state_size = self.config["state_size"]
   self.action_size = self.config["action_size"]
   memory_size = self.config["memory_size"]
   actor_lr = self.config["actor_lr"]
   critic_lr = self.config["critic_lr"]
   self.batch_size = self.config["batch_size"]
   self.discount = self.config["discount"]
   sigma = self.config["sigma"] if self.config["sigma"] else 0.2
   self.tau= self.config["tau"]
   self.seed = self.config["seed"] if self.config["seed"] else 0
   self.action_noise= self.config["action_noise"] if self.config["action_noise"] else "No"
   self.critic_l2_reg= self.config["critic_l2_reg"] if self.config["critic_l2_reg"] else 0.0
   random.seed(self.seed)
   torch.manual_seed(self.seed)
   
   self.actor = Actor(self.state_size, self.action_size, nodes= self.config["actor_nodes"], seed= self.seed).to(device)
   if 'actor' in data.keys(): self.actor.load_state_dict(data['actor'])
   self.critic = Critic(self.state_size, self.action_size, nodes= self.config["critic_nodes"], seed= self.seed).to(device)
   self.targetActor = Actor(self.state_size, self.action_size, nodes= self.config["actor_nodes"], seed= self.seed).to(device)
   self.targetCritic = Critic(self.state_size, self.action_size, nodes= self.config["critic_nodes"], seed= self.seed).to(device)
   # Initialize parameters
   self.hard_update(self.actor, self.targetActor)
   self.hard_update(self.critic, self.targetCritic)
   
   self.actor_optimizer = optim.Adam(self.actor.parameters(), lr= actor_lr)
   self.critic_optimizer = optim.Adam(self.critic.parameters(), lr= critic_lr, weight_decay= self.critic_l2_reg)
   self.criticLoss = nn.MSELoss()
   self.noise= None
   if self.action_noise== "OU":
     self.noise = OUNoise(np.zeros(self.action_size), sigma= sigma)
   elif self.action_noise== "No":
     self.noise = NoNoise()
   elif self.action_noise== "Normal":
     self.noise = NormalActionNoise(np.zeros(self.action_size), sigma= sigma)
     
     
   self.memory = ReplayBuffer(self.action_size, memory_size, self.batch_size, self.seed)
Exemple #3
0
def eval_from_checkpoint(env, config, run=0):
    """
    Apply procedures of training for a DDPG.
    """

    experience = ReplayBuffer()
    noise = NormalActionNoise(0, 0.1, size=env.action_space.shape[0])

    # initialize
    agent = TD3DDPG(env, config, experience, action_noise=noise, run=run)
    agent.initialize()

    agent.restore()
    agent.record()
    agent.close()
Exemple #4
0
def ddpg(episode, breaking_step, reward_name):
    env = gym.make('AntPyBulletEnv-v0')
    cumulus_steps = 0
    episode_steps = 0

    # randomly initialize critics and actor with weights and biases
    q1 = CriticNN()
    q1.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

    q2 = CriticNN()
    q2.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

    mu = ActorNN(env.action_space.shape[0])
    mu.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

    # initialize target networks
    q1_target = CriticNN()
    q1_target.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    q2_target = CriticNN()
    q2_target.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

    mu_target = ActorNN(env.action_space.shape[0])
    mu_target.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

    q1_target, q2_target, mu_target = update_network_parameters(
        q1, q1_target, q2, q2_target, mu, mu_target, 0.005)

    # initialize replay buffer (actor critic train only after batch is full 64!)
    replay_buffer = ReplayBuffer(1000000, env.observation_space.shape[0],
                                 env.action_space.shape[0])

    performance = []
    avg_return = []
    time_step_reward = []
    avg_time_step_reward = []
    a_c = 0
    b_c = 0
    c_c = 0
    d_c = 0
    e_c = 0
    f_c = 0
    for e in range(episode):

        # receive initial observation state s1 (observation = s1)
        # env.render()
        observation = env.reset()
        state = tf.convert_to_tensor([observation], dtype=tf.float32)

        max_steps = 1000
        min_action = env.action_space.low[0]
        max_action = env.action_space.high[0]
        update_frequency = 2
        learn_count = 0
        score = 0
        for i in range(max_steps):

            # select an action a_t = mu(state) + noise
            noise = NormalActionNoise(0, 0.1)
            action = mu(state) + np.random.normal(noise.mean, noise.sigma)
            proto_tensor = tf.make_tensor_proto(action)
            action = tf.make_ndarray(proto_tensor)
            action = action[0]

            # execute action a_t and observe reward, and next state
            next_state, reward, done, _ = env.step(action)

            # store transition in replay buffer
            replay_buffer.store_transition(state, action, reward, next_state,
                                           done)

            # if there are enough transitions in the replay buffer
            batch_size = 100
            if replay_buffer.mem_cntr >= batch_size:

                # sample a random mini batch of n=64 transitions
                buff_state, buff_action, buff_reward, buff_next_state, buff_done = replay_buffer.sample_buffer(
                    batch_size)

                states = tf.convert_to_tensor(buff_state, dtype=tf.float32)
                next_states = tf.convert_to_tensor(buff_next_state,
                                                   dtype=tf.float32)
                rewards = tf.convert_to_tensor(buff_reward, dtype=tf.float32)
                actions = tf.convert_to_tensor(buff_action, dtype=tf.float32)

                # train critics
                with tf.GradientTape(persistent=True) as tape:

                    # calculate which actions target_actor chooses and add noise
                    target_actions = mu_target(next_states) + tf.clip_by_value(
                        np.random.normal(scale=0.2), -0.5, 0.5)
                    target_actions = tf.clip_by_value(target_actions,
                                                      min_action, max_action)

                    # calculate next_q_values of the critic by feeding the next state and from actor chosen actions
                    next_critic_value1 = tf.squeeze(
                        q1_target(next_states, target_actions), 1)
                    next_critic_value2 = tf.squeeze(
                        q2_target(next_states, target_actions), 1)

                    # calculate q values of critic actual state
                    critic_value1 = tf.squeeze(q1(states, actions), 1)
                    critic_value2 = tf.squeeze(q2(states, actions), 1)

                    # use smaller q value from the 2 critics
                    next_critic_value = tf.math.minimum(
                        next_critic_value1, next_critic_value2)

                    # calculate target values: yt = rt + gamma * q_target(s_t+1, mu_target(s_t+1)); with t = time step
                    y = rewards + 0.99 * next_critic_value * (1 - buff_done)

                    # calculate the loss between critic and target_critic
                    critic1_loss = keras.losses.MSE(y, critic_value1)
                    critic2_loss = keras.losses.MSE(y, critic_value2)

                # update critics by minimized the loss (critic_loss) and using Adam optimizer
                critic1_network_gradient = tape.gradient(
                    critic1_loss, q1.trainable_variables)
                critic2_network_gradient = tape.gradient(
                    critic2_loss, q2.trainable_variables)
                q1.optimizer.apply_gradients(
                    zip(critic1_network_gradient, q1.trainable_variables))
                q2.optimizer.apply_gradients(
                    zip(critic2_network_gradient, q2.trainable_variables))

                learn_count += 1

                # train actor
                if learn_count % update_frequency == 0:
                    with tf.GradientTape() as tape:
                        new_policy_actions = mu(states)
                        # check if - or + (descent or ascent) not sure yet
                        actor_loss = -q1(states, new_policy_actions)
                        actor_loss = tf.math.reduce_mean(actor_loss)

                    # update the actor policy using the sampled policy gradient
                    actor_network_gradient = tape.gradient(
                        actor_loss, mu.trainable_variables)
                    mu.optimizer.apply_gradients(
                        zip(actor_network_gradient, mu.trainable_variables))

                    # update the target networks
                    update_network_parameters(q1, q1_target, q2, q2_target, mu,
                                              mu_target, 0.005)

            time_step_reward.append(reward)
            avg_time_step_reward_short = np.mean(time_step_reward[-50:])
            avg_time_step_reward.append(avg_time_step_reward_short)
            if done:
                performance.append(score)
                avg_reward = np.mean(performance[-50:])
                avg_return.append(avg_reward)
                cumulus_steps += i
                print(
                    "episode: {}/{}, score: {}, avg_score: {}, ep_steps: {}, cumulus_steps: {}"
                    .format(e, episode, score, avg_reward, i, cumulus_steps))

                if 10000 < cumulus_steps < 11000 and a_c == 0:
                    a_c = 1
                    if not os.path.exists(
                            "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}"
                            .format(reward_name)):
                        os.mkdir(
                            "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}"
                            .format(reward_name))
                    mu.save_weights(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/mu{}.h5"
                        .format(reward_name, cumulus_steps))
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_return{}"
                        .format(reward_name, cumulus_steps), avg_return)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/time_step_reward{}"
                        .format(reward_name, cumulus_steps), time_step_reward)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/performance{}"
                        .format(reward_name, cumulus_steps), performance)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_time_step_reward{}"
                        .format(reward_name,
                                cumulus_steps), avg_time_step_reward)

                if 150000 < cumulus_steps < 151000 and b_c == 0:
                    b_c = 1
                    if not os.path.exists(
                            "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}"
                            .format(reward_name)):
                        os.mkdir(
                            "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}"
                            .format(reward_name))
                    mu.save_weights(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/mu{}.h5"
                        .format(reward_name, cumulus_steps))
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_return{}"
                        .format(reward_name, cumulus_steps), avg_return)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/time_step_reward{}"
                        .format(reward_name, cumulus_steps), time_step_reward)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/performance{}"
                        .format(reward_name, cumulus_steps), performance)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_time_step_reward{}"
                        .format(reward_name,
                                cumulus_steps), avg_time_step_reward)

                # if 350000 < cumulus_steps < 351000 and c_c == 0:
                #     c_c = 1
                #     if not os.path.exists("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}".format(reward_name)):
                #         os.mkdir("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}".format(reward_name))
                #     mu.save_weights("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/mu{}.h5".format(reward_name, cumulus_steps))
                #     np.save("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_return{}".format(reward_name, cumulus_steps), avg_return)
                #     np.save("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/time_step_reward{}".format(reward_name, cumulus_steps), time_step_reward)
                #     np.save("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/performance{}".format(reward_name, cumulus_steps), performance)
                #     np.save("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_time_step_reward{}".format(reward_name, cumulus_steps), avg_time_step_reward)

                if 550000 < cumulus_steps < 551000 and d_c == 0:
                    d_c = 1
                    if not os.path.exists(
                            "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}"
                            .format(reward_name)):
                        os.mkdir(
                            "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}"
                            .format(reward_name))
                    mu.save_weights(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/mu{}.h5"
                        .format(reward_name, cumulus_steps))
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_return{}"
                        .format(reward_name, cumulus_steps), avg_return)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/time_step_reward{}"
                        .format(reward_name, cumulus_steps), time_step_reward)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/performance{}"
                        .format(reward_name, cumulus_steps), performance)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_time_step_reward{}"
                        .format(reward_name,
                                cumulus_steps), avg_time_step_reward)

                if 750000 < cumulus_steps < 751000 and e_c == 0:
                    e_c = 1
                    if not os.path.exists(
                            "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}"
                            .format(reward_name)):
                        os.mkdir(
                            "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}"
                            .format(reward_name))
                    mu.save_weights(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/mu{}.h5"
                        .format(reward_name, cumulus_steps))
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_return{}"
                        .format(reward_name, cumulus_steps), avg_return)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/time_step_reward{}"
                        .format(reward_name, cumulus_steps), time_step_reward)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/performance{}"
                        .format(reward_name, cumulus_steps), performance)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_time_step_reward{}"
                        .format(reward_name,
                                cumulus_steps), avg_time_step_reward)

                if 1000000 < cumulus_steps < 1001000 and f_c == 0:
                    f_c = 1
                    mu.save_weights(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/mu{}.h5"
                        .format(reward_name, cumulus_steps))
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_return{}"
                        .format(reward_name, cumulus_steps), avg_return)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/time_step_reward{}"
                        .format(reward_name, cumulus_steps), time_step_reward)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/performance{}"
                        .format(reward_name, cumulus_steps), performance)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_time_step_reward{}"
                        .format(reward_name,
                                cumulus_steps), avg_time_step_reward)
                break

            score += reward
            state = tf.convert_to_tensor([next_state], dtype=tf.float32)

        # stop learning after certain time steps
        if cumulus_steps > breaking_step:
            break

    return avg_return, mu, performance, time_step_reward, avg_time_step_reward
def learn(
        env,
        seed=None,
        total_timesteps=None,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_epoch_cycles=20,
        nb_rollout_steps=100,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        noise_type='adaptive-param_0.2',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        nb_eval_steps=100,
        nb_save_epochs=None,
        batch_size=64,  # per MPI worker
        tau=0.01,
        action_range=(-250.0, 250.0),
        observation_range=(-5.0, 5.0),
        eval_env=None,
        load_path=None,
        save_dir=None,
        param_noise_adaption_interval=50,
        **network_kwargs):

    set_global_seeds(seed)

    if total_timesteps is not None:
        assert nb_epochs is None
        nb_epochs = int(total_timesteps) // (nb_epoch_cycles *
                                             nb_rollout_steps)
    else:
        nb_epochs = 500

    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        rank = 0

    memory = Memory(limit=int(1e6))

    network_spec = [{
        'layer_type': 'dense',
        'units': int(256),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(128),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(1),
        'activation': 'tanh',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }]

    vnetwork_spec = [{
        'layer_type': 'concat',
        'nodes_in': ['action_movement', 'observation_self'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(256),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(128),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(1),
        'activation': '',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }]

    network = DdpgPolicy(scope="ddpg",
                         ob_space=env.observation_space,
                         ac_space=env.action_space,
                         network_spec=network_spec,
                         v_network_spec=vnetwork_spec,
                         stochastic=False,
                         reuse=False,
                         build_act=True,
                         trainable_vars=None,
                         not_trainable_vars=None,
                         gaussian_fixed_var=False,
                         weight_decay=0.0,
                         ema_beta=0.99999,
                         normalize_observations=normalize_observations,
                         normalize_returns=normalize_returns,
                         observation_range=observation_range)

    target_network = DdpgPolicy(scope="target",
                                ob_space=env.observation_space,
                                ac_space=env.action_space,
                                network_spec=network_spec,
                                v_network_spec=vnetwork_spec,
                                stochastic=False,
                                reuse=False,
                                build_act=True,
                                trainable_vars=None,
                                not_trainable_vars=None,
                                gaussian_fixed_var=False,
                                weight_decay=0.0,
                                ema_beta=0.99999,
                                normalize_observations=normalize_observations,
                                normalize_returns=normalize_returns,
                                observation_range=observation_range)

    action_noise = None
    param_noise = None
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                action_noise = dict()
                for k, v in env.action_space.spaces.items():
                    act_size = v.spaces[0].shape[-1]
                    _, stddev = current_noise_type.split('_')
                    action_noise[k] = NormalActionNoise(mu=np.zeros(act_size),
                                                        sigma=float(stddev) *
                                                        np.ones(act_size))
            elif 'ou' in current_noise_type:
                action_noise = dict()
                for k, v in env.action_space.spaces.items():
                    act_size = v.spaces[0].shape[-1]
                    _, stddev = current_noise_type.split('_')
                    action_noise[k] = OrnsteinUhlenbeckActionNoise(
                        mu=np.zeros(act_size),
                        sigma=float(stddev) * np.ones(act_size))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    max_action = action_range[1]
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))

    agent = DDPG(network,
                 target_network,
                 memory,
                 env.observation_space,
                 env.action_space,
                 gamma=gamma,
                 tau=tau,
                 normalize_returns=normalize_returns,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size,
                 action_noise=action_noise,
                 param_noise=param_noise,
                 critic_l2_reg=critic_l2_reg,
                 actor_lr=actor_lr,
                 critic_lr=critic_lr,
                 enable_popart=popart,
                 clip_norm=clip_norm,
                 reward_scale=reward_scale)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()

    saver = functools.partial(save_variables, sess=sess)
    loader = functools.partial(load_variables, sess=sess)
    if load_path != None:
        loader(load_path)

    # Prepare everything.
    agent.initialize(sess)
    sess.graph.finalize()

    agent.reset()
    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = env.num_envs
    n_agents = obs['observation_self'].shape[0]

    episode_reward = np.zeros((nenvs, n_agents), dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            if nenvs > 1:
                # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each
                # of the environments, so resetting here instead
                agent.reset()
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q, _, _ = agent.step(obs,
                                             apply_noise=True,
                                             compute_Q=True)

                # Execute next action.
                if rank == 0 and render:
                    env.render()

                # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch
                for k, v in action.items():
                    action[k] *= max_action

                nenvs_actions = []
                for i in range(nenvs):
                    nenv_action = {
                        'action_movement':
                        action['action_movement'][i * n_agents:(i + 1) *
                                                  n_agents]
                    }
                    nenvs_actions.append(nenv_action)
                new_obs, r, done, info = env.step(
                    nenvs_actions
                )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                # note these outputs are batched from vecenv

                t += 1
                if rank == 0 and render:
                    env.render()
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                agent.store_transition(
                    obs, action, r, new_obs, done
                )  #the batched data will be unrolled in memory.py's append.

                obs = new_obs

                for d in range(len(done)):
                    if done[d]:
                        # Episode done.
                        epoch_episode_rewards.append(episode_reward[d])
                        episode_rewards_history.append(episode_reward[d])
                        epoch_episode_steps.append(episode_step[d])
                        episode_reward[d] = 0.
                        episode_step[d] = 0
                        epoch_episodes += 1
                        episodes += 1
                        if nenvs == 1:
                            agent.reset()

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            epoch_adaptive_distances = []
            for t_train in range(nb_train_steps):
                # Adapt param noise, if necessary.
                if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0:
                    distance = agent.adapt_param_noise()
                    epoch_adaptive_distances.append(distance)

                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

            # Evaluate.
            eval_episode_rewards = []
            eval_qs = []
            if eval_env is not None:
                nenvs_eval = eval_obs.shape[0]
                eval_episode_reward = np.zeros(nenvs_eval, dtype=np.float32)
                for t_rollout in range(nb_eval_steps):
                    eval_action, eval_q, _, _ = agent.step(eval_obs,
                                                           apply_noise=False,
                                                           compute_Q=True)
                    eval_obs, eval_r, eval_done, eval_info = eval_env.step(
                        max_action * eval_action
                    )  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
                    if render_eval:
                        eval_env.render()
                    eval_episode_reward += eval_r

                    eval_qs.append(eval_q)
                    for d in range(len(eval_done)):
                        if eval_done[d]:
                            eval_episode_rewards.append(eval_episode_reward[d])
                            eval_episode_rewards_history.append(
                                eval_episode_reward[d])
                            eval_episode_reward[d] = 0.0

        if MPI is not None:
            mpi_size = MPI.COMM_WORLD.Get_size()
        else:
            mpi_size = 1

        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        stats = agent.get_stats()
        combined_stats = stats.copy()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_std'] = np.std(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(
            episode_rewards_history)
        combined_stats['rollout/return_history_std'] = np.std(
            episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['train/param_noise_distance'] = np.mean(
            epoch_adaptive_distances)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        # Evaluation statistics.
        if eval_env is not None:
            combined_stats['eval/return'] = eval_episode_rewards
            combined_stats['eval/return_history'] = np.mean(
                eval_episode_rewards_history)
            combined_stats['eval/Q'] = eval_qs
            combined_stats['eval/episodes'] = len(eval_episode_rewards)

        def as_scalar(x):
            if isinstance(x, np.ndarray):
                assert x.size == 1
                return x[0]
            elif np.isscalar(x):
                return x
            else:
                raise ValueError('expected scalar, got %s' % x)

        combined_stats_sums = np.array(
            [np.array(x).flatten()[0] for x in combined_stats.values()])
        if MPI is not None:
            combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums)

        combined_stats = {
            k: v / mpi_size
            for (k, v) in zip(combined_stats.keys(), combined_stats_sums)
        }

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])

        if rank == 0:
            logger.dump_tabular()
        logger.info('')
        logdir = logger.get_dir()
        if rank == 0 and logdir:
            if hasattr(env, 'get_state'):
                with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
                    pickle.dump(env.get_state(), f)
            if eval_env and hasattr(eval_env, 'get_state'):
                with open(os.path.join(logdir, 'eval_env_state.pkl'),
                          'wb') as f:
                    pickle.dump(eval_env.get_state(), f)

        if nb_save_epochs != None and (epoch + 1) % nb_save_epochs == 0:
            if save_dir == None:
                checkdir = osp.join(logger.get_dir(), 'checkpoints')
            else:
                checkdir = osp.join(save_dir, 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % epoch)
            print('Saving to', savepath)
            saver(savepath)

    return agent
Exemple #6
0
def learn(env,
          config,
          num_episodes=5000,
          num_eval_final=50,
          batch_size=100,
          seed=7,
          run=0,
          record=False):
    """
    Apply procedures of training for a DDPG.
    """

    experience = ReplayBuffer()
    noise = NormalActionNoise(0, 0.1, size=env.action_space.shape[0])

    config.batch_size = batch_size

    # initialize
    agent = TD3DDPG(env, config, experience, action_noise=noise, run=run)
    agent.initialize()

    # record one game at the beginning
    # if agent.config.record:
    #     agent.record()
    # model

    # Evaluate untrained policy
    agent.evaluate_policy()

    total_timesteps = 0
    timesteps_since_eval = 0
    episode_num = 0
    done = True

    stats = {}
    stats["episode_rewards"] = []
    stats["evaluation_rewards"] = []
    stats["grad_norms"] = []  # TODO
    stats["cummulative_timesteps"] = []
    stats["episode_timesteps"] = []
    stats["eval_episode_timesteps"] = []
    stats["eval_cummulative_timesteps"] = []
    stats["num_episodes"] = num_episodes
    stats["num_eval_final"] = num_eval_final
    stats["seed"] = seed
    stats["agent"] = agent.agent_name
    stats["env"] = config.env_name

    while episode_num < num_episodes:  #total_timesteps < config.max_timesteps:

        if done:

            if total_timesteps != 0:
                print("Total T: {} Episode Num: {} Episode T: {} Reward: {}".
                      format(total_timesteps, episode_num, episode_timesteps,
                             episode_reward))
                stats = agent.train(episode_timesteps, stats=stats)
                stats["episode_rewards"].append(episode_reward)
                stats["cummulative_timesteps"].append(total_timesteps)
                stats["episode_timesteps"].append(episode_timesteps)

            # # Evaluate episode
            # if timesteps_since_eval >= config.eval_freq:
            #     timesteps_since_eval %= config.eval_freq
            #     agent.evaluate_policy()

            # Reset environment
            observation = env.reset()
            done = False
            episode_reward = 0
            episode_timesteps = 0
            episode_num += 1

        # Select action randomly or according to policy
        if total_timesteps < config.start_timesteps:
            action = env.action_space.sample()
        else:
            action, _ = agent.act(np.array(observation),
                                  apply_noise=True,
                                  compute_q=False)
            action = np.squeeze(action, axis=1)

        # Perform action
        new_observation, reward, done, _ = env.step(action)
        done_bool = False if episode_timesteps + 1 == env._max_episode_steps else done
        episode_reward += reward

        # Store data in replay buffer
        agent.add_experience(observation, action, reward, new_observation,
                             done_bool)

        observation = new_observation

        episode_timesteps += 1
        total_timesteps += 1
        timesteps_since_eval += 1

    # Final evaluation
    rewards, eval_cummulative_timesteps, eval_episode_timesteps = agent.evaluate_policy(
        eval_episodes=num_eval_final)
    stats["evaluation_rewards"] = rewards
    stats["eval_cummulative_timesteps"] = eval_cummulative_timesteps
    stats["eval_episode_timesteps"] = eval_episode_timesteps

    agent.save_model()
    if record:
        agent.record()
    agent.close()

    return stats
def run(env_id, seed, noise_type, layer_norm, evaluation, memory_limit,
        **kwargs):
    rank = MPI.COMM_WORLD.Get_rank()
    if rank != 0:
        logger.set_level(logger.DISABLED)
    print("rank: %d" % (rank))
    env = gym.make(env_id)
    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))

    if evaluation and rank == 0:
        eval_env = gym.make(env_id)
        enal_env = bench.Monitor(eval_env,
                                 os.path.join(logger.get_dir(), "gym_eval"))
        env = bench.Monitor(env, None)
    else:
        eval_env = None

    action_noise = None
    param_noise = None
    nb_actions = env.action_space.shape[-1]

    for current_noise_type in noise_type.split(","):
        current_noise_type = current_noise_type.strip()
        if current_noise_type == "none":
            pass
        elif "adaptive-param" in current_noise_type:
            _, stddev = current_noise_type.split("_")
            param_noise = AdaptiveParamNoiseSpec(
                initial_stddev=float(stddev),
                desired_action_stddev=float(stddev))
        elif "normal" in current_noise_type:
            _, stddev = current_noise_type.split("_")
            action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                             sigma=float(stddev) *
                                             np.ones(nb_actions))
        elif "ou" in current_noise_type.split("_"):
            _, stddev = current_noise_type.split("_")
            action_noise = OrnsteinUhlenbeckActionNoise(
                mu=np.zeros(nb_actions),
                sigma=float(stddev) * np.ones(nb_actions))
        else:
            raise RuntimeError(
                'unknown noise type "{}"'.format(current_noise_type))
    print(type(memory_limit), memory_limit)
    memory = Memory(limit=int(memory_limit),
                    action_shape=env.action_space.shape,
                    observation_shape=env.observation_space.shape)
    critic = Critic(layer_norm=layer_norm)
    actor = Actor(nb_actions, layer_norm=layer_norm)

    seed = seed + 1000000 * rank
    logger.info("rank {} : seed={}, logdir={}".format(rank, seed,
                                                      logger.get_dir()))
    tf.reset_default_graph()
    set_global_seeds(seed)
    env.seed(seed)
    if eval_env is not None:
        eval_env.seed(seed)

    if rank == 0:
        start_time = time.time()

    if option == 1:
        training.train(env=env,
                       eval_env=eval_env,
                       param_noise=param_noise,
                       action_noise=action_noise,
                       actor=actor,
                       critic=critic,
                       memory=memory,
                       **kwargs)
    elif option == 2:
        training_reward_shaping.train(env=env,
                                      eval_env=eval_env,
                                      param_noise=param_noise,
                                      action_noise=action_noise,
                                      actor=actor,
                                      critic=critic,
                                      memory=memory,
                                      **kwargs)
    env.close()
    if eval_env is not None:
        eval_env.close()
    if rank == 0:
        logger.info("total runtime: {}s".format(time.time() - start_time))
Exemple #8
0
def learn(
        env,
        seed=None,
        total_timesteps=1e6,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_rollout_steps=100,
        max_ep_len=250,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        noise_type='adaptive-param_0.2',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        start_steps=10000,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        nb_eval_steps=100,
        nb_log_steps=None,
        nb_save_steps=None,
        batch_size=64,  # per MPI worker
        polyak=0.01,
        action_range=(-250.0, 250.0),
        observation_range=(-5.0, 5.0),
        target_noise=0.2,
        noise_clip=0.5,
        policy_delay=2,
        eval_env=None,
        load_path=None,
        save_dir=None,
        **network_kwargs):

    set_global_seeds(seed)

    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        rank = 0

    memory = Memory(limit=int(1e6))

    network_spec = [{
        'layer_type': 'dense',
        'units': int(256),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(128),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(1),
        'activation': 'tanh',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }]

    vnetwork_spec = [{
        'layer_type': 'concat',
        'nodes_in': ['action_movement', 'observation_self'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(256),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(128),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(1),
        'activation': '',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }]

    network = Td3Policy(scope="td3",
                        ob_space=env.observation_space,
                        ac_space=env.action_space,
                        network_spec=network_spec,
                        v_network_spec=vnetwork_spec,
                        stochastic=False,
                        reuse=False,
                        build_act=True,
                        trainable_vars=None,
                        not_trainable_vars=None,
                        gaussian_fixed_var=False,
                        weight_decay=0.0,
                        ema_beta=0.99999,
                        normalize_observations=normalize_observations,
                        normalize_returns=normalize_returns,
                        observation_range=observation_range,
                        action_range=action_range,
                        target_noise=target_noise,
                        noise_clip=noise_clip)

    target_network = Td3Policy(scope="target",
                               ob_space=env.observation_space,
                               ac_space=env.action_space,
                               network_spec=network_spec,
                               v_network_spec=vnetwork_spec,
                               stochastic=False,
                               reuse=False,
                               build_act=True,
                               trainable_vars=None,
                               not_trainable_vars=None,
                               gaussian_fixed_var=False,
                               weight_decay=0.0,
                               ema_beta=0.99999,
                               normalize_observations=normalize_observations,
                               normalize_returns=normalize_returns,
                               observation_range=observation_range,
                               action_range=action_range,
                               target_noise=target_noise,
                               noise_clip=noise_clip,
                               isTarget=True)

    action_noise = None
    param_noise = None
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                action_noise = dict()
                for k, v in env.action_space.spaces.items():
                    act_size = v.spaces[0].shape[-1]
                    _, stddev = current_noise_type.split('_')
                    action_noise[k] = NormalActionNoise(mu=np.zeros(act_size),
                                                        sigma=float(stddev) *
                                                        np.ones(act_size))
            elif 'ou' in current_noise_type:
                action_noise = dict()
                for k, v in env.action_space.spaces.items():
                    act_size = v.spaces[0].shape[-1]
                    _, stddev = current_noise_type.split('_')
                    action_noise[k] = OrnsteinUhlenbeckActionNoise(
                        mu=np.zeros(act_size),
                        sigma=float(stddev) * np.ones(act_size))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    max_action = action_range[1]
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))

    agent = TD3(env,
                network,
                target_network,
                memory,
                env.action_space,
                env.observation_space,
                steps_per_epoch=nb_rollout_steps,
                epochs=nb_epochs,
                gamma=gamma,
                polyak=polyak,
                actor_lr=actor_lr,
                critic_lr=critic_lr,
                batch_size=batch_size,
                start_steps=start_steps,
                action_noise=action_noise,
                target_noise=target_noise,
                noise_clip=noise_clip,
                policy_delay=policy_delay)

    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()

    saver = functools.partial(save_variables, sess=sess)
    loader = functools.partial(load_variables, sess=sess)
    if load_path != None:
        loader(load_path)

    # Prepare everything.
    agent.initialize(sess)
    sess.graph.finalize()

    agent.reset()
    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = env.num_envs
    n_agents = obs['observation_self'].shape[0]

    episode_reward = np.zeros((nenvs, n_agents), dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0

    for t in range(int(total_timesteps)):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True)
            nenvs_actions = []
            for i in range(nenvs):
                nenv_action = {
                    'action_movement':
                    action['action_movement'][i * n_agents:(i + 1) * n_agents]
                }
                nenvs_actions.append(nenv_action)
        else:
            action, q = env.action_space.sample(), None
            nenvs_actions = []
            for i in range(nenvs):
                nenv_action = {
                    'action_movement':
                    action['action_movement'][i * n_agents:(i + 1) *
                                              n_agents][0]
                }
                nenvs_actions.append(nenv_action)

        new_obs, r, done, info = env.step(nenvs_actions)

        episode_reward += r
        episode_step += 1

        for d in range(len(done)):
            done[d] = False if episode_step == max_ep_len else done[d]

        epoch_actions.append(action)
        epoch_qs.append(q)
        agent.store_transition(
            obs, action, r, new_obs,
            done)  #the batched data will be unrolled in memory.py's append.

        obs = new_obs

        for d in range(len(done)):
            if done[d]:
                # Episode done.
                epoch_episode_rewards.append(episode_reward[d])
                episode_rewards_history.append(episode_reward[d])
                epoch_episode_steps.append(episode_step[d])
                episode_reward[d] = 0.
                episode_step[d] = 0
                epoch_episodes += 1
                episodes += 1
                if nenvs == 1:
                    agent.reset()

        episode_actor_losses = []
        episode_critic_losses = []
        episode_critic = []
        episode_critic_twin = []
        if d or (episode_step[0] == max_ep_len):
            """
            Perform all TD3 updates at the end of the trajectory
            (in accordance with source code of TD3 published by
            original authors).
            """
            for j in range(episode_step[0]):
                critic_loss, critic, critic_twin, actor_loss = agent.train(
                    episode_step[0])

                episode_critic_losses.append(critic_loss)
                episode_critic.append(critic)
                episode_critic_twin.append(critic_twin)
                if actor_loss is not None:
                    episode_actor_losses.append(actor_loss)

            obs, r, done, episode_reward, episode_step = env.reset(
            ), 0, False, np.zeros((nenvs, n_agents),
                                  dtype=np.float32), np.zeros(nenvs, dtype=int)

        if (t + 1) % nb_log_steps == 0:
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_std'] = np.std(
                epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/return_history_std'] = np.std(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['train/loss_actor'] = np.mean(episode_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(
                episode_critic_losses)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = np.array(
                [np.array(x).flatten()[0] for x in combined_stats.values()])
            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t
            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            if rank == 0:
                logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)

        if nb_save_steps != None and (t + 1) % nb_save_steps == 0:
            if save_dir == None:
                checkdir = osp.join(logger.get_dir(), 'checkpoints')
            else:
                checkdir = osp.join(save_dir, 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % t)
            print('Saving to', savepath)
            saver(savepath)

    return agent
Exemple #9
0
    def __init__(self, env, args):
        ob_space = env.observation_space
        goal_dim = env.goal_dim
        ob_dim = ob_space.shape[0]
        self.ob_dim = ob_dim
        self.ac_dim = ac_dim = 7
        self.goal_dim = goal_dim
        self.num_iters = args.num_iters
        self.random_prob = args.random_prob
        self.tau = args.tau
        self.reward_scale = args.reward_scale
        self.gamma = args.gamma

        self.log_interval = args.log_interval
        self.save_interval = args.save_interval
        self.rollout_steps = args.rollout_steps
        self.env = env
        self.batch_size = args.batch_size
        self.train_steps = args.train_steps
        self.closest_dist = np.inf
        self.warmup_iter = args.warmup_iter
        self.max_grad_norm = args.max_grad_norm
        self.use_her = args.her
        self.k_future = args.k_future
        self.model_dir = os.path.join(args.save_dir, 'model')
        self.pretrain_dir = args.pretrain_dir
        os.makedirs(self.model_dir, exist_ok=True)
        self.global_step = 0
        self.actor = Actor(ob_dim=ob_dim,
                           act_dim=ac_dim,
                           hid1_dim=args.hid1_dim,
                           hid2_dim=args.hid2_dim,
                           hid3_dim=args.hid3_dim,
                           init_method=args.init_method)
        self.critic = Critic(ob_dim=ob_dim,
                             act_dim=ac_dim,
                             hid1_dim=args.hid1_dim,
                             hid2_dim=args.hid2_dim,
                             hid3_dim=args.hid3_dim,
                             init_method=args.init_method)
        if args.resume or args.test or args.pretrain_dir is not None:
            self.load_model(args.resume_step, pretrain_dir=args.pretrain_dir)
        if not args.test:
            self.actor_target = Actor(ob_dim=ob_dim,
                                      act_dim=ac_dim,
                                      hid1_dim=args.hid1_dim,
                                      hid2_dim=args.hid2_dim,
                                      hid3_dim=args.hid3_dim,
                                      init_method=args.init_method)
            self.critic_target = Critic(ob_dim=ob_dim,
                                        act_dim=ac_dim,
                                        hid1_dim=args.hid1_dim,
                                        hid2_dim=args.hid2_dim,
                                        hid3_dim=args.hid3_dim,
                                        init_method=args.init_method)
            self.actor_optim = self.construct_optim(self.actor,
                                                    lr=args.actor_lr)
            cri_w_decay = args.critic_weight_decay
            self.critic_optim = self.construct_optim(self.critic,
                                                     lr=args.critic_lr,
                                                     weight_decay=cri_w_decay)
            self.hard_update(self.actor_target, self.actor)
            self.hard_update(self.critic_target, self.critic)

            self.actor_target.eval()
            self.critic_target.eval()
            if args.noise_type == 'ou_noise':
                mu = np.zeros(ac_dim)
                sigma = float(args.ou_noise_std) * np.ones(ac_dim)
                self.action_noise = OrnsteinUhlenbeckActionNoise(mu=mu,
                                                                 sigma=sigma)
            elif args.noise_type == 'uniform':
                low_limit = args.uniform_noise_low
                high_limit = args.uniform_noise_high
                dec_step = args.max_noise_dec_step
                self.action_noise = UniformNoise(low_limit=low_limit,
                                                 high_limit=high_limit,
                                                 dec_step=dec_step)

            elif args.noise_type == 'gaussian':
                mu = np.zeros(ac_dim)
                sigma = args.normal_noise_std * np.ones(ac_dim)
                self.action_noise = NormalActionNoise(mu=mu, sigma=sigma)

            self.memory = Memory(limit=int(args.memory_limit),
                                 action_shape=(int(ac_dim), ),
                                 observation_shape=(int(ob_dim), ))
            self.critic_loss = nn.MSELoss()
            self.ob_norm = args.ob_norm
            if self.ob_norm:
                self.obs_oms = OnlineMeanStd(shape=(1, ob_dim))
            else:
                self.obs_oms = None

        self.cuda()
Exemple #10
0
    eval_env = gym.make(args.env)
    #eval_env.seed(args.seed+100)
    if logger.get_dir():
        eval_env = bench.Monitor(
            eval_env, os.path.join(logger.get_dir(), "eval.monitor.json"))

    max_timesteps = train_env.spec.timestep_limit

    # set noise type
    current_noise_type = args.noise_type.strip()
    nb_actions = train_env.action_space.shape[0]
    if 'normal' in current_noise_type:
        _, stddev = current_noise_type.split('_')
        action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                         sigma=float(stddev) *
                                         np.ones(nb_actions))
        action_noise.reset()
    if 'ou' in current_noise_type:
        _, stddev = current_noise_type.split('_')
        action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                    sigma=float(stddev) *
                                                    np.ones(nb_actions))
        action_noise.reset()
    else:
        raise RuntimeError(
            'unknown noise type "{}"'.format(current_noise_type))

    episode_rewards = []
    if 'Sparse' in train_env.spec.id:
        sparse = True