Esempio n. 1
0
def ddpg(episode, breaking_step, reward_name):
    env = gym.make('AntPyBulletEnv-v0')
    cumulus_steps = 0
    episode_steps = 0

    # randomly initialize critics and actor with weights and biases
    q1 = CriticNN()
    q1.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

    q2 = CriticNN()
    q2.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

    mu = ActorNN(env.action_space.shape[0])
    mu.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

    # initialize target networks
    q1_target = CriticNN()
    q1_target.compile(optimizer=Adam(learning_rate=0.001), loss='mse')
    q2_target = CriticNN()
    q2_target.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

    mu_target = ActorNN(env.action_space.shape[0])
    mu_target.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

    q1_target, q2_target, mu_target = update_network_parameters(
        q1, q1_target, q2, q2_target, mu, mu_target, 0.005)

    # initialize replay buffer (actor critic train only after batch is full 64!)
    replay_buffer = ReplayBuffer(1000000, env.observation_space.shape[0],
                                 env.action_space.shape[0])

    performance = []
    avg_return = []
    time_step_reward = []
    avg_time_step_reward = []
    a_c = 0
    b_c = 0
    c_c = 0
    d_c = 0
    e_c = 0
    f_c = 0
    for e in range(episode):

        # receive initial observation state s1 (observation = s1)
        # env.render()
        observation = env.reset()
        state = tf.convert_to_tensor([observation], dtype=tf.float32)

        max_steps = 1000
        min_action = env.action_space.low[0]
        max_action = env.action_space.high[0]
        update_frequency = 2
        learn_count = 0
        score = 0
        for i in range(max_steps):

            # select an action a_t = mu(state) + noise
            noise = NormalActionNoise(0, 0.1)
            if cumulus_steps < 900:
                action = env.action_space.sample()
            else:
                action = mu(state) + np.random.normal(noise.mean, noise.sigma)
                proto_tensor = tf.make_tensor_proto(action)
                action = tf.make_ndarray(proto_tensor)
                action = action[0]

            # execute action a_t and observe reward, and next state
            action[2] = 0
            action[3] = 0
            next_state, reward, done, _ = env.step(action)
            reward_list = env.env.rewards
            z_pos = env.env.robot.body_xyz[2]
            fwp = reward_list[1]
            if fwp > 0:
                reward = reward + fwp * z_pos

            # store transition in replay buffer
            replay_buffer.store_transition(state, action, reward, next_state,
                                           done)

            # if there are enough transitions in the replay buffer
            batch_size = 100
            if replay_buffer.mem_cntr >= batch_size:

                # sample a random mini batch of n=64 transitions
                buff_state, buff_action, buff_reward, buff_next_state, buff_done = replay_buffer.sample_buffer(
                    batch_size)

                states = tf.convert_to_tensor(buff_state, dtype=tf.float32)
                next_states = tf.convert_to_tensor(buff_next_state,
                                                   dtype=tf.float32)
                rewards = tf.convert_to_tensor(buff_reward, dtype=tf.float32)
                actions = tf.convert_to_tensor(buff_action, dtype=tf.float32)

                # train critics
                with tf.GradientTape(persistent=True) as tape:

                    # calculate which actions target_actor chooses and add noise
                    target_actions = mu_target(next_states) + tf.clip_by_value(
                        np.random.normal(scale=0.2), -0.5, 0.5)
                    target_actions = tf.clip_by_value(target_actions,
                                                      min_action, max_action)

                    # calculate next_q_values of the critic by feeding the next state and from actor chosen actions
                    next_critic_value1 = tf.squeeze(
                        q1_target(next_states, target_actions), 1)
                    next_critic_value2 = tf.squeeze(
                        q2_target(next_states, target_actions), 1)

                    # calculate q values of critic actual state
                    critic_value1 = tf.squeeze(q1(states, actions), 1)
                    critic_value2 = tf.squeeze(q2(states, actions), 1)

                    # use smaller q value from the 2 critics
                    next_critic_value = tf.math.minimum(
                        next_critic_value1, next_critic_value2)

                    # calculate target values: yt = rt + gamma * q_target(s_t+1, mu_target(s_t+1)); with t = time step
                    y = rewards + 0.99 * next_critic_value * (1 - buff_done)

                    # calculate the loss between critic and target_critic
                    critic1_loss = keras.losses.MSE(y, critic_value1)
                    critic2_loss = keras.losses.MSE(y, critic_value2)

                # update critics by minimized the loss (critic_loss) and using Adam optimizer
                critic1_network_gradient = tape.gradient(
                    critic1_loss, q1.trainable_variables)
                critic2_network_gradient = tape.gradient(
                    critic2_loss, q2.trainable_variables)
                q1.optimizer.apply_gradients(
                    zip(critic1_network_gradient, q1.trainable_variables))
                q2.optimizer.apply_gradients(
                    zip(critic2_network_gradient, q2.trainable_variables))

                learn_count += 1

                # train actor
                if learn_count % update_frequency == 0:
                    with tf.GradientTape() as tape:
                        new_policy_actions = mu(states)
                        # check if - or + (descent or ascent) not sure yet
                        actor_loss = -q1(states, new_policy_actions)
                        actor_loss = tf.math.reduce_mean(actor_loss)

                    # update the actor policy using the sampled policy gradient
                    actor_network_gradient = tape.gradient(
                        actor_loss, mu.trainable_variables)
                    mu.optimizer.apply_gradients(
                        zip(actor_network_gradient, mu.trainable_variables))

                    # update the target networks
                    update_network_parameters(q1, q1_target, q2, q2_target, mu,
                                              mu_target, 0.005)

            time_step_reward.append(reward)
            avg_time_step_reward_short = np.mean(time_step_reward[-50:])
            avg_time_step_reward.append(avg_time_step_reward_short)
            if done:
                performance.append(score)
                avg_reward = np.mean(performance[-50:])
                avg_return.append(avg_reward)
                cumulus_steps += i
                print(
                    "episode: {}/{}, score: {}, avg_score: {}, ep_steps: {}, cumulus_steps: {}"
                    .format(e, episode, score, avg_reward, i, cumulus_steps))

                if 10000 < cumulus_steps < 11000 and a_c == 0:
                    a_c = 1
                    if not os.path.exists(
                            "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}"
                            .format(reward_name)):
                        os.mkdir(
                            "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}"
                            .format(reward_name))
                    mu.save_weights(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/mu{}.h5"
                        .format(reward_name, cumulus_steps))
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_return{}"
                        .format(reward_name, cumulus_steps), avg_return)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/time_step_reward{}"
                        .format(reward_name, cumulus_steps), time_step_reward)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/performance{}"
                        .format(reward_name, cumulus_steps), performance)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_time_step_reward{}"
                        .format(reward_name,
                                cumulus_steps), avg_time_step_reward)

                if 150000 < cumulus_steps < 151000 and b_c == 0:
                    b_c = 1
                    if not os.path.exists(
                            "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}"
                            .format(reward_name)):
                        os.mkdir(
                            "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}"
                            .format(reward_name))
                    mu.save_weights(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/mu{}.h5"
                        .format(reward_name, cumulus_steps))
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_return{}"
                        .format(reward_name, cumulus_steps), avg_return)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/time_step_reward{}"
                        .format(reward_name, cumulus_steps), time_step_reward)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/performance{}"
                        .format(reward_name, cumulus_steps), performance)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_time_step_reward{}"
                        .format(reward_name,
                                cumulus_steps), avg_time_step_reward)

                # if 350000 < cumulus_steps < 351000 and c_c == 0:
                #     c_c = 1
                #     if not os.path.exists("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}".format(reward_name)):
                #         os.mkdir("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}".format(reward_name))
                #     mu.save_weights("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/mu{}.h5".format(reward_name, cumulus_steps))
                #     np.save("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_return{}".format(reward_name, cumulus_steps), avg_return)
                #     np.save("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/time_step_reward{}".format(reward_name, cumulus_steps), time_step_reward)
                #     np.save("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/performance{}".format(reward_name, cumulus_steps), performance)
                #     np.save("/home/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_time_step_reward{}".format(reward_name, cumulus_steps), avg_time_step_reward)

                if 550000 < cumulus_steps < 551000 and d_c == 0:
                    d_c = 1
                    if not os.path.exists(
                            "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}"
                            .format(reward_name)):
                        os.mkdir(
                            "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}"
                            .format(reward_name))
                    mu.save_weights(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/mu{}.h5"
                        .format(reward_name, cumulus_steps))
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_return{}"
                        .format(reward_name, cumulus_steps), avg_return)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/time_step_reward{}"
                        .format(reward_name, cumulus_steps), time_step_reward)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/performance{}"
                        .format(reward_name, cumulus_steps), performance)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_time_step_reward{}"
                        .format(reward_name,
                                cumulus_steps), avg_time_step_reward)

                if 750000 < cumulus_steps < 751000 and e_c == 0:
                    e_c = 1
                    if not os.path.exists(
                            "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}"
                            .format(reward_name)):
                        os.mkdir(
                            "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}"
                            .format(reward_name))
                    mu.save_weights(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/mu{}.h5"
                        .format(reward_name, cumulus_steps))
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_return{}"
                        .format(reward_name, cumulus_steps), avg_return)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/time_step_reward{}"
                        .format(reward_name, cumulus_steps), time_step_reward)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/performance{}"
                        .format(reward_name, cumulus_steps), performance)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_time_step_reward{}"
                        .format(reward_name,
                                cumulus_steps), avg_time_step_reward)

                if 1000000 < cumulus_steps < 1001000 and f_c == 0:
                    f_c = 1
                    mu.save_weights(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/mu{}.h5"
                        .format(reward_name, cumulus_steps))
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_return{}"
                        .format(reward_name, cumulus_steps), avg_return)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/time_step_reward{}"
                        .format(reward_name, cumulus_steps), time_step_reward)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/performance{}"
                        .format(reward_name, cumulus_steps), performance)
                    np.save(
                        "/var/tmp/ga53cov/Bachelor_Arbeit/BA/Models/Ant_v2/{}/avg_time_step_reward{}"
                        .format(reward_name,
                                cumulus_steps), avg_time_step_reward)
                break

            score += reward
            state = tf.convert_to_tensor([next_state], dtype=tf.float32)

        # stop learning after certain time steps
        if cumulus_steps > breaking_step:
            break

    return avg_return, mu, performance, time_step_reward, avg_time_step_reward
Esempio n. 2
0
def learn(
        env,
        seed=None,
        total_timesteps=1e6,
        nb_epochs=None,  # with default settings, perform 1M steps total
        nb_rollout_steps=100,
        max_ep_len=250,
        reward_scale=1.0,
        render=False,
        render_eval=False,
        noise_type='adaptive-param_0.2',
        normalize_returns=False,
        normalize_observations=True,
        critic_l2_reg=1e-2,
        actor_lr=1e-4,
        critic_lr=1e-3,
        popart=False,
        gamma=0.99,
        clip_norm=None,
        start_steps=10000,
        nb_train_steps=50,  # per epoch cycle and MPI worker,
        nb_eval_steps=100,
        nb_log_steps=None,
        nb_save_steps=None,
        batch_size=64,  # per MPI worker
        polyak=0.01,
        action_range=(-250.0, 250.0),
        observation_range=(-5.0, 5.0),
        target_noise=0.2,
        noise_clip=0.5,
        policy_delay=2,
        eval_env=None,
        load_path=None,
        save_dir=None,
        **network_kwargs):

    set_global_seeds(seed)

    if MPI is not None:
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        rank = 0

    memory = Memory(limit=int(1e6))

    network_spec = [{
        'layer_type': 'dense',
        'units': int(256),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(128),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(1),
        'activation': 'tanh',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }]

    vnetwork_spec = [{
        'layer_type': 'concat',
        'nodes_in': ['action_movement', 'observation_self'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(256),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(128),
        'activation': 'relu',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }, {
        'layer_type': 'dense',
        'units': int(1),
        'activation': '',
        'nodes_in': ['main'],
        'nodes_out': ['main']
    }]

    network = Td3Policy(scope="td3",
                        ob_space=env.observation_space,
                        ac_space=env.action_space,
                        network_spec=network_spec,
                        v_network_spec=vnetwork_spec,
                        stochastic=False,
                        reuse=False,
                        build_act=True,
                        trainable_vars=None,
                        not_trainable_vars=None,
                        gaussian_fixed_var=False,
                        weight_decay=0.0,
                        ema_beta=0.99999,
                        normalize_observations=normalize_observations,
                        normalize_returns=normalize_returns,
                        observation_range=observation_range,
                        action_range=action_range,
                        target_noise=target_noise,
                        noise_clip=noise_clip)

    target_network = Td3Policy(scope="target",
                               ob_space=env.observation_space,
                               ac_space=env.action_space,
                               network_spec=network_spec,
                               v_network_spec=vnetwork_spec,
                               stochastic=False,
                               reuse=False,
                               build_act=True,
                               trainable_vars=None,
                               not_trainable_vars=None,
                               gaussian_fixed_var=False,
                               weight_decay=0.0,
                               ema_beta=0.99999,
                               normalize_observations=normalize_observations,
                               normalize_returns=normalize_returns,
                               observation_range=observation_range,
                               action_range=action_range,
                               target_noise=target_noise,
                               noise_clip=noise_clip,
                               isTarget=True)

    action_noise = None
    param_noise = None
    if noise_type is not None:
        for current_noise_type in noise_type.split(','):
            current_noise_type = current_noise_type.strip()
            if current_noise_type == 'none':
                pass
            elif 'adaptive-param' in current_noise_type:
                _, stddev = current_noise_type.split('_')
                param_noise = AdaptiveParamNoiseSpec(
                    initial_stddev=float(stddev),
                    desired_action_stddev=float(stddev))
            elif 'normal' in current_noise_type:
                action_noise = dict()
                for k, v in env.action_space.spaces.items():
                    act_size = v.spaces[0].shape[-1]
                    _, stddev = current_noise_type.split('_')
                    action_noise[k] = NormalActionNoise(mu=np.zeros(act_size),
                                                        sigma=float(stddev) *
                                                        np.ones(act_size))
            elif 'ou' in current_noise_type:
                action_noise = dict()
                for k, v in env.action_space.spaces.items():
                    act_size = v.spaces[0].shape[-1]
                    _, stddev = current_noise_type.split('_')
                    action_noise[k] = OrnsteinUhlenbeckActionNoise(
                        mu=np.zeros(act_size),
                        sigma=float(stddev) * np.ones(act_size))
            else:
                raise RuntimeError(
                    'unknown noise type "{}"'.format(current_noise_type))

    max_action = action_range[1]
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))

    agent = TD3(env,
                network,
                target_network,
                memory,
                env.action_space,
                env.observation_space,
                steps_per_epoch=nb_rollout_steps,
                epochs=nb_epochs,
                gamma=gamma,
                polyak=polyak,
                actor_lr=actor_lr,
                critic_lr=critic_lr,
                batch_size=batch_size,
                start_steps=start_steps,
                action_noise=action_noise,
                target_noise=target_noise,
                noise_clip=noise_clip,
                policy_delay=policy_delay)

    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    eval_episode_rewards_history = deque(maxlen=100)
    episode_rewards_history = deque(maxlen=100)
    sess = U.get_session()

    saver = functools.partial(save_variables, sess=sess)
    loader = functools.partial(load_variables, sess=sess)
    if load_path != None:
        loader(load_path)

    # Prepare everything.
    agent.initialize(sess)
    sess.graph.finalize()

    agent.reset()
    obs = env.reset()
    if eval_env is not None:
        eval_obs = eval_env.reset()
    nenvs = env.num_envs
    n_agents = obs['observation_self'].shape[0]

    episode_reward = np.zeros((nenvs, n_agents), dtype=np.float32)  #vector
    episode_step = np.zeros(nenvs, dtype=int)  # vector
    episodes = 0  #scalar
    t = 0  # scalar

    epoch = 0

    start_time = time.time()

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0

    for t in range(int(total_timesteps)):
        """
        Until start_steps have elapsed, randomly sample actions
        from a uniform distribution for better exploration. Afterwards, 
        use the learned policy (with some noise, via act_noise). 
        """
        if t > start_steps:
            action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True)
            nenvs_actions = []
            for i in range(nenvs):
                nenv_action = {
                    'action_movement':
                    action['action_movement'][i * n_agents:(i + 1) * n_agents]
                }
                nenvs_actions.append(nenv_action)
        else:
            action, q = env.action_space.sample(), None
            nenvs_actions = []
            for i in range(nenvs):
                nenv_action = {
                    'action_movement':
                    action['action_movement'][i * n_agents:(i + 1) *
                                              n_agents][0]
                }
                nenvs_actions.append(nenv_action)

        new_obs, r, done, info = env.step(nenvs_actions)

        episode_reward += r
        episode_step += 1

        for d in range(len(done)):
            done[d] = False if episode_step == max_ep_len else done[d]

        epoch_actions.append(action)
        epoch_qs.append(q)
        agent.store_transition(
            obs, action, r, new_obs,
            done)  #the batched data will be unrolled in memory.py's append.

        obs = new_obs

        for d in range(len(done)):
            if done[d]:
                # Episode done.
                epoch_episode_rewards.append(episode_reward[d])
                episode_rewards_history.append(episode_reward[d])
                epoch_episode_steps.append(episode_step[d])
                episode_reward[d] = 0.
                episode_step[d] = 0
                epoch_episodes += 1
                episodes += 1
                if nenvs == 1:
                    agent.reset()

        episode_actor_losses = []
        episode_critic_losses = []
        episode_critic = []
        episode_critic_twin = []
        if d or (episode_step[0] == max_ep_len):
            """
            Perform all TD3 updates at the end of the trajectory
            (in accordance with source code of TD3 published by
            original authors).
            """
            for j in range(episode_step[0]):
                critic_loss, critic, critic_twin, actor_loss = agent.train(
                    episode_step[0])

                episode_critic_losses.append(critic_loss)
                episode_critic.append(critic)
                episode_critic_twin.append(critic_twin)
                if actor_loss is not None:
                    episode_actor_losses.append(actor_loss)

            obs, r, done, episode_reward, episode_step = env.reset(
            ), 0, False, np.zeros((nenvs, n_agents),
                                  dtype=np.float32), np.zeros(nenvs, dtype=int)

        if (t + 1) % nb_log_steps == 0:
            # Log stats.
            # XXX shouldn't call np.mean on variable length lists
            duration = time.time() - start_time
            stats = agent.get_stats()
            combined_stats = stats.copy()
            combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
            combined_stats['rollout/return_std'] = np.std(
                epoch_episode_rewards)
            combined_stats['rollout/return_history'] = np.mean(
                episode_rewards_history)
            combined_stats['rollout/return_history_std'] = np.std(
                episode_rewards_history)
            combined_stats['rollout/episode_steps'] = np.mean(
                epoch_episode_steps)
            combined_stats['train/loss_actor'] = np.mean(episode_actor_losses)
            combined_stats['train/loss_critic'] = np.mean(
                episode_critic_losses)
            combined_stats['total/duration'] = duration
            combined_stats['total/steps_per_second'] = float(t) / float(
                duration)
            combined_stats['total/episodes'] = episodes
            combined_stats['rollout/episodes'] = epoch_episodes

            def as_scalar(x):
                if isinstance(x, np.ndarray):
                    assert x.size == 1
                    return x[0]
                elif np.isscalar(x):
                    return x
                else:
                    raise ValueError('expected scalar, got %s' % x)

            combined_stats_sums = np.array(
                [np.array(x).flatten()[0] for x in combined_stats.values()])
            # Total statistics.
            combined_stats['total/epochs'] = epoch + 1
            combined_stats['total/steps'] = t
            for key in sorted(combined_stats.keys()):
                logger.record_tabular(key, combined_stats[key])
            if rank == 0:
                logger.dump_tabular()
            logger.info('')
            logdir = logger.get_dir()
            if rank == 0 and logdir:
                if hasattr(env, 'get_state'):
                    with open(os.path.join(logdir, 'env_state.pkl'),
                              'wb') as f:
                        pickle.dump(env.get_state(), f)
                if eval_env and hasattr(eval_env, 'get_state'):
                    with open(os.path.join(logdir, 'eval_env_state.pkl'),
                              'wb') as f:
                        pickle.dump(eval_env.get_state(), f)

        if nb_save_steps != None and (t + 1) % nb_save_steps == 0:
            if save_dir == None:
                checkdir = osp.join(logger.get_dir(), 'checkpoints')
            else:
                checkdir = osp.join(save_dir, 'checkpoints')
            os.makedirs(checkdir, exist_ok=True)
            savepath = osp.join(checkdir, '%.5i' % t)
            print('Saving to', savepath)
            saver(savepath)

    return agent
Esempio n. 3
0
    def __init__(self, env, args):
        ob_space = env.observation_space
        goal_dim = env.goal_dim
        ob_dim = ob_space.shape[0]
        self.ob_dim = ob_dim
        self.ac_dim = ac_dim = 7
        self.goal_dim = goal_dim
        self.num_iters = args.num_iters
        self.random_prob = args.random_prob
        self.tau = args.tau
        self.reward_scale = args.reward_scale
        self.gamma = args.gamma

        self.log_interval = args.log_interval
        self.save_interval = args.save_interval
        self.rollout_steps = args.rollout_steps
        self.env = env
        self.batch_size = args.batch_size
        self.train_steps = args.train_steps
        self.closest_dist = np.inf
        self.warmup_iter = args.warmup_iter
        self.max_grad_norm = args.max_grad_norm
        self.use_her = args.her
        self.k_future = args.k_future
        self.model_dir = os.path.join(args.save_dir, 'model')
        self.pretrain_dir = args.pretrain_dir
        os.makedirs(self.model_dir, exist_ok=True)
        self.global_step = 0
        self.actor = Actor(ob_dim=ob_dim,
                           act_dim=ac_dim,
                           hid1_dim=args.hid1_dim,
                           hid2_dim=args.hid2_dim,
                           hid3_dim=args.hid3_dim,
                           init_method=args.init_method)
        self.critic = Critic(ob_dim=ob_dim,
                             act_dim=ac_dim,
                             hid1_dim=args.hid1_dim,
                             hid2_dim=args.hid2_dim,
                             hid3_dim=args.hid3_dim,
                             init_method=args.init_method)
        if args.resume or args.test or args.pretrain_dir is not None:
            self.load_model(args.resume_step, pretrain_dir=args.pretrain_dir)
        if not args.test:
            self.actor_target = Actor(ob_dim=ob_dim,
                                      act_dim=ac_dim,
                                      hid1_dim=args.hid1_dim,
                                      hid2_dim=args.hid2_dim,
                                      hid3_dim=args.hid3_dim,
                                      init_method=args.init_method)
            self.critic_target = Critic(ob_dim=ob_dim,
                                        act_dim=ac_dim,
                                        hid1_dim=args.hid1_dim,
                                        hid2_dim=args.hid2_dim,
                                        hid3_dim=args.hid3_dim,
                                        init_method=args.init_method)
            self.actor_optim = self.construct_optim(self.actor,
                                                    lr=args.actor_lr)
            cri_w_decay = args.critic_weight_decay
            self.critic_optim = self.construct_optim(self.critic,
                                                     lr=args.critic_lr,
                                                     weight_decay=cri_w_decay)
            self.hard_update(self.actor_target, self.actor)
            self.hard_update(self.critic_target, self.critic)

            self.actor_target.eval()
            self.critic_target.eval()
            if args.noise_type == 'ou_noise':
                mu = np.zeros(ac_dim)
                sigma = float(args.ou_noise_std) * np.ones(ac_dim)
                self.action_noise = OrnsteinUhlenbeckActionNoise(mu=mu,
                                                                 sigma=sigma)
            elif args.noise_type == 'uniform':
                low_limit = args.uniform_noise_low
                high_limit = args.uniform_noise_high
                dec_step = args.max_noise_dec_step
                self.action_noise = UniformNoise(low_limit=low_limit,
                                                 high_limit=high_limit,
                                                 dec_step=dec_step)

            elif args.noise_type == 'gaussian':
                mu = np.zeros(ac_dim)
                sigma = args.normal_noise_std * np.ones(ac_dim)
                self.action_noise = NormalActionNoise(mu=mu, sigma=sigma)

            self.memory = Memory(limit=int(args.memory_limit),
                                 action_shape=(int(ac_dim), ),
                                 observation_shape=(int(ob_dim), ))
            self.critic_loss = nn.MSELoss()
            self.ob_norm = args.ob_norm
            if self.ob_norm:
                self.obs_oms = OnlineMeanStd(shape=(1, ob_dim))
            else:
                self.obs_oms = None

        self.cuda()
Esempio n. 4
0
class DDPG:
    def __init__(self, env, args):
        ob_space = env.observation_space
        goal_dim = env.goal_dim
        ob_dim = ob_space.shape[0]
        self.ob_dim = ob_dim
        self.ac_dim = ac_dim = 7
        self.goal_dim = goal_dim
        self.num_iters = args.num_iters
        self.random_prob = args.random_prob
        self.tau = args.tau
        self.reward_scale = args.reward_scale
        self.gamma = args.gamma

        self.log_interval = args.log_interval
        self.save_interval = args.save_interval
        self.rollout_steps = args.rollout_steps
        self.env = env
        self.batch_size = args.batch_size
        self.train_steps = args.train_steps
        self.closest_dist = np.inf
        self.warmup_iter = args.warmup_iter
        self.max_grad_norm = args.max_grad_norm
        self.use_her = args.her
        self.k_future = args.k_future
        self.model_dir = os.path.join(args.save_dir, 'model')
        self.pretrain_dir = args.pretrain_dir
        os.makedirs(self.model_dir, exist_ok=True)
        self.global_step = 0
        self.actor = Actor(ob_dim=ob_dim,
                           act_dim=ac_dim,
                           hid1_dim=args.hid1_dim,
                           hid2_dim=args.hid2_dim,
                           hid3_dim=args.hid3_dim,
                           init_method=args.init_method)
        self.critic = Critic(ob_dim=ob_dim,
                             act_dim=ac_dim,
                             hid1_dim=args.hid1_dim,
                             hid2_dim=args.hid2_dim,
                             hid3_dim=args.hid3_dim,
                             init_method=args.init_method)
        if args.resume or args.test or args.pretrain_dir is not None:
            self.load_model(args.resume_step, pretrain_dir=args.pretrain_dir)
        if not args.test:
            self.actor_target = Actor(ob_dim=ob_dim,
                                      act_dim=ac_dim,
                                      hid1_dim=args.hid1_dim,
                                      hid2_dim=args.hid2_dim,
                                      hid3_dim=args.hid3_dim,
                                      init_method=args.init_method)
            self.critic_target = Critic(ob_dim=ob_dim,
                                        act_dim=ac_dim,
                                        hid1_dim=args.hid1_dim,
                                        hid2_dim=args.hid2_dim,
                                        hid3_dim=args.hid3_dim,
                                        init_method=args.init_method)
            self.actor_optim = self.construct_optim(self.actor,
                                                    lr=args.actor_lr)
            cri_w_decay = args.critic_weight_decay
            self.critic_optim = self.construct_optim(self.critic,
                                                     lr=args.critic_lr,
                                                     weight_decay=cri_w_decay)
            self.hard_update(self.actor_target, self.actor)
            self.hard_update(self.critic_target, self.critic)

            self.actor_target.eval()
            self.critic_target.eval()
            if args.noise_type == 'ou_noise':
                mu = np.zeros(ac_dim)
                sigma = float(args.ou_noise_std) * np.ones(ac_dim)
                self.action_noise = OrnsteinUhlenbeckActionNoise(mu=mu,
                                                                 sigma=sigma)
            elif args.noise_type == 'uniform':
                low_limit = args.uniform_noise_low
                high_limit = args.uniform_noise_high
                dec_step = args.max_noise_dec_step
                self.action_noise = UniformNoise(low_limit=low_limit,
                                                 high_limit=high_limit,
                                                 dec_step=dec_step)

            elif args.noise_type == 'gaussian':
                mu = np.zeros(ac_dim)
                sigma = args.normal_noise_std * np.ones(ac_dim)
                self.action_noise = NormalActionNoise(mu=mu, sigma=sigma)

            self.memory = Memory(limit=int(args.memory_limit),
                                 action_shape=(int(ac_dim), ),
                                 observation_shape=(int(ob_dim), ))
            self.critic_loss = nn.MSELoss()
            self.ob_norm = args.ob_norm
            if self.ob_norm:
                self.obs_oms = OnlineMeanStd(shape=(1, ob_dim))
            else:
                self.obs_oms = None

        self.cuda()

    def test(self, render=False, record=True, slow_t=0):
        dist, succ_rate = self.rollout(render=render,
                                       record=record,
                                       slow_t=slow_t)
        print('Final step distance: ', dist)

    def train(self):
        self.net_mode(train=True)
        tfirststart = time.time()
        epoch_episode_rewards = deque(maxlen=1)
        epoch_episode_steps = deque(maxlen=1)
        total_rollout_steps = 0
        for epoch in range(self.global_step, self.num_iters):
            episode_reward = 0
            episode_step = 0
            self.action_noise.reset()
            obs = self.env.reset()
            obs = obs[0]
            epoch_actor_losses = []
            epoch_critic_losses = []
            if self.use_her:
                ep_experi = {
                    'obs': [],
                    'act': [],
                    'reward': [],
                    'new_obs': [],
                    'ach_goals': [],
                    'done': []
                }
            for t_rollout in range(self.rollout_steps):
                total_rollout_steps += 1
                ran = np.random.random(1)[0]
                if self.pretrain_dir is None and epoch < self.warmup_iter or \
                        ran < self.random_prob:
                    act = self.random_action().flatten()
                else:
                    act = self.policy(obs).flatten()
                new_obs, r, done, info = self.env.step(act)
                ach_goals = new_obs[1].copy()
                new_obs = new_obs[0].copy()
                episode_reward += r
                episode_step += 1
                self.memory.append(obs, act, r * self.reward_scale, new_obs,
                                   ach_goals, done)
                if self.use_her:
                    ep_experi['obs'].append(obs)
                    ep_experi['act'].append(act)
                    ep_experi['reward'].append(r * self.reward_scale)
                    ep_experi['new_obs'].append(new_obs)
                    ep_experi['ach_goals'].append(ach_goals)
                    ep_experi['done'].append(done)
                if self.ob_norm:
                    self.obs_oms.update(new_obs)
                obs = new_obs
            epoch_episode_rewards.append(episode_reward)
            epoch_episode_steps.append(episode_step)
            if self.use_her:
                for t in range(episode_step - self.k_future):
                    ob = ep_experi['obs'][t]
                    act = ep_experi['act'][t]
                    new_ob = ep_experi['new_obs'][t]
                    ach_goal = ep_experi['ach_goals'][t]
                    k_futures = np.random.choice(np.arange(
                        t + 1, episode_step),
                                                 self.k_future - 1,
                                                 replace=False)
                    k_futures = np.concatenate((np.array([t]), k_futures))
                    for future in k_futures:
                        new_goal = ep_experi['ach_goals'][future]
                        her_ob = np.concatenate(
                            (ob[:-self.goal_dim], new_goal), axis=0)
                        her_new_ob = np.concatenate(
                            (new_ob[:-self.goal_dim], new_goal), axis=0)
                        res = self.env.cal_reward(ach_goal.copy(), new_goal,
                                                  act)
                        her_reward, _, done = res
                        self.memory.append(her_ob, act,
                                           her_reward * self.reward_scale,
                                           her_new_ob, ach_goal.copy(), done)
            self.global_step += 1
            if epoch >= self.warmup_iter:
                for t_train in range(self.train_steps):
                    act_loss, cri_loss = self.train_net()
                    epoch_critic_losses.append(cri_loss)
                    epoch_actor_losses.append(act_loss)

            if epoch % self.log_interval == 0:
                tnow = time.time()
                stats = {}
                if self.ob_norm:
                    stats['ob_oms_mean'] = safemean(self.obs_oms.mean.numpy())
                    stats['ob_oms_std'] = safemean(self.obs_oms.std.numpy())
                stats['total_rollout_steps'] = total_rollout_steps
                stats['rollout/return'] = safemean(
                    [rew for rew in epoch_episode_rewards])
                stats['rollout/ep_steps'] = safemean(
                    [l for l in epoch_episode_steps])
                if epoch >= self.warmup_iter:
                    stats['actor_loss'] = np.mean(epoch_actor_losses)
                    stats['critic_loss'] = np.mean(epoch_critic_losses)
                stats['epoch'] = epoch
                stats['actor_lr'] = self.actor_optim.param_groups[0]['lr']
                stats['critic_lr'] = self.critic_optim.param_groups[0]['lr']
                stats['time_elapsed'] = tnow - tfirststart
                for name, value in stats.items():
                    logger.logkv(name, value)
                logger.dumpkvs()
            if (epoch == 0 or epoch >= self.warmup_iter) and \
                    self.save_interval and\
                    epoch % self.save_interval == 0 and \
                    logger.get_dir():
                mean_final_dist, succ_rate = self.rollout()
                logger.logkv('epoch', epoch)
                logger.logkv('test/total_rollout_steps', total_rollout_steps)
                logger.logkv('test/mean_final_dist', mean_final_dist)
                logger.logkv('test/succ_rate', succ_rate)

                tra_mean_dist, tra_succ_rate = self.rollout(train_test=True)
                logger.logkv('train/mean_final_dist', tra_mean_dist)
                logger.logkv('train/succ_rate', tra_succ_rate)

                # self.log_model_weights()
                logger.dumpkvs()
                if mean_final_dist < self.closest_dist:
                    self.closest_dist = mean_final_dist
                    is_best = True
                else:
                    is_best = False
                self.save_model(is_best=is_best, step=self.global_step)

    def train_net(self):
        batch_data = self.memory.sample(batch_size=self.batch_size)
        for key, value in batch_data.items():
            batch_data[key] = torch.from_numpy(value)
        obs0_t = batch_data['obs0']
        obs1_t = batch_data['obs1']
        obs0_t = self.normalize(obs0_t, self.obs_oms)
        obs1_t = self.normalize(obs1_t, self.obs_oms)
        obs0 = Variable(obs0_t).float().cuda()
        with torch.no_grad():
            vol_obs1 = Variable(obs1_t).float().cuda()

        rewards = Variable(batch_data['rewards']).float().cuda()
        actions = Variable(batch_data['actions']).float().cuda()
        terminals = Variable(batch_data['terminals1']).float().cuda()

        cri_q_val = self.critic(obs0, actions)
        with torch.no_grad():
            target_net_act = self.actor_target(vol_obs1)
            target_net_q_val = self.critic_target(vol_obs1, target_net_act)
            # target_net_q_val.volatile = False
            target_q_label = rewards
            target_q_label += self.gamma * target_net_q_val * (1 - terminals)
            target_q_label = target_q_label.detach()

        self.actor.zero_grad()
        self.critic.zero_grad()
        cri_loss = self.critic_loss(cri_q_val, target_q_label)
        cri_loss.backward()
        if self.max_grad_norm is not None:
            torch.nn.utils.clip_grad_norm(self.critic.parameters(),
                                          self.max_grad_norm)
        self.critic_optim.step()

        self.critic.zero_grad()
        self.actor.zero_grad()
        net_act = self.actor(obs0)
        net_q_val = self.critic(obs0, net_act)
        act_loss = -net_q_val.mean()
        act_loss.backward()

        if self.max_grad_norm is not None:
            torch.nn.utils.clip_grad_norm(self.actor.parameters(),
                                          self.max_grad_norm)
        self.actor_optim.step()

        self.soft_update(self.actor_target, self.actor, self.tau)
        self.soft_update(self.critic_target, self.critic, self.tau)
        return act_loss.cpu().data.numpy(), cri_loss.cpu().data.numpy()

    def normalize(self, x, stats):
        if stats is None:
            return x
        return (x - stats.mean) / stats.std

    def denormalize(self, x, stats):
        if stats is None:
            return x
        return x * stats.std + stats.mean

    def net_mode(self, train=True):
        if train:
            self.actor.train()
            self.critic.train()
        else:
            self.actor.eval()
            self.critic.eval()

    def load_model(self, step=None, pretrain_dir=None):
        model_dir = self.model_dir
        if pretrain_dir is not None:
            ckpt_file = os.path.join(self.pretrain_dir, 'model_best.pth')
        else:
            if step is None:
                ckpt_file = os.path.join(model_dir, 'model_best.pth')
            else:
                ckpt_file = os.path.join(model_dir,
                                         'ckpt_{:08d}.pth'.format(step))
        if not os.path.isfile(ckpt_file):
            raise ValueError("No checkpoint found at '{}'".format(ckpt_file))
        mutils.print_yellow('Loading checkpoint {}'.format(ckpt_file))
        checkpoint = torch.load(ckpt_file)
        if pretrain_dir is not None:
            actor_dict = self.actor.state_dict()
            critic_dict = self.critic.state_dict()
            actor_pretrained_dict = {
                k: v
                for k, v in checkpoint['actor_state_dict'].items()
                if k in actor_dict
            }
            critic_pretrained_dict = {
                k: v
                for k, v in checkpoint['critic_state_dict'].items()
                if k in critic_dict
            }
            actor_dict.update(actor_pretrained_dict)
            critic_dict.update(critic_pretrained_dict)
            self.actor.load_state_dict(actor_dict)
            self.critic.load_state_dict(critic_dict)
            self.global_step = 0
        else:
            self.actor.load_state_dict(checkpoint['actor_state_dict'])
            self.critic.load_state_dict(checkpoint['critic_state_dict'])
            self.global_step = checkpoint['global_step']
        if step is None:
            mutils.print_yellow('Checkpoint step: {}'
                                ''.format(checkpoint['ckpt_step']))

        self.warmup_iter += self.global_step
        mutils.print_yellow('Checkpoint loaded...')

    def save_model(self, is_best, step=None):
        if step is None:
            step = self.global_step
        ckpt_file = os.path.join(self.model_dir,
                                 'ckpt_{:08d}.pth'.format(step))
        data_to_save = {
            'ckpt_step': step,
            'global_step': self.global_step,
            'actor_state_dict': self.actor.state_dict(),
            'actor_optimizer': self.actor_optim.state_dict(),
            'critic_state_dict': self.critic.state_dict(),
            'critic_optimizer': self.critic_optim.state_dict()
        }

        mutils.print_yellow('Saving checkpoint: %s' % ckpt_file)
        torch.save(data_to_save, ckpt_file)
        if is_best:
            torch.save(data_to_save,
                       os.path.join(self.model_dir, 'model_best.pth'))

    def rollout(self, train_test=False, render=False, record=False, slow_t=0):
        test_conditions = self.env.train_test_conditions \
            if train_test else self.env.test_conditions
        done_num = 0
        final_dist = []
        episode_length = []
        for idx in range(test_conditions):
            if train_test:
                obs = self.env.train_test_reset(cond=idx)
            else:
                obs = self.env.test_reset(cond=idx)
            for t_rollout in range(self.rollout_steps):
                obs = obs[0].copy()
                act = self.policy(obs, stochastic=False).flatten()
                obs, r, done, info = self.env.step(act)
                if render:
                    self.env.render()
                    if slow_t > 0:
                        time.sleep(slow_t)
                if done:
                    done_num += 1
                    break
            if record:
                print('dist: ', info['dist'])
            final_dist.append(info['dist'])
            episode_length.append(t_rollout)
        final_dist = np.array(final_dist)
        mean_final_dist = np.mean(final_dist)
        succ_rate = done_num / float(test_conditions)
        if record:
            with open('./test_data.json', 'w') as f:
                json.dump(final_dist.tolist(), f)

            print('\nDist statistics:')
            print("Minimum: {0:9.4f} Maximum: {1:9.4f}"
                  "".format(np.min(final_dist), np.max(final_dist)))
            print("Mean: {0:9.4f}".format(mean_final_dist))
            print("Standard Deviation: {0:9.4f}".format(np.std(final_dist)))
            print("Median: {0:9.4f}".format(np.median(final_dist)))
            print("First quartile: {0:9.4f}"
                  "".format(np.percentile(final_dist, 25)))
            print("Third quartile: {0:9.4f}"
                  "".format(np.percentile(final_dist, 75)))
            print('Success rate:', succ_rate)
        if render:
            while True:
                self.env.render()
        return mean_final_dist, succ_rate

    def log_model_weights(self):
        for name, param in self.actor.named_parameters():
            logger.logkv('actor/' + name, param.clone().cpu().data.numpy())
        for name, param in self.actor_target.named_parameters():
            logger.logkv('actor_target/' + name,
                         param.clone().cpu().data.numpy())
        for name, param in self.critic.named_parameters():
            logger.logkv('critic/' + name, param.clone().cpu().data.numpy())
        for name, param in self.critic_target.named_parameters():
            logger.logkv('critic_target/' + name,
                         param.clone().cpu().data.numpy())

    def random_action(self):
        act = np.random.uniform(-1., 1., self.ac_dim)
        return act

    def policy(self, obs, stochastic=True):
        self.actor.eval()
        ob = Variable(torch.from_numpy(obs)).float().cuda().view(1, -1)
        act = self.actor(ob)
        act = act.cpu().data.numpy()
        if stochastic:
            act = self.action_noise(act)
        self.actor.train()
        return act

    def cuda(self):
        self.critic.cuda()
        self.actor.cuda()
        if hasattr(self, 'critic_target'):
            self.critic_target.cuda()
            self.actor_target.cuda()
            self.critic_loss.cuda()

    def construct_optim(self, net, lr, weight_decay=None):
        if weight_decay is None:
            weight_decay = 0
        params = mutils.add_weight_decay([net], weight_decay=weight_decay)
        optimizer = optim.Adam(params, lr=lr, weight_decay=weight_decay)
        return optimizer

    def soft_update(self, target, source, tau):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(target_param.data * (1.0 - tau) +
                                    param.data * tau)

    def hard_update(self, target, source):
        for target_param, param in zip(target.parameters(),
                                       source.parameters()):
            target_param.data.copy_(param.data)
Esempio n. 5
0
class DDPG:
  def __init__(self, actor_state_size, actor_action_size, critic_state_size, critic_action_size, **kwargs):
    
    if 'filename' in kwargs.keys(): 
      data= torch.load(kwargs['filename'])
      self.config= data["config"]
      self.scores= data["scores"]
    elif 'config' in kwargs.keys():
      self.config= kwargs['config']
      data= {}
      self.scores= []
    else:
      raise OSError('DDPG: no configuration parameter in class init')
      
        
    self.actor_state_size = actor_state_size
    self.actor_action_size = actor_action_size
    self.critic_state_size = critic_state_size
    self.critic_action_size = critic_action_size
    memory_size = self.config.get("memory_size", 100000)
    actor_lr = self.config.get("actor_lr", 1e-3)
    critic_lr = self.config.get("critic_lr", 1e-3)
    self.batch_size = self.config.get("batch_size", 256)
    self.discount = self.config.get("discount", 0.9)
    sigma = self.config.get("sigma", 0.2)
    self.tau= self.config.get("tau", 0.001)
    self.seed = self.config.get("seed", 0)
    self.action_noise= self.config.get("action_noise", "No")
    self.critic_l2_reg= self.config.get("critic_l2_reg", 0.0)
    random.seed(self.seed)
    torch.manual_seed(self.seed)
    
    param_noise= False
    if self.action_noise== "Param": param_noise= True
    
    self.actor = Actor(actor_state_size, actor_action_size, nodes= self.config["actor_nodes"], seed= self.seed, param_noise= param_noise).to(device)
    self.critic = Critic(critic_state_size, critic_action_size, nodes= self.config["critic_nodes"], seed= self.seed).to(device)
    self.targetActor = Actor(actor_state_size, actor_action_size, nodes= self.config["actor_nodes"], seed= self.seed, param_noise= param_noise).to(device)
    self.targetCritic = Critic(critic_state_size, critic_action_size, nodes= self.config["critic_nodes"], seed= self.seed).to(device)
    # Initialize parameters
    self.hard_update(self.actor, self.targetActor)
    self.hard_update(self.critic, self.targetCritic)
        
    self.actor_optimizer = optim.Adam(self.actor.parameters(), lr= actor_lr)
    self.critic_optimizer = optim.Adam(self.critic.parameters(), lr= critic_lr, weight_decay= self.critic_l2_reg)
    self.criticLoss = nn.MSELoss()  #nn.SmoothL1Loss()
    #self.criticLoss = nn.SmoothL1Loss()
    
    #self.noise= None
    self.noise = NoNoise()
    if self.action_noise== "OU":
      self.noise = OUNoise(np.zeros(actor_action_size), sigma= sigma)
    elif self.action_noise== "No":
      self.noise = NoNoise()
    elif self.action_noise== "Normal":
      self.noise = NormalActionNoise(np.zeros(actor_action_size), sigma= sigma)
      
    self.memory = Memory(memory_size, self.batch_size, self.seed)
    
    
  def hard_update(self, source, target):
      for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(param.data)
    
  def soft_update(self, local_model, target_model, tau):
    """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target

        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
    for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
      target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)  
    
  def act(self, state, add_noise= True):
    """Returns actions for given state as per current policy."""
    self.actor.resample()
    #state = torch.from_numpy(state).float().to(device)
    #state= torch.FloatTensor(state).view(1, -1).to(device)
    #state= torch.FloatTensor(state).unsqueeze(0).to(device)
    state= torch.FloatTensor(state).to(device)
    if len(state.size())== 1:
      state= state.unsqueeze(0)
    
    self.actor.eval()
    with torch.no_grad():
      action = self.actor(state).cpu().data.numpy()
    self.actor.train()
    
    if add_noise and self.noise:
        action += self.noise()
    return np.clip(action, -1, 1)
    
  def step(self, state, action, reward, next_state, done):
    """Save experience in replay memory, and use random sample from buffer to learn."""
    self.memory.add((state, action, reward, next_state, done))
    if len(self.memory) >= self.batch_size:
      self.learn()
      
  def learn_critic(self, states, actions, rewards, next_states, dones, actions_next):
    # ---------------------------- update critic ---------------------------- #
    # Get predicted next-state actions and Q values from target models
    #actions_next = self.targetActor(next_states)
    Q_targets_next = self.targetCritic(next_states, actions_next)
    # Compute Q targets for current states (y_i)
    Q_targets = rewards + (self.discount * Q_targets_next * (1 - dones))
    Q_targets = Variable(Q_targets.data, requires_grad=False)
    # Compute critic loss
    Q_expected = self.critic(states, actions)
    critic_loss = self.criticLoss(Q_expected, Q_targets)
    # Minimize the loss
    self.critic_optimizer.zero_grad()
    critic_loss.backward()
    self.critic_optimizer.step()
    # ----------------------- update target networks ----------------------- #
    self.soft_update(self.critic, self.targetCritic, self.tau)
    
  #def learn_actor(self, states, actions, rewards, next_states, dones, actions_pred):
  def learn_actor(self, states, actions_pred):
    # ---------------------------- update actor ---------------------------- #
    # Compute actor loss
    #actions_pred = self.actor(states)
    actor_loss = -self.critic(states, actions_pred).mean()
    # Minimize the loss
    self.actor_optimizer.zero_grad()
    actor_loss.backward()
    self.actor_optimizer.step()
    # ----------------------- update target networks ----------------------- #
    self.soft_update(self.actor, self.targetActor, self.tau) 
      
  def learn(self):
    states, actions, rewards, next_states, dones = self.memory.sample()
    
    self.learn_critic(states, actions, rewards, next_states, dones, self.targetActor(next_states))
    self.learn_actor( states, self.actor(states))
    
  def reset(self):
    self.noise.reset()
      
  def update(self, score= None):
    if score: self.scores.append(score)
      
  def save(self, filename= None):
    data= {"config": self.config, "actor": self.actor.state_dict(), "scores": self.scores,}
    if not filename:
      filename= self.__class__.__name__+ '_'+ datetime.now().strftime("%Y-%m-%d_%H:%M:%S")+ '.data'
    torch.save(data, filename)
    torch.save(self.actor.state_dict(), "last_actor.pth")
Esempio n. 6
0
    eval_env = gym.make(args.env)
    #eval_env.seed(args.seed+100)
    if logger.get_dir():
        eval_env = bench.Monitor(
            eval_env, os.path.join(logger.get_dir(), "eval.monitor.json"))

    max_timesteps = train_env.spec.timestep_limit

    # set noise type
    current_noise_type = args.noise_type.strip()
    nb_actions = train_env.action_space.shape[0]
    if 'normal' in current_noise_type:
        _, stddev = current_noise_type.split('_')
        action_noise = NormalActionNoise(mu=np.zeros(nb_actions),
                                         sigma=float(stddev) *
                                         np.ones(nb_actions))
        action_noise.reset()
    if 'ou' in current_noise_type:
        _, stddev = current_noise_type.split('_')
        action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions),
                                                    sigma=float(stddev) *
                                                    np.ones(nb_actions))
        action_noise.reset()
    else:
        raise RuntimeError(
            'unknown noise type "{}"'.format(current_noise_type))

    episode_rewards = []
    if 'Sparse' in train_env.spec.id:
        sparse = True