コード例 #1
0
ファイル: simple.py プロジェクト: princeton-vl/PackIt
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert not isinstance(self.action_space, gym.spaces.Box), \
                "Error: DeepQ cannot output a gym.spaces.Box action space."
            assert issubclass(self.policy, DeepQPolicy), "Error: the input policy for the DeepQ model must be " \
                                                         "an instance of DeepQPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(graph=self.graph)

                optimizer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate)

                self.act, self._train_step, self.update_target, _ = deepq.build_train(
                    q_func=self.policy,
                    ob_space=self.observation_space,
                    ac_space=self.action_space,
                    optimizer=optimizer,
                    gamma=self.gamma,
                    grad_norm_clipping=10,
                    param_noise=self.param_noise,
                    sess=self.sess)

                self.params = find_trainable_variables("deepq")

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(self.sess)
                self.update_target(sess=self.sess)

                self.summary = tf.summary.merge_all()
コード例 #2
0
    def setup_model(self):

        with SetVerbosity(self.verbose):
            assert not isinstance(self.action_space, gym.spaces.Box), \
                "Error: DQN cannot output a gym.spaces.Box action space."

            # If the policy is wrap in functool.partial (e.g. to disable dueling)
            # unwrap it to check the class type
            if isinstance(self.policy, partial):
                test_policy = self.policy.func
            else:
                test_policy = self.policy
            assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \
                                                       "an instance of DQNPolicy."

            self.graph = tf.Graph()

            with self.graph.as_default():
                self.set_random_seed(self.seed)
                self.sess = tf_util.make_session(num_cpu=self.n_cpu_tf_sess,
                                                 graph=self.graph)

                optimizer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate)

                self.act, self._train_step, self.update_target, self.step_model = deepq.build_train(
                    q_func=partial(self.policy, **self.policy_kwargs),
                    ob_space=self.observation_space,
                    ac_space=self.action_space,
                    optimizer=optimizer,
                    gamma=self.gamma,
                    grad_norm_clipping=10,
                    param_noise=self.param_noise,
                    sess=self.sess,
                    full_tensorboard_log=self.full_tensorboard_log,
                    double_q=self.double_q)
                self.proba_step = self.step_model.proba_step
                self.params = tf_util.get_trainable_vars("deepq")

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(self.sess)
                self.update_target(sess=self.sess)

                self.summary = tf.summary.merge_all()
コード例 #3
0
    def setup_model(self):
        self.graph = tf.Graph()
        with self.graph.as_default():
            self.sess = tf_util.make_session(graph=self.graph)

            # https://github.com/hill-a/stable-baselines/blob/master/stable_baselines/deepq/build_graph.py
            self.act, self.train_step, self.update_target, self.step_model = deepq.build_train(
                q_func=self.policy,
                ob_space=self.env.observation_space,
                ac_space=self.env.action_space,
                optimizer=tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate),
                gamma=self.gamma,
                # grad_norm_clipping=1,
                sess=self.sess)
            self.params = find_trainable_variables('deepq')

            tf_util.initialize(self.sess)
            self.update_target(sess=self.sess)
            self.summary = tf.summary.merge_all()
コード例 #4
0
    def setup_model(self):
        with SetVerbosity(self.verbose):
            assert not isinstance(self.action_space, gym.spaces.Box), \
                "Error: DQN cannot output a gym.spaces.Box action space."

            # If the policy is wrap in functool.partial (e.g. to disable dueling)
            # unwrap it to check the class type
            if isinstance(self.policy, partial):
                test_policy = self.policy.func
            else:
                test_policy = self.policy
            assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \
                                                       "an instance of DQNPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(graph=self.graph)

                optimizer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate)
                # optimizer = tf.contrib.opt.NadamOptimizer(learning_rate=self.learning_rate)
                # optimizer = tf.train.MomentumOptimizer(learning_rate=1e-3, momentum=0.9, use_nesterov=True)

                self.act, self._train_step, self.update_target, self.step_model = deepq.build_train(
                    q_func=self.policy,
                    ob_space=self.observation_space,
                    ac_space=self.action_space,
                    optimizer=optimizer,
                    gamma=self.gamma,
                    grad_norm_clipping=10,
                    param_noise=self.param_noise,
                    sess=self.sess)
                self.proba_step = self.step_model.proba_step
                self.params = find_trainable_variables("deepq")

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(self.sess)
                self.update_target(sess=self.sess)

                self.summary = tf.summary.merge_all()
コード例 #5
0
    def setup_model(self):
        with SetVerbosity(self.verbose):

            assert isinstance(self.action_space, gym.spaces.Discrete), \
                "Error: DeepQ cannot output a {} action space, only spaces.Discrete is supported."\
                .format(self.action_space)

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(graph=self.graph)

                # capture the shape outside the closure so that the env object is not serialized
                # by cloudpickle when serializing make_obs_ph
                observation_space = self.observation_space

                def make_obs_ph(name):
                    """
                    makes the observation placeholder

                    :param name: (str) the placeholder name
                    :return: (TensorFlow Tensor) the placeholder
                    """
                    return ObservationInput(observation_space, name=name)

                self.act, self._train_step, self.update_target, _ = deepq.build_train(
                    make_obs_ph=make_obs_ph,
                    q_func=self.policy,
                    num_actions=self.action_space.n,
                    optimizer=tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate),
                    gamma=self.gamma,
                    grad_norm_clipping=10,
                    param_noise=self.param_noise)

                self.params = find_trainable_variables("deepq")

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(self.sess)
                self.update_target(sess=self.sess)
コード例 #6
0
def main(args):
    """
    Train a DQN agent on cartpole env
    :param args: (Parsed Arguments) the input arguments
    """
    with tf_utils.make_session(8) as sess:
        # Create the environment
        env = gym.make("CartPole-v0")
        # Create all the functions necessary to train the model
        act, train, update_target, _ = deepq.build_train(
            q_func=CustomPolicy,
            ob_space=env.observation_space,
            ac_space=env.action_space,
            optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
            sess=sess)
        # Create the replay buffer
        replay_buffer = ReplayBuffer(50000)
        # Create the schedule for exploration starting from 1 (every action is random) down to
        # 0.02 (98% of actions are selected according to values predicted by the model).
        exploration = LinearSchedule(schedule_timesteps=10000,
                                     initial_p=1.0,
                                     final_p=0.02)

        # Initialize the parameters and copy them to the target network.
        tf_utils.initialize()
        update_target()

        episode_rewards = [0.0]
        obs = env.reset()
        for step in itertools.count():
            # Take action and update exploration to the newest value
            action = act(obs[None], update_eps=exploration.value(step))[0]
            new_obs, rew, done, _ = env.step(action)
            # Store transition in the replay buffer.
            replay_buffer.add(obs, action, rew, new_obs, float(done))
            obs = new_obs

            episode_rewards[-1] += rew
            if done:
                obs = env.reset()
                episode_rewards.append(0)

            if len(episode_rewards[-101:-1]) == 0:
                mean_100ep_reward = -np.inf
            else:
                mean_100ep_reward = round(
                    float(np.mean(episode_rewards[-101:-1])), 1)

            is_solved = step > 100 and mean_100ep_reward >= 200

            if args.no_render and step > args.max_timesteps:
                break

            if is_solved:
                if args.no_render:
                    break
                # Show off the result
                env.render()
            else:
                # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                if step > 1000:
                    obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(
                        32)
                    train(obses_t, actions, rewards, obses_tp1, dones,
                          np.ones_like(rewards))
                # Update target network periodically.
                if step % 1000 == 0:
                    update_target()

            if done and len(episode_rewards) % 10 == 0:
                logger.record_tabular("steps", step)
                logger.record_tabular("episodes", len(episode_rewards))
                logger.record_tabular("mean episode reward", mean_100ep_reward)
                logger.record_tabular("% time spent exploring",
                                      int(100 * exploration.value(step)))
                logger.dump_tabular()
コード例 #7
0
    def exec(self):
        """
        Train a DQN agent on cartpole env
        :param args: (Parsed Arguments) the input arguments
        """
        with tf_utils.make_session(8) as sess:
            # Create the environment
            env = self.env
            # Create all the functions necessary to train the model
            act, train, update_target, _ = deepq.build_train(
                q_func=CustomPolicy,
                ob_space=env.observation_space,
                ac_space=env.action_space,
                optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
                sess=sess,
                double_q = False,
            )
            # Create the replay buffer
            replay_buffer = ReplayBuffer(50000)
            # Create the schedule for exploration starting from 1 (every action is random) down to
            # 0.02 (98% of actions are selected according to values predicted by the model).
            solved_yet = False
            is_solved = False
            steps_so_far = 0
            # Initialize the parameters and copy them to the target network.
            tf_utils.initialize()
            update_target()

            episode_rewards = [0.0]
            obs = env.reset()
            for i in trange(self.episode_count):

                step = 0
                done = False
                while not done:
                    step += 1
                    steps_so_far += 1

                    if not self.mode_rbed:
                        self.linear_decay(step=steps_so_far)
                    # Take action and update exploration to the newest value
                    action = act(obs[None], update_eps=self.epsilon)[0]
                    new_obs, rew, done, _ = env.step(action)
                    # Store transition in the replay buffer.
                    replay_buffer.add(obs, action, rew, new_obs, float(done))
                    obs = new_obs

                    episode_rewards[-1] += rew

                    if done:
                        obs = env.reset()

                        last_reward = episode_rewards[-1]

                        if self.mode_rbed:
                            self.rb_decay_epsilon(current_reward=last_reward)

                        if len(episode_rewards[-101:-1]) == 0:
                            mean_100ep_reward = sum(episode_rewards)/100
                        else:
                            mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)

                        # is_solved = step > 100 and mean_100ep_reward >= self.env_target

                        # log epsilon
                        self.ex.log_scalar(self.VAL_EPSILON, self.epsilon)

                        # log reward
                        self.ex.log_scalar(self.VAL_REWARD, last_reward)

                        # log average reward
                        self.ex.log_scalar(self.VAL_AVG100, mean_100ep_reward)

                        # log solved at
                        if mean_100ep_reward >= self.env_target and (not solved_yet):
                            solved_yet = True
                            self.ex.log_scalar(self.VAL_SOLVEDAT, i)

                        # For next run
                        episode_rewards.append(0)

                    # Do not train further once solved. Keeping consistent with the original scheme
                    if not solved_yet:
                        # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
                        if steps_so_far > 1000:
                            obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
                            train(
                                obses_t,
                                actions,
                                rewards,
                                obses_tp1,
                                dones,
                                np.ones_like(rewards))
                        # Update target network periodically.
                        if steps_so_far % 1000 == 0:
                            update_target()
コード例 #8
0
ファイル: DQN_metric.py プロジェクト: do-not-be-hasty/RL
    def setup_model(self):
        with SetVerbosity(self.verbose):
            assert not isinstance(self.action_space, gym.spaces.Box), \
                "Error: DQN cannot output a gym.spaces.Box action space."

            # If the policy is wrap in functool.partial (e.g. to disable dueling)
            # unwrap it to check the class type
            if isinstance(self.policy, partial):
                test_policy = self.policy.func
            else:
                test_policy = self.policy
            assert issubclass(test_policy, DQNPolicy), "Error: the input policy for the DQN model must be " \
                                                       "an instance of DQNPolicy."

            self.graph = tf.Graph()
            with self.graph.as_default():
                self.sess = tf_util.make_session(graph=self.graph)

                optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate)

                self.act, self._train_step, self.update_target, self.step_model = deepq.build_train(
                    q_func=self.policy,
                    ob_space=self.observation_space,
                    ac_space=self.action_space,
                    optimizer=optimizer,
                    gamma=self.gamma,
                    grad_norm_clipping=10,
                    param_noise=self.param_noise,
                    sess=self.sess
                )
                self.proba_step = self.step_model.proba_step
                self.params = find_trainable_variables("deepq")

                # Initialize the parameters and copy them to the target network.
                tf_util.initialize(self.sess)
                self.update_target(sess=self.sess)

                self.summary = tf.summary.merge_all()

            # TODO metric

            self.model = tf.keras.models.Sequential([
                tf.keras.layers.Flatten(input_shape=self.observation_space.shape),
                tf.keras.layers.Dense(256, activation=tf.nn.relu),
                tf.keras.layers.Dense(1, activation=tf.nn.relu)
            ])

            self.model.compile(optimizer='adam',
                          loss='mean_squared_error',
                          metrics=['mean_absolute_error'])

            def mtr_train_naive(obses_beg, obses_step, obses_fin, dist):
                data = np.concatenate([obses_beg, obses_fin], axis=1)
                self.model.fit(x=data, y=dist, verbose=0)

            def mtr_train_step(obses_beg, obses_step, obses_fin, dist):
                data_step = np.concatenate([obses_step, obses_fin], axis=1)
                pred = self.model.predict(data_step, verbose=0).flatten() + 1

                data = np.concatenate([obses_beg, obses_fin], axis=1)
                y = np.minimum(dist, pred*self.mtr_weight + dist*(1-self.mtr_weight))
                # print(obses_beg[0], obses_step[0], obses_fin[0], dist[0], pred[0], y[0])
                self.model.fit(x=data, y=y, verbose=0)

            def mtr_predict(data):
                return self.model.predict(data, verbose=0)

            self.mtr_train = mtr_train_step
            self.mtr_predict = mtr_predict