def test_lstm_example():
    from code import tensorflow_code as tf
    from baselines.common import policies, models, cmd_util
    from baselines.common.vec_env.dummy_vec_env import DummyVecEnv

    # create vectorized environment
    venv = DummyVecEnv(
        [lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)])

    with tf.Session() as sess:
        # build policy based on lstm network with 128 units
        policy = policies.build_policy(venv, models.lstm(128))(nbatch=1,
                                                               nsteps=1)

        # initialize tensorflow_code variables
        sess.run(tf.global_variables_initializer())

        # prepare environment variables
        ob = venv.reset()
        state = policy.initial_state
        done = [False]
        step_counter = 0

        # run a single episode until the end (i.e. until done)
        while True:
            action, _, state, _ = policy.step(ob, S=state, M=done)
            ob, reward, done, _ = venv.step(action)
            step_counter += 1
            if done:
                break

        assert step_counter > 5
    def load_policy(cls, policy_dict_path, tf_generator, network_config=None):
        """
        For when we only need to load a policy for the forward pass. For instance, to run on the robot from
        a checkpointed policy.
        """
        from code.tensorflow_code import ops
        ops.reset_default_graph(
        )  # we need to destroy the default graph before re_init or checkpoint won't restore.
        pol_dict = pickle.load(open(policy_dict_path, "rb"))
        tf_map = tf_generator(dim_input=pol_dict['deg_obs'],
                              dim_output=pol_dict['deg_action'],
                              batch_size=1,
                              network_config=network_config)

        sess = tf.Session()
        init_op = tf.initialize_all_variables()
        sess.run(init_op)
        saver = tf.train.Saver()
        check_file = pol_dict['checkpoint_path_tf']
        saver.restore(sess, check_file)

        device_string = pol_dict['device_string']

        cls_init = cls(pol_dict['deg_action'], tf_map.get_input_tensor(),
                       tf_map.get_output_op(), np.zeros((1, )), sess,
                       device_string)
        cls_init.chol_pol_covar = pol_dict['chol_pol_covar']
        cls_init.scale = pol_dict['scale']
        cls_init.bias = pol_dict['bias']
        cls_init.x_idx = pol_dict['x_idx']
        return cls_init
Ejemplo n.º 3
0
def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
    np.random.seed(0)
    np_random.seed(0)

    env = DummyVecEnv([env_fn])

    with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True)).as_default():
        tf.set_random_seed(0)

        model = learn_fn(env)

        sum_rew = 0
        done = True

        for i in range(n_trials):
            if done:
                obs = env.reset()
                state = model.initial_state

            if state is not None:
                a, v, state, _ = model.step(obs, S=state, M=[False])
            else:
                a, v, _, _ = model.step(obs)

            obs, rew, done, _ = env.step(a)
            sum_rew += float(rew)

        print("Reward in {} trials is {}".format(n_trials, sum_rew))
        assert sum_rew > min_reward_fraction * n_trials, \
            'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials)
Ejemplo n.º 4
0
    def __init__(self, hyperparams, dO, dU):
        config = copy.deepcopy(POLICY_OPT_TF)
        config.update(hyperparams)

        PolicyOpt.__init__(self, config, dO, dU)

        tf.set_random_seed(self._hyperparams['random_seed'])

        self.tf_iter = 0
        self.batch_size = self._hyperparams['batch_size']
        self.device_string = "/cpu:0"
        if self._hyperparams['use_gpu'] == 1:
            self.gpu_device = self._hyperparams['gpu_id']
            self.device_string = "/gpu:" + str(self.gpu_device)
        self.act_op = None  # mu_hat
        self.feat_op = None  # features
        self.loss_scalar = None
        self.obs_tensor = None
        self.precision_tensor = None
        self.action_tensor = None  # mu true
        self.solver = None
        self.feat_vals = None
        self.init_network()
        self.init_solver()
        self.var = self._hyperparams['init_var'] * np.ones(dU)
        self.sess = tf.Session()
        self.policy = TfPolicy(
            dU,
            self.obs_tensor,
            self.act_op,
            self.feat_op,
            np.zeros(dU),
            self.sess,
            self.device_string,
            copy_param_scope=self._hyperparams['copy_param_scope'])

        # List of indices for state (vector) data and image (tensor) data in observation.
        self.x_idx, self.img_idx, i = [], [], 0
        if 'obs_image_data' not in self._hyperparams['network_params']:
            self._hyperparams['network_params'].update({'obs_image_data': []})
        for sensor in self._hyperparams['network_params']['obs_include']:
            dim = self._hyperparams['network_params']['sensor_dims'][sensor]
            if sensor in self._hyperparams['network_params']['obs_image_data']:
                self.img_idx = self.img_idx + list(range(i, i + dim))
            else:
                self.x_idx = self.x_idx + list(range(i, i + dim))
            i += dim
        init_op = tf.initialize_all_variables()
        self.sess.run(init_op)
    def load_act(path):
        with open(path, "rb") as f:
            model_data, act_params = cloudpickle.load(f)
        act = deepq.build_act(**act_params)
        sess = tf.Session()
        sess.__enter__()
        with tempfile.TemporaryDirectory() as td:
            arc_path = os.path.join(td, "packed.zip")
            with open(arc_path, "wb") as f:
                f.write(model_data)

            zipfile.ZipFile(arc_path, 'r', zipfile.ZIP_DEFLATED).extractall(td)
            load_variables(os.path.join(td, "model"))

        return ActWrapper(act, act_params)
Ejemplo n.º 6
0
def reward_per_episode_test(env_fn,
                            learn_fn,
                            min_avg_reward,
                            n_trials=N_EPISODES):
    env = DummyVecEnv([env_fn])

    with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(
            allow_soft_placement=True)).as_default():
        model = learn_fn(env)

        N_TRIALS = 100

        observations, actions, rewards = rollout(env, model, N_TRIALS)
        rewards = [sum(r) for r in rewards]

        avg_rew = sum(rewards) / N_TRIALS
        print("Average reward in {} episodes is {}".format(n_trials, avg_rew))
        assert avg_rew > min_avg_reward, \
            'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward)
def main(args):
    for ite in range(int(args['trial_num'])):
        print('Trial Number:', ite)

        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
        config = tf.ConfigProto(gpu_options=gpu_options,
                                log_device_placement=False)

        with tf.Session(config=config) as sess:

            if args['change_seed']:
                #rand_seed = 10 * ite
                rand_seed = np.random.randint(1, 1000, size=1)
            else:
                rand_seed = 0
            env = gym.make(args['env'])

            np.random.seed(int(args['random_seed']) + int(rand_seed))
            tf.set_random_seed(int(args['random_seed']) + int(rand_seed))
            env.seed(int(args['random_seed']) + int(rand_seed))

            env_test = gym.make(args['env'])
            env_test.seed(int(args['random_seed']) + int(rand_seed))

            state_dim = env.observation_space.shape[0]
            action_dim = env.action_space.shape[0]
            print('action_space.shape', env.action_space.shape)
            print('observation_space.shape', env.observation_space.shape)
            action_bound = env.action_space.high
            # Ensure action bound is symmetric
            #print(env.action_space.high)
            #print(env.action_space.low)
            assert (env.action_space.high[0] == -env.action_space.low[0])

            agent = TD3(sess,
                        env,
                        state_dim,
                        action_dim,
                        action_bound,
                        int(args['minibatch_size']),
                        tau=float(args['tau']),
                        actor_lr=float(args['actor_lr']),
                        critic_lr=float(args['critic_lr']),
                        gamma=float(args['gamma']),
                        hidden_dim=np.asarray(args['hidden_dim']))

            # actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_dim))

            if args['use_gym_monitor']:
                if not args['render_env']:
                    env = wrappers.Monitor(env,
                                           args['monitor_dir'],
                                           video_callable=False,
                                           force=True)
                else:
                    env = wrappers.Monitor(
                        env,
                        args['monitor_dir'],
                        video_callable=lambda episode_id: episode_id % 50 == 0,
                        force=True)

            step_R_i = train(sess, env, env_test, args, agent)

            result_path = "./results/trials/"
            try:
                import pathlib
                pathlib.Path(result_path).mkdir(parents=True, exist_ok=True)
            except:
                print(
                    "A result directory does not exist and cannot be created. The trial results are not saved"
                )

            result_filename = args['result_file'] + '_' + args[
                'env'] + '_trial_idx_' + str(int(args['trial_idx'])) + '.txt'

            if args['overwrite_result'] and ite == 0:
                np.savetxt(result_filename, np.asarray(step_R_i))
            else:
                data = np.loadtxt(result_filename, dtype=float)
                data_new = np.vstack((data, np.asarray(step_R_i)))
                np.savetxt(result_filename, data_new)

            if args['use_gym_monitor']:
                env.monitor.close()
Ejemplo n.º 8
0
def main(args):
    result_name = 'TD3_' + args['env'] + '_trial_idx_' + str(
        int(args['trial_idx']))

    for ite in range(int(args['trial_num'])):
        print('Trial Number:', ite)

        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
        config = tf.ConfigProto(gpu_options=gpu_options,
                                log_device_placement=False)

        with tf.Session(config=config) as sess:

            if args['change_seed']:
                rand_seed = 10 * ite
            else:
                rand_seed = 0

            np.random.seed(int(args['random_seed']) + int(rand_seed))
            tf.set_random_seed(int(args['random_seed']) + int(rand_seed))
            env = gym.make(args['env'])
            env.seed(int(args['random_seed']) + int(rand_seed))

            if args['save_video']:
                try:
                    import pathlib
                    pathlib.Path("./Video/" + args['env']).mkdir(parents=True,
                                                                 exist_ok=True)
                    video_relative_path = "./Video/" + args['env'] + "/"

                    ## To save video of the first episode
                    env = gym.wrappers.Monitor(
                        env,
                        video_relative_path,
                        video_callable=lambda episode_id: episode_id == 0,
                        force=True)
                    ## To save video of every episodes
                    # env_test = gym.wrappers.Monitor(env_test, video_relative_path, \
                    #    video_callable=lambda episode_id: episode_id%1==0, force =True)
                except:
                    print(
                        "Cannot create video directories. Video will not be saved."
                    )

            state_dim = env.observation_space.shape[0]
            action_dim = env.action_space.shape[0]
            action_bound = env.action_space.high
            # Ensure action bound is symmetric
            assert (env.action_space.high[0] == -env.action_space.low[0])

            if args['method_name'] == 'TD3':
                from TD3_keras_agent import TD3
                agent = TD3(
                    sess,
                    env,
                    state_dim,
                    action_dim,
                    action_bound,
                    int(args['minibatch_size']),
                    tau=float(args['tau']),
                    actor_lr=float(args['actor_lr']),
                    critic_lr=float(args['critic_lr']),
                    gamma=float(args['gamma']),
                    hidden_dim=np.asarray(args['hidden_dim']),
                )

            agent.load_model(iteration=int(args['load_model_iter']),
                             expname=result_name)

            # if args['use_gym_monitor']:
            #     if not args['render_env']:
            #         env = wrappers.Monitor(
            #                 env, args['monitor_dir'], video_callable=False, force=True)
            #     else:
            #         env = wrappers.Monitor(env, args['monitor_dir'], video_callable=lambda episode_id: episode_id==0, force=True)

            test(sess, env, args, agent, result_name)