def run(self):
        assert callable(self.env_maker)
        env = self.env_maker()

        # setup policy
        self.policy_type = self.policy_type.lower()
        if self.policy_type == 'stochastic':
            if discrete_action(env.action_space):
                self.policy = SoftmaxPolicy()
            elif continuous_action(env.action_space):
                self.policy = GaussianPolicy(low=env.action_space.low,
                                             high=env.action_space.high)
            else:
                raise TypeError('Type of action_space not valid')
        elif self.policy_type == 'greedy':
            if not discrete_action(env.action_space):
                raise TypeError('greedy policy supports only discrete action.')
            self.policy = EpsGreedyPolicy(self.policy_eps)
        else:
            raise ValueError('policy type {} invalid.'.format(self.policy_type))

        # load model
        saved_model = self.do_load_model()
        net = self.net_cls()
        net.set_model(saved_model)

        # global_variables_initializer will re-initialize net.weights
        # and so we need to sync to saved_weights
        saved_weights = saved_model.get_weights()
        sess = tf.Session()
        net.set_session(sess)
        sess.run(tf.global_variables_initializer())
        net.set_sync_weights(saved_weights)
        net.sync()

        # evaluation
        all_total_rewards = []
        for _ in range(self.num_episodes):
            state = env.reset()
            self.render_env_at_timestep(env)
            total_rewards = 0.0
            while True:
                state = self.state_to_input(state)
                action_values = net.action_values([state])[0]
                action = self.policy.select_action(action_values)
                print('action:', action)
                state, reward, done, info = env.step(action)
                self.render_env_at_timestep(env)
                total_rewards += reward
                if done:
                    break
            if self.render_end:
                env.render()
            all_total_rewards.append(total_rewards)
            self.print('episode reward: {}'.format(total_rewards))
        average_reward = sum(all_total_rewards) / len(all_total_rewards)
        self.print('average episode reward: {}'.format(average_reward))
Exemple #2
0
    def worker(self, wid):
        """Run a worker process."""
        assert callable(self.env_maker)
        env = self.env_maker()

        # determine action mode from env.action_space
        if discrete_action(env.action_space):
            self.action_mode = 'discrete'
            self.action_dim = env.action_space.n
        elif continuous_action(env.action_space):
            self.action_mode = 'continuous'
            self.action_dim = len(env.action_space.shape)
            self.action_low = env.action_space.low
            self.action_high = env.action_space.high
        else:
            raise TypeError('Invalid type of env.action_space')

        self.is_master = wid == 0
        if self.is_master and self.save_dir is not None:
            env_name = 'UnknownEnv-v0' if env.spec is None else env.spec.id
            self.output = self.get_output_dir(env_name)
        else:
            self.output = None

        # ports, cluster, and server
        cluster_list = ['{}:{}'.format(LOCALHOST, p) for p in self.port_list]
        cluster = tf.train.ClusterSpec({JOBNAME: cluster_list})
        tf.train.Server(cluster, job_name=JOBNAME, task_index=wid)
        self.print('Starting server #{}'.format(wid))

        self.setup_algorithm()

        # global/local devices
        worker_dev = '/job:{}/task:{}/cpu:0'.format(JOBNAME, wid)
        rep_dev = tf.train.replica_device_setter(worker_device=worker_dev,
                                                 cluster=cluster)

        self.setup_nets(worker_dev, rep_dev, env)
        if self.replay_type is not None:
            replay_kwargs = {**REPLAY_KWARGS, **self.replay_kwargs}
            if self.is_master:
                self.print_kwargs(replay_kwargs, 'Replay memory arguments')
            if self.replay_type == 'uniform':
                self.replay = Replay(**replay_kwargs)
            elif self.replay_type == 'prioritized':
                self.replay = PriorityReplay(**replay_kwargs)
            else:
                message = 'replay type {} invalid'.format(self.replay_type)
                raise ValueError(message)

        # begin tensorflow session, build async RL agent and train
        port = self.port_list[wid]
        with tf.Session('grpc://{}:{}'.format(LOCALHOST, port)) as sess:
            sess.run(tf.global_variables_initializer())
            self.set_session(sess)

            # train the agent
            self.train_on_env(env)

        if self.num_parallel > 1:
            self.event_finished.set()
            if self.is_master:
                while True:
                    time.sleep(1)