def main(args):
    logging.debug('Configuration: {}'.format(args))

    network_creator, env_creator = get_network_and_environment_creator(args)

    learner = PAACLearner(network_creator, env_creator, args, SolowRunner,
                          SolowStateProcessor())

    setup_kill_signal_handler(learner)

    logging.info('Starting training')
    learner.train()
    logging.info('Finished training')
Esempio n. 2
0
    def testEvalOnce(self):
        pe = PolicyMonitor(env=make_env(),
                           state_processor=SolowStateProcessor(),
                           global_policy_net=self.global_policy_net,
                           summary_writer=self.summary_writer,
                           num_actions=self.num_actions,
                           input_size=self.input_size,
                           temporal_size=self.temporal_size)

        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())
            total_reward, episode_length, rewards = pe.eval_once(sess)
            self.assertTrue(episode_length > 10)
Esempio n. 3
0
 def __init__(self,
              name,
              env,
              policy_net,
              value_net,
              shared_layer,
              global_counter,
              discount_factor=0.99,
              summary_writer=None,
              max_global_steps=None):
     super(SolowWorker,
           self).__init__(name, env, policy_net, value_net, shared_layer,
                          global_counter, discount_factor, summary_writer,
                          max_global_steps, 100., SolowStateProcessor())
Esempio n. 4
0
    def policy_monitor_worker_equal(self):

        global_counter = itertools.count()
        worker_env = make_env()
        worker_env.seed(1692)
        worker = SolowWorker(
            'test_worker',
            env=worker_env,
            policy_net=self.global_policy_net,
            value_net=None,
            shared_layer=lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True),
            global_counter=global_counter,
        )

        env = make_env()
        pe = PolicyMonitor(env=env,
                           state_processor=SolowStateProcessor(),
                           global_policy_net=self.global_policy_net,
                           summary_writer=self.summary_writer,
                           num_actions=self.num_actions,
                           input_size=self.input_size,
                           temporal_size=self.temporal_size)

        with self.test_session() as sess:
            sess.run(tf.global_variables_initializer())

            worker.state = worker_env.reset()
            worker.history.append(worker.process_state(worker.state))

            sess.run(worker.copy_params_op)

            transitions = worker.run_n_steps(10, sess, stochastic=False)
            worker_rewards = [t.reward for t in transitions[0]]

            pe.env = make_env()
            pe.env.seed(1692)
            pe.policy_net = worker.policy_net
            total_reward, episode_length, rewards = pe.eval_once(sess)
            monitor_rewards = rewards[:10]

        np.testing.assert_almost_equal(monitor_rewards,
                                       worker_rewards,
                                       decimal=4)
Esempio n. 5
0
 def __init__(self,
              name,
              env,
              policy_net,
              value_net,
              shared_layer,
              global_counter,
              discount_factor=0.99,
              summary_writer=None,
              max_global_steps=None,
              scale=1.,
              ub=0.99,
              lb=0.01,
              n_grid=51):
     super(GridSolowWorker,
           self).__init__(name, env, policy_net, value_net, shared_layer,
                          global_counter, discount_factor, summary_writer,
                          max_global_steps, scale, SolowStateProcessor())
     self.idx_to_grid = {
         idx: v
         for idx, v in zip(range(n_grid), np.linspace(lb, ub, n_grid))
     }
Esempio n. 6
0
 def __init__(self, id, emulators, variables, queue, barrier):
     super().__init__(id, emulators, variables, queue, barrier)
     self.state_processor = SolowStateProcessor()
            policy_net=policy_net,
            value_net=value_net,
            shared_layer=lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True),
            global_counter=global_counter,
            discount_factor=0.99,
            summary_writer=worker_summary_writer,
            max_global_steps=FLAGS.max_global_steps)
        workers.append(worker)

    saver = tf.train.Saver(keep_checkpoint_every_n_hours=2.0, max_to_keep=10)

    # Used to occasionally save videos for our policy net
    # and write episode rewards to Tensorboard
    pe = PolicyMonitor(env=make_eval_env(p, q),
                       global_policy_net=policy_net,
                       state_processor=SolowStateProcessor(),
                       summary_writer=summary_writer,
                       saver=saver,
                       num_actions=NUM_ACTIONS,
                       input_size=INPUT_SIZE,
                       temporal_size=TEMPORAL_SIZE)

with tf.Session() as sess:

    sess.run(tf.global_variables_initializer())
    coord = tf.train.Coordinator()

    # Load a previous checkpoint if it exists
    latest_checkpoint = tf.train.latest_checkpoint(CHECKPOINT_DIR)
    if latest_checkpoint:
        print("Loading model checkpoint: {}".format(latest_checkpoint))