def setUp(self): super(TickerTraderWorkerTests, self).setUp() self.discount_factor = 0.99 self.global_step = tf.Variable(0, name="global_step", trainable=False) self.global_counter = itertools.count() self.batch_size = 16 self.num_assets = 2 self.num_actions = 3 self.input_size = 1 + self.num_assets * 3 # cash + (quantity, price, vol) * n_assets self.temporal_size = self.num_assets * 2 self.T = 10 with tf.variable_scope("global"): self.global_policy_net = DiscreteAndContPolicyEstimator( self.num_assets, static_size=self.input_size, temporal_size=self.temporal_size, shared_layer=lambda x_t, x: rnn_graph_lstm( x_t, x, 32, 1, True)) self.global_value_net = ValueEstimator( static_size=self.input_size, temporal_size=self.temporal_size, shared_layer=lambda x_t, x: rnn_graph_lstm( x_t, x, 32, 1, True), reuse=True, num_actions=self.num_actions) self.shared_layer = lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True)
def setUp(self): super(GridWorkerTests, self).setUp() self.discount_factor = 0.99 self.global_step = tf.Variable(0, name="global_step", trainable=False) self.global_counter = itertools.count() self.batch_size = 16 self.num_outputs = 1 self.num_choices = 3 self.input_size = 2 self.temporal_size = 2 self.T = 10 with tf.variable_scope("global"): self.global_policy_net = DiscretePolicyEstimator( self.num_outputs, self.num_choices, static_size=self.input_size, temporal_size=self.temporal_size, shared_layer=lambda x_t, x: rnn_graph_lstm( x_t, x, 32, 1, True)) self.global_value_net = ValueEstimator( static_size=self.input_size, temporal_size=self.temporal_size, shared_layer=lambda x_t, x: rnn_graph_lstm( x_t, x, 32, 1, True), reuse=True, ) self.shared_layer = lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True)
def setUp(self): super(PolicyMonitorTest, self).setUp() self.batch_size = 16 self.num_actions = 1 self.input_size = 2 self.temporal_size = 2 self.T = 10 self.env = make_env() self.global_step = tf.Variable(0, name="global_step", trainable=False) self.summary_writer = tf.summary.FileWriter(tempfile.mkdtemp()) with tf.variable_scope("global"): self.global_policy_net = GaussianPolicyEstimator( self.num_actions, static_size=self.input_size, temporal_size=self.temporal_size, shared_layer=lambda x_t, x: rnn_graph_lstm( x_t, x, 32, 1, True)) self.global_value_net = ValueEstimator( static_size=self.input_size, temporal_size=self.temporal_size, shared_layer=lambda x_t, x: rnn_graph_lstm( x_t, x, 32, 1, True), reuse=True)
def predict_test(self): global_step = tf.Variable(0, name='global_step', trainable=False) estimator = ValueEstimator( static_size=self.input_size, temporal_size=self.temporal_size, shared_layer=lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True), learning_rate=1e-3) grads = [g for g, _ in estimator.grads_and_vars] with self.test_session() as sess: sess.run(tf.global_variables_initializer()) # Run feeds feed_dict = { estimator.states: self.states, estimator.history: self.temporal_states, estimator.targets: self.targets } losses = [] for _ in range(1000): loss = sess.run(estimator.loss, feed_dict) pred = sess.run(estimator.predictions, feed_dict) grads_ = sess.run(grads, feed_dict) grad_feed_dict = {k: v for k, v in zip(grads, grads_)} _ = sess.run(estimator.train_op, grad_feed_dict) losses.append(loss) # Assertions self.assertLess(loss, 1e-1) self.assertGreater(loss, 0.) self.assertEqual(pred['logits'].shape, (self.batch_size, )) self.assertLess(losses[-1], losses[0])
def learn_policy_test(self): global_step = tf.Variable(0, name='global_step', trainable=False) estimator = GaussianPolicyEstimator( self.num_actions, static_size=self.input_size, temporal_size=self.temporal_size, shared_layer=lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True), learning_rate=1e-3, seed=1692) grads = [g for g, _ in estimator.grads_and_vars] with self.test_session() as sess: sess.run(tf.global_variables_initializer()) # Run feeds for _ in range(1000): feed_dict = { estimator.states: self.states, estimator.history: self.temporal_states, estimator.advantages: np.ones_like(self.advantage), estimator.actions: self.actions } pred = sess.run(estimator.predictions, feed_dict) grads_ = sess.run(grads, feed_dict) grad_feed_dict = {k: v for k, v in zip(grads, grads_)} _ = sess.run(estimator.train_op, grad_feed_dict) self.assertLess(np.mean(np.abs((pred['mu'] - self.actions))), 0.1)
def gaussian_predict_test(self): global_step = tf.Variable(0, name='global_step',trainable=False) estimator = GaussianPolicyEstimator( self.num_actions, static_size=self.input_size, temporal_size=self.temporal_size, shared_layer=lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True) ) grads = [g for g, _ in estimator.grads_and_vars] with self.test_session() as sess: sess.run(tf.global_variables_initializer()) # Run feeds losses = [] for _ in range(10): feed_dict = { estimator.states: self.states, estimator.history: self.temporal_states, estimator.advantages: self.advantage, estimator.actions: self.actions } loss = sess.run(estimator.loss, feed_dict) losses.append(loss) pred = sess.run(estimator.predictions, feed_dict) grads_ = sess.run(grads, feed_dict) grad_feed_dict = { k: v for k, v in zip(grads, grads_) } _ = sess.run(estimator.train_op, grad_feed_dict) # Assertions self.assertLess(losses[-1], losses[0]) np.testing.assert_array_less(0., pred['sigma']) self.assertEqual(pred['mu'].shape[1], self.num_actions) self.assertEqual(pred['sigma'].shape[1], self.num_actions)
def learn_policy_test(self): tf.Variable(0, name='global_step', trainable=False) estimator = DiscreteAndContPolicyEstimator( self.n_assets, static_size=self.input_size, temporal_size=self.temporal_size, shared_layer=lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True), seed=1692, learning_rate=1e-3) def all_idx(idx, axis): grid = np.ogrid[tuple(map(slice, idx.shape))] grid.insert(axis, idx) return tuple(grid) grads = [g for g, _ in estimator.grads_and_vars] with self.test_session() as sess: sess.run(tf.global_variables_initializer()) # Run feeds for _ in range(1000): feed_dict = { estimator.states: self.states, estimator.history: self.temporal_states, estimator.advantages: np.ones_like(self.advantage), estimator.actions: self.actions, estimator.discrete_actions: self.discrete_actions } pred = sess.run(estimator.predictions, feed_dict) grads_ = sess.run(grads, feed_dict) grad_feed_dict = {k: v for k, v in zip(grads, grads_)} _ = sess.run(estimator.train_op, grad_feed_dict) cont_action_optimal_choice = pred['mu'][all_idx( self.discrete_actions, 2)] # index 3D probs with 2D array of choices prob_optimal_choice = pred['probs'][all_idx(self.discrete_actions, 2)] cont_action_optimal_choice = pred['mu'][all_idx( self.discrete_actions, 2)] self.assertLess(0.9, prob_optimal_choice.mean()) self.assertLess( np.mean(np.abs((cont_action_optimal_choice - self.actions))), 0.2)
def policy_monitor_worker_equal(self): global_counter = itertools.count() worker_env = make_env() worker_env.seed(1692) worker = SolowWorker( 'test_worker', env=worker_env, policy_net=self.global_policy_net, value_net=None, shared_layer=lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True), global_counter=global_counter, ) env = make_env() pe = PolicyMonitor(env=env, state_processor=SolowStateProcessor(), global_policy_net=self.global_policy_net, summary_writer=self.summary_writer, num_actions=self.num_actions, input_size=self.input_size, temporal_size=self.temporal_size) with self.test_session() as sess: sess.run(tf.global_variables_initializer()) worker.state = worker_env.reset() worker.history.append(worker.process_state(worker.state)) sess.run(worker.copy_params_op) transitions = worker.run_n_steps(10, sess, stochastic=False) worker_rewards = [t.reward for t in transitions[0]] pe.env = make_env() pe.env.seed(1692) pe.policy_net = worker.policy_net total_reward, episode_length, rewards = pe.eval_once(sess) monitor_rewards = rewards[:10] np.testing.assert_almost_equal(monitor_rewards, worker_rewards, decimal=4)
q = 1 register_solow_env(p, q) with tf.device("/cpu:0"): # Keeps track of the number of updates we've performed global_step = tf.Variable(0, name="global_step", trainable=False) # Global policy and value nets with tf.variable_scope("global"): policy_net = GaussianPolicyEstimator( NUM_ACTIONS, static_size=INPUT_SIZE, temporal_size=TEMPORAL_SIZE, shared_layer=lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True), ) value_net = ValueEstimator( static_size=INPUT_SIZE, temporal_size=TEMPORAL_SIZE, shared_layer=lambda x_t, x: rnn_graph_lstm(x_t, x, 32, 1, True), reuse=True, scale=100., ) # Global step iterator global_counter = itertools.count() # Create worker graphs workers = [] for worker_id in range(NUM_WORKERS):