def _reset(self):
     self._state = 0
     self.last_call_thread_id = threading.current_thread().ident
     return ts.restart([self._state],
                       batch_size=1,
                       reward_spec=self._reward_spec)
  def testSparseObs(self, batch_size, actions_from_reward_layer):
    obs_spec = {
        'global': {'sport': tensor_spec.TensorSpec((), tf.string)},
        'per_arm': {
            'name': tensor_spec.TensorSpec((3,), tf.string),
            'fruit': tensor_spec.TensorSpec((3,), tf.string)
        }
    }
    columns_a = tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            'name', ['bob', 'george', 'wanda']))
    columns_b = tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            'fruit', ['banana', 'kiwi', 'pear']))
    columns_c = tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            'sport', ['bridge', 'chess', 'snooker']))

    dummy_net = arm_network.create_feed_forward_common_tower_network(
        obs_spec,
        global_layers=(3, 4, 5),
        arm_layers=(3, 2),
        common_layers=(4, 3),
        output_dim=self._encoding_dim,
        global_preprocessing_combiner=(tf.compat.v2.keras.layers.DenseFeatures(
            [columns_c])),
        arm_preprocessing_combiner=tf.compat.v2.keras.layers.DenseFeatures(
            [columns_a, columns_b]))
    time_step_spec = ts.time_step_spec(obs_spec)
    reward_layer = get_per_arm_reward_layer(encoding_dim=self._encoding_dim)
    policy = neural_linucb_policy.NeuralLinUCBPolicy(
        dummy_net,
        self._encoding_dim,
        reward_layer,
        actions_from_reward_layer=tf.constant(
            actions_from_reward_layer, dtype=tf.bool),
        cov_matrix=self._a[0:1],
        data_vector=self._b[0:1],
        num_samples=self._num_samples_per_arm[0:1],
        epsilon_greedy=0.0,
        time_step_spec=time_step_spec,
        accepts_per_arm_features=True,
        emit_policy_info=('predicted_rewards_mean',))
    observations = {
        'global': {
            'sport': tf.constant(['snooker', 'chess'])
        },
        'per_arm': {
            'name':
                tf.constant([['george', 'george', 'george'],
                             ['bob', 'bob', 'bob']]),
            'fruit':
                tf.constant([['banana', 'banana', 'banana'],
                             ['kiwi', 'kiwi', 'kiwi']])
        }
    }

    time_step = ts.restart(observations, batch_size=2)
    action_step = policy.action(time_step, seed=1)
    self.assertEqual(action_step.action.shape.as_list(), [2])
    self.assertEqual(action_step.action.dtype, tf.int32)
    # Initialize all variables
    self.evaluate([tf.compat.v1.global_variables_initializer(),
                   tf.compat.v1.tables_initializer()])
    action = self.evaluate(action_step.action)
    self.assertAllEqual(action.shape, [2])
    p_info = self.evaluate(action_step.info)
    self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3])
    self.assertAllEqual(p_info.chosen_arm_features['name'].shape, [2])
    self.assertAllEqual(p_info.chosen_arm_features['fruit'].shape, [2])
    first_action = action[0]
    first_arm_name_feature = observations[
        bandit_spec_utils.PER_ARM_FEATURE_KEY]['name'][0]
    self.assertAllEqual(p_info.chosen_arm_features['name'][0],
                        first_arm_name_feature[first_action])
Exemple #3
0
 def _reset(self):
     self.set_state(self._initial_state)
     self._episode_ended = False
     return ts.restart(self._to_observation())
 def testDistributionRaisesNotImplementedError(self):
     mock_tf_py_policy = tf_py_policy.TFPyPolicy(self._get_mock_py_policy())
     observation = tf.ones([5], tf.float32)
     time_step = ts.restart(observation)
     with self.assertRaises(NotImplementedError):
         mock_tf_py_policy.distribution(time_step=time_step)
Exemple #5
0
 def testRestartIsFirst(self):
     observation = tf.constant(-1)
     time_step = ts.restart(observation)
     is_first = time_step.is_first()
     self.assertEqual(True, self.evaluate(is_first))
Exemple #6
0
 def _time_step(self):
   return ts.restart(tf.constant([1, 2], dtype=tf.float32))
Exemple #7
0
 def _reset(self):
   if self._current_time_step and self._current_time_step.is_last():
     self._episodes += 1
     self._steps = 0
   return ts.restart(self._get_observation())
 def setUp(self):
     super(RandomPyPolicyTest, self).setUp()
     self._time_step_spec = time_step.time_step_spec(
         observation_spec=array_spec.ArraySpec((1, ), np.int32))
     self._time_step = time_step.restart(observation=np.array([1]))
Exemple #9
0
 def _reset(self):
   self._count = np.array(0, dtype=np.int32)
   return ts.restart(self._count.copy())
Exemple #10
0
 def setUp(self):
   super(ScriptedPyPolicyTest, self).setUp()
   self._obs_spec = array_spec.ArraySpec((), np.int32, 'obs')
   self._time_step_spec = ts.time_step_spec(self._obs_spec)
   self._time_step = ts.restart(observation=1)  # pytype: disable=wrong-arg-types
 def _reset(self):
     self._state = np.zeros(2, dtype=np.int32)
     self._counter = 0
     self._done = False
     return ts.restart(self._state)
  def testPerArmRewardsSparseObs(self):
    if not tf.executing_eagerly():
      return
    tf.compat.v1.set_random_seed(3000)
    obs_spec = {
        'global': {'sport': tensor_spec.TensorSpec((), tf.string)},
        'per_arm': {
            'name': tensor_spec.TensorSpec((3,), tf.string),
            'fruit': tensor_spec.TensorSpec((3,), tf.string)
        }
    }
    columns_a = tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            'name', ['bob', 'george', 'wanda']))
    columns_b = tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            'fruit', ['banana', 'kiwi', 'pear']))
    columns_c = tf.feature_column.indicator_column(
        tf.feature_column.categorical_column_with_vocabulary_list(
            'sport', ['bridge', 'chess', 'snooker']))

    reward_network = (
        global_and_arm_feature_network.create_feed_forward_common_tower_network(
            observation_spec=obs_spec,
            global_layers=(4, 3, 2),
            arm_layers=(6, 5, 4),
            common_layers=(7, 6, 5),
            global_preprocessing_combiner=(
                tf.compat.v2.keras.layers.DenseFeatures([columns_c])),
            arm_preprocessing_combiner=tf.compat.v2.keras.layers.DenseFeatures(
                [columns_a, columns_b])))

    time_step_spec = ts.time_step_spec(obs_spec)
    action_spec = tensor_spec.BoundedTensorSpec((), tf.int32, 0, 2)
    policy = greedy_reward_policy.GreedyRewardPredictionPolicy(
        time_step_spec,
        action_spec,
        reward_network=reward_network,
        accepts_per_arm_features=True,
        emit_policy_info=('predicted_rewards_mean',))
    observations = {
        'global': {
            'sport': tf.constant(['snooker', 'chess'])
        },
        'per_arm': {
            'name':
                tf.constant([['george', 'george', 'george'],
                             ['bob', 'bob', 'bob']]),
            'fruit':
                tf.constant([['banana', 'banana', 'banana'],
                             ['kiwi', 'kiwi', 'kiwi']])
        }
    }

    time_step = ts.restart(observations, batch_size=2)
    action_step = policy.action(time_step, seed=1)
    self.assertEqual(action_step.action.shape.as_list(), [2])
    self.assertEqual(action_step.action.dtype, tf.int32)
    # Initialize all variables
    self.evaluate(tf.compat.v1.global_variables_initializer())
    action = self.evaluate(action_step.action)
    self.assertAllEqual(action.shape, [2])
    p_info = self.evaluate(action_step.info)
    self.assertAllEqual(p_info.predicted_rewards_mean.shape, [2, 3])
    self.assertAllEqual(p_info.chosen_arm_features['name'].shape, [2])
    self.assertAllEqual(p_info.chosen_arm_features['fruit'].shape, [2])
    first_action = action[0]
    first_arm_name_feature = observations[
        bandit_spec_utils.PER_ARM_FEATURE_KEY]['name'][0]
    self.assertAllEqual(p_info.chosen_arm_features['name'][0],
                        first_arm_name_feature[first_action])
Exemple #13
0
    def _reset(self):
        observations = self._parallel_env.reset()
        self._dones = [False] * self._num_envs

        timesteps = ts.restart(observations, batch_size=self._num_envs)
        return timesteps
 def _reset(self):
     self._state = 0
     self.resets += 1
     self.last_call_thread_id = threading.current_thread().ident
     return ts.restart([self._state])
Exemple #15
0
 def _reset(self) -> ts.TimeStep:
     self._pasture_engine.reset()
     return ts.restart(self._pasture_engine.state())
    def testGetEpochLoss(self):
        agent = ppo_agent.PPOAgent(
            self._time_step_spec,
            self._action_spec,
            tf.compat.v1.train.AdamOptimizer(),
            actor_net=DummyActorNet(self._obs_spec, self._action_spec),
            value_net=DummyValueNet(self._obs_spec),
            normalize_observations=False,
            normalize_rewards=False,
            value_pred_loss_coef=1.0,
            policy_l2_reg=1e-4,
            value_function_l2_reg=1e-4,
            entropy_regularization=0.1,
            importance_ratio_clipping=10,
        )
        observations = tf.constant([[1, 2], [3, 4], [1, 2], [3, 4]],
                                   dtype=tf.float32)
        time_steps = ts.restart(observations, batch_size=2)
        actions = tf.constant([[0], [1], [0], [1]], dtype=tf.float32)
        returns = tf.constant([1.9, 1.0, 1.9, 1.0], dtype=tf.float32)
        sample_action_log_probs = tf.constant([0.9, 0.3, 0.9, 0.3],
                                              dtype=tf.float32)
        advantages = tf.constant([1.9, 1.0, 1.9, 1.0], dtype=tf.float32)
        weights = tf.constant([1.0, 1.0, 0.0, 0.0], dtype=tf.float32)
        sample_action_distribution_parameters = {
            'loc': tf.constant([[9.0], [15.0], [9.0], [15.0]],
                               dtype=tf.float32),
            'scale': tf.constant([[8.0], [12.0], [8.0], [12.0]],
                                 dtype=tf.float32),
        }
        train_step = tf.compat.v1.train.get_or_create_global_step()

        loss_info = agent.get_epoch_loss(time_steps,
                                         actions,
                                         sample_action_log_probs,
                                         returns,
                                         advantages,
                                         sample_action_distribution_parameters,
                                         weights,
                                         train_step,
                                         debug_summaries=False)

        self.evaluate(tf.compat.v1.global_variables_initializer())
        total_loss, extra_loss_info = self.evaluate(loss_info)
        (policy_gradient_loss, value_estimation_loss, l2_regularization_loss,
         entropy_reg_loss, kl_penalty_loss) = extra_loss_info

        # Check loss values are as expected. Factor of 2/4 is because four timesteps
        # were included in the data, but two were masked out. Reduce_means in losses
        # will divide by 4, but computed loss values are for first 2 timesteps.
        expected_pg_loss = -0.0164646133 * 2 / 4
        expected_ve_loss = 123.205 * 2 / 4
        expected_l2_loss = 1e-4 * 12 * 2 / 4
        expected_ent_loss = -0.370111 * 2 / 4
        expected_kl_penalty_loss = 0.0
        self.assertAllClose(expected_pg_loss + expected_ve_loss +
                            expected_l2_loss + expected_ent_loss +
                            expected_kl_penalty_loss,
                            total_loss,
                            atol=0.001,
                            rtol=0.001)
        self.assertAllClose(expected_pg_loss, policy_gradient_loss)
        self.assertAllClose(expected_ve_loss, value_estimation_loss)
        self.assertAllClose(expected_l2_loss,
                            l2_regularization_loss,
                            atol=0.001,
                            rtol=0.001)
        self.assertAllClose(expected_ent_loss, entropy_reg_loss)
        self.assertAllClose(expected_kl_penalty_loss, kl_penalty_loss)
Exemple #17
0
 def _reset(self):
     self._state = np.array(self.env.reset(), dtype=np.float32)
     self._episode_ended = False
     return ts.restart(self._state)
Exemple #18
0
 def _reset(self):
     self._state = 0
     return ts.restart(self._state)
Exemple #19
0
 def _reset(self):
   self._state = np.int32(0)
   return ts.restart(self._state)
Exemple #20
0
 def _reset(self):
     self._game.reset()
     return timeStep.restart(self._game.game_state())
Exemple #21
0
 def _reset(self):
   if self._current_time_step and self._current_time_step.is_last():
     self._episodes += 1
   self._current_step = np.array(0, dtype=self._dtype)
   return ts.restart(self._get_observation())
 def _reset(self):
     self.env.reset()
     self._state = self.env.encoded_state()
     self._episode_ended = False
     return ts.restart(self._state)
Exemple #23
0
 def _reset(self):
     self._done = False
     self._ready_state = self._env.reset()
     self._prev_time_steps = [[None for _ in g] for g in self._ready_state]
     self._prev_actions = [[None for _ in g] for g in self._ready_state]
     return ts.restart(self._ready_state[0][0])
 def _reset(self):
     self.game.game_reset()
     self._episode_ended = False
     return ts.restart(self.obs)
Exemple #25
0
 def testRestartIsFirst(self):
     observation = -1
     time_step = ts.restart(observation)
     self.assertTrue(time_step.is_first())
 def _reset(self):
     self._state = self.job_info.get_observation()
     self._episode_ended = False
     self.step_count = 0
     self.assigned_job = []
     return ts.restart(np.array(self._state, dtype=np.float32))
Exemple #27
0
 def _reset(self):
     self.reset_board()
     return ts.restart(self._state)
 def _reset(self):
   """Resets the wrapper."""
   self._state = np.array(self._env.reset(), dtype=np.float32)
   self._episode_ended = False
   return ts.restart(self._state)
Exemple #29
0
 def _reset(self):
     self.prev_bx = 0.0
     self.prev_bdy = 0.0
     return ts.restart(self.env.reset())
from Trade import Trade
from tf_agents.trajectories import time_step as ts

trader = Trade()

import random
stop = -500
gain = 500
trader.reset()
action = 0
for i in range(len(dados1)):
    
    compra,venda,neg,ficha,comprado,vendido,recompensa= trader.agente(dados1.values[i],action,stop,gain,0)
    # print('estado: ',dados2.values[i])
    observations = tf.constant([[dados2.values[i]]])
    time_step = ts.restart(observations,1)
    action2 = saved_policy.action(time_step)
    # time_step = ts.transition(observations,1)
    # action2 = agent.policy.action(time_step)
    action = action2.action.numpy()[0]
    
    print(i,'------------------')
    print('acao: ',action)
    print('comprado: ',comprado)
    print('vendido: ',vendido)
    print('recompensa: ',recompensa)
    
    print('recompensa: ',time_step.reward.numpy(),' action: ',action2.action.numpy()[0])

print(sum(neg.ganhofinal))