Esempio n. 1
0
def test_ClassifierEnv():
    """Tests imbDRL.environments.classifierenv.ClassifierEnv."""
    X = np.arange(10, dtype=np.float32)
    y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1], dtype=np.int32)

    env = ClassifierEnv(X, y, 0.2)
    validate_py_environment(env, episodes=5)
Esempio n. 2
0
    def __init__(self, game):

        # set game
        self._game = game

        # set action range
        self.action_count = param.CAM_COUNT * param.MOVE_OPTIONS
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(),
            dtype=np.int32,
            minimum=0,
            maximum=self.action_count - 1,
            name='action')

        # set observation range
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(self._game._n_states, ),
            dtype=np.float32,
            minimum=param.OBS_SPEC_MIN,
            maximum=param.OBS_SPEC_MAX,
            name='observation')

        # create action dictionary
        self.create_action_dict()

        # make sure the environment is okay
        utils.validate_py_environment(self, episodes=5)
Esempio n. 3
0
    def test_motion_primitives_concat_state():
        params = ParameterServer(
            filename="modules/runtime/tests/data/highway_merging.json")
        scenario_generation = UniformVehicleDistribution(num_scenarios=3,
                                                         random_seed=0,
                                                         params=params)
        state_observer = StateConcatenation(params=params)
        action_wrapper = MotionPrimitives(params=params)
        evaluator = GoalReached(params=params)
        viewer = MPViewer(params=params,
                          x_range=[-30, 30],
                          y_range=[-20, 40],
                          follow_agent_id=True)  #use_world_bounds=True) #

        runtimerl = RuntimeRL(action_wrapper=action_wrapper,
                              nn_observer=state_observer,
                              evaluator=evaluator,
                              step_time=0.05,
                              viewer=viewer,
                              scenario_generator=scenario_generation)

        tfa_env = TFAWrapper(runtimerl)
        _ = tfa_env.reset()
        utils.validate_py_environment(tfa_env, episodes=5)
        _ = tf_py_environment.TFPyEnvironment(tfa_env)
Esempio n. 4
0
def main():
    env = slime_env()
    #o0 = env.reset()
    #o1 = env.step(0)

    # #Check if the class work ok
    utils.validate_py_environment(env, episodes=5)
Esempio n. 5
0
    def test_tfa_runtime():
        params = ParameterServer(
            filename="tests/data/deterministic_scenario_test.json")
        scenario_generation = DeterministicScenarioGeneration(num_scenarios=3,
                                                              random_seed=0,
                                                              params=params)
        state_observer = ClosestAgentsObserver(params=params)
        action_wrapper = DynamicModel(params=params)
        evaluator = GoalReached(params=params)
        viewer = MPViewer(params=params,
                          x_range=[-30, 30],
                          y_range=[-20, 40],
                          follow_agent_id=True)  # use_world_bounds=True

        runtimerl = RuntimeRL(action_wrapper=action_wrapper,
                              observer=state_observer,
                              evaluator=evaluator,
                              step_time=0.05,
                              viewer=viewer,
                              scenario_generator=scenario_generation)

        tfa_env = TFAWrapper(runtimerl)
        _ = tfa_env.reset()

        utils.validate_py_environment(tfa_env, episodes=5)
        _ = tf_py_environment.TFPyEnvironment(tfa_env)
Esempio n. 6
0
def validate_evironment():
    validate_env = SpinQubitEnv(0.1,
                                sigmax(),
                                basis(2, 0),
                                1,
                                .1)
    utils.validate_py_environment(validate_env, episodes=5)
Esempio n. 7
0
def test_jumping():
  """Test jumping environment."""
  env = JumpingEnvironment(**params)
  validate_py_environment(env, episodes=10)
  policy = RandomPyPolicy(time_step_spec=None, action_spec=env.action_spec())
  filepath = os.path.join(configs.TEMP_DIR, 'test_jumping.mp4')
  episode_as_video(env, policy, filepath=filepath)
  assert glob.glob(filepath.split('.')[0] + '*')
Esempio n. 8
0
def test_multi_monster():
  """Test multi-monster environment."""
  env = MultiMonsterEnvironment(n_monsters=3, **params)
  validate_py_environment(env, episodes=10)
  policy = RandomPyPolicy(time_step_spec=None, action_spec=env.action_spec())
  filepath = os.path.join(configs.TEMP_DIR, 'test_multi.mp4')
  episode_as_video(env, policy, filepath=filepath)
  assert glob.glob(filepath.split('.')[0] + '*')
Esempio n. 9
0
  def testValidateOk(self):
    env = get_mock_env(self._action_spec, self._observation_spec, None)
    rng = np.random.RandomState()

    sample_fn = lambda: array_spec.sample_spec_nest(env.observation_spec(), rng)

    def step(unused_time_step):
      if rng.rand() < 0.10:
        return ts.termination(sample_fn(), 0.0)  # pytype: disable=wrong-arg-types
      else:
        return ts.transition(sample_fn(), 1.0)  # pytype: disable=wrong-arg-types

    env.step = step
    env.reset = lambda: ts.restart(sample_fn())

    utils.validate_py_environment(env, episodes=2)
Esempio n. 10
0
  def testValidateBoundedSpecDistinctBounds(self):
    observation_spec = array_spec.BoundedArraySpec((3,), np.int32,
                                                   [-10, -5, -2], [10, 5, 2])
    env = get_mock_env(self._action_spec, observation_spec, None)
    rng = np.random.RandomState()
    sample_fn = lambda: array_spec.sample_spec_nest(env.observation_spec(), rng)

    def step(unused_time_step):
      if rng.rand() < 0.10:
        return ts.termination(sample_fn(), 0.0)  # pytype: disable=wrong-arg-types
      else:
        return ts.transition(sample_fn(), 1.0)  # pytype: disable=wrong-arg-types

    env.step = step
    env.reset = lambda: ts.restart(sample_fn())
    utils.validate_py_environment(env, episodes=1)
def test_environment(py_env, observe_action, terminate_action):
    """
	Helper function which tests out a metamdp environment. If this runs without crashing, it
	is likely that the environment does not contain egregious bugs, at least the inputs/outputs 
	are likely to match the required action and observation specs. Of course, the transition logic of the 
	environment may still be messed up.
	"""
    print('ObservationSpec:', py_env.observation_spec())
    print('ActionSpec:', py_env.action_spec())

    time_step = py_env.reset()
    cumulative_reward = time_step.reward
    print(cumulative_reward)

    for a in [observe_action] * 10 + [terminate_action]:
        time_step = py_env.step(a)
        cumulative_reward += time_step.reward
        print(cumulative_reward)
    #these lines compute the reward on a single episodes where the agent takes an `observe' action 10 times, then terminates

    print('Final Reward = ', cumulative_reward)
    utils.validate_py_environment(py_env)
Esempio n. 12
0
        elif action == 0:
            new_card = np.random.randint(1, 11)
            self._state += new_card
        else:
            raise ValueError('`action` should be 0 or 1.')

        if self._episode_ended or self._state >= 21:
            reward = self._state - 21 if self._state <= 21 else -21
            return ts.termination(np.array([self._state], dtype=np.int32), reward)
        else:
            return ts.transition(
                np.array([self._state], dtype=np.int32), reward=0.0, discount=1.0)


environment = CardGameEnv()
utils.validate_py_environment(environment, episodes=5)

get_new_card_action = 0
end_round_action = 1

environment = CardGameEnv()
time_step = environment.reset()
print(time_step)
cumulative_reward = time_step.reward

for _ in range(3):
    time_step = environment.step(get_new_card_action)
    print(time_step)
    cumulative_reward += time_step.reward

time_step = environment.step(end_round_action)
Esempio n. 13
0
 def testValidateWithBatchSize(self):
   batch_size = 2
   obs_spec = array_spec.BoundedArraySpec((2, 3), np.int32, -10, 10)
   env = random_py_environment.RandomPyEnvironment(
       obs_spec, batch_size=batch_size)
   utils.validate_py_environment(env)
Esempio n. 14
0
def test_PointMass2DEnv():
    env = PointMass2DEnv()
    utils.validate_py_environment(env, episodes=2)
Esempio n. 15
0
    def testEnvRegistered(self):
        env = suite_dm_control.load('ball_in_cup', 'catch')
        self.assertIsInstance(env, py_environment.Base)

        utils.validate_py_environment(env)
Esempio n. 16
0
    def __init__(
        self,
        alphabet: str,
        starting_seq: str,
        model: flexs.Model,
        max_num_steps: int,
    ):  # pylint: disable=W0231
        """
        Initialize PPO agent environment.

        Based on this tutorial:
        https://www.mikulskibartosz.name/how-to-create-an-environment-for-a-tensorflow-agent

        Args:
            alphabet: Usually UCGA.
            starting_seq: When initializing the environment,
                the sequence which is initially mutated.
            model: Landscape or model which evaluates
                each sequence.
            max_num_steps: Maximum number of steps before
                episode is forced to terminate. Usually the
                `model_queries_per_batch`.

        """
        self.alphabet = alphabet

        # model/model/measurements
        self.model = model
        self.previous_fitness = -float("inf")

        # sequence
        self.seq = starting_seq
        self._state = {
            "sequence":
            string_to_one_hot(self.seq, self.alphabet).astype(np.float32),
            "fitness":
            self.model.get_fitness([starting_seq]).astype(np.float32),
        }
        self.episode_seqs = set()  # the sequences seen in the current episode
        self.measured_sequences = {}

        # tf_agents environment
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(1, ),
            dtype=np.integer,
            minimum=0,
            maximum=len(self.seq) * len(self.alphabet) - 1,
            name="action",
        )
        self._observation_spec = {
            "sequence":
            array_spec.BoundedArraySpec(
                shape=(len(self.seq), len(self.alphabet)),
                dtype=np.float32,
                minimum=0,
                maximum=1,
            ),
            "fitness":
            array_spec.ArraySpec(shape=(1, ), dtype=np.float32),
        }

        self.num_steps = 0
        self.max_num_steps = max_num_steps

        validate_py_environment(self, episodes=1)
Esempio n. 17
0
 def test_validate_specs(self):
     env = test_envs.CountingEnv(steps_per_episode=15)
     env_utils.validate_py_environment(env, episodes=10)
Esempio n. 18
0
from tf_agents.environments import utils
from tf_agents.environments import wrappers
import tensorflow as tf
import numpy as np

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.specs import array_spec
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts

environment = DualGoalMaze()
stats_env = wrappers.RunStats(environment)

utils.validate_py_environment(stats_env, episodes=5)

time_step = stats_env.reset()
rewards = []
steps = []
num_episodes = 5

for _ in range(num_episodes):
    episode_reward = 0
    episode_steps = 0
    while not time_step.is_last():
        action = np.random.randint(0, 4)
        time_step = stats_env.step(action)
        episode_steps += 1
        episode_reward += time_step.reward
    rewards.append(episode_reward)
Esempio n. 19
0
  def testValidateNotATimeStep(self):
    env = get_mock_env(self._action_spec, self._observation_spec, None)

    with self.assertRaises(ValueError):
      utils.validate_py_environment(env, episodes=1)
            # a new episode.
            return self.reset()

        #check if the move is valid and reward accordingly
        if 0 <= action <= 8:
            if self.isSpotEmpty(action):
                self._grid = self.mark
                reward = self.calcReward()
                print("agent goes in spot {} for reward {}", action, reward)
            else:  #punish for picking a spot that has been picked
                reward = -10
            self._episode_ended = True
        else:
            raise ValueError('`action` should be 0 - 8.')

        if not self.isGridFull():
            self.takeOppTurn()
        else:
            self._episode_ended = True

        if self._episode_ended:
            return ts.termination(self._grid, reward)
        else:
            return ts.transition(self._grid, reward=2, discount=1.0)


print("poop")
env = TicTacToeEnv()
print(env._grid)
utils.validate_py_environment(env, episodes=1)
Esempio n. 21
0
  def testValidateOutOfBounds(self):
    env = get_mock_env(self._action_spec, self._observation_spec,
                       ts.restart(np.array([-11], dtype=np.int32)))

    with self.assertRaisesRegexp(ValueError, "does not match expected"):
      utils.validate_py_environment(env, episodes=1)
Esempio n. 22
0
def test_environment():
    """Test environment using built-in validate tool."""
    environment = LanceEnvironment()
    utils.validate_py_environment(environment, episodes=5)
    print('Test successful.')
import numpy as np
import rospy
from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts
from arm_pyenv import ArmEnv
# source devel/setup.bash
# roslaunch arm_bringup sim_bringup.launch world:=empty
rospy.init_node("test")
tf.compat.v1.enable_v2_behavior()

environment = ArmEnv()

timed_env = wrappers.TimeLimit(
    environment,
    900
)
utils.validate_py_environment(timed_env, episodes=5)
print('action_spec:', environment.action_spec())
print('time_step_spec:', environment.time_step_spec())
print('time_step_spec.observation:', environment.time_step_spec().observation)
print('time_step_spec.step_type:', environment.time_step_spec().step_type)
print('time_step_spec.discount:', environment.time_step_spec().discount)
print('time_step_spec.reward:', environment.time_step_spec().reward)

Esempio n. 24
0
 def check_valid(self):
     utils.validate_py_environment(self, episodes=5)
     print('OK')
Esempio n. 25
0
  def testValidateWrongDTypeAndShape(self):
    env = get_mock_env(self._action_spec, self._observation_spec,
                       ts.restart(np.array([0, 1], dtype=np.int64)))

    with self.assertRaisesRegexp(ValueError, "does not match expected"):
      utils.validate_py_environment(env, episodes=1)
Esempio n. 26
0
def test_Quadcopter3DEnv():
    env = Quadcopter3DEnv()
    utils.validate_py_environment(env, episodes=2)
from tf_agents.trajectories import trajectory
from tf_agents.utils import common

# from evn_chamberModel import EnvChamberModel
from env_ChamberModel_standalone import EnvChamberModel

tf.compat.v1.enable_v2_behavior()

#################
# RL environment Setup#
#################
# FabModel = EnvChamberModel()
FabModel = EnvChamberModel(wafer=10, discount=0.99)
FabModel_1 = EnvChamberModel(wafer=10, discount=0.99)

utils.validate_py_environment(FabModel, episodes=5)
train_tf_env = tf_py_environment.TFPyEnvironment(FabModel)
eval_tf_env = tf_py_environment.TFPyEnvironment(FabModel_1)
print('Obseravtion Spec:')
print(train_tf_env.observation_spec())
# print('Reward Spec:')
# print(tf_env_FabModel.time_step_spec().reward)
print('Action Spec:')
print(train_tf_env.action_spec())

#################
# DQN Agent Setup#
#################

# Hyperparameters
num_iterations = 100000  # @param {type:"integer"}
Esempio n. 28
0
def try_hparams(hparams):

    # Initialize train and eval environments
    environment = GameEnv()
    utils.validate_py_environment(environment, episodes=5)
    train_py_env = GameEnv()
    eval_py_env = GameEnv()
    train_env = tf_py_environment.TFPyEnvironment(train_py_env)
    eval_env = tf_py_environment.TFPyEnvironment(eval_py_env)

    # Initialize the QNetwork
    fc_layer_params = (hparams['layer1_count'],hparams['layer2_count'],)#hparams['layer3_count'],)
    q_net = q_network.QNetwork(
        train_env.observation_spec(),
        train_env.action_spec(),
        fc_layer_params=fc_layer_params)

    optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
    #optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate, rho=0.9, momentum=0.95, epsilon=1e-07)

    train_step_counter = tf.Variable(0)

    # Initialize the DQN Agent
    agent = dqn_agent.DqnAgent(
        train_env.time_step_spec(),
        train_env.action_spec(),
        q_network=q_net,
        optimizer=optimizer,
        n_step_update=td_sample_size,
        target_update_period=nn_update_frequency,
        td_errors_loss_fn=common.element_wise_squared_loss,
        train_step_counter=train_step_counter)

    agent.initialize()

    replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(
        data_spec=agent.collect_data_spec,
        batch_size=train_env.batch_size,
        max_length=replay_buffer_max_length)

    # Collect some data using a totaly random policy
    random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(),
                                                    train_env.action_spec())
    collect_data(train_env, random_policy, replay_buffer, steps=initial_collect_steps)

    # Convert replay buffer to dataset
    dataset = replay_buffer.as_dataset(
        num_parallel_calls=3, 
        sample_batch_size=batch_size, 
        num_steps=td_sample_size+1).prefetch(3)

    iterator = iter(dataset)

    agent.train = common.function(agent.train)
    # Reset the train step
    agent.train_step_counter.assign(0)

    # Evaluate the agents policy, random policy and optimal policy once before training
    optimal_return = 0 #solve_perfectly(eval_env,num_eval_episodes)
    random_return = compute_avg_return(eval_env,random_policy,num_eval_episodes)
    avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes)
    
    returns = [avg_return] # returns will contain all average returns of the agent during training
    global losses
    epsilon = start_epsilon
    epsilon_step = (start_epsilon-end_epsilon) / epsilon_anneal_steps

    for _ in range(num_iterations):
      # Reduce epsilon
      epsilon = max(epsilon - epsilon_step, end_epsilon)
        
      # Collect a few steps using the epsilon greedy policy and save to the replay buffer.
      for _ in range(collect_steps_per_iteration):
        collect_step(train_env, EpsilonGreedyPolicy(agent.policy, epsilon), replay_buffer)

      # Sample a batch of data from the buffer and update the agent's network.
      experience, unused_info = next(iterator)
      train_loss = agent.train(experience).loss

      step = agent.train_step_counter.numpy()

      if step % log_interval == 0:
        print('step = {0}: loss = {1}'.format(step, train_loss))
        losses.append(train_loss)
        
      if step % eval_interval == 0:
        avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes, True)
        print('step = {0}: Average Return = {1} Optimal policy = {2} Random policy = {3}'.format(step, avg_return, optimal_return,random_return))
        returns.append(avg_return)
    
    #plt.plot(losses)
    #plt.show()
    #plt.cla()
    return returns
Esempio n. 29
0
from ai.PastureEngine import PastureEngine
from ai.PastureEnvironment import PastureEnvironment
from pasture.animal.sheep.Sheep import Sheep
from pasture.animal.shepherd.Shepherd import Shepherd

sheep_list = [Sheep(2, 2, 2), Sheep(4, 4, 2)]
shepherd_list = [Shepherd(6, 6)]

pasture_engine = PastureEngine(size=8,
                               starting_shepherds_list=shepherd_list,
                               starting_sheep_list=sheep_list,
                               target=(1, 1))

pasture_environment = PastureEnvironment(pasture_engine)
utils.validate_py_environment(pasture_environment, episodes=5)

pasture_env_wrapped = wrappers.TimeLimit(pasture_environment, duration=15)
print(pasture_env_wrapped)

train_tf_env = tf_py_environment.TFPyEnvironment(pasture_env_wrapped)
print(train_tf_env)
eval_tf_env = tf_py_environment.TFPyEnvironment(pasture_env_wrapped)
print(eval_tf_env)

fc_layer_params = [32, 64, 128]

q_net = q_network.QNetwork(
    train_tf_env.observation_spec(),  # input
    train_tf_env.action_spec(),  # output
    fc_layer_params=fc_layer_params  # layerz
Esempio n. 30
0
    def __init__(
        self,
        landscape: flexs.Landscape,
        rounds: int,
        sequences_batch_size: int,
        model_queries_per_batch: int,
        starting_sequence: str,
        alphabet: str,
        log_file: Optional[str] = None,
        model: Optional[flexs.Model] = None,
        num_experiment_rounds: int = 10,
        num_model_rounds: int = 1,
    ):
        """
        Args:
            num_experiment_rounds: Number of experiment-based rounds to run. This is by
                default set to 10, the same number of sequence proposal of rounds run.
            num_model_rounds: Number of model-based rounds to run.

        """
        tf.config.run_functions_eagerly(False)

        name = f"DynaPPO_Agent_{num_experiment_rounds}_{num_model_rounds}"

        if model is None:
            model = DynaPPOEnsemble(
                len(starting_sequence),
                alphabet,
            )
            model.train(
                s_utils.generate_random_sequences(len(starting_sequence), 10,
                                                  alphabet),
                [0] * 10,
            )

        super().__init__(
            model,
            name,
            rounds,
            sequences_batch_size,
            model_queries_per_batch,
            starting_sequence,
            log_file,
        )

        self.alphabet = alphabet
        self.num_experiment_rounds = num_experiment_rounds
        self.num_model_rounds = num_model_rounds

        env = DynaPPOEnvMut(
            alphabet=self.alphabet,
            starting_seq=starting_sequence,
            model=model,
            landscape=landscape,
            max_num_steps=model_queries_per_batch,
        )
        validate_py_environment(env, episodes=1)
        self.tf_env = tf_py_environment.TFPyEnvironment(env)

        encoder_layer = tf.keras.layers.Lambda(lambda obs: obs["sequence"])
        actor_net = actor_distribution_network.ActorDistributionNetwork(
            self.tf_env.observation_spec(),
            self.tf_env.action_spec(),
            preprocessing_combiner=encoder_layer,
            fc_layer_params=[128],
        )
        value_net = value_network.ValueNetwork(
            self.tf_env.observation_spec(),
            preprocessing_combiner=encoder_layer,
            fc_layer_params=[128],
        )

        self.agent = ppo_agent.PPOAgent(
            self.tf_env.time_step_spec(),
            self.tf_env.action_spec(),
            optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
            actor_net=actor_net,
            value_net=value_net,
            num_epochs=10,
            summarize_grads_and_vars=False,
        )
        self.agent.initialize()