def test_ClassifierEnv(): """Tests imbDRL.environments.classifierenv.ClassifierEnv.""" X = np.arange(10, dtype=np.float32) y = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1], dtype=np.int32) env = ClassifierEnv(X, y, 0.2) validate_py_environment(env, episodes=5)
def __init__(self, game): # set game self._game = game # set action range self.action_count = param.CAM_COUNT * param.MOVE_OPTIONS self._action_spec = array_spec.BoundedArraySpec( shape=(), dtype=np.int32, minimum=0, maximum=self.action_count - 1, name='action') # set observation range self._observation_spec = array_spec.BoundedArraySpec( shape=(self._game._n_states, ), dtype=np.float32, minimum=param.OBS_SPEC_MIN, maximum=param.OBS_SPEC_MAX, name='observation') # create action dictionary self.create_action_dict() # make sure the environment is okay utils.validate_py_environment(self, episodes=5)
def test_motion_primitives_concat_state(): params = ParameterServer( filename="modules/runtime/tests/data/highway_merging.json") scenario_generation = UniformVehicleDistribution(num_scenarios=3, random_seed=0, params=params) state_observer = StateConcatenation(params=params) action_wrapper = MotionPrimitives(params=params) evaluator = GoalReached(params=params) viewer = MPViewer(params=params, x_range=[-30, 30], y_range=[-20, 40], follow_agent_id=True) #use_world_bounds=True) # runtimerl = RuntimeRL(action_wrapper=action_wrapper, nn_observer=state_observer, evaluator=evaluator, step_time=0.05, viewer=viewer, scenario_generator=scenario_generation) tfa_env = TFAWrapper(runtimerl) _ = tfa_env.reset() utils.validate_py_environment(tfa_env, episodes=5) _ = tf_py_environment.TFPyEnvironment(tfa_env)
def main(): env = slime_env() #o0 = env.reset() #o1 = env.step(0) # #Check if the class work ok utils.validate_py_environment(env, episodes=5)
def test_tfa_runtime(): params = ParameterServer( filename="tests/data/deterministic_scenario_test.json") scenario_generation = DeterministicScenarioGeneration(num_scenarios=3, random_seed=0, params=params) state_observer = ClosestAgentsObserver(params=params) action_wrapper = DynamicModel(params=params) evaluator = GoalReached(params=params) viewer = MPViewer(params=params, x_range=[-30, 30], y_range=[-20, 40], follow_agent_id=True) # use_world_bounds=True runtimerl = RuntimeRL(action_wrapper=action_wrapper, observer=state_observer, evaluator=evaluator, step_time=0.05, viewer=viewer, scenario_generator=scenario_generation) tfa_env = TFAWrapper(runtimerl) _ = tfa_env.reset() utils.validate_py_environment(tfa_env, episodes=5) _ = tf_py_environment.TFPyEnvironment(tfa_env)
def validate_evironment(): validate_env = SpinQubitEnv(0.1, sigmax(), basis(2, 0), 1, .1) utils.validate_py_environment(validate_env, episodes=5)
def test_jumping(): """Test jumping environment.""" env = JumpingEnvironment(**params) validate_py_environment(env, episodes=10) policy = RandomPyPolicy(time_step_spec=None, action_spec=env.action_spec()) filepath = os.path.join(configs.TEMP_DIR, 'test_jumping.mp4') episode_as_video(env, policy, filepath=filepath) assert glob.glob(filepath.split('.')[0] + '*')
def test_multi_monster(): """Test multi-monster environment.""" env = MultiMonsterEnvironment(n_monsters=3, **params) validate_py_environment(env, episodes=10) policy = RandomPyPolicy(time_step_spec=None, action_spec=env.action_spec()) filepath = os.path.join(configs.TEMP_DIR, 'test_multi.mp4') episode_as_video(env, policy, filepath=filepath) assert glob.glob(filepath.split('.')[0] + '*')
def testValidateOk(self): env = get_mock_env(self._action_spec, self._observation_spec, None) rng = np.random.RandomState() sample_fn = lambda: array_spec.sample_spec_nest(env.observation_spec(), rng) def step(unused_time_step): if rng.rand() < 0.10: return ts.termination(sample_fn(), 0.0) # pytype: disable=wrong-arg-types else: return ts.transition(sample_fn(), 1.0) # pytype: disable=wrong-arg-types env.step = step env.reset = lambda: ts.restart(sample_fn()) utils.validate_py_environment(env, episodes=2)
def testValidateBoundedSpecDistinctBounds(self): observation_spec = array_spec.BoundedArraySpec((3,), np.int32, [-10, -5, -2], [10, 5, 2]) env = get_mock_env(self._action_spec, observation_spec, None) rng = np.random.RandomState() sample_fn = lambda: array_spec.sample_spec_nest(env.observation_spec(), rng) def step(unused_time_step): if rng.rand() < 0.10: return ts.termination(sample_fn(), 0.0) # pytype: disable=wrong-arg-types else: return ts.transition(sample_fn(), 1.0) # pytype: disable=wrong-arg-types env.step = step env.reset = lambda: ts.restart(sample_fn()) utils.validate_py_environment(env, episodes=1)
def test_environment(py_env, observe_action, terminate_action): """ Helper function which tests out a metamdp environment. If this runs without crashing, it is likely that the environment does not contain egregious bugs, at least the inputs/outputs are likely to match the required action and observation specs. Of course, the transition logic of the environment may still be messed up. """ print('ObservationSpec:', py_env.observation_spec()) print('ActionSpec:', py_env.action_spec()) time_step = py_env.reset() cumulative_reward = time_step.reward print(cumulative_reward) for a in [observe_action] * 10 + [terminate_action]: time_step = py_env.step(a) cumulative_reward += time_step.reward print(cumulative_reward) #these lines compute the reward on a single episodes where the agent takes an `observe' action 10 times, then terminates print('Final Reward = ', cumulative_reward) utils.validate_py_environment(py_env)
elif action == 0: new_card = np.random.randint(1, 11) self._state += new_card else: raise ValueError('`action` should be 0 or 1.') if self._episode_ended or self._state >= 21: reward = self._state - 21 if self._state <= 21 else -21 return ts.termination(np.array([self._state], dtype=np.int32), reward) else: return ts.transition( np.array([self._state], dtype=np.int32), reward=0.0, discount=1.0) environment = CardGameEnv() utils.validate_py_environment(environment, episodes=5) get_new_card_action = 0 end_round_action = 1 environment = CardGameEnv() time_step = environment.reset() print(time_step) cumulative_reward = time_step.reward for _ in range(3): time_step = environment.step(get_new_card_action) print(time_step) cumulative_reward += time_step.reward time_step = environment.step(end_round_action)
def testValidateWithBatchSize(self): batch_size = 2 obs_spec = array_spec.BoundedArraySpec((2, 3), np.int32, -10, 10) env = random_py_environment.RandomPyEnvironment( obs_spec, batch_size=batch_size) utils.validate_py_environment(env)
def test_PointMass2DEnv(): env = PointMass2DEnv() utils.validate_py_environment(env, episodes=2)
def testEnvRegistered(self): env = suite_dm_control.load('ball_in_cup', 'catch') self.assertIsInstance(env, py_environment.Base) utils.validate_py_environment(env)
def __init__( self, alphabet: str, starting_seq: str, model: flexs.Model, max_num_steps: int, ): # pylint: disable=W0231 """ Initialize PPO agent environment. Based on this tutorial: https://www.mikulskibartosz.name/how-to-create-an-environment-for-a-tensorflow-agent Args: alphabet: Usually UCGA. starting_seq: When initializing the environment, the sequence which is initially mutated. model: Landscape or model which evaluates each sequence. max_num_steps: Maximum number of steps before episode is forced to terminate. Usually the `model_queries_per_batch`. """ self.alphabet = alphabet # model/model/measurements self.model = model self.previous_fitness = -float("inf") # sequence self.seq = starting_seq self._state = { "sequence": string_to_one_hot(self.seq, self.alphabet).astype(np.float32), "fitness": self.model.get_fitness([starting_seq]).astype(np.float32), } self.episode_seqs = set() # the sequences seen in the current episode self.measured_sequences = {} # tf_agents environment self._action_spec = array_spec.BoundedArraySpec( shape=(1, ), dtype=np.integer, minimum=0, maximum=len(self.seq) * len(self.alphabet) - 1, name="action", ) self._observation_spec = { "sequence": array_spec.BoundedArraySpec( shape=(len(self.seq), len(self.alphabet)), dtype=np.float32, minimum=0, maximum=1, ), "fitness": array_spec.ArraySpec(shape=(1, ), dtype=np.float32), } self.num_steps = 0 self.max_num_steps = max_num_steps validate_py_environment(self, episodes=1)
def test_validate_specs(self): env = test_envs.CountingEnv(steps_per_episode=15) env_utils.validate_py_environment(env, episodes=10)
from tf_agents.environments import utils from tf_agents.environments import wrappers import tensorflow as tf import numpy as np from tf_agents.environments import py_environment from tf_agents.environments import tf_environment from tf_agents.environments import tf_py_environment from tf_agents.specs import array_spec from tf_agents.environments import suite_gym from tf_agents.trajectories import time_step as ts environment = DualGoalMaze() stats_env = wrappers.RunStats(environment) utils.validate_py_environment(stats_env, episodes=5) time_step = stats_env.reset() rewards = [] steps = [] num_episodes = 5 for _ in range(num_episodes): episode_reward = 0 episode_steps = 0 while not time_step.is_last(): action = np.random.randint(0, 4) time_step = stats_env.step(action) episode_steps += 1 episode_reward += time_step.reward rewards.append(episode_reward)
def testValidateNotATimeStep(self): env = get_mock_env(self._action_spec, self._observation_spec, None) with self.assertRaises(ValueError): utils.validate_py_environment(env, episodes=1)
# a new episode. return self.reset() #check if the move is valid and reward accordingly if 0 <= action <= 8: if self.isSpotEmpty(action): self._grid = self.mark reward = self.calcReward() print("agent goes in spot {} for reward {}", action, reward) else: #punish for picking a spot that has been picked reward = -10 self._episode_ended = True else: raise ValueError('`action` should be 0 - 8.') if not self.isGridFull(): self.takeOppTurn() else: self._episode_ended = True if self._episode_ended: return ts.termination(self._grid, reward) else: return ts.transition(self._grid, reward=2, discount=1.0) print("poop") env = TicTacToeEnv() print(env._grid) utils.validate_py_environment(env, episodes=1)
def testValidateOutOfBounds(self): env = get_mock_env(self._action_spec, self._observation_spec, ts.restart(np.array([-11], dtype=np.int32))) with self.assertRaisesRegexp(ValueError, "does not match expected"): utils.validate_py_environment(env, episodes=1)
def test_environment(): """Test environment using built-in validate tool.""" environment = LanceEnvironment() utils.validate_py_environment(environment, episodes=5) print('Test successful.')
import numpy as np import rospy from tf_agents.environments import py_environment from tf_agents.environments import tf_environment from tf_agents.environments import tf_py_environment from tf_agents.environments import utils from tf_agents.specs import array_spec from tf_agents.environments import wrappers from tf_agents.environments import suite_gym from tf_agents.trajectories import time_step as ts from arm_pyenv import ArmEnv # source devel/setup.bash # roslaunch arm_bringup sim_bringup.launch world:=empty rospy.init_node("test") tf.compat.v1.enable_v2_behavior() environment = ArmEnv() timed_env = wrappers.TimeLimit( environment, 900 ) utils.validate_py_environment(timed_env, episodes=5) print('action_spec:', environment.action_spec()) print('time_step_spec:', environment.time_step_spec()) print('time_step_spec.observation:', environment.time_step_spec().observation) print('time_step_spec.step_type:', environment.time_step_spec().step_type) print('time_step_spec.discount:', environment.time_step_spec().discount) print('time_step_spec.reward:', environment.time_step_spec().reward)
def check_valid(self): utils.validate_py_environment(self, episodes=5) print('OK')
def testValidateWrongDTypeAndShape(self): env = get_mock_env(self._action_spec, self._observation_spec, ts.restart(np.array([0, 1], dtype=np.int64))) with self.assertRaisesRegexp(ValueError, "does not match expected"): utils.validate_py_environment(env, episodes=1)
def test_Quadcopter3DEnv(): env = Quadcopter3DEnv() utils.validate_py_environment(env, episodes=2)
from tf_agents.trajectories import trajectory from tf_agents.utils import common # from evn_chamberModel import EnvChamberModel from env_ChamberModel_standalone import EnvChamberModel tf.compat.v1.enable_v2_behavior() ################# # RL environment Setup# ################# # FabModel = EnvChamberModel() FabModel = EnvChamberModel(wafer=10, discount=0.99) FabModel_1 = EnvChamberModel(wafer=10, discount=0.99) utils.validate_py_environment(FabModel, episodes=5) train_tf_env = tf_py_environment.TFPyEnvironment(FabModel) eval_tf_env = tf_py_environment.TFPyEnvironment(FabModel_1) print('Obseravtion Spec:') print(train_tf_env.observation_spec()) # print('Reward Spec:') # print(tf_env_FabModel.time_step_spec().reward) print('Action Spec:') print(train_tf_env.action_spec()) ################# # DQN Agent Setup# ################# # Hyperparameters num_iterations = 100000 # @param {type:"integer"}
def try_hparams(hparams): # Initialize train and eval environments environment = GameEnv() utils.validate_py_environment(environment, episodes=5) train_py_env = GameEnv() eval_py_env = GameEnv() train_env = tf_py_environment.TFPyEnvironment(train_py_env) eval_env = tf_py_environment.TFPyEnvironment(eval_py_env) # Initialize the QNetwork fc_layer_params = (hparams['layer1_count'],hparams['layer2_count'],)#hparams['layer3_count'],) q_net = q_network.QNetwork( train_env.observation_spec(), train_env.action_spec(), fc_layer_params=fc_layer_params) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate) #optimizer = tf.keras.optimizers.RMSprop(learning_rate=learning_rate, rho=0.9, momentum=0.95, epsilon=1e-07) train_step_counter = tf.Variable(0) # Initialize the DQN Agent agent = dqn_agent.DqnAgent( train_env.time_step_spec(), train_env.action_spec(), q_network=q_net, optimizer=optimizer, n_step_update=td_sample_size, target_update_period=nn_update_frequency, td_errors_loss_fn=common.element_wise_squared_loss, train_step_counter=train_step_counter) agent.initialize() replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer( data_spec=agent.collect_data_spec, batch_size=train_env.batch_size, max_length=replay_buffer_max_length) # Collect some data using a totaly random policy random_policy = random_tf_policy.RandomTFPolicy(train_env.time_step_spec(), train_env.action_spec()) collect_data(train_env, random_policy, replay_buffer, steps=initial_collect_steps) # Convert replay buffer to dataset dataset = replay_buffer.as_dataset( num_parallel_calls=3, sample_batch_size=batch_size, num_steps=td_sample_size+1).prefetch(3) iterator = iter(dataset) agent.train = common.function(agent.train) # Reset the train step agent.train_step_counter.assign(0) # Evaluate the agents policy, random policy and optimal policy once before training optimal_return = 0 #solve_perfectly(eval_env,num_eval_episodes) random_return = compute_avg_return(eval_env,random_policy,num_eval_episodes) avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes) returns = [avg_return] # returns will contain all average returns of the agent during training global losses epsilon = start_epsilon epsilon_step = (start_epsilon-end_epsilon) / epsilon_anneal_steps for _ in range(num_iterations): # Reduce epsilon epsilon = max(epsilon - epsilon_step, end_epsilon) # Collect a few steps using the epsilon greedy policy and save to the replay buffer. for _ in range(collect_steps_per_iteration): collect_step(train_env, EpsilonGreedyPolicy(agent.policy, epsilon), replay_buffer) # Sample a batch of data from the buffer and update the agent's network. experience, unused_info = next(iterator) train_loss = agent.train(experience).loss step = agent.train_step_counter.numpy() if step % log_interval == 0: print('step = {0}: loss = {1}'.format(step, train_loss)) losses.append(train_loss) if step % eval_interval == 0: avg_return = compute_avg_return(eval_env, agent.policy, num_eval_episodes, True) print('step = {0}: Average Return = {1} Optimal policy = {2} Random policy = {3}'.format(step, avg_return, optimal_return,random_return)) returns.append(avg_return) #plt.plot(losses) #plt.show() #plt.cla() return returns
from ai.PastureEngine import PastureEngine from ai.PastureEnvironment import PastureEnvironment from pasture.animal.sheep.Sheep import Sheep from pasture.animal.shepherd.Shepherd import Shepherd sheep_list = [Sheep(2, 2, 2), Sheep(4, 4, 2)] shepherd_list = [Shepherd(6, 6)] pasture_engine = PastureEngine(size=8, starting_shepherds_list=shepherd_list, starting_sheep_list=sheep_list, target=(1, 1)) pasture_environment = PastureEnvironment(pasture_engine) utils.validate_py_environment(pasture_environment, episodes=5) pasture_env_wrapped = wrappers.TimeLimit(pasture_environment, duration=15) print(pasture_env_wrapped) train_tf_env = tf_py_environment.TFPyEnvironment(pasture_env_wrapped) print(train_tf_env) eval_tf_env = tf_py_environment.TFPyEnvironment(pasture_env_wrapped) print(eval_tf_env) fc_layer_params = [32, 64, 128] q_net = q_network.QNetwork( train_tf_env.observation_spec(), # input train_tf_env.action_spec(), # output fc_layer_params=fc_layer_params # layerz
def __init__( self, landscape: flexs.Landscape, rounds: int, sequences_batch_size: int, model_queries_per_batch: int, starting_sequence: str, alphabet: str, log_file: Optional[str] = None, model: Optional[flexs.Model] = None, num_experiment_rounds: int = 10, num_model_rounds: int = 1, ): """ Args: num_experiment_rounds: Number of experiment-based rounds to run. This is by default set to 10, the same number of sequence proposal of rounds run. num_model_rounds: Number of model-based rounds to run. """ tf.config.run_functions_eagerly(False) name = f"DynaPPO_Agent_{num_experiment_rounds}_{num_model_rounds}" if model is None: model = DynaPPOEnsemble( len(starting_sequence), alphabet, ) model.train( s_utils.generate_random_sequences(len(starting_sequence), 10, alphabet), [0] * 10, ) super().__init__( model, name, rounds, sequences_batch_size, model_queries_per_batch, starting_sequence, log_file, ) self.alphabet = alphabet self.num_experiment_rounds = num_experiment_rounds self.num_model_rounds = num_model_rounds env = DynaPPOEnvMut( alphabet=self.alphabet, starting_seq=starting_sequence, model=model, landscape=landscape, max_num_steps=model_queries_per_batch, ) validate_py_environment(env, episodes=1) self.tf_env = tf_py_environment.TFPyEnvironment(env) encoder_layer = tf.keras.layers.Lambda(lambda obs: obs["sequence"]) actor_net = actor_distribution_network.ActorDistributionNetwork( self.tf_env.observation_spec(), self.tf_env.action_spec(), preprocessing_combiner=encoder_layer, fc_layer_params=[128], ) value_net = value_network.ValueNetwork( self.tf_env.observation_spec(), preprocessing_combiner=encoder_layer, fc_layer_params=[128], ) self.agent = ppo_agent.PPOAgent( self.tf_env.time_step_spec(), self.tf_env.action_spec(), optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), actor_net=actor_net, value_net=value_net, num_epochs=10, summarize_grads_and_vars=False, ) self.agent.initialize()