def gym_continuous_pool_setup(): """ :return: (exp_spec, playground) :rtype: (ExperimentSpec, GymPlayground) """ exp_spec = ExperimentSpec(batch_size_in_ts=20, max_epoch=2, theta_nn_hidden_layer_topology=(2, 2), environment_name='LunarLanderContinuous-v2') exp_spec.set_experiment_spec({'pool_capacity': 100}) playground = GymPlayground(exp_spec.prefered_environment) poolmanager = PoolManager(exp_spec, playground) timestepsampleOne = TimestepSample(container_id=1, playground=playground) timestepsampleTwo = TimestepSample(container_id=2, playground=playground) samplebatch = SampleBatch(batch_size=exp_spec.batch_size_in_ts, playground=playground) trajectoriespool = TrajectoriesPool(capacity=exp_spec['pool_capacity'], batch_size=exp_spec.batch_size_in_ts, playground=playground) env = playground.env initial_observation = env.reset() yield (exp_spec, playground, poolmanager, timestepsampleOne, timestepsampleTwo, samplebatch, trajectoriespool, env, initial_observation)
def __init__(self, experiment_spec: ExperimentSpec, playground: GymPlayground, discounted: bool = True): self._exp_spec = experiment_spec self._playground_spec = playground.get_environment_spec() self.discounted = discounted # (nice to have) todo:refactor --> to a speed optimized version: # - init list with max env trj lentgh and value 0. # - Write collected observation with a running index # Check benchmark and proof of concept: # - exploration_and_benchmarking/data_structure/data_structure_prof_of_concept_benchmark.py # - exploration_and_benchmarking/data_structure/list_vs_array_data_structure_benchmark.py self.observations = [] self.actions = [] self.rewards = [] self.q_values = None self.theReturn = None self.lenght = None # Internal state # (nice to have) todo:refactor --> using the namedtuple InertnalState: self._step_count_since_begining_of_training = 0 self._trj_collected = 0 self._q_values_computed = False self._trj_pending_pop = False
def __init__(self, exp_spec: ExperimentSpec, agent_root_dir: str = None): """ Build agent computation graph :param exp_spec: Experiment specification regarding NN and algo training hparam plus some environment detail :type exp_spec: ExperimentSpec :param agent_root_dir: The agent root directory :type agent_root_dir: str """ if agent_root_dir is not None: self.agent_root_dir = agent_root_dir else: self._use_hardcoded_agent_root_directory() self.exp_spec = exp_spec try: hec = exp_spec['harderEnvCoeficient'] self.playground = GymPlayground( environment_name=exp_spec.prefered_environment, harderEnvCoeficient=hec) except KeyError: self.playground = GymPlayground( environment_name=exp_spec.prefered_environment) """ ---- Init computation graph ---- """ # required placeholder for Agent.play() methode self.obs_t_ph = None self.policy_pi = None self._build_computation_graph() not_implemented_msg = "must be set by _build_computation_graph()" assert self.obs_t_ph is not None, "self.obs_t_ph {}".format( not_implemented_msg) assert self.policy_pi is not None, "self.policy_pi {}".format( not_implemented_msg) """ ---- Setup parameters saving ---- """ self.saver = tf_cv1.train.Saver() self.writer = None self.this_run_dir = None
def gym_discrete_setup(): exp_spec = ExperimentSpec(batch_size_in_ts=1000, max_epoch=2, theta_nn_hidden_layer_topology=(2, 2)) playground = GymPlayground('LunarLander-v2') trajectory_collector = TrajectoryCollector(exp_spec, playground) uni_batch_collector = UniformBatchCollector( capacity=exp_spec.batch_size_in_ts) env = playground.env initial_observation = env.reset() yield exp_spec, playground, trajectory_collector, uni_batch_collector, env, initial_observation
# coding=utf-8 from blocAndTools.buildingbloc import GymPlayground, gym_environment_reward_assesment from blocAndTools.rl_vocabulary import rl_name vocab = rl_name() playgroundLunarLanderContinuous = GymPlayground('LunarLanderContinuous-v2') playgroundMountainCarContinuous = GymPlayground('MountainCarContinuous-v0') playgroundBipedalWalkerContinuous = GymPlayground('BipedalWalker-v2') playgroundBipedalWalkerHardcoreContinuous = GymPlayground( 'BipedalWalkerHardcore-v2') playgroundPendulum = GymPlayground('Pendulum-v0') playgroundCartPole = GymPlayground('CartPole-v1') """ Average reward over 5000 sample in environment: CartPole: 1.0 Pendulum: -6.194059069237778 MountainCarContinuous: -0.03379479089447222 LunarLanderContinuous: -1.8451261821262512 BipedalWalkerContinuous: -0.09389189287451959 BipedalWalkerHardcoreContinuous: -0.21095661603674906 Average reward over 10000 sample in environment: CartPole: 1.0 Pendulum: -6.392986420889935 MountainCarContinuous: -0.033452527858401936 LunarLanderContinuous: -2.1619796013042705 BipedalWalkerContinuous: -0.09298591752622176 BipedalWalkerHardcoreContinuous: -0.056385564141322744 Average reward over 30000 sample in environment:
def test_PoolManager_PRODUCE_MINIBATCH(gym_continuous_pool_setup): # region ::Type hint bloc ... exp_spec: ExperimentSpec playground: GymPlayground poolmanager = PoolManager timestepsampleOne = TimestepSample timestepsampleTwo = TimestepSample samplebatch = SampleBatch trajectoriespool = TrajectoriesPool env: Union[TimeLimit, Any] # endregion (_, _, _, _, _, _, _, env, initial_observation) = gym_continuous_pool_setup POOL_CAPACITY_4 = 4 exp_spec = ExperimentSpec(batch_size_in_ts=2, max_epoch=2, theta_nn_hidden_layer_topology=(2, 2), environment_name='LunarLanderContinuous-v2') exp_spec.set_experiment_spec({'pool_capacity': POOL_CAPACITY_4}) playground = GymPlayground(exp_spec.prefered_environment) poolmanager = PoolManager(exp_spec, playground) timestepsample1 = TimestepSample(container_id=1, playground=playground) timestepsample2 = TimestepSample(container_id=2, playground=playground) timestepsample3 = TimestepSample(container_id=3, playground=playground) timestepsample4 = TimestepSample(container_id=4, playground=playground) tss_collection = [ timestepsample1, timestepsample2, timestepsample3, timestepsample4 ] obs_t = initial_observation tss: TimestepSample for tss in tss_collection: act_t, obs_t_prime, rew_t, done_t = step_foward_and_collect( env, obs_t, poolmanager) tss.replace(obs_t=obs_t, act_t=act_t, obs_t_prime=obs_t_prime, rew_t=rew_t, done_t=done_t) obs_t = obs_t_prime assert poolmanager.timestep_collected_so_far() == POOL_CAPACITY_4 minibatch_list1 = poolmanager._trajectories_pool.sample_from_pool_as_list() minibatch_list2 = poolmanager._trajectories_pool.sample_from_pool_as_list() print(minibatch_list1) print(minibatch_list2) assert minibatch_list1 != minibatch_list2 popu = [idx for idx in range(10)] popu_sample1 = random.sample(popu, 3) popu_sample2 = random.sample(popu, 3) assert popu_sample1 != popu_sample2 minibatch1: SampleBatch = poolmanager.sample_from_pool() minibatch2: SampleBatch = poolmanager.sample_from_pool() print(minibatch1) print(minibatch2) flag1 = [tss in minibatch1 for tss in tss_collection] flag2 = [tss in minibatch2 for tss in tss_collection] assert sum(flag1) == 2 assert sum(flag2) == 2 assert minibatch1 != minibatch2