Ejemplo n.º 1
0
    def worker(runs, env_cfg, task, param):
        environment = Maze(**env_cfg)
        agent_program = AGENT_PROGRAM[task[0]](environment.act_spec,
                                               environment.obs_spec, **task[1])
        agent = core.Agent(agent_program, lambda env:
                           (env.STATE_IDX[env.obs], env.reward),
                           lambda action, env: env.step(action))

        steps = []
        for _ in range(0, task[2]['episodes']):
            core.Run(agent, environment).start()

            steps.append(environment.nstep)

            agent_program.reset()
            environment.reset()

        key = {
            **env_cfg,
            **{
                'runs': runs
            },
            **{
                'alg': task[0]
            },
            **task[1],
            **task[2]
        }
        SharedMem.dump(Testbed.key_for('episodes', **key), param['run'], steps)
Ejemplo n.º 2
0
    def worker(runs, env_cfg, task, param):
        environment = Maze(**env_cfg)
        agent_program = AGENT_PROGRAM[task[0]](environment.act_spec,
                                               environment.obs_spec, **task[1])
        agent = core.Agent(agent_program, lambda env:
                           (env.STATE_IDX[env.obs], env.reward),
                           lambda action, env: env.step(action))

        while environment.steps_cnt < task[2]['steps']:
            core.Run(agent, environment).start()

            agent_program.reset()
            environment.reset()

        key = {
            **env_cfg,
            **{
                'runs': runs
            },
            **{
                'alg': task[0]
            },
            **task[1],
            **task[2]
        }
        SharedMem.dump(Testbed.key_for('rewards', **key), param['run'],
                       environment.rewards[0:task[2]['steps']])
Ejemplo n.º 3
0
def test_runs_in_mountain_car_environment():
    environment = MountainCar()
    agent_program = SemiGradientSarsa(
        environment.act_spec,
        environment.obs_spec,
        alpha=0.5,
        epsilon=0.0,
        gamma=1.0)
    agent = core.Agent(
        agent_program,
        lambda env: (env.obs, env.reward, env.done()),
        lambda action, env: env.step(action))
    core.Run(agent, environment).start()
Ejemplo n.º 4
0
def test_true_sarsa_lambda_runs_in_mountain_car_environment():
    environment = MountainCar()
    agent_program = TrueOnlineSarsaLambda(
        environment.act_spec,
        environment.obs_spec,
        alpha=0.2,
        epsilon=0.0,
        gamma=1.0,
        lmbda=0.9)
    agent = core.Agent(
        agent_program,
        lambda env: (env.obs, env.reward, env.done()),
        lambda action, env: env.step(action))
    core.Run(agent, environment).start()
Ejemplo n.º 5
0
    def test_collects_reward_from_each_step(self):
        arms = 10
        eps = 0.1
        environment = NArmedBanditEnv(10, arms)
        agent = core.Agent(
            epsilongreedy.SampleAverage(
                act_spec=core.Spec([core.Space(shape=(10, ))]),
                obs_spec=core.Spec([core.Space(shape=(10, ))]),
                epsilon=eps), lambda env: (env.last_action, env.reward),
            lambda action, env: env.step(action))

        core.Run(agent, environment).start()
        assert len(environment.all_rewards) == 10
        assert len(environment.optimal_actions) == 10
Ejemplo n.º 6
0
def test_nstep_sarsa():
    environment = WindyGridWorld()
    agent_program = OnPolicyNStepSarsa(
        environment.act_spec,
        environment.obs_spec,
        n=2,
        alpha=0.5,
        epsilon=0.1,
        gamma=1.0)
    agent = core.Agent(
        agent_program,
        lambda env: (env.STATE_IDX[env.obs], env.reward, env.done()),
        lambda action, env: env.step(action))
    core.Run(agent, environment).start()
Ejemplo n.º 7
0
def test_actor_critic_runs_in_mountain_car_environment():
    environment = MountainCar()
    agent_program = ActorCriticLambda(
        environment.act_spec,
        environment.obs_spec,
        alpha_w=0.1,
        alpha_theta=0.01,
        gamma=1.0,
        lambda_w=0.9,
        lambda_theta=0.9)
    agent = core.Agent(
        agent_program,
        lambda env: (env.obs, env.reward, env.done()),
        lambda action, env: env.step(action))
    core.Run(agent, environment).start()
Ejemplo n.º 8
0
    def worker(runs, env_cfg, task, param):
        environment = NArmedBanditEnv(**env_cfg)
        agent = core.Agent(
            AGENT_PROGRAM[task[0]](environment.act_spec, environment.obs_spec,
                                   **task[1]), lambda env:
            (env.last_action, env.reward),
            lambda action, env: env.step(action))

        core.Run(agent, environment).start()

        key = {**env_cfg, **{'runs': runs}, **{'alg': task[0]}, **task[1]}
        SharedMem.dump(Testbed.key_for('rewards', **key), param['run'],
                       environment.all_rewards)
        SharedMem.dump(Testbed.key_for('actions', **key), param['run'],
                       environment.optimal_actions)
Ejemplo n.º 9
0
    def worker(runs, env_cfg, task, param):
        environment = RaceTrack(**env_cfg)
        agent = core.Agent(
            AGENT_PROGRAM[task[0]](
                environment.act_spec,
                environment.obs_spec,
                **task[1]),
            lambda env: (env.obs, -1),
            lambda action, env: env.step(action))

        core.Run(agent, environment).start()

        key = {**env_cfg, **{'runs': runs}, **{'alg': task[0]}, **task[1]}
        states, actions, rewards = environment.episode()
        run = param['run']
        SharedMem.dump(Testbed.key_for('states', **key), run, states)
        SharedMem.dump(Testbed.key_for('rewards', **key), run, rewards)
        SharedMem.dump(Testbed.key_for('actions', **key), run, actions)
Ejemplo n.º 10
0
    def run(self):
        environment = WindyGridWorld(stochastic=self.stochastic)
        agent_program = OnPolicyNStepSarsa(environment.act_spec,
                                           environment.obs_spec,
                                           n=self.n,
                                           alpha=self.alpha,
                                           epsilon=self.epsilon,
                                           gamma=self.gamma)
        agent = core.Agent(
            agent_program, lambda env:
            (env.STATE_IDX[env.obs], env.reward, env.done()),
            lambda action, env: env.step(action))
        for _ in tqdm.tqdm(range(0, self.runs)):
            core.Run(agent, environment).start()

            self.steps.append(environment.nstep)

            agent_program.reset()
            environment.reset()