def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="ID of the gym environment")
    parser.add_argument('-a',
                        '--agent-config',
                        help="Agent configuration file")
    parser.add_argument('-n',
                        '--network-spec',
                        default=None,
                        help="Network specification file")
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=50000,
                        help="Number of episodes")
    parser.add_argument('-t',
                        '--max-timesteps',
                        type=int,
                        default=2000 * 60,
                        help="Maximum number of timesteps per episode")
    # parser.add_argument('-m', '--monitor', help="Save results to this directory")
    # parser.add_argument('-ms', '--monitor-safe', action='store_true', default=False, help="Do not overwrite previous results")
    # parser.add_argument('-mv', '--monitor-video', type=int, default=0, help="Save video every x steps (0 = disabled)")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se',
                        '--save-episodes',
                        type=int,
                        default=100,
                        help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('-D',
                        '--debug',
                        action='store_true',
                        default=False,
                        help="Show debug outputs")

    args = parser.parse_args()

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    environment = OpenAIUniverse(args.gym_id)
    environment.configure(remotes=1)

    if args.agent_config is not None:
        with open(args.agent_config, 'r') as fp:
            agent_config = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    if args.network_spec:
        with open(args.network_spec, 'r') as fp:
            network_spec = json.load(fp=fp)
    else:
        network_spec = None
        logger.info("No network configuration provided.")

    agent = Agent.from_spec(spec=agent_config,
                            kwargs=dict(states=environment.states,
                                        actions=environment.actions,
                                        network=network_spec))

    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError(
                "Could not load agent from {}: No such directory.".format(
                    load_dir))
        agent.load_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent_config)

    runner = Runner(agent=agent, environment=environment, repeat_actions=1)

    report_episodes = args.episodes // 1000
    if args.debug:
        report_episodes = 1

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            steps_per_second = r.timestep / (time.time() - r.start_time)
            logger.info(
                "Finished episode {} after {} timesteps. Steps Per Second {}".
                format(r.episode, r.episode_timestep, steps_per_second))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(
                sum(r.episode_rewards[-500:]) / 500))
            logger.info("Average of last 100 rewards: {}".format(
                sum(r.episode_rewards[-100:]) / 100))
        return True

    logger.info("Starting {agent} for Environment '{env}'".format(
        agent=agent, env=environment))
    runner.run(args.episodes,
               args.max_timesteps,
               episode_finished=episode_finished)
    runner.close()
    logger.info(
        "Learning finished. Total episodes: {ep}".format(ep=runner.episode))

    # if args.monitor:
    #     environment.gym.monitor.close()
    environment.close()
Exemple #2
0
    global iter
    global modelSaves
    plt.pause(0.01)

    if (iter == 30):
        iter = 0
        agent.save_model('longlongNoNorma/dense_mix')
        modelSaves = modelSaves + 1
    else:
        iter = iter + 1

    return True


# Start learning
runner.run(episodes=7000,
           max_episode_timesteps=(candles.candle_nums + 100),
           episode_finished=episode_finished)

#runner.run(episodes=1, max_episode_timesteps=(candles.candle_nums + 100), episode_finished=episode_finished, deterministic=True)

# Print statistics
print(
    "Learning finished. Total episodes: {ep}. Average reward of last 100 episodes: {ar}."
    .format(ep=runner.episode, ar=np.mean(runner.episode_rewards[-100:])))

print(env.pair_currency)
print(env.base_currency)

runner.close()
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="Id of the Gym environment")
    parser.add_argument('-a', '--agent', help="Agent configuration file")
    parser.add_argument('-n',
                        '--network',
                        default=None,
                        help="Network specification file")
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=None,
                        help="Number of episodes")
    parser.add_argument('-t',
                        '--timesteps',
                        type=int,
                        default=None,
                        help="Number of timesteps")
    parser.add_argument('-m',
                        '--max-episode-timesteps',
                        type=int,
                        default=None,
                        help="Maximum number of timesteps per episode")
    parser.add_argument('-d',
                        '--deterministic',
                        action='store_true',
                        default=False,
                        help="Choose actions deterministically")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se',
                        '--save-episodes',
                        type=int,
                        default=100,
                        help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('--monitor', help="Save results to this directory")
    parser.add_argument('--monitor-safe',
                        action='store_true',
                        default=False,
                        help="Do not overwrite previous results")
    parser.add_argument('--monitor-video',
                        type=int,
                        default=0,
                        help="Save video every x steps (0 = disabled)")
    parser.add_argument('--visualize',
                        action='store_true',
                        default=False,
                        help="Enable OpenAI Gym's visualization")
    parser.add_argument('-D',
                        '--debug',
                        action='store_true',
                        default=False,
                        help="Show debug outputs")
    parser.add_argument(
        '--job',
        type=str,
        default=None,
        help="For distributed mode: The job type of this agent.")
    parser.add_argument(
        '--task',
        type=int,
        default=0,
        help="For distributed mode: The task index of this agent.")

    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)

    logger = logging.getLogger(__file__)
    logger.setLevel(logging.INFO)

    environment = OpenAIGym(gym_id=args.gym_id,
                            monitor=args.monitor,
                            monitor_safe=args.monitor_safe,
                            monitor_video=args.monitor_video,
                            visualize=args.visualize)

    if args.agent is not None:
        with open(args.agent, 'r') as fp:
            agent = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    if args.network is not None:
        with open(args.network, 'r') as fp:
            network = json.load(fp=fp)
    else:
        network = None
        logger.info("No network configuration provided.")

    # TEST
    agent["execution"] = dict(
        type="distributed",
        distributed_spec=dict(
            job=args.job,
            task_index=args.task,
            # parameter_server=(args.job == "ps"),
            cluster_spec=dict(ps=["192.168.2.107:22222"],
                              worker=["192.168.2.107:22223"
                                      ]))) if args.job else None
    # END: TEST

    agent = Agent.from_spec(spec=agent,
                            kwargs=dict(
                                states=environment.states,
                                actions=environment.actions,
                                network=network,
                            ))
    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError(
                "Could not load agent from {}: No such directory.".format(
                    load_dir))
        agent.restore_model(args.load)

    if args.save:
        save_dir = os.path.dirname(args.save)
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir, 0o755)
            except OSError:
                raise OSError(
                    "Cannot save agent to dir {} ()".format(save_dir))

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent)

    runner = Runner(agent=agent, environment=environment, repeat_actions=1)

    if args.debug:  # TODO: Timestep-based reporting
        report_episodes = 1
    else:
        report_episodes = 100

    logger.info("Starting {agent} for Environment '{env}'".format(
        agent=agent, env=environment))

    def episode_finished(r, id_):
        if r.episode % report_episodes == 0:
            steps_per_second = r.timestep / (time.time() - r.start_time)
            logger.info(
                "Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}"
                .format(r.agent.episode, r.episode_timestep, steps_per_second))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
        if args.save and args.save_episodes is not None and not r.episode % args.save_episodes:
            logger.info("Saving agent to {}".format(args.save))
            r.agent.save_model(args.save)

        return True

    runner.run(num_timesteps=args.timesteps,
               num_episodes=args.episodes,
               max_episode_timesteps=args.max_episode_timesteps,
               deterministic=args.deterministic,
               episode_finished=episode_finished)
    runner.close()

    logger.info("Learning finished. Total episodes: {ep}".format(
        ep=runner.agent.episode))
Exemple #4
0
def main():

    args = make_args_parser()
    # print_config(args)
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    logger.addHandler(logging.StreamHandler(sys.stdout))

    # # Temporary for quick access
    # args.groups = 1
    # args.run_all = False
    #
    # args.episodes = 501
    # args.save_episodes = 3500
    # args.testing = False
    # args.target_group = 7
    # args.restore_agent = True
    #
    # args.save_agent =./saved_model/" + path

    # input_path = "./saved_model/" + "V3/group7-1000/"
    # path = "V3/group7-1500/"
    # output_path = "./outputs/" + path

    # ~~~~~~~~~~~~~~~~~ Setting up the Model ~~~~~~~~~~~~~~~~~ #

    # Initialize environment (tensorforce's template)
    memory = {}
    environment = ReJoin(
        args.phase,
        args.query,
        args.episodes,
        args.groups,
        memory,
        args.mode,
        args.target_group,
        args.run_all
    )

    if args.agent_config is not None:
        with open(args.agent_config, "r") as fp:
            agent_config = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    if args.network_spec is not None:
        with open(args.network_spec, "r") as fp:
            network_spec = json.load(fp=fp)
    else:
        raise TensorForceError("No network configuration provided.")

    # Set up the PPO Agent
    agent = Agent.from_spec(
        spec=agent_config,
        kwargs=dict(
            states=environment.states, actions=environment.actions, network=network_spec,
            variable_noise=0.5
        ),
    )

    if args.restore_agent != "":
        agent.restore_model(directory=args.restore_agent)

    runner = Runner(agent=agent, environment=environment)
    # ~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~~~~~ ~~~~~~~~~~~~~~~~~ #

    report_episodes = 1

    def episode_finished(r):
        if r.episode % report_episodes == 0:

            if args.save_agent != "" and args.testing is False and r.episode == args.save_episodes:
                save_dir = os.path.dirname(args.save_agent)
                if not os.path.isdir(save_dir):
                    try:
                        os.mkdir(save_dir, 0o755)
                    except OSError:
                        raise OSError("Cannot save agent to dir {} ()".format(save_dir))

                    r.agent.save_model(
                        directory=args.save_agent, append_timestep=True
                    )

            logger.info(
                "Episode {ep} reward: {r}".format(ep=r.episode, r=r.episode_rewards[-1])
            )
            logger.info(
                "Average of last 100 rewards: {}\n".format(
                    sum(r.episode_rewards[-100:]) / 100
                )
            )
        return True

    logger.info(
        "Starting {agent} for Environment '{env}'".format(agent=agent, env=environment)
    )

    # Start training or testing
    runner.run(
        episodes=args.episodes,
        max_episode_timesteps=args.max_timesteps,
        episode_finished=episode_finished,
        deterministic=args.testing,
    )

    runner.close()
    logger.info("Learning finished. Total episodes: {ep}".format(ep=runner.episode))

    def find_convergence(eps):
        last = eps[-1]
        for i in range(1, len(eps)):
            if eps[i * -1] != last:
                print("Converged at episode:", len(eps) - i + 2)
                return True

    find_convergence(runner.episode_rewards)
    # plt.figure(1)
    # plt.hist(runner.episode_rewards)
    #
    # plt.figure(2)
    # plt.plot(runner.episode_rewards, "b.", MarkerSize=2)

    if not os.path.exists(args.outputs):
        os.makedirs(args.outputs)
    # Plot recorded costs over all episodes
    # print(memory)
    i = 2
    for file, val in memory.items():
        i += 1
        plt.figure(i)

        postgres_estimate = val["postgres_cost"]
        costs = np.array(val["costs"])
        max_val = max(costs)
        min_val = min(costs)
        plt.xlabel("episode")
        plt.ylabel("cost")
        plt.title(file)
        plt.scatter(
            np.arange(len(costs)),
            costs,
            c="g",
            alpha=0.5,
            marker=r"$\ast$",
            label="Cost",
        )
        plt.legend(loc="upper right")
        plt.scatter(
            0,
            [min_val],
            c="r",
            alpha=1,
            marker=r"$\heartsuit$",
            s=200,
            label="min cost observed=" + str(min_val),
        )
        plt.scatter(
            0,
            [max_val],
            c="b",
            alpha=1,
            marker=r"$\times$",
            s=200,
            label="max cost observed=" + str(max_val),
        )
        plt.legend(loc="upper right")
        plt.scatter(
            0,
            [postgres_estimate],
            c="c",
            alpha=1,
            marker=r"$\star$",
            s=200,
            label="postgreSQL estimate=" + str(postgres_estimate),
        )
        plt.legend(loc="upper right")

        plt.savefig(args.outputs + file + ".png")

    plt.show(block=True)
Exemple #5
0
def main():
    parser = argparse.ArgumentParser(description='Playground Flags.')
    parser.add_argument('--game',
                        default='pommerman',
                        help='Game to choose.')
    parser.add_argument('--config',
                        default='ffa_v0',
                        help='Configuration to execute.')
    parser.add_argument('--agents',
                        default='tensorforce::ppo,test::agents.SimpleAgent,test::agents.SimpleAgent,test::agents.SimpleAgent',
                        help='Comma delineated list of agent types and docker locations to run the agents.')
    parser.add_argument('--record_dir',
                        help="Directory to record the PNGs of the game. Doesn't record if None.")
    args = parser.parse_args()

    config = utility.AttrDict(getattr(configs, args.config)())
    _agents = []
    for agent_id, agent_info in enumerate(args.agents.split(",")):
        agent = config.agent(agent_id, config.game_type)
        agent_type, agent_control = agent_info.split("::")
        assert agent_type in ["player", "random", "docker", "test", "tensorforce"]
        if agent_type == "player":
            assert agent_control in ["arrows"]
            on_key_press, on_key_release = utility.get_key_control(agent_control)
            agent = agents.PlayerAgent(
                agent, utility.KEY_INPUT, on_key_press=on_key_press, on_key_release=on_key_release)
        elif agent_type == "random":
            agent = agents.RandomAgent(agent)
        elif agent_type == "docker":
            agent = agents.DockerAgent(
                agent,
                docker_image=agent_control,
                docker_client=client,
                port=agent_id+1000)
        elif agent_type == "test":
            agent = eval(agent_control)(agent)
        elif agent_type == "tensorforce":
            agent = agents.TensorForceAgent(agent, algorithm=agent_control)
            training_agent = agent
        _agents.append(agent)

    gym.envs.registration.register(
        id=config.env_id,
        entry_point=config.env_entry_point,
        kwargs=config.env_kwargs
    )
    env = config.env(**config.env_kwargs)
    env.set_agents(_agents)
    env.set_training_agent(training_agent.agent_id)
    env.seed(0)

    # Create a Proximal Policy Optimization agent
    agent = training_agent.initialize(env)

    atexit.register(functools.partial(clean_up_agents, _agents))
    wrapped_env = WrappedEnv(env, visualize=True)
    runner = Runner(agent=agent, environment=wrapped_env)
    runner.run(episodes=10, max_episode_timesteps=2000)
    print("Stats: ", runner.episode_rewards, runner.episode_timesteps, runner.episode_times)

    try:
        runner.close()
    except AttributeError as e:
        pass
Exemple #6
0
def main():
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)

    console_handler = logging.StreamHandler()
    console_handler.setFormatter(
        logging.Formatter(
            "%(asctime)s [%(threadName)-12.12s] [%(levelname)-5.5s]  %(message)s"
        ))
    logger.addHandler(console_handler)

    parser = argparse.ArgumentParser()

    parser.add_argument('-a',
                        '--agent-config',
                        help="Agent configuration file")
    parser.add_argument('-n',
                        '--network-spec',
                        default=None,
                        help="Network specification file")

    args = parser.parse_args()

    if args.agent_config is not None:
        with open(args.agent_config, 'r') as fp:
            agent_config = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    if args.network_spec is not None:
        with open(args.network_spec, 'r') as fp:
            network_spec = json.load(fp=fp)
    else:
        network_spec = None
        logger.info("No network configuration provided.")

    if network_spec[0]['type'] == 'conv2d':
        agent_config['states_preprocessing'] = [{
            'type': 'expand_dims',
            'axis': -1
        }]
    else:
        agent_config['states_preprocessing'] = [{'type': 'flatten'}]

    logger.info("Start training")

    environment = Game2048()

    agent = Agent.from_spec(spec=agent_config,
                            kwargs=dict(
                                states=environment.states,
                                actions=environment.actions,
                                network=network_spec,
                            ))

    runner = Runner(agent=agent, environment=environment, repeat_actions=1)

    def episode_finished(r):
        if r.episode % 100 == 0:
            sps = r.timestep / (time.time() - r.start_time)
            logger.info(
                "Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}"
                .format(ep=r.episode, ts=r.timestep, sps=sps))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Episode timesteps: {}".format(r.episode_timestep))
            logger.info("Episode largest tile: {}".format(
                r.environment.largest_tile))
            logger.info("Average of last 500 rewards: {}".format(
                sum(r.episode_rewards[-500:]) / 500))
            logger.info("Average of last 100 rewards: {}".format(
                sum(r.episode_rewards[-100:]) / 100))
        return True

    runner.run(timesteps=6000000,
               episodes=1000,
               max_episode_timesteps=10000,
               deterministic=False,
               episode_finished=episode_finished)

    terminal = False
    state = environment.reset()
    while not terminal:
        action = agent.act(state)
        state, terminal, reward = environment.execute(action)
    environment.print_state()

    runner.close()
Exemple #7
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="Id of the Gym environment")
    parser.add_argument('-i',
                        '--import-modules',
                        help="Import module(s) required for environment")
    parser.add_argument('-a', '--agent', help="Agent configuration file")
    parser.add_argument('-n',
                        '--network',
                        default=None,
                        help="Network specification file")
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=None,
                        help="Number of episodes")
    parser.add_argument('-t',
                        '--timesteps',
                        type=int,
                        default=None,
                        help="Number of timesteps")
    parser.add_argument('-m',
                        '--max-episode-timesteps',
                        type=int,
                        default=None,
                        help="Maximum number of timesteps per episode")
    parser.add_argument('-d',
                        '--deterministic',
                        action='store_true',
                        default=False,
                        help="Choose actions deterministically")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se',
                        '--save-episodes',
                        type=int,
                        default=100,
                        help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('--monitor', help="Save results to this directory")
    parser.add_argument('--monitor-safe',
                        action='store_true',
                        default=False,
                        help="Do not overwrite previous results")
    parser.add_argument('--monitor-video',
                        type=int,
                        default=0,
                        help="Save video every x steps (0 = disabled)")
    parser.add_argument('--visualize',
                        action='store_true',
                        default=False,
                        help="Enable OpenAI Gym's visualization")
    parser.add_argument('-D',
                        '--debug',
                        action='store_true',
                        default=False,
                        help="Show debug outputs")
    parser.add_argument('-te',
                        '--test',
                        action='store_true',
                        default=False,
                        help="Test agent without learning.")
    parser.add_argument(
        '-sl',
        '--sleep',
        type=float,
        default=None,
        help=
        "Slow down simulation by sleeping for x seconds (fractions allowed).")
    parser.add_argument(
        '--job',
        type=str,
        default=None,
        help="For distributed mode: The job type of this agent.")
    parser.add_argument(
        '--task',
        type=int,
        default=0,
        help="For distributed mode: The task index of this agent.")

    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)

    logger = logging.getLogger()
    logger.setLevel(logging.INFO)

    if args.import_modules is not None:
        for module in args.import_modules.split(','):
            importlib.import_module(name=module)

    environment = OpenAIGym(gym_id=args.gym_id,
                            monitor=args.monitor,
                            monitor_safe=args.monitor_safe,
                            monitor_video=args.monitor_video,
                            visualize=args.visualize)

    if args.agent is not None:
        with open(args.agent, 'r') as fp:
            agent = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    if args.network is not None:
        with open(args.network, 'r') as fp:
            network = json.load(fp=fp)
        agent = Agent.from_spec(spec=agent,
                                kwargs=dict(states=environment.states,
                                            actions=environment.actions,
                                            network=network))
    else:
        logger.info("No network configuration provided.")
        agent = Agent.from_spec(spec=agent,
                                kwargs=dict(states=environment.states,
                                            actions=environment.actions))

    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError(
                "Could not load agent from {}: No such directory.".format(
                    load_dir))
        agent.restore_model(args.load)

    if args.save:
        save_dir = os.path.dirname(args.save)
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir, 0o755)
            except OSError:
                raise OSError(
                    "Cannot save agent to dir {} ()".format(save_dir))

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent)

    runner = Runner(agent=agent, environment=environment, repeat_actions=1)

    if args.debug:  # TODO: Timestep-based reporting
        report_episodes = 1
    else:
        report_episodes = 100

    logger.info("Starting {agent} for Environment '{env}'".format(
        agent=agent, env=environment))

    def episode_finished(r, id_):
        if r.episode % report_episodes == 0:
            steps_per_second = r.timestep / (time.time() - r.start_time)
            logger.info(
                "Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}"
                .format(r.agent.episode, r.episode_timestep, steps_per_second))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {:0.2f}".format(
                sum(r.episode_rewards[-500:]) /
                min(500, len(r.episode_rewards))))
            logger.info("Average of last 100 rewards: {:0.2f}".format(
                sum(r.episode_rewards[-100:]) /
                min(100, len(r.episode_rewards))))
        if args.save and args.save_episodes is not None and not r.episode % args.save_episodes:
            logger.info("Saving agent to {}".format(args.save))
            r.agent.save_model(args.save)

        return True

    runner.run(num_timesteps=args.timesteps,
               num_episodes=args.episodes,
               max_episode_timesteps=args.max_episode_timesteps,
               deterministic=args.deterministic,
               episode_finished=episode_finished,
               testing=args.test,
               sleep=args.sleep)
    runner.close()

    logger.info("Learning finished. Total episodes: {ep}".format(
        ep=runner.agent.episode))
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--gym_id',
                        default='CartPole-v0',
                        help="Name of the OpenAI Gym Environment")
    parser.add_argument('-a',
                        '--agent',
                        type=str,
                        default='PPO',
                        help="Agent to train.")
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=20,
                        help="Number of episodes to train for.")
    parser.add_argument('-t',
                        '--timesteps',
                        type=int,
                        default=None,
                        help="Number of timesteps to train for.")
    parser.add_argument(
        '-nv',
        '--novisualize',
        action='store_false',
        default=True,
        help="Don't visualize training (will speed up training)")
    parser.add_argument('-m',
                        '--max-episode-timesteps',
                        type=int,
                        default=None,
                        help="Maximum number of timesteps per episode")
    parser.add_argument(
        '-d',
        '--deterministic',
        action='store_true',
        default=False,
        help="Choose deterministically and don't use random actions.")
    parser.add_argument(
        '-l',
        '--load',
        help="Load pretrained agent from this particular directory.")
    parser.add_argument(
        '-nm',
        '--num-episodes-to-test',
        type=int,
        default=10,
        help="Number of episodes to test the loaded policy for.")
    parser.add_argument('-x',
                        '--exp',
                        type=str,
                        default='exp_test_delete_this',
                        help="Name of experiment for logging/saving weights.")
    parser.add_argument('--monitor',
                        default='./logs/',
                        help="Save results and logs to this directory.")
    parser.add_argument('--save',
                        default='./weights/',
                        help="Save trained model to this directory.")
    parser.add_argument('--monitor-safe',
                        action='store_true',
                        default=False,
                        help="Do not overwrite previous results.")
    parser.add_argument('--monitor-video',
                        type=int,
                        default=0,
                        help="Save video every x steps (0 = disabled).")
    parser.add_argument('-D',
                        '--debug',
                        action='store_true',
                        default=False,
                        help="Show debug outputs.")

    args = parser.parse_args()

    # Load the required agent from custom module
    logger.info('Loading {} Agent/Network'.format(args.agent))

    if args.agent.lower() == 'ddpg':
        from modules.custom_agents import DDPG_Agent_Network
        agent, network = DDPG_Agent_Network()
    elif args.agent.lower() == 'naf':
        from modules.custom_agents import NAF_Agent_Network
        agent, network = NAF_Agent_Network()
    elif args.agent.lower() == 'trpo':
        from modules.custom_agents import TRPO_Agent_Network
        agent, network = TRPO_Agent_Network()
    elif args.agent.lower() == 'ppo':
        from modules.custom_agents import PPO_Agent_Network
        agent, network = PPO_Agent_Network()
    elif args.agent.lower() == 'vpg':
        from modules.custom_agents import VPG_Agent_Network
        agent, network = VPG_Agent_Network()

    logfilepath = os.path.join(args.monitor, args.agent, args.exp)

    if not args.load:
        logger.info('Creating logging folder {}'.format(logfilepath))
        os.system('mkdir -p {}'.format(logfilepath))

    env = OpenAIGym(gym_id=args.gym_id,
                    monitor=logfilepath,
                    monitor_safe=args.monitor_safe,
                    monitor_video=args.monitor_video,
                    visualize=args.novisualize)

    agent = Agent.from_spec(spec=agent,
                            kwargs=dict(
                                states=env.states,
                                actions=env.actions,
                                network=network,
                            ))

    if args.load:
        logger.info("Testing pre-trained model!")
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError(
                "Could not load agent from {}: No such directory.".format(
                    load_dir))
        agent.restore_model(args.load)
        logger.info('Loaded pre-trained model weights!')
        logger.info('Starting testing process!')
        env = gym.make(args.gym_id)
        for _i in range(args.num_episodes_to_test):
            logger.info('Episode: {}'.format(_i))
            s = env.reset()
            done = False
            while not done:
                env.render()
                action = agent.act(s)
                s, r, done, _ = env.step(action)
            # TODO: Make a logger here similar to episode_end()
        return

    def episode_finished(r, id):
        if r.episode % report_episodes == 0:
            steps_per_second = r.timestep / (time.time() - r.start_time)
            logger.info(
                "Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}"
                .format(r.agent.episode, r.episode_timestep, steps_per_second))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {:0.2f}".format(
                sum(r.episode_rewards[-500:]) /
                min(500, len(r.episode_rewards))))
            logger.info("Average of last 100 rewards: {:0.2f}".format(
                sum(r.episode_rewards[-100:]) /
                min(100, len(r.episode_rewards))))
        return True

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent)

    runner = Runner(agent=agent, environment=env, repeat_actions=1)

    if args.debug:  # TODO: Timestep-based reporting
        report_episodes = 1
    else:
        report_episodes = 1

    logger.info("Starting {agent} for Environment '{env}'".format(agent=agent,
                                                                  env=env))

    runner.run(num_timesteps=args.timesteps,
               num_episodes=args.episodes,
               max_episode_timesteps=args.max_episode_timesteps,
               deterministic=args.deterministic,
               episode_finished=episode_finished)

    logger.info("Learning finished. Total episodes: {ep}".format(
        ep=runner.agent.episode))
    filepath = os.path.join(args.save, args.agent, args.exp)
    logger.info('Creating directory {}'.format(filepath))
    os.system('mkdir -p {}'.format(filepath))  # recursive mkdir
    logger.info("Saving trained model to {}!".format(filepath))
    filepath = agent.save_model(os.path.join(filepath, 'model'),
                                append_timestep=False)
    logger.info("Saved trained model as: {}".format(filepath))

    runner.close()
Exemple #9
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '-P',
        '--port',
        default=6025,
        help=
        "Port on which the UE4 Game listens on for incoming RL-client connections"
    )
    parser.add_argument('-H',
                        '--host',
                        default=None,
                        help="Hostname of the UE4 Game (default: localhost)")
    parser.add_argument('-a',
                        '--agent-config',
                        help="Agent configuration file")
    parser.add_argument('-n',
                        '--network-spec',
                        default=None,
                        help="Network specification file")
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=None,
                        help="Number of episodes")
    parser.add_argument('-t',
                        '--timesteps',
                        type=int,
                        default=None,
                        help="Number of timesteps")
    parser.add_argument('-m',
                        '--max-episode-timesteps',
                        type=int,
                        default=None,
                        help="Maximum number of timesteps per episode")
    parser.add_argument('-d',
                        '--deterministic',
                        action='store_true',
                        default=False,
                        help="Choose actions deterministically")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('-D',
                        '--debug',
                        action='store_true',
                        default=False,
                        help="Show debug outputs")
    parser.add_argument('-R',
                        '--random-test-run',
                        action="store_true",
                        help="Do a quick random test run on the env")

    args = parser.parse_args()

    # logging.basicConfig(filename="logfile.txt", level=logging.INFO)
    logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)

    # We have to connect this remote env to get the specs.
    # We also discretize axis-mappings b/c we will use a deep q-network.
    # Use num_ticks==6 to match Nature paper by Mnih et al.
    # ("human cannot press fire button with more than 10Hz", dt=1/60)
    # TODO: Need to build in capturing and concat'ing last 4 images (plus 8-bit conversion!) into 1 input state signal.
    # TODO: Use pre-processor for that.
    environment = UE4Environment(host=args.host,
                                 port=args.port,
                                 connect=True,
                                 discretize_actions=True,
                                 num_ticks=6)
    environment.seed(200)

    # Do a quick random test-run with image capture of the first n images -> then exit after 1000 steps.
    if args.random_test_run:
        # Reset the env.
        s = environment.reset()
        img_format = "RGB" if len(environment.states["shape"]) == 3 else "L"
        img = Image.fromarray(s, img_format)
        # Save first received image as a sanity-check.
        img.save("reset.png")
        for i in range(1000):
            s, is_terminal, r = environment.execute(action=random.choice(
                range(environment.actions["num_actions"])))
            if i < 10:
                img = Image.fromarray(s, img_format)
                img.save("{:03d}.png".format(i))
            logging.debug("i={} r={} term={}".format(i, r, is_terminal))
            if is_terminal:
                environment.reset()
        quit()

    if args.agent_config is not None:
        with open(args.agent_config, 'r') as fp:
            agent_config = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    if args.network_spec is not None:
        with open(args.network_spec, 'r') as fp:
            network_spec = json.load(fp=fp)
    else:
        network_spec = None
        logger.info("No network configuration provided.")

    agent = Agent.from_spec(spec=agent_config,
                            kwargs=dict(states=environment.states,
                                        actions=environment.actions,
                                        network=network_spec))
    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError(
                "Could not load agent from {}: No such directory.".format(
                    load_dir))
        agent.restore_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent_config)

    runner = Runner(agent=agent, environment=environment, repeat_actions=1)

    if args.debug:  # TODO: Timestep-based reporting
        report_episodes = 1
    else:
        report_episodes = 100

    logger.info("Starting {agent} for Environment '{env}'".format(
        agent=agent, env=environment))

    def episode_finished(r, id_):
        if r.episode % report_episodes == 0:
            steps_per_second = r.global_timestep / (time.time() - r.start_time)
            logger.info(
                "Finished episode {} after {} timesteps. SPS={}".format(
                    r.global_episode, r.episode_timestep, steps_per_second))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(
                sum(r.episode_rewards[-500:]) /
                min(500, len(r.episode_rewards))))
            logger.info("Average of last 100 rewards: {}".format(
                sum(r.episode_rewards[-100:]) /
                min(100, len(r.episode_rewards))))
        return True

    runner.run(timesteps=args.timesteps,
               episodes=args.episodes,
               max_episode_timesteps=args.max_episode_timesteps,
               deterministic=args.deterministic,
               episode_finished=episode_finished)
    runner.close()

    logger.info("Learning finished. Total episodes: {ep}".format(
        ep=runner.agent.episode))
Exemple #10
0
class Ezpeezy():
    """
    This class is used to encompass all user behavior and interactions.
    ...
    Attributes
    ----------
    _env : tensorforce.Environment
        custom environment used to define hyperparameter space and reward functions
    _agent : tensorforce.Agent
        dqn agent used to optimize the reward function defined in the environment
    _runner : tensorforce.Runner
        used to handle the training job of the agent
    
    Methods
    -------
    set_k_folds(n_folds, pick_random=None)
        Specifies to the environment what sort of cross-validation data 
        configuration to use.
    train_on_data(X_train, y_train, X_valid=None, y_valid=None)
        Specifies to the environment what data to use for training.
    get_history()
        Returns the history of the agent including the configurations it has already
        tested.
    run(num_episodes)
        Begins using the agent to discover the actions required to optimize the
        environment's reward.
    """
    def __init__(self,
                 config,
                 model_fn,
                 model_type='sklearn',
                 model_train_batch_size=256,
                 model_train_epochs=75,
                 exploration=0.9,
                 exploration_decay_rate=0.8,
                 monitor_metric='val_loss',
                 opt='max',
                 starting_tol=-0.01,
                 tol_decay=0.5,
                 deepqn_lr=1e-15):
        """
        Parameters
        ----------
        config : dict
            a dictionary representing the configuration of the hyperparameter space.
            keys represent the name of the hyperparameter while keys can represent
            ranges of the parameter space and its type
        model_fn : function
            function that returns the model you want to optimize
        model_type : string
            "sklearn" to signify that the passed in model_fn is of the sklearn library,
			or "keras" to signify that the passed in model_fn is made from the keras library
        model_train_batch_size : int
            the batch size to use when training your model
        model_train_epochs : int
            number of eopchs to train your model for on each iteration
        exploration : float
            the agent's exploration value
        exploration_decay_rate : float
            the agent's exploration value's decay rate (uses exponential decay)
        monitor_metric : None or string or function
            the metric you would like to optimize in your model - string in the case of
            model_type == 'keras', function if model_type == 'sklearn' or None if to use
            the .score(X, y) function of the sklearn clasasifier

            if function, defined to take in y_true, y_pred and return numeric type
        opt : string
            the optimization direction of the given monitor_metric
        starting_tol : int/float
            the value that you would like to see your metric to increase by at each
            training step, or else end the agent's episode
        tol_decay : int/float
            at each training step in the episode, decrease the tolerance by this value
        deepqn_lr : float
            learning rate to use for the DQN
        """

        self._env = CustomEnvironment(
            config,
            model_train_epoch=model_train_epochs,
            model_train_batch_size=model_train_batch_size,
            model_fn=model_fn,
            model_type=model_type,
            monitor_metric=monitor_metric,
            opt=opt,
            starting_tol=starting_tol,
            tol_decay=tol_decay)
        self._agent = DeepQNetwork(
            states=self._env.states(),
            actions=self._env.actions(),
            max_episode_timesteps=self._env.max_episode_timesteps(),
            memory=60,
            batch_size=3,
            exploration=dict(type='decaying',
                             unit='timesteps',
                             decay='exponential',
                             initial_value=exploration,
                             decay_steps=100000,
                             decay_rate=exploration_decay_rate),
            discount=dict(type='decaying',
                          unit='timesteps',
                          decay='exponential',
                          initial_value=0.7,
                          decay_steps=100000,
                          decay_rate=0.5),
            learning_rate=deepqn_lr)

        self.runner = Runner(agent=self._agent, environment=self._env)

    def set_k_folds(self, n_folds, pick_random=None):
        """
        Specifies to the environment what sort of cross-validation data 
        configuration to use.

        Parameters
        ----------
        n_folds : int
            the number of folds to divide your dataset into using k-fold 
            cross-validation
        pick_random : int/None
            if set to an int, randomly select pick_random of the n_folds to use
            for training your model
        """
        assert isinstance(n_folds, int), 'n_folds must be an int'
        assert (isinstance(pick_random, int) & (pick_random < n_folds)) | (pick_random == None) , \
             "pick  random must be an int less than n_folds or None"

        self._env.set_k_folds(n_folds, pick_random)

    def train_on_data(self, X_train, y_train, X_valid=None, y_valid=None):
        """
        Specifies to the environment what data to use for training.

        Parameters
        ----------
        X_train : iterible
            data used to train your model
        y_train : iterable
            labels used to train your model
        X_valid : iterable/None
            data used to validate your model unless using k-fold CV
        y_valid : iterable/None
            labels used to validate your model unless using k-fold CV
        """
        self._env.train_on_data(X_train, y_train, X_valid, y_valid)

    def get_history(self):
        """
        Returns the history of the agent including the configurations it has already
        tested.

        Returns
        -------
        pd.Dataframe
            Dataframe representing each absolute time step with its episode, configuration
            and monitored metric
        """
        return self._env.get_history()

    def run(self, num_episodes):
        """
        Begins using the agent to discover the actions required to optimize the
        environment's reward.
        
        Parameters
        ----------
        num_episodes : int
            number of episodes to try your agent for on your environment
        
        Prints
        ------
        the best parameters for your goal.
        """
        self._env.reset_history()
        self._env.set_num_episodes(num_episodes)
        self.runner.run(num_episodes=num_episodes)
        print('Best parameters are:')
        print(self._env.get_best_params())
        self.runner.close()
Exemple #11
0
def main():
    # Print all possible environments in the Pommerman registry
    # Instantiate the environment
    DETERMINISTIC = False
    VISUALIZE = False

    if args.test:
        DETERMINISTIC = True
        VISUALIZE = True

    config = ffa_competition_env()
    env = Pomme(**config["env_kwargs"])
    env.seed(0)

    # Create a Proximal Policy Optimization agent
    with open('ppo.json', 'r') as fp:
            agent = json.load(fp=fp)

    with open('mlp2_lstm_network.json', 'r') as fp:
            network = json.load(fp=fp)

    agent = Agent.from_spec(
        spec=agent,
        kwargs=dict(
            states=dict(type='float', shape=env.observation_space.shape),
            actions=dict(type='int', num_actions=env.action_space.n),
            network=network
        )
    )

    # Add 3 random agents
    agents = []
    for agent_id in range(3):
        agents.append(SimpleAgent(config["agent"](agent_id, config["game_type"])))

    # Add TensorforceAgent
    agent_id += 1
    agents.append(TensorforceAgent(config["agent"](agent_id, config["game_type"])))
    env.set_agents(agents)
    env.set_training_agent(agents[-1].agent_id)
    env.set_init_game_state(None)

    # Instantiate and run the environment for 5 episodes.
    if VISUALIZE:
        wrapped_env = WrappedEnv(env, True)
    else:
        wrapped_env = WrappedEnv(env)

    runner = Runner(agent=agent, environment=wrapped_env)

    rewards = []
    episodes = []
    def episode_finished(r):
        nonlocal episodes
        nonlocal rewards
        print("Finished episode {ep} after {ts} timesteps (reward: {reward})".format(ep=r.episode, ts=r.episode_timestep,
                                                                             reward=r.episode_rewards[-1]))
        if r.episode % 1000 == 0:
            agent.save_model(('./{}').format(EXPERIMENT_NAME), False)
            try:
                prev_data = pickle.load(open(EXPERIMENT_NAME, "rb"))
                prev_len = len(prev_data[0])
                prev_data[0].extend(rewards)
                rewards = []
                prev_data[1].extend(episodes)
                episodes = []
                pickle.dump(prev_data, open(EXPERIMENT_NAME, "wb"))
            except (OSError, IOError) as e:
                pickle.dump([rewards, episodes], open(EXPERIMENT_NAME, "wb"))
        if r.episode_rewards[-1] >= 5:
            print()
            print()
            print()
            print("WINNER WINNER CHICKEN DINNER")
        episodes.append(r.episode)
        rewards.append(r.episode_rewards[-1])
        return True

    # Restore, Train, and Save Model
    if args.test or args.resume: # If test, change settings and restore model
        agent.restore_model('./','PPO_K_someS_500batch_biggerreward_99dis')
    runner.run(episodes=EPISODES, max_episode_timesteps=2000, episode_finished=episode_finished, deterministic=False)

    if not args.test:
        agent.save_model(('./{}').format(EXPERIMENT_NAME), False)
    print("Stats: ", runner.episode_rewards[-5:], runner.episode_timesteps[-5:])

    #Dump reward values
    try:
        prev_data = pickle.load(open(EXPERIMENT_NAME, "rb"))
        prev_len = len(prev_data[0])
        prev_data[0].extend(rewards)
        prev_data[1].extend(episodes)
        print(episodes)
        pickle.dump(prev_data, open(EXPERIMENT_NAME, "wb"))
    except (OSError, IOError) as e:
        pickle.dump([rewards, episodes], open(EXPERIMENT_NAME, "wb"))

    try:
        runner.close()
    except AttributeError as e:
        pass
def main():
    parser = argparse.ArgumentParser(description="Playground Flags.")
    parser.add_argument("--game",
                        default="pommerman",
                        help="Game to choose.")
    parser.add_argument("--config",
                        default="PommeFFA-v0",
                        help="Configuration to execute. See env_ids in "
                        "configs.py for options.")
    parser.add_argument("--agents",
                        default="tensorforce::ppo,test::agents.SimpleAgent,"
                        "test::agents.SimpleAgent,test::agents.SimpleAgent",
                        help="Comma delineated list of agent types and docker "
                        "locations to run the agents.")
    parser.add_argument("--agent_env_vars",
                        help="Comma delineated list of agent environment vars "
                        "to pass to Docker. This is only for the Docker Agent."
                        " An example is '0:foo=bar:baz=lar,3:foo=lam', which "
                        "would send two arguments to Docker Agent 0 and one to"
                        " Docker Agent 3.",
                        default="")
    parser.add_argument("--record_pngs_dir",
                        default=None,
                        help="Directory to record the PNGs of the game. "
                        "Doesn't record if None.")
    parser.add_argument("--record_json_dir",
                        default=None,
                        help="Directory to record the JSON representations of "
                        "the game. Doesn't record if None.")
    parser.add_argument("--render",
                        default=True,
                        help="Whether to render or not. Defaults to True.")
    parser.add_argument("--game_state_file",
                        default=None,
                        help="File from which to load game state. Defaults to "
                        "None.")
    args = parser.parse_args()

    config = args.config
    record_pngs_dir = args.record_pngs_dir
    record_json_dir = args.record_json_dir
    agent_env_vars = args.agent_env_vars
    game_state_file = args.game_state_file

    # TODO: After https://github.com/MultiAgentLearning/playground/pull/40
    #       this is still missing the docker_env_dict parsing for the agents.
    agents = [
        helpers.make_agent_from_string(agent_string, agent_id+1000)
        for agent_id, agent_string in enumerate(args.agents.split(","))
    ]

    env = make(config, agents, game_state_file)
    training_agent = None

    for agent in agents:
        if type(agent) == TensorForceAgent:
            training_agent = agent
            env.set_training_agent(agent.agent_id)
            break

    if args.record_pngs_dir:
        assert not os.path.isdir(args.record_pngs_dir)
        os.makedirs(args.record_pngs_dir)
    if args.record_json_dir:
        assert not os.path.isdir(args.record_json_dir)
        os.makedirs(args.record_json_dir)

    # Create a Proximal Policy Optimization agent
    agent = training_agent.initialize(env)

    atexit.register(functools.partial(clean_up_agents, agents))
    wrapped_env = WrappedEnv(env, visualize=args.render)
    runner = Runner(agent=agent, environment=wrapped_env)
    runner.run(episodes=10, max_episode_timesteps=2000)
    print("Stats: ", runner.episode_rewards, runner.episode_timesteps,
          runner.episode_times)

    try:
        runner.close()
    except AttributeError as e:
        pass
Exemple #13
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument('-a', '--agent-config', help="Agent configuration file")
    args = parser.parse_args()

    #From quickstart on docs
    #Network as list of layers
    #This is from mlp2_embedding_network.json
    network_spec = [
        {
            "type": "dense",
            "size":  32
#            "activation": "relu"
        },
        {
            "type": "dense",
            "size": 32
#            "activation": "relu"
        }
    ]

    DATAPATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    observedFile = os.path.join(DATAPATH,r"prnio.int")
    infoFile = os.path.join(DATAPATH,r"prnio.cfl")

    environment = PycrysfmlEnvironment(observedFile, infoFile)

    #get agent configuration
    if args.agent_config is not None:
        with open(args.agent_config, 'r') as fp:
            agent_config = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    agent = Agent.from_spec(
            spec=agent_config,
            kwargs=dict(
                states=environment.states,
                actions=environment.actions,
                network=network_spec,
            )
        )

    #Use this line to resore a pre-trained agent
    #agent.restore_model(file="/mnt/storage/deepQmodel_chisq")

    runner = Runner(
        agent=agent,
        environment=environment,
        repeat_actions=1
    )

    rewardsLog = []
    steps = []

    def episode_finished(r):

        if r.episode % 10 == 0:
            rewardsLog.append(r.episode_rewards[-1])
            steps.append(r.episode)

        if r.episode % 50 == 0:
            sps = r.timestep / (time.time() - r.start_time)
            file = open("/mnt/storage/trainingLog", "a")
            file.write("Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}\n".format(ep=r.episode,
                                                                                                    ts=r.timestep,
                                                                                                    sps=sps))
            file.write("Episode reward: {}\n".format(r.episode_rewards[-1]))
            file.write("Episode timesteps: {}\n".format(r.episode_timestep))
            file.write("Average of last 500 rewards: {}\n".format(sum(r.episode_rewards[-500:]) / 500))
            file.write("Average of last 100 rewards: {}\n".format(sum(r.episode_rewards[-100:]) / 100))

            agent.save_model(directory="/mnt/storage/deepQmodel_simpleA_stdreward", append_timestep=False)

        return True

    runner.run(
        timesteps=60000000,
        episodes=5000,
        max_episode_timesteps=1000,
        deterministic=False,
        episode_finished=episode_finished
    )

    #graph rewards
    plt.scatter(steps, rewardsLog)
    plt.savefig('/mnt/storage/rewardLog_simpleA_stdreward.png')

    runner.close()
Exemple #14
0
    def compute(self, config_id, config, budget, working_directory):
        if self.environment.max_episode_timesteps() is None:
            min_capacity = 1000 + config['batch_size']
        else:
            min_capacity = self.environment.max_episode_timesteps(
            ) + config['batch_size']
        max_capacity = 100000
        capacity = min(
            max_capacity,
            max(min_capacity, config['memory'] * config['batch_size']))
        frequency = max(16, int(config['frequency'] * config['batch_size']))

        if config['ratio_based'] == 'yes':
            ratio_based = True
            clipping_value = config['clipping_value']
        else:
            ratio_based = False
            clipping_value = 0.0

        if config['baseline'] == 'no':
            baseline_policy = None
            baseline_objective = None
            baseline_optimizer = None
            estimate_horizon = False
            estimate_terminal = False
            estimate_advantage = False
        else:
            estimate_horizon = 'early'
            estimate_terminal = True
            estimate_advantage = (config['estimate_advantage'] == 'yes')
            if config['baseline'] == 'same-policy':
                baseline_policy = None
                baseline_objective = None
                baseline_optimizer = None
            elif config['baseline'] == 'auto':
                # other modes, shared network/policy etc !!!
                baseline_policy = dict(
                    network=dict(type='auto', internal_rnn=False))
                baseline_objective = dict(type='state_value',
                                          huber_loss=0.0,
                                          early_reduce=False)
                baseline_optimizer = dict(
                    type='adam',
                    learning_rate=config['baseline_learning_rate'])
            else:
                assert False

        if config['entropy_regularization'] < 3e-5:  # yes/no better
            entropy_regularization = 0.0
        else:
            entropy_regularization = config['entropy_regularization']

        agent = dict(
            agent='tensorforce',
            policy=dict(network=dict(type='auto', internal_rnn=False)),
            memory=dict(type='replay', capacity=capacity),
            update=dict(unit='timesteps',
                        batch_size=config['batch_size'],
                        frequency=frequency),
            optimizer=dict(type='adam', learning_rate=config['learning_rate']),
            objective=dict(type='policy_gradient',
                           ratio_based=ratio_based,
                           clipping_value=clipping_value,
                           early_reduce=False),
            reward_estimation=dict(horizon=config['horizon'],
                                   discount=config['discount'],
                                   estimate_horizon=estimate_horizon,
                                   estimate_actions=False,
                                   estimate_terminal=estimate_terminal,
                                   estimate_advantage=estimate_advantage),
            baseline_policy=baseline_policy,
            baseline_objective=baseline_objective,
            baseline_optimizer=baseline_optimizer,
            preprocessing=None,
            l2_regularization=0.0,
            entropy_regularization=entropy_regularization)

        # num_episodes = list()
        final_reward = list()
        max_reward = list()
        rewards = list()

        for n in range(round(budget)):
            runner = Runner(agent=agent, environment=self.environment)

            # performance_threshold = runner.environment.max_episode_timesteps() - agent['reward_estimation']['horizon']

            # def callback(r, p):
            #     return True

            runner.run(num_episodes=500, use_tqdm=False)
            runner.close()

            # num_episodes.append(len(runner.episode_rewards))
            final_reward.append(
                float(np.mean(runner.episode_rewards[-20:], axis=0)))
            average_rewards = [
                float(np.mean(runner.episode_rewards[n:n + 20], axis=0))
                for n in range(len(runner.episode_rewards) - 20)
            ]
            max_reward.append(float(np.amax(average_rewards, axis=0)))
            rewards.append(list(runner.episode_rewards))

        # mean_num_episodes = float(np.mean(num_episodes, axis=0))
        mean_final_reward = float(np.mean(final_reward, axis=0))
        mean_max_reward = float(np.mean(max_reward, axis=0))
        # loss = mean_num_episodes - mean_final_reward - mean_max_reward
        loss = -mean_final_reward - mean_max_reward

        return dict(loss=loss, info=dict(rewards=rewards))
Exemple #15
0
def main():
    env = OpenAIGym("P3DX-v0")

    agent = DQNAgent(
        states=dict(type='float', shape=(80,80,4)),
        actions=dict(type='int', num_actions=7),
        network= [
                dict(
                    type="conv2d",
                    size=16,
                    window=[8,8],
                    stride=4,
                    activation="relu"
                ),
                dict(
                    type="conv2d",
                    size=32,
                    window=[4,4],
                    stride=2,
                    activation="relu"
                ),
                dict(
                    type="flatten"
                ),
                dict(
                    type="dense",
                    size=256
                )
        ],
        actions_exploration = dict(
            type="epsilon_decay",
            initial_epsilon=1.0,
            final_epsilon=0.1,
            timesteps=1000
        ),
        memory=dict(
                type="replay",
                capacity=1000,
                include_next_states=True
        ),
        update_mode = dict(
            unit="timesteps",
            batch_size=16,
            frequency=4
        ),
        discount = 0.99,
        entropy_regularization = None,
        double_q_model = True,
        optimizer = dict(
            type="adam",
            learning_rate=1e-4
        )
    )

    
    try:
        agent.restore_model(directory="data/", file="data-117246")
        print("Found data!")
    except:
        print("Can't load data")

    SAVE_INTERVAL = 10
    def episode_finished(r):
        #print(r.episode)
        if r.episode % SAVE_INTERVAL == 0:
            print("Finished episode {ep} after {ts} timesteps".format(ep=r.episode + 1, ts=r.timestep + 1))
            print("Episode reward: {}".format(r.episode_rewards[-1]))
            print("Average of last {} rewards: {}\n".format(SAVE_INTERVAL, np.mean(r.episode_rewards[-SAVE_INTERVAL:])))

            r.agent.save_model(directory="data/data", append_timestep=True)

            with open("reward_history.csv", "a") as csvfile:
                writer = csv.writer(csvfile)
                for reward in r.episode_rewards[-SAVE_INTERVAL:]:
                    writer.writerow([r.episode, reward])
        
            with open("episode_history.csv", "a") as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow([r.episode, r.timestep])
        '''
        with open("individual_reward_history.csv", "a") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow([r.episode, r.episode_rewards[-1]])
        '''
        return True

    runner = Runner(
        agent = agent,  # Agent object
        environment = env  # Environment object
    )

    max_episodes  = 10000
    max_timesteps = 50000000
    runner.run(max_timesteps,max_episodes, episode_finished=episode_finished)

    runner.close()
Exemple #16
0
def main():
    '''CLI interface to bootstrap taining'''
    parser = argparse.ArgumentParser(description="Playground Flags.")
    parser.add_argument("--game", default="pommerman", help="Game to choose.")
    parser.add_argument("--config",
                        default="PommeFFACompetition-v0",
                        help="Configuration to execute. See env_ids in "
                        "configs.py for options.")
    parser.add_argument("--agents",
                        default="tensorforce::ppo,test::agents.SimpleAgent,"
                        "test::agents.SimpleAgent,test::agents.SimpleAgent",
                        help="Comma delineated list of agent types and docker "
                        "locations to run the agents.")
    parser.add_argument("--agent_env_vars",
                        help="Comma delineated list of agent environment vars "
                        "to pass to Docker. This is only for the Docker Agent."
                        " An example is '0:foo=bar:baz=lar,3:foo=lam', which "
                        "would send two arguments to Docker Agent 0 and one to"
                        " Docker Agent 3.",
                        default="")
    parser.add_argument("--record_pngs_dir",
                        default=None,
                        help="Directory to record the PNGs of the game. "
                        "Doesn't record if None.")
    parser.add_argument("--record_json_dir",
                        default=None,
                        help="Directory to record the JSON representations of "
                        "the game. Doesn't record if None.")
    parser.add_argument("--render",
                        default=False,
                        action='store_true',
                        help="Whether to render or not. Defaults to False.")
    parser.add_argument("--game_state_file",
                        default=None,
                        help="File from which to load game state. Defaults to "
                        "None.")
    parser.add_argument("--model_save_dir",
                        default="./ppo_model/model",
                        help="Directory to save the learnt models.")
    args = parser.parse_args()

    config = args.config
    record_pngs_dir = args.record_pngs_dir
    record_json_dir = args.record_json_dir
    agent_env_vars = args.agent_env_vars
    game_state_file = args.game_state_file

    # TODO: After https://github.com/MultiAgentLearning/playground/pull/40
    #       this is still missing the docker_env_dict parsing for the agents.
    agents = [
        helpers.make_agent_from_string(agent_string, agent_id + 1000)
        for agent_id, agent_string in enumerate(args.agents.split(","))
    ]

    env = make(config, agents, game_state_file)
    training_agent = None

    for agent in agents:
        # if type(agent) == TensorForceAgent:
        if agent.trainable:
            training_agent = agent
            env.set_training_agent(agent.agent_id)
            break

    if args.record_pngs_dir:
        assert not os.path.isdir(args.record_pngs_dir)
        os.makedirs(args.record_pngs_dir)
    if args.record_json_dir:
        assert not os.path.isdir(args.record_json_dir)
        os.makedirs(args.record_json_dir)

    # Create a Proximal Policy Optimization agent
    agent = training_agent.initialize(env)

    atexit.register(functools.partial(clean_up_agents, agents))
    wrapped_env = WrappedEnv(env, visualize=args.render)
    runner = Runner(agent=agent, environment=wrapped_env)

    num_epi = 200000
    vis_epi = 100
    max_reward = -10.0
    for i in range(num_epi // vis_epi):
        runner.run(episodes=vis_epi, max_episode_timesteps=2000)
        m_reward = np.mean(runner.episode_rewards[-vis_epi:])
        m_step = np.mean(runner.episode_timesteps[-vis_epi:])
        m_time = np.mean(runner.episode_times[-vis_epi:])
        print("[Iter %s]: %.3f %.3f %.3f" % (i, m_reward, m_step, m_time))
        sys.stdout.flush()

        if m_reward > max_reward:
            max_reward = m_reward
            agent.save_model(args.model_save_dir, False)
            print("[Save] max_reward=%s" % (max_reward))

    try:
        runner.close()
    except AttributeError as e:
        pass
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="Id of the Gym environment")
    parser.add_argument('-a', '--agent', help="Agent configuration file")
    parser.add_argument('-n', '--network', default=None, help="Network specification file")
    parser.add_argument('-e', '--episodes', type=int, default=None, help="Number of episodes")
    parser.add_argument('-t', '--timesteps', type=int, default=None, help="Number of timesteps")
    parser.add_argument('-m', '--max-episode-timesteps', type=int, default=None, help="Maximum number of timesteps per episode")
    parser.add_argument('-d', '--deterministic', action='store_true', help="Choose actions deterministically")
    parser.add_argument('-M', '--mode', choices=('tmux', 'child'), default='tmux', help="Starter mode")
    parser.add_argument('-W', '--num-workers', type=int, default=1, help="Number of worker agents")
    parser.add_argument('-C', '--child', action='store_true', help="Child process")
    parser.add_argument('-P', '--parameter-server', action='store_true', help="Parameter server")
    parser.add_argument('-I', '--task-index', type=int, default=0, help="Task index")
    parser.add_argument('-K', '--kill', action='store_true', help="Kill runners")
    parser.add_argument('-L', '--logdir', default='logs_async', help="Log directory")
    parser.add_argument('-D', '--debug', action='store_true', help="Show debug outputs")

    args = parser.parse_args()

    session_name = 'OpenAI-' + args.gym_id
    shell = '/bin/bash'

    kill_cmds = [
        "kill $( lsof -i:12222-{} -t ) > /dev/null 2>&1".format(12222 + args.num_workers),
        "tmux kill-session -t {}".format(session_name),
    ]
    if args.kill:
        os.system("\n".join(kill_cmds))
        return 0

    if not args.child:
        # start up child processes
        target_script = os.path.abspath(inspect.stack()[0][1])

        def wrap_cmd(session, name, cmd):
            if isinstance(cmd, list):
                cmd = ' '.join(shlex_quote(str(arg)) for arg in cmd)
            if args.mode == 'tmux':
                return 'tmux send-keys -t {}:{} {} Enter'.format(session, name, shlex_quote(cmd))
            elif args.mode == 'child':
                return '{} > {}/{}.{}.out 2>&1 & echo kill $! >> {}/kill.sh'.format(
                    cmd, args.logdir, session, name, args.logdir
                )

        def build_cmd(ps, index):
            cmd_args = [
                'CUDA_VISIBLE_DEVICES=',
                sys.executable, target_script,
                args.gym_id,
                '--agent', os.path.join(os.getcwd(), args.agent),
                '--network', os.path.join(os.getcwd(), args.network),
                '--num-workers', args.num_workers,
                '--child',
                '--task-index', index
            ]
            if args.episodes is not None:
                cmd_args.append('--episodes')
                cmd_args.append(args.episodes)
            if args.timesteps is not None:
                cmd_args.append('--timesteps')
                cmd_args.append(args.timesteps)
            if args.max_episode_timesteps is not None:
                cmd_args.append('--max-episode-timesteps')
                cmd_args.append(args.max_episode_timesteps)
            if args.deterministic:
                cmd_args.append('--deterministic')
            if ps:
                cmd_args.append('--parameter-server')
            if args.debug:
                cmd_args.append('--debug')
            return cmd_args

        if args.mode == 'tmux':
            cmds = kill_cmds + ['tmux new-session -d -s {} -n ps'.format(session_name)]
        elif args.mode == 'child':
            cmds = ['mkdir -p {}'.format(args.logdir),
                    'rm -f {}/kill.sh'.format(args.logdir),
                    'echo "#/bin/bash" > {}/kill.sh'.format(args.logdir),
                    'chmod +x {}/kill.sh'.format(args.logdir)]

        cmds.append(wrap_cmd(session_name, 'ps', build_cmd(ps=True, index=0)))

        for i in xrange(args.num_workers):
            name = 'worker{}'.format(i)
            if args.mode == 'tmux':
                cmds.append('tmux new-window -t {} -n {} -d {}'.format(session_name, name, shell))
            cmds.append(wrap_cmd(session_name, name, build_cmd(ps=False, index=i)))

        # add one PS call
        # cmds.append('tmux new-window -t {} -n ps -d {}'.format(session_name, shell))

        print("\n".join(cmds))

        os.system("\n".join(cmds))

        return 0

    ps_hosts = ['127.0.0.1:{}'.format(12222)]
    worker_hosts = []
    port = 12223
    for _ in range(args.num_workers):
        worker_hosts.append('127.0.0.1:{}'.format(port))
        port += 1
    cluster = {'ps': ps_hosts, 'worker': worker_hosts}
    cluster_spec = tf.train.ClusterSpec(cluster)

    environment = OpenAIGym(args.gym_id)

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)  # log_levels[agent.log_level])

    if args.agent is not None:
        with open(args.agent, 'r') as fp:
            agent = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    if args.network is not None:
        with open(args.network, 'r') as fp:
            network = json.load(fp=fp)
    else:
        network = None
        logger.info("No network configuration provided.")

    if args.parameter_server:
        agent['device'] = '/job:ps/task:{}'.format(args.task_index)  # '/cpu:0'
    else:
        agent['device'] = '/job:worker/task:{}'.format(args.task_index)  # '/cpu:0'

    agent['distributed'] = dict(
        cluster_spec=cluster_spec,
        task_index=args.task_index,
        parameter_server=args.parameter_server
    )

    agent = Agent.from_spec(
        spec=agent,
        kwargs=dict(
            states=environment.states,
            actions=environment.actions,
            network=network
        )
    )

    logger.info("Starting distributed agent for OpenAI Gym '{gym_id}'".format(gym_id=args.gym_id))
    logger.info("Config:")
    logger.info(agent)

    runner = Runner(
        agent=agent,
        environment=environment,
        repeat_actions=1
    )

    if args.debug:  # TODO: Timestep-based reporting
        report_episodes = 1
    else:
        report_episodes = 100

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            steps_per_second = r.timestep / (time.time() - r.start_time)
            logger.info("Finished episode {} after overall {} timesteps. Steps Per Second {}".format(
                r.agent.episode,
                r.agent.timestep,
                steps_per_second)
            )
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards))))
            logger.info("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards))))
        return True

    runner.run(
        timesteps=args.timesteps,
        episodes=args.episodes,
        max_episode_timesteps=args.max_episode_timesteps,
        deterministic=args.deterministic,
        episode_finished=episode_finished
    )
    runner.close()
Exemple #18
0
    def compute(self, config_id, config, budget, working_directory):
        if self.environment.max_episode_timesteps() == None:
            min_capacity = 1000 + config['batch_size']
        else:
            min_capacity = self.environment.max_episode_timesteps() + config['batch_size']
        max_capacity = 100000
        capacity = min(max_capacity, max(min_capacity, config['memory'] * config['batch_size']))
        frequency = max(4, int(config['frequency'] * config['batch_size']))

        if config['baseline'] == 'no':
            baseline_policy = None
            baseline_objective = None
            baseline_optimizer = None
            estimate_horizon = False
            estimate_terminal = False
            estimate_advantage = False
        else:
            estimate_horizon = 'late'
            estimate_advantage = (config['estimate_advantage'] == 'yes')
            if config['baseline'] == 'same-policy':
                baseline_policy = None
                baseline_objective = None
                baseline_optimizer = None
            elif config['baseline'] == 'auto':
                # other modes, shared network/policy etc !!!
                baseline_policy = dict(network=dict(type='auto', internal_rnn=False))
                baseline_objective = dict(
                    type='value', value='state', huber_loss=0.0, early_reduce=False
                )
                baseline_optimizer = dict(
                    type='adam', learning_rate=config['baseline_learning_rate']
                )
            else:
                assert False

        if config['l2_regularization'] < 3e-5:  # yes/no better
            l2_regularization = 0.0
        else:
            l2_regularization = config['l2_regularization']

        if config['entropy_regularization'] < 3e-5:  # yes/no better
            entropy_regularization = 0.0
        else:
            entropy_regularization = config['entropy_regularization']

        # Set agent configuration according to configspace
        print("### Set agent configuration according to configspace")
        agent = dict(
            agent='tensorforce',
            policy=dict(network=dict(type='auto', internal_rnn=False)),
            memory=dict(type='replay', capacity=capacity),  # replay, recent
            update=dict(unit='timesteps', batch_size=config['batch_size'], frequency=frequency),  # timesteps, episode
            optimizer=dict(type='adam', learning_rate=config['learning_rate']),
            objective=dict(
                type='policy_gradient', ratio_based=True, clipping_value=0.1, 
                early_reduce=False
            ),
            reward_estimation=dict(
                horizon=config['horizon'], discount=config['discount'],
                estimate_horizon=estimate_horizon, estimate_actions=False,
                estimate_terminal=False, estimate_advantage=estimate_advantage
            ),
            baseline_policy=baseline_policy, baseline_objective=baseline_objective,
            baseline_optimizer=baseline_optimizer,
            preprocessing=None,
            l2_regularization=l2_regularization, entropy_regularization=entropy_regularization
        )

        # Set state representation according to configspace
        print("### Set state representation according to configspace")

        # Example state configurations to evaluate
        config_state = None
        if config['state'] == 0:
            config_state = []
        elif config['state'] == 1:
            config_state = ['bin_buffer_fill']
        elif config['state'] == 2:
            config_state = ['bin_buffer_fill', 'distance_to_action']
        elif config['state'] == 3:
            config_state = ['bin_buffer_fill', 'distance_to_action', 'bin_machine_failure']
        elif config['state'] == 4:
            config_state = ['bin_buffer_fill', 'distance_to_action', 'bin_machine_failure', 'order_waiting_time']


        self.environment.environment.parameters.update({'TRANSP_AGENT_STATE': config_state})
        self.environment.environment.parameters.update({'TRANSP_AGENT_REWARD': config['reward']})
        #self.environment.environment.parameters.update({'TRANSP_AGENT_REWARD_INVALID_ACTION': config['reward_invalid']})
        #self.environment.environment.parameters.update({'TRANSP_AGENT_REWARD_OBJECTIVE_WEIGHTS': config['reward_weighted']})
        self.environment.environment.parameters.update({'TRANSP_AGENT_MAX_INVALID_ACTIONS': config['max_invalid_actions']})
        self.environment.environment.parameters.update({'TRANSP_AGENT_WAITING_TIME_ACTION': config['waiting_if_invalid_actions']})

        # num_episodes = list()
        final_reward = list()
        max_reward = list()
        rewards = list()

        for n in range(round(budget)):
            runner = Runner(agent=agent, environment=self.environment)
            #runner = Runner(agent='config/ppo2.json', environment=self.environment)

            # performance_threshold = runner.environment.max_episode_timesteps() - agent['reward_estimation']['horizon']

            # def callback(r, p):
            #     return True

            runner.run(num_episodes=NUM_EPISODES, use_tqdm=False)
            runner.close()

            # num_episodes.append(len(runner.episode_rewards))
            final_reward.append(float(np.mean(runner.episode_rewards[-20:], axis=0)))
            average_rewards = [
                float(np.mean(runner.episode_rewards[n: n + 20], axis=0))
                for n in range(len(runner.episode_rewards) - 20)
            ]
            max_reward.append(float(np.amax(average_rewards, axis=0)))
            rewards.append(list(runner.episode_rewards))

        # mean_num_episodes = float(np.mean(num_episodes, axis=0))
        mean_final_reward = float(np.mean(final_reward, axis=0))
        mean_max_reward = float(np.mean(max_reward, axis=0))
        # loss = mean_num_episodes - mean_final_reward - mean_max_reward
        loss = -mean_final_reward - mean_max_reward

        return dict(loss=loss, info=dict(rewards=rewards))
    def test_blogpost_introduction_runner(self):
        from .minimal_test import MinimalTest
        from tensorforce.agents import DQNAgent
        from tensorforce.execution import Runner

        environment = MinimalTest(specification={'int': ()})

        network_spec = [dict(type='dense', size=32)]

        agent = DQNAgent(states=environment.states,
                         actions=environment.actions,
                         network=network_spec,
                         memory=dict(type='replay',
                                     include_next_states=True,
                                     capacity=100),
                         target_sync_frequency=50)
        runner = Runner(agent=agent, environment=environment)

        def episode_finished(runner):
            if runner.episode % 100 == 0:
                print(sum(runner.episode_rewards[-100:]) / 100)
            return runner.episode < 100 \
                or not all(reward >= 1.0 for reward in runner.episode_rewards[-100:])

        # runner.run(episodes=1000, episode_finished=episode_finished)
        runner.run(episodes=10, episode_finished=episode_finished
                   )  # Only 10 episodes for this test
        runner.close()

        ### Code block: next
        agent = DQNAgent(states=environment.states,
                         actions=environment.actions,
                         network=network_spec,
                         memory=dict(type='replay',
                                     include_next_states=True,
                                     capacity=100),
                         target_sync_frequency=50)

        # max_episodes = 1000
        max_episodes = 10  # Only 10 episodes for this test
        max_timesteps = 2000

        episode = 0
        episode_rewards = list()

        while True:
            state = environment.reset()
            agent.reset()

            timestep = 0
            episode_reward = 0
            while True:
                action = agent.act(states=state)
                state, terminal, reward = environment.execute(action=action)
                agent.observe(terminal=terminal, reward=reward)

                timestep += 1
                episode_reward += reward

                if terminal or timestep == max_timesteps:
                    break

            episode += 1
            episode_rewards.append(episode_reward)

            if all(reward >= 1.0 for reward in
                   episode_rewards[-100:]) or episode == max_episodes:
                break

        agent.close()
        environment.close()
class TensorforceTradingStrategy(TradingStrategy):
    """A trading strategy capable of self tuning, training, and evaluating with Tensorforce."""
    def __init__(self,
                 environment: TradingEnvironment,
                 agent_spec: Dict = None,
                 network_spec: Dict = None,
                 **kwargs):
        """
        Arguments:
            environment: A `TradingEnvironment` instance for the agent to trade within.
            agent_spec: A specification dictionary for the `Tensorforce` agent.
            network_sepc: A specification dictionary for the `Tensorforce` agent's model network.
            kwargs (optional): Optional keyword arguments to adjust the strategy.
        """
        self._environment = environment

        self._max_episode_timesteps = kwargs.get('max_episode_timesteps', None)

        if agent_spec and network_spec:
            self._agent_spec = agent_spec
            self._network_spec = network_spec

            self._agent = Agent.from_spec(spec=agent_spec,
                                          kwargs=dict(
                                              network=network_spec,
                                              states=environment.states,
                                              actions=environment.actions))

            self._runner = Runner(agent=self._agent, environment=environment)

    @property
    def agent(self) -> Agent:
        """A Tensorforce `Agent` instance that will learn the strategy."""
        return self._agent

    @property
    def max_episode_timesteps(self) -> int:
        """The maximum timesteps per episode."""
        return self._max_episode_timesteps

    @max_episode_timesteps.setter
    def max_episode_timesteps(self, max_episode_timesteps: int):
        self._max_episode_timesteps = max_episode_timesteps

    def restore_agent(self, path: str, model_path: str = None):
        """Deserialize the strategy's learning agent from a file.

        Arguments:
            path: The `str` path of the file the agent specification is stored in.
                The `.json` file extension will be automatically appended if not provided.
            model_path (optional): The `str` path of the file or directory the agent checkpoint is stored in.
                If not provided, the `model_path` will default to `{path_without_dot_json}/agents`.
        """
        path_with_ext = path if path.endswith('.json') else f'{path}.json'

        with open(path_with_ext) as json_file:
            spec = json.load(json_file)

            self._agent_spec = spec.agent
            self._network_spec = spec.network

        self._agent = Agent.from_spec(spec=self._agent_spec,
                                      kwargs=dict(
                                          network=self._network_spec,
                                          states=self._environment.states,
                                          actions=self._environment.actions))

        path_without_ext = path_with_ext.replace('.json', '')
        model_path = model_path or f'{path_without_ext}/agent'

        self._agent.restore_model(file=model_path)

        self._runner = Runner(agent=self._agent, environment=self._environment)

    def save_agent(self,
                   path: str,
                   model_path: str = None,
                   append_timestep: bool = False):
        """Serialize the learning agent to a file for restoring later.

        Arguments:
            path: The `str` path of the file to store the agent specification in.
                The `.json` file extension will be automatically appended if not provided.
            model_path (optional): The `str` path of the directory to store the agent checkpoints in.
                If not provided, the `model_path` will default to `{path_without_dot_json}/agents`.
            append_timestep: Whether the timestep should be appended to filename to prevent overwriting previous models.
                Defaults to `False`.
        """
        path_with_ext = path if path.endswith('.json') else f'{path}.json'

        spec = {'agent': self._agent_spec, 'network': self._network_spec}

        with open(path_with_ext, 'w') as json_file:
            json.dump(spec, json_file)

        path_without_ext = path_with_ext.replace('.json', '')
        model_path = model_path or f'{path_without_ext}/agent'

        if not os.path.exists(model_path):
            os.makedirs(model_path)

        self._agent.save_model(directory=model_path, append_timestep=True)

    def _finished_episode_cb(self, runner: Runner) -> bool:
        n_episodes = runner.episode
        n_timesteps = runner.episode_timestep
        avg_reward = np.mean(runner.episode_rewards)

        print(f"Finished episode {n_episodes} after {n_timesteps} timesteps.")
        print(f"Average episode reward: {avg_reward})")

        return True

    def tune(self,
             steps: int = None,
             episodes: int = None,
             callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame:
        raise NotImplementedError

    def run(
        self,
        steps: int = None,
        episodes: int = None,
        should_train: bool = False,
        episode_callback: Callable[[pd.DataFrame],
                                   bool] = None) -> pd.DataFrame:
        testing = not should_train

        self._runner.run(testing=testing,
                         num_timesteps=steps,
                         num_episodes=episodes,
                         max_episode_timesteps=self._max_episode_timesteps,
                         episode_finished=self._finished_episode_cb)

        self._runner.close()

        n_episodes = self._runner.episode
        n_timesteps = self._runner.timestep
        avg_reward = np.mean(self._runner.episode_rewards)

        print("Finished running strategy.")
        print(f"Total episodes: {n_episodes} ({n_timesteps} timesteps).")
        print(f"Average reward: {avg_reward}.")
Exemple #21
0
def main():
    parser = argparse.ArgumentParser(description='Tensorforce runner')
    parser.add_argument(
        'agent', help='Agent (configuration JSON file, name, or library module)'
    )
    parser.add_argument(
        'environment',
        help='Environment (name, configuration JSON file, or library module)'
    )
    # Agent arguments
    parser.add_argument(
        '-n', '--network', type=str, default=None,
        help='Network (configuration JSON file, name, or library module)'
    )
    # Environment arguments
    parser.add_argument(
        '-l', '--level', type=str, default=None,
        help='Level or game id, like `CartPole-v1`, if supported'
    )
    parser.add_argument(
        '--visualize', action='store_true',
        help='Visualize agent--environment interaction, if supported'
    )
    parser.add_argument(
        '-i', '--import-modules', type=str, default=None,
        help='Import comma-separated modules required for environment'
    )
    # Runner arguments
    parser.add_argument('-t', '--timesteps', type=int, default=None, help='Number of timesteps')
    parser.add_argument('-e', '--episodes', type=int, default=None, help='Number of episodes')
    parser.add_argument(
        '-m', '--max-episode-timesteps', type=int, default=None,
        help='Maximum number of timesteps per episode'
    ),
    parser.add_argument(
        '--mean-horizon', type=int, default=10,
        help='Number of timesteps/episodes for mean reward computation'
    )
    parser.add_argument('-v', '--evaluation', action='store_true', help='Evaluation mode')
    parser.add_argument(
        '-s', '--save-best-agent', action='store_true', help='Save best-performing agent'
    )
    # Logging arguments
    parser.add_argument('-r', '--repeat', type=int, default=1, help='Number of repetitions')
    parser.add_argument(
        '-p', '--path', type=str, default=None,
        help='Logging path, directory plus filename without extension'
    )
    parser.add_argument('--seaborn', action='store_true', help='Use seaborn')
    args = parser.parse_args()

    if args.import_modules is not None:
        for module in args.import_modules.split(','):
            importlib.import_module(name=module)

    if args.path is None:
        callback = None

    else:
        assert os.path.splitext(args.path)[1] == ''
        assert args.episodes is not None and args.visualize is not None
        rewards = [list() for _ in range(args.episodes)]
        timesteps = [list() for _ in range(args.episodes)]
        seconds = [list() for _ in range(args.episodes)]
        agent_seconds = [list() for _ in range(args.episodes)]

        def callback(r):
            rewards[r.episode - 1].append(r.episode_reward)
            timesteps[r.episode - 1].append(r.episode_timestep)
            seconds[r.episode - 1].append(r.episode_second)
            agent_seconds[r.episode - 1].append(r.episode_agent_second)
            return True

    if args.visualize:
        if args.level is None:
            environment = Environment.create(environment=args.environment, visualize=True)
        else:
            environment = Environment.create(
                environment=args.environment, level=args.level, visualize=True
            )

    else:
        if args.level is None:
            environment = Environment.create(environment=args.environment)
        else:
            environment = Environment.create(environment=args.environment, level=args.level)

    for _ in range(args.repeat):
        agent_kwargs = dict()
        if args.network is not None:
            agent_kwargs['network'] = args.network
        if args.max_episode_timesteps is not None:
            assert environment.max_episode_timesteps() is None or \
                environment.max_episode_timesteps() == args.max_episode_timesteps
            agent_kwargs['max_episode_timesteps'] = args.max_episode_timesteps
        agent = Agent.create(agent=args.agent, environment=environment, **agent_kwargs)

        runner = Runner(agent=agent, environment=environment)
        runner.run(
            num_timesteps=args.timesteps, num_episodes=args.episodes,
            max_episode_timesteps=args.max_episode_timesteps, callback=callback,
            mean_horizon=args.mean_horizon, evaluation=args.evaluation
            # save_best_model=args.save_best_model
        )
        runner.close()

    if args.path is not None:
        if not os.path.isdir(os.path.split(args.path)[0]):
            os.makedirs(os.path.split(args.path)[0], exist_ok=True)

        with open(args.path + '.json', 'w') as filehandle:
            filehandle.write(
                json.dumps(dict(
                    rewards=rewards, timesteps=timesteps, seconds=seconds,
                    agent_seconds=agent_seconds
                ))
            )

        if args.seaborn:
            import seaborn as sns
            sns.set()

        xs = np.arange(len(rewards))
        min_rewards = np.amin(rewards, axis=1)
        max_rewards = np.amax(rewards, axis=1)
        median_rewards = np.median(rewards, axis=1)
        plt.plot(xs, median_rewards, color='green', linewidth=2.0)
        plt.fill_between(xs, min_rewards, max_rewards, color='green', alpha=0.4)
        plt.xlabel('episodes')
        plt.ylabel('reward')
        plt.savefig(fname=(args.path + '.png'))
class TensorforceTradingStrategy(TradingStrategy):
    """A trading strategy capable of self tuning, training, and evaluating with Tensorforce."""
    def __init__(self,
                 environment: 'TradingEnvironment',
                 agent_spec: any,
                 save_best_agent: bool = False,
                 **kwargs):
        """
        Arguments:
            environment: A `TradingEnvironment` instance for the agent to trade within.
            agent: A `Tensorforce` agent or agent specification.
            save_best_agent (optional): The runner will automatically save the best agent
            kwargs (optional): Optional keyword arguments to adjust the strategy.
        """
        self._max_episode_timesteps = kwargs.get('max_episode_timesteps',
                                                 False)

        self._environment = Environment.create(
            environment='gym',
            level=environment,
            max_episode_timesteps=self._max_episode_timesteps)

        self._agent = Agent.create(agent=agent_spec,
                                   environment=self._environment)

        self._runner = Runner(agent=self._agent,
                              environment=self._environment,
                              save_best_agent=save_best_agent)

    @property
    def agent(self) -> Agent:
        """A Tensorforce `Agent` instance that will learn the strategy."""
        return self._agent

    @property
    def max_episode_timesteps(self) -> int:
        """The maximum timesteps per episode."""
        return self._max_episode_timesteps

    @max_episode_timesteps.setter
    def max_episode_timesteps(self, max_episode_timesteps: int):
        self._max_episode_timesteps = max_episode_timesteps

    def restore_agent(self, directory: str, filename: str = None):
        """Deserialize the strategy's learning agent from a file.

        Arguments:
            directory: The `str` path of the directory the agent checkpoint is stored in.
            filename (optional): The `str` path of the file the agent specification is stored in.
                The `.json` file extension will be automatically appended if not provided.
        """
        self._agent = Agent.load(directory, filename=filename)

        self._runner = Runner(agent=self._agent, environment=self._environment)

    def save_agent(self,
                   directory: str,
                   filename: str = None,
                   append_timestep: bool = False):
        """Serialize the learning agent to a file for restoring later.

        Arguments:
            directory: The `str` path of the directory the agent checkpoint is stored in.
            filename (optional): The `str` path of the file the agent specification is stored in.
                The `.json` file extension will be automatically appended if not provided.
            append_timestep: Whether the timestep should be appended to filename to prevent overwriting previous models.
                Defaults to `False`.
        """
        self._agent.save(directory=directory,
                         filename=filename,
                         append_timestep=append_timestep)

    def _finished_episode_cb(self, runner: Runner) -> bool:
        n_episodes = runner.episodes
        n_timesteps = runner.episode_timesteps
        avg_reward = np.mean(runner.episode_rewards)

        print("Finished episode {} after {} timesteps.".format(
            n_episodes, n_timesteps))
        print("Average episode reward: {})".format(avg_reward))

        return True

    def tune(self,
             steps: int = None,
             episodes: int = None,
             callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame:
        raise NotImplementedError

    def run(
        self,
        steps: int = None,
        episodes: int = None,
        evaluation: bool = False,
        episode_callback: Callable[[pd.DataFrame],
                                   bool] = None) -> pd.DataFrame:
        self._runner.run(evaluation=evaluation,
                         num_timesteps=steps,
                         num_episodes=episodes,
                         callback=episode_callback)

        n_episodes = self._runner.episodes
        n_timesteps = self._runner.timesteps
        avg_reward = np.mean(self._runner.episode_rewards)

        print("Finished running strategy.")
        print("Total episodes: {} ({} timesteps).".format(
            n_episodes, n_timesteps))
        print("Average reward: {}.".format(avg_reward))

        self._runner.close()

        return self._environment.environment._exchange._performance
Exemple #23
0
def main(max_timesteps):
    max_episodes = None
    #max_timesteps = 86400000000*days

    env = real_adapter(pong)

    network_spec = [
        #dict(type='flatten'),
        dict(type='dense', size=11, activation='tanh'),
        #dict(type='dense', size=20, activation='tanh'),
        #dict(type='dense', size=32, activation='tanh'),
    ]

    exploration = dict(type='epsilon_decay', timesteps=max_timesteps)

    summarizer = dict(
        directory="./models/"+str(datetime.now()).replace(' ', ''),
        steps=10000,
        seconds=None,
        labels=[
            #'rewards',
            #'actions',
            'inputs',
            'gradients',
            'configuration',
        ],
        meta_dict=dict(
            description='July 2: Trying 11 node hidden layer.',
            layers=str(network_spec),
            timesteps=max_timesteps,
            exploration=exploration,
        ),
    )

    agent = NAFAgent(
        states=env.states,
        actions=env.actions,
        network=network_spec,
        #actions_exploration=exploration,
        #summarizer=summarizer,
        #batch_size=64
    )

    runner = Runner(agent, env)

    report_episodes = 1

    #global prev
    global prev
    prev = 0

    def episode_finished(r):
        global prev
        if r.episode % report_episodes == 0:
            #print("Finished episode {ep} after {ts} timesteps".format(ep=r.episode, ts=r.timestep-prev))
            #print("Episode reward: {}".format(r.episode_rewards[-1]))
            print(r.episode_rewards[-1])
        prev = r.timestep
        #print("Average of last 100 rewards: {}".format(sum(r.episode_rewards[-100:]) / 100))
        return True

    print("Starting {agent} for Environment '{env}'".format(agent=agent, env=env))

    runner.run(num_episodes=max_episodes, num_timesteps=max_timesteps, max_episode_timesteps=None, episode_finished=episode_finished)

    agent.save_model(directory='./results/NAF/'+str(datetime.now()).replace(' ', '')+'/model')

    runner.close()

    print("Learning finished. Total episodes: {ep}".format(ep=runner.episode))
Exemple #24
0
class UnittestBase(object):
    """
    Unit-test base class.
    """

    # Unittest
    num_updates = None
    num_episodes = None
    num_timesteps = None

    # Environment
    timestep_range = (1, 5)
    states = dict(bool_state=dict(type='bool', shape=(1, )),
                  int_state=dict(type='int', shape=(2, ), num_values=4),
                  float_state=dict(type='float', shape=(1, 1, 2)),
                  bounded_state=dict(type='float',
                                     shape=(),
                                     min_value=-0.5,
                                     max_value=0.5))
    actions = dict(bool_action=dict(type='bool', shape=(1, )),
                   int_action=dict(type='int', shape=(2, ), num_values=4),
                   float_action=dict(type='float', shape=(1, 1)),
                   bounded_action=dict(type='float',
                                       shape=(2, ),
                                       min_value=-0.5,
                                       max_value=0.5))

    # Exclude action types
    exclude_bool_action = False
    exclude_int_action = False
    exclude_float_action = False
    exclude_bounded_action = False

    # Agent
    agent = dict(update=4,
                 network=dict(type='auto', size=8, internal_rnn=2),
                 objective='policy_gradient',
                 reward_estimation=dict(horizon=2))

    # Tensorforce config
    require_observe = False
    require_all = False

    def start_tests(self, name=None):
        """
        Start unit-test method.
        """
        if name is None:
            sys.stdout.write('\n{} {}: '.format(
                datetime.now().strftime('%H:%M:%S'),
                self.__class__.__name__[4:]))
        else:
            sys.stdout.write('\n{} {} ({}): '.format(
                datetime.now().strftime('%H:%M:%S'),
                self.__class__.__name__[4:], name))
        sys.stdout.flush()

    def finished_test(self, assertion=None):
        """
        Finished unit-test.
        """
        if assertion is None:
            assertion = True
        else:
            self.assertTrue(expr=assertion)
        if assertion:
            sys.stdout.write('.')
            sys.stdout.flush()

    def prepare(self,
                environment=None,
                timestep_range=None,
                states=None,
                actions=None,
                exclude_bool_action=False,
                exclude_int_action=False,
                exclude_float_action=False,
                exclude_bounded_action=False,
                require_observe=False,
                require_all=False,
                **agent):
        """
        Generic unit-test preparation.
        """
        Layer.layers = None

        if environment is None:
            if states is None:
                states = deepcopy(self.__class__.states)

            if actions is None:
                actions = deepcopy(self.__class__.actions)
                if exclude_bool_action or self.__class__.exclude_bool_action:
                    actions.pop('bool_action')
                if exclude_int_action or self.__class__.exclude_int_action:
                    actions.pop('int_action')
                if exclude_float_action or self.__class__.exclude_float_action:
                    actions.pop('float_action')
                if exclude_bounded_action or self.__class__.exclude_bounded_action:
                    actions.pop('bounded_action')

            if timestep_range is None:
                timestep_range = self.__class__.timestep_range

            environment = UnittestEnvironment(
                states=states,
                actions=actions,
                timestep_range=timestep_range,
            )

        elif timestep_range is not None:
            raise TensorforceError.unexpected()

        environment = Environment.create(environment=environment)

        for key, value in self.__class__.agent.items():
            if key not in agent:
                agent[key] = value

        if self.__class__.require_all or require_all:
            config = None
        elif self.__class__.require_observe or require_observe:
            config = dict(api_functions=['reset', 'act', 'observe'])
        else:
            config = dict(api_functions=['reset', 'act'])

        agent = Agent.create(agent=agent,
                             environment=environment,
                             config=config)

        return agent, environment

    def unittest(self,
                 num_updates=None,
                 num_episodes=None,
                 num_timesteps=None,
                 environment=None,
                 timestep_range=None,
                 states=None,
                 actions=None,
                 exclude_bool_action=False,
                 exclude_int_action=False,
                 exclude_float_action=False,
                 exclude_bounded_action=False,
                 require_observe=False,
                 require_all=False,
                 **agent):
        """
        Generic unit-test.
        """
        agent, environment = self.prepare(
            environment=environment,
            timestep_range=timestep_range,
            states=states,
            actions=actions,
            exclude_bool_action=exclude_bool_action,
            exclude_int_action=exclude_int_action,
            exclude_float_action=exclude_float_action,
            exclude_bounded_action=exclude_bounded_action,
            require_observe=require_observe,
            require_all=require_all,
            **agent)

        self.runner = Runner(agent=agent, environment=environment)

        assert (num_updates is not None) + (num_episodes is not None) + \
            (num_timesteps is not None) <= 1
        if num_updates is None and num_episodes is None and num_timesteps is None:
            num_updates = self.__class__.num_updates
            num_episodes = self.__class__.num_episodes
            num_timesteps = self.__class__.num_timesteps
        if num_updates is None and num_episodes is None and num_timesteps is None:
            num_updates = 2
        assert (num_updates is not None) + (num_episodes is not None) + \
            (num_timesteps is not None) == 1

        evaluation = not any([
            require_all, require_observe, self.__class__.require_all,
            self.__class__.require_observe
        ])
        self.runner.run(num_episodes=num_episodes,
                        num_timesteps=num_timesteps,
                        num_updates=num_updates,
                        max_episode_timesteps=agent.max_episode_timesteps,
                        use_tqdm=False,
                        evaluation=evaluation)
        self.runner.close()

        self.finished_test()
Exemple #25
0
def main(args):
    version = 'v1'
    episodes = args.episodes
    visualize = args.visualize

    config = ffa_v0_fast_env()
    env = Pomme(**config["env_kwargs"])
    env.seed(0)

    agent = PPOAgent(
        states=dict(type='float', shape=(11, 11, 12)),
        actions=dict(type='int', num_actions=env.action_space.n),
        network=[
            # (9, 9, 12)
            dict(type='conv2d', size=12, window=3, stride=1),
            # (7, 7, 8)
            dict(type='conv2d', size=8, window=3, stride=1),
            # (5, 5, 4)
            dict(type='conv2d', size=4, window=3, stride=1),
            # (100)
            dict(type='flatten'),
            dict(type='dense', size=64, activation='relu'),
            dict(type='dense', size=16, activation='relu'),
        ],
        batching_capacity=1000,
        step_optimizer=dict(type='adam', learning_rate=1e-4))

    if os.path.exists(os.path.join('models', version, 'checkpoint')):
        agent.restore_model(directory=os.path.join('models', version))

    agents = []
    for agent_id in range(3):
        # agents.append(RandomAgent(config["agent"](agent_id, config["game_type"])))
        # agents.append(StoppingAgent(config["agent"](agent_id, config["game_type"])))
        agents.append(
            SimpleAgent(config["agent"](agent_id, config["game_type"])))

    agent_id += 1
    agents.append(
        TensorforceAgent(config["agent"](agent_id, config["game_type"])))
    env.set_agents(agents)
    env.set_training_agent(agents[-1].agent_id)
    env.set_init_game_state(None)

    wrapped_env = WrappedEnv(env, agent, visualize)
    runner = Runner(agent=agent, environment=wrapped_env)

    try:
        runner.run(episodes=episodes, max_episode_timesteps=100)
    except Exception as e:
        raise e
    finally:
        agent.save_model(directory=os.path.join('models', version, 'agent'))

    win_count = len(
        list(filter(lambda reward: reward == 1, runner.episode_rewards)))
    print('Stats: ')
    print(f'  runner.episode_rewards = {runner.episode_rewards}')
    print(f'  win count = {win_count}')

    try:
        runner.close()
    except AttributeError as e:
        raise e
Exemple #26
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('gym_id', help="Id of the Gym environment")
    parser.add_argument('-a', '--agent', help="Agent configuration file")
    parser.add_argument('-n',
                        '--network',
                        default=None,
                        help="Network specification file")
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=None,
                        help="Number of episodes")
    parser.add_argument('-t',
                        '--timesteps',
                        type=int,
                        default=None,
                        help="Number of timesteps")
    parser.add_argument('-m',
                        '--max-episode-timesteps',
                        type=int,
                        default=None,
                        help="Maximum number of timesteps per episode")
    parser.add_argument('-d',
                        '--deterministic',
                        action='store_true',
                        default=False,
                        help="Choose actions deterministically")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('--monitor', help="Save results to this directory")
    parser.add_argument('--monitor-safe',
                        action='store_true',
                        default=False,
                        help="Do not overwrite previous results")
    parser.add_argument('--monitor-video',
                        type=int,
                        default=0,
                        help="Save video every x steps (0 = disabled)")
    parser.add_argument('-D',
                        '--debug',
                        action='store_true',
                        default=False,
                        help="Show debug outputs")

    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)

    logger = logging.getLogger(__file__)
    logger.setLevel(logging.INFO)

    environment = OpenAIGym(gym_id=args.gym_id,
                            monitor=args.monitor,
                            monitor_safe=args.monitor_safe,
                            monitor_video=args.monitor_video)

    if args.agent is not None:
        with open(args.agent, 'r') as fp:
            agent = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    if args.network is not None:
        with open(args.network, 'r') as fp:
            network = json.load(fp=fp)
    else:
        network = None
        logger.info("No network configuration provided.")

    agent = Agent.from_spec(spec=agent,
                            kwargs=dict(states=environment.states,
                                        actions=environment.actions,
                                        network=network))
    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError(
                "Could not load agent from {}: No such directory.".format(
                    load_dir))
        agent.restore_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent)

    runner = Runner(agent=agent, environment=environment, repeat_actions=1)

    if args.debug:  # TODO: Timestep-based reporting
        report_episodes = 1
    else:
        report_episodes = 100

    logger.info("Starting {agent} for Environment '{env}'".format(
        agent=agent, env=environment))

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            steps_per_second = r.timestep / (time.time() - r.start_time)
            logger.info(
                "Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}"
                .format(r.agent.episode, r.episode_timestep, steps_per_second))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {:0.2f}".format(
                sum(r.episode_rewards[-500:]) /
                min(500, len(r.episode_rewards))))
            logger.info("Average of last 100 rewards: {:0.2f}".format(
                sum(r.episode_rewards[-100:]) /
                min(100, len(r.episode_rewards))))
        return True

    runner.run(timesteps=args.timesteps,
               episodes=args.episodes,
               max_episode_timesteps=args.max_episode_timesteps,
               deterministic=args.deterministic,
               episode_finished=episode_finished)
    runner.close()

    logger.info("Learning finished. Total episodes: {ep}".format(
        ep=runner.agent.episode))
Exemple #27
0
def main():
    parser = argparse.ArgumentParser(description="Playground Flags.")
    parser.add_argument("--game", default="pommerman", help="Game to choose.")
    parser.add_argument("--config",
                        default="PommeFFA-v0",
                        help="Configuration to execute. See env_ids in "
                        "configs.py for options.")
    parser.add_argument("--agents",
                        default="tensorforce::ppo,test::agents.SimpleAgent,"
                        "test::agents.SimpleAgent,test::agents.SimpleAgent",
                        help="Comma delineated list of agent types and docker "
                        "locations to run the agents.")
    parser.add_argument("--agent_env_vars",
                        help="Comma delineated list of agent environment vars "
                        "to pass to Docker. This is only for the Docker Agent."
                        " An example is '0:foo=bar:baz=lar,3:foo=lam', which "
                        "would send two arguments to Docker Agent 0 and one to"
                        " Docker Agent 3.",
                        default="")
    parser.add_argument("--record_pngs_dir",
                        default=None,
                        help="Directory to record the PNGs of the game. "
                        "Doesn't record if None.")
    parser.add_argument("--record_json_dir",
                        default=None,
                        help="Directory to record the JSON representations of "
                        "the game. Doesn't record if None.")
    parser.add_argument("--render",
                        default=True,
                        help="Whether to render or not. Defaults to True.")
    parser.add_argument("--game_state_file",
                        default=None,
                        help="File from which to load game state. Defaults to "
                        "None.")
    args = parser.parse_args()

    config = args.config
    record_pngs_dir = args.record_pngs_dir
    record_json_dir = args.record_json_dir
    agent_env_vars = args.agent_env_vars
    game_state_file = args.game_state_file

    # TODO: After https://github.com/MultiAgentLearning/playground/pull/40
    #       this is still missing the docker_env_dict parsing for the agents.
    agents = [
        helpers.make_agent_from_string(agent_string, agent_id + 1000)
        for agent_id, agent_string in enumerate(args.agents.split(","))
    ]

    env = make(config, agents, game_state_file)
    training_agent = None

    for agent in agents:
        if type(agent) == TensorForceAgent:
            training_agent = agent
            env.set_training_agent(agent.agent_id)
            break

    if args.record_pngs_dir:
        assert not os.path.isdir(args.record_pngs_dir)
        os.makedirs(args.record_pngs_dir)
    if args.record_json_dir:
        assert not os.path.isdir(args.record_json_dir)
        os.makedirs(args.record_json_dir)

    # Create a Proximal Policy Optimization agent
    agent = training_agent.initialize(env)

    atexit.register(functools.partial(clean_up_agents, agents))
    wrapped_env = WrappedEnv(env, visualize=args.render)
    runner = Runner(agent=agent, environment=wrapped_env)
    runner.run(episodes=10, max_episode_timesteps=2000)
    print("Stats: ", runner.episode_rewards, runner.episode_timesteps,
          runner.episode_times)

    try:
        runner.close()
    except AttributeError as e:
        pass
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('--mode', help="ID of the game mode")
    parser.add_argument('--hide',
                        dest='hide',
                        action='store_const',
                        const=True,
                        default=False,
                        help="Hide output window")
    parser.add_argument('-a',
                        '--agent-config',
                        help="Agent configuration file")
    parser.add_argument('-n',
                        '--network-spec',
                        default=None,
                        help="Network specification file")
    parser.add_argument('-e',
                        '--episodes',
                        type=int,
                        default=50000,
                        help="Number of episodes")
    parser.add_argument('-t',
                        '--max-timesteps',
                        type=int,
                        default=2000,
                        help="Maximum number of timesteps per episode")
    parser.add_argument('-s', '--save', help="Save agent to this dir")
    parser.add_argument('-se',
                        '--save-episodes',
                        type=int,
                        default=100,
                        help="Save agent every x episodes")
    parser.add_argument('-l', '--load', help="Load agent from this dir")
    parser.add_argument('-D',
                        '--debug',
                        action='store_true',
                        default=False,
                        help="Show debug outputs")

    args = parser.parse_args()

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)  # configurable!!!

    environment = MazeExplorer(mode_id=args.mode, visible=not args.hide)

    if args.agent_config is not None:
        with open(args.agent_config, 'r') as fp:
            agent_config = json.load(fp=fp)
    else:
        raise TensorForceError("No agent configuration provided.")

    if args.network_spec is not None:
        with open(args.network_spec, 'r') as fp:
            network = json.load(fp=fp)
    else:
        network = None
        logger.info("No network configuration provided.")

    agent = Agent.from_spec(spec=agent_config,
                            kwargs=dict(states=environment.states,
                                        actions=environment.actions,
                                        network=network))

    if args.load:
        load_dir = os.path.dirname(args.load)
        if not os.path.isdir(load_dir):
            raise OSError(
                "Could not load agent from {}: No such directory.".format(
                    load_dir))
        agent.restore_model(args.load)

    if args.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent_config)

    if args.save:
        save_dir = os.path.dirname(args.save)
        if not os.path.isdir(save_dir):
            try:
                os.mkdir(save_dir, 0o755)
            except OSError:
                raise OSError(
                    "Cannot save agent to dir {} ()".format(save_dir))

    runner = Runner(agent=agent, environment=environment, repeat_actions=1)

    report_episodes = args.episodes // 1000
    if args.debug:
        report_episodes = 1

    def episode_finished(r):
        if r.episode % report_episodes == 0:
            sps = r.timestep / (time.time() - r.start_time)
            logger.info(
                "Finished episode {ep} after {ts} timesteps. Steps Per Second {sps}"
                .format(ep=r.episode, ts=r.timestep, sps=sps))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {}".format(
                sum(r.episode_rewards[-500:]) / 500))
            logger.info("Average of last 100 rewards: {}".format(
                sum(r.episode_rewards[-100:]) / 100))
        return True

    logger.info("Starting {agent} for Environment '{env}'".format(
        agent=agent, env=environment))
    runner.run(args.episodes,
               args.max_timesteps,
               episode_finished=episode_finished)
    runner.close()
    logger.info(
        "Learning finished. Total episodes: {ep}".format(ep=runner.episode))

    environment.close()
def main(argv):
    logging_basicConfig(level=INFO)
    logger = getLogger(__file__)
    logger.setLevel(INFO)

    environment = OpenAIGym(
        gym_id='MoveToBeacon-bbueno5000-v0',
        monitor=FLAGS.monitor,
        monitor_safe=FLAGS.monitor_safe,
        monitor_video=FLAGS.monitor_video,
        visualize=FLAGS.visualize)

    # if FLAGS.agent_config is not None:
    #     with open(FLAGS.agent_config, 'r') as fp:
    #         agent_config = json.load(fp=fp)
    # else:
    #     raise TensorForceError(
    #         "No agent configuration provided.")

    # if FLAGS.network is not None:
    #     with open(FLAGS.network, 'r') as fp:
    #         network = json.load(fp=fp)
    # else:
    #     network = None
    #     logger.info(
    #         "No network configuration provided.")

    network_spec = [
        dict(type='flatten'),
        dict(type='dense', size=32),
        dict(type='dense', size=32)
        ]

    agent = PPOAgent(
        states=environment.states,
        actions=environment.actions,
        network=network_spec
        )

    if FLAGS.load:
        load_dir = path.dirname(FLAGS.load)
        if not path.isdir(load_dir):
            raise OSError(
                "Could not load agent from {}: No such directory.".format(load_dir))
        agent.restore_model(FLAGS.load)

    if FLAGS.save:
        save_dir = path.dirname(FLAGS.save)
        if not path.isdir(save_dir):
            try:
                mkdir(save_dir, 0o755)
            except OSError:
                raise OSError(
                    "Cannot save agent to dir {} ()".format(save_dir))

    if FLAGS.debug:
        logger.info("-" * 16)
        logger.info("Configuration:")
        logger.info(agent)

    runner = Runner(
        agent=agent,
        environment=environment,
        repeat_actions=1)

    if FLAGS.debug:
        report_episodes = 1
    else:
        report_episodes = 100

    logger.info(
        "Starting {agent} for Environment {env}".format(
            agent=agent, env=environment))

    def episode_finished(r, id_):
        if r.episode % report_episodes == 0:
            steps_per_second = r.timestep / (time() - r.start_time)
            logger.info("Finished episode {:d} after {:d} timesteps. Steps Per Second {:0.2f}".format(
                r.agent.episode, r.episode_timestep, steps_per_second))
            logger.info("Episode reward: {}".format(r.episode_rewards[-1]))
            logger.info("Average of last 500 rewards: {:0.2f}".format(
                sum(r.episode_rewards[-500:]) / min(500, len(r.episode_rewards))))
            logger.info("Average of last 100 rewards: {:0.2f}".format(
                sum(r.episode_rewards[-100:]) / min(100, len(r.episode_rewards))))
        if FLAGS.save and FLAGS.save_episodes is not None and not r.episode % FLAGS.save_episodes:
            logger.info("Saving agent to {}".format(FLAGS.save))
            r.agent.save_model(FLAGS.save)
        return True

    runner.run(
        num_timesteps=FLAGS.timesteps,
        num_episodes=FLAGS.num_episodes,
        max_episode_timesteps=FLAGS.max_episode_timesteps,
        deterministic=FLAGS.deterministic,
        episode_finished=episode_finished,
        testing=FLAGS.test,
        sleep=FLAGS.sleep)

    runner.close()

    logger.info("Learning completed.")
    logger.info("Total episodes: {ep}".format(ep=runner.agent.episode))
def main():
    '''CLI interface to bootstrap taining'''
    parser = argparse.ArgumentParser(description="Playground Flags.")
    parser.add_argument("--game", default="pommerman", help="Game to choose.")
    parser.add_argument("--config",
                        default="PommeFFACompetition-v0",
                        help="Configuration to execute. See env_ids in "
                        "configs.py for options.")
    parser.add_argument("--agents",
                        default="tensorforce::ppo,test::agents.SimpleAgent,"
                        "test::agents.SimpleAgent,test::agents.SimpleAgent",
                        help="Comma delineated list of agent types and docker "
                        "locations to run the agents.")
    parser.add_argument("--agent_env_vars",
                        help="Comma delineated list of agent environment vars "
                        "to pass to Docker. This is only for the Docker Agent."
                        " An example is '0:foo=bar:baz=lar,3:foo=lam', which "
                        "would send two arguments to Docker Agent 0 and one to"
                        " Docker Agent 3.",
                        default="")
    parser.add_argument("--record_pngs_dir",
                        default=None,
                        help="Directory to record the PNGs of the game. "
                        "Doesn't record if None.")
    parser.add_argument("--record_json_dir",
                        default=None,
                        help="Directory to record the JSON representations of "
                        "the game. Doesn't record if None.")
    parser.add_argument("--render",
                        default=False,
                        action='store_true',
                        help="Whether to render or not. Defaults to False.")
    parser.add_argument("--game_state_file",
                        default=None,
                        help="File from which to load game state. Defaults to "
                        "None.")
    parser.add_argument("--checkpoint",
                        default="models/ppo",
                        help="Directory where checkpoint file stored to.")
    parser.add_argument("--num_of_episodes",
                        default="10",
                        help="Number of episodes")
    parser.add_argument("--max_timesteps",
                        default="2000",
                        help="Number of steps")
    args = parser.parse_args()

    config = args.config
    record_pngs_dir = args.record_pngs_dir
    record_json_dir = args.record_json_dir
    agent_env_vars = args.agent_env_vars
    game_state_file = args.game_state_file
    checkpoint = args.checkpoint
    num_of_episodes = int(args.num_of_episodes)
    max_timesteps = int(args.max_timesteps)

    # TODO: After https://github.com/MultiAgentLearning/playground/pull/40
    #       this is still missing the docker_env_dict parsing for the agents.
    agents = [
        helpers.make_agent_from_string(agent_string, agent_id + 1000)
        for agent_id, agent_string in enumerate(args.agents.split(","))
    ]

    env = make(config, agents, game_state_file)
    training_agent = None

    for agent in agents:
        if type(agent) == TensorForceAgent:
            training_agent = agent
            env.set_training_agent(agent.agent_id)
            break

    if args.record_pngs_dir:
        assert not os.path.isdir(args.record_pngs_dir)
        os.makedirs(args.record_pngs_dir)
    if args.record_json_dir:
        assert not os.path.isdir(args.record_json_dir)
        os.makedirs(args.record_json_dir)

    # Create a Proximal Policy Optimization agent
    agent = training_agent.initialize(env)

    atexit.register(functools.partial(clean_up_agents, agents))
    wrapped_env = WrappedEnv(env, visualize=args.render)
    runner = Runner(agent=agent, environment=wrapped_env)
    runner.run(episodes=num_of_episodes, max_episode_timesteps=max_timesteps)
    print("Stats: ", runner.episode_rewards[-30:], runner.episode_timesteps,
          runner.episode_times)

    agent.save_model(checkpoint)

    rewards = runner.episode_rewards
    win = rewards.count(1)
    lose = rewards.count(-1)
    draw = rewards.count(0)
    total = win + lose + draw
    ratio = round((win / total) * 100.0, 2)
    print("Results ({}%) = Win({}), Lose({}), Draw({})".format(
        ratio, win, lose, draw))
    try:
        runner.close()
    except AttributeError as e:
        pass
def main():
    # SET BASIC PARAMETERS
    start_time = time.time()
    random_seed = 21
    agent_save_period = 500
    visualize_period = 1
    run_number = 965

    load_agent = False
    agent_filename = '371-P33-27-PPO-2000'
    to_visualize = False

    # Set logging level
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)

    # if args.import_modules is not None:
    #    for module in args.import_modules.split(','):
    #        importlib.import_module(name=module)

    environment = Environment.create(environment='gym',
                                     level='EnvTestContinuousR-v2',
                                     visualize=to_visualize)

    # Set random seed for environment
    environment.environment.env.seed(random_seed)
    environment.environment.env.set_reward(3)
    environment.environment.env.set_random(3)
    environment.environment.env.set_reward_scale(6)

    # Initialize Agent-Network-Model objects

    with open(
            'C:\\Users\\genia\\Source\\Repos\\Box2dEnv\\examples\\configs\\ppo-new3.json',
            'r') as fp:
        agentSpec = json.load(fp=fp)

    with open(
            'C:\\Users\\genia\\Source\\Repos\\Box2dEnv\\examples\\configs\\mlp2_network-new.json',
            'r') as fp:
        network = json.load(fp=fp)

    # agentSpec['update_mode'].update(batch_size=24)
    # agentSpec['update_mode'].update(frequency=24)
    #agentSpec['baseline']['sizes'] = [512,512]
    agentSpec['optimization_steps'] = 9
    agentSpec['network']['layers'][0]['size'] = 128
    agentSpec['network']['layers'][1]['size'] = 129
    agentSpec['critic_network']['layers'][0]['size'] = 126
    agentSpec['critic_network']['layers'][1]['size'] = 127
    agentSpec['batch_size'] = 13
    agentSpec['subsampling_fraction'] = 0.8
    agentSpec['critic_optimizer']['num_steps'] = 11
    agentSpec['likelihood_ratio_clipping'] = 0.2

    # network[0].update(size=512)
    # network[1].update(size=512)
    # agentSpec['network']['layers'] = network
    # agentSpec['critic_network']['layers'] = network
    agent = Agent.create(
        max_episode_timesteps=3000,
        agent=agentSpec,
        environment=environment,
        seed=random_seed
        # kwargs=dict(
        #     states=environment.states,
        #     actions=environment.actions,
        #     network=network,
        #     #random_seed=random_seed
    )

    agent.initialize()
    # print("Agent memory ", agent.memory['capacity'])
    # print("Agent baseline steps", agent.baseline_optimizer['num_steps'])
    # print("Agent optimizer steps", agent.optimizer['num_steps'])

    if load_agent:
        agent.restore_model(
            directory=
            'C:\\Users\\genia\\Source\\Repos\\Box2dEnv\\Box2dEnv\\saves\\modelSave',
            file=agent_filename)

    runner = Runner(agent=agent, environment=environment)

    # logger.info("Starting {agent} for Environment '{env}'".format(agent=agent, env=environment))

    # Naming variables
    nNum = str(run_number).zfill(3)
    task = environment.environment.env.task
    if task == 'LIFT':
        nTask = 'L'
    else:
        nTask = 'P'
    nReward = environment.environment.env.reward_level
    nRandom = environment.environment.env.rand_level
    nSeed = str(random_seed).zfill(2)
    nAlg = 'PPO'

    nName = ("{}-{}{}{}-{}-{}".format(nNum, nTask, nReward, nRandom, nSeed,
                                      nAlg))

    def episode_finished(r, id_=None):

        # if r.episode == 1:
        # r.agent.restore_model('C:\\Users\\genia\\Source\\Repos\\Box2dEnv\\Box2dEnv\\saves\\modelSave')

        save_period = 5
        if r.episodes % visualize_period == 0:
            if to_visualize:
                environment.visualize = True  # Set to true to visualize
        else:
            environment.visualize = False

        if r.episodes % save_period == 0:
            with open(
                    'C:\\Users\\genia\\Source\\Repos\\Box2dEnv\\Box2dEnv\\saves\\{}.csv'
                    .format(nName), 'a+') as csv:
                for reward in r.episode_rewards[-save_period:]:
                    csv.write("{:2.2f}\n".format(reward))
                # print("\nSaving, yo!")

        if r.episodes == 1 or (r.episodes % agent_save_period == 0):
            logger.info("\nSaving agent to {} at episode {}".format(
                'C:\\Users\\genia\\Source\\Repos\\Box2dEnv\\Box2dEnv\\saves\\modelSave\\{}'
                .format(nName), r.episodes))
            # r.agent.save(
            #     directory='C:\\Users\\genia\\Source\\Repos\\Box2dEnv\\Box2dEnv\\saves\\modelSave\\{}{}'.format(nName, r.episodes),
            #     append_timestep=False)

        return True

    def episode_finish(r, id_=None):
        print(r)

    runner.run(
        num_episodes=2000,
        num_timesteps=10000000,
        max_episode_timesteps=500,
        num_repeat_actions=1,
        # Callback
        callback=episode_finished,
        callback_episode_frequency=1,
        callback_timestep_frequency=None,
        # Tqdm
        use_tqdm=True,
        mean_horizon=100,
        # Evaluation
        evaluation=False,
        evaluation_callback=None,
        evaluation_frequency=None,
        max_evaluation_timesteps=None,
        num_evaluation_iterations=0)

    runner.close()

    logger.info("Learning finished. Total episodes: {ep}".format(
        ep=runner.agent.episode))