Esempio n. 1
0
    def test(self, model_epoch: int = 0, should_render: bool = True):
        train_provider, test_provider = self.data_provider.split_data_train_test(self.train_split_percentage)

        del train_provider

        test_env = SubprocVecEnv([make_env(test_provider, i) for i in range(self.n_envs)])

        model_path = path.join('data', 'agents', f'{self.study_name}__{model_epoch}.pkl')
        model = self.Model.load(model_path, env=test_env)

        self.logger.info(f'Testing model ({self.study_name}__{model_epoch})')

        state = None
        obs, done, rewards = test_env.reset(), [False], []
        while not all(done):
            action, state = model.predict(obs, state=state)
            obs, reward, done, _ = test_env.step(action)

            rewards.append(reward)

            if should_render and self.n_envs == 1:
                test_env.render(mode='human')

        self.logger.info(
            f'Finished testing model ({self.study_name}__{model_epoch}): ${"{:.2f}".format(np.sum(rewards))}')
Esempio n. 2
0
def main():
    #env = SubprocVecEnv([(lambda i=i: SwocGym(i+1, GameServicePath, i, fieldWidth=10, fieldHeight=10, saveEpisode=True)) for i in range(1)])
    env = SubprocVecEnv([
        (lambda i=i: MazeGym(mazeWidth=10, mazeHeight=10, nrWallsToRemove=10))
        for i in range(1)
    ])
    try:
        model = PPO2("MlpPolicy",
                     env,
                     verbose=1,
                     tensorboard_log='/home/ralph/swoc2019/log')
        if SaveFile.exists():
            print('loading...')
            model.load_parameters(SaveFile)
        else:
            print('Warning: No save file loaded')

        print('evaluating...', end='')
        obs = env.reset()
        totalRewards = None
        for i in range(100):
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            totalRewards = totalRewards + rewards if totalRewards is not None else rewards
            env.render()
            sleep(0.2)
        print(f'mean reward: {np.mean(totalRewards)}')

    except KeyboardInterrupt:
        print('closing...')
    finally:
        env.close()
    print('closed')
Esempio n. 3
0
def attention_render(model_name, env_name, num_cpu, log_dir):
    if not os.path.exists(log_dir):
        raise ('log_dir not Exists')

    env_id = env_name + 'NoFrameskip-v4'
    env = SubprocVecEnv([make_env(env_id, i, log_dir) for i in range(num_cpu)])
    # env = Monitor(env, log_dir, allow_early_resets=True)

    if model_name == 'A2C_Attention':
        model = A2C(AttentionPolicy,
                    env,
                    verbose=1,
                    tensorboard_log=log_dir + 'tensorboard/')
    elif model_name == 'A2C_Attention2':
        model = A2C(Attention2Policy,
                    env,
                    verbose=1,
                    tensorboard_log=log_dir + 'tensorboard/')
    elif model_name == 'A2C':
        model = A2C(LstmPolicy,
                    env,
                    verbose=1,
                    tensorboard_log=log_dir + 'tensorboard/')
    else:
        model = None
    model = model.load(log_dir + model_name + '_' + env_name, env=env)

    obs = env.reset()
    # print(env.observation_space)
    # cv2.imshow('test', RGB2BGR(obs[0]))
    # cv2.waitKey(0)
    while True:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        attentions = model.get_attention(obs, _states, done)[0]
        attentions_img = []
        # print('attention', np.array(attention).shape)
        for i, attention in enumerate(attentions):
            attention = np.array(attention)
            attention = np.reshape(attention, [
                env.observation_space.shape[0] // 10,
                env.observation_space.shape[1] // 10, 1
            ])
            attention = np.repeat(attention, [10] * attention.shape[0], axis=0)
            attention = np.repeat(attention, [10] * attention.shape[1], axis=1)
            attention = attention * 255
            attentions_img.append(attention)
            # print(np.sum(attention))
        attentions = tile_images(attentions_img)
        cv2.imshow('attention', attentions)
        cv2.waitKey(1)
        # break
        env.render()
    return model
Esempio n. 4
0
def test():
    # Parallel environments
    n_cpu = 4
    env = SubprocVecEnv([lambda: RSEnv() for i in range(n_cpu)])

    model = A2C(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=600000, log_interval=10)

    model.save("sba2c")

    env = TestRSEnv()
    obs = env.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        env.render()
    env.close()
Esempio n. 5
0
def main():
    agent_data = pd.read_csv('../output_EURUSD_M1_/agentData.csv')
    agent_data = agent_data.drop(agent_data.columns[0], axis=1)
    agent_data = agent_data.astype('float32')

    env = SubprocVecEnv([lambda: ForexTradingEnv(agent_data)] * 10, )
    #env = DummyVecEnv([lambda: ForexTradingEnv(agent_data)], )

    #    model = DQN(CustomDQNPolicy, env, gamma=0.95, verbose=1, tensorboard_log = "./tensorboard", entcoeff=0.005, adam_epsilon = 1e-6)

    import tensorflow as tf
    from TenorboardCallbacks import TensorboardCallback
    checkpoint_callback = CheckpointCallback(save_freq=1000000,
                                             save_path='./models/',
                                             name_prefix='ppo2')

    for curr in [1]:
        model = PPO2(PPO2Policy_Basic,
                     env,
                     verbose=1,
                     tensorboard_log="./tensorboard",
                     vf_coef=1e-7,
                     ent_coef=1e-4,
                     n_steps=512,
                     gamma=0.99)
        #model = PPO2.load("5_days_model/ppo2_999000000_steps.zip", policy=PPO2Policy_Basic, env = env,verbose=1, tensorboard_log = "./tensorboard")

        model.learn(total_timesteps=10000000000,
                    log_interval=10000000,
                    callback=CallbackList(
                        [TensorboardCallback(env), checkpoint_callback]))
        model.save(model_fileName)

    obs = env.reset()
    for i in range(2000000):
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        if i % 1 == 0:
            env.render()
        if done:
            break
def run_baseline_ppo2(env_name, n_cpu=4, train=True):
    from stable_baselines.common.policies import MlpPolicy
    from stable_baselines.common.vec_env import SubprocVecEnv
    from stable_baselines import PPO2

    if train:
        # multiprocess environment
        env = SubprocVecEnv([lambda: gym.make(env_name) for i in range(n_cpu)])
        model = PPO2(MlpPolicy, env, verbose=1)
        model.learn(total_timesteps=100000)
        model.save("checkpoints/ppo2_" + env_name)
    else:
        from stable_baselines.common.vec_env import DummyVecEnv
        env = DummyVecEnv([lambda: gym.make(env_name)])
        model = PPO2.load("checkpoints/ppo2_" + env_name)

        obs = env.reset()
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            env.render()
            print("state: ", obs, " reward: ", rewards, " done: ", dones, "info: ", info)
class BitmexTradingStrategySBL(TradingStrategy):
    """A trading strategy capable of self tuning, training, and evaluating with stable-baselines.

    Arguments:
        environments: An instance of a trading environments for the agent to trade within.
        model: The RL model to create the agent with.
            Defaults to DQN.
        policy: The RL policy to train the agent's model with.
            Defaults to 'MlpPolicy'.
        model_kwargs: Any additional keyword arguments to adjust the model.
        kwargs: Optional keyword arguments to adjust the strategy.
    """
    def __init__(self,
                 environment: BitmexEnvironment,
                 model: BaseRLModel = DQN,
                 policy: Union[str, BasePolicy] = 'MlpPolicy',
                 model_kwargs: any = {},
                 policy_kwargs: any = {},
                 n_env: int = 1,
                 **kwargs):
        self._model = model
        self._model_kwargs = model_kwargs
        self._policy_kwargs = policy_kwargs
        self._n_env = n_env

        self.environment = environment
        self._agent = self._model(policy,
                                  self._environment,
                                  **self._model_kwargs,
                                  policy_kwargs=self._policy_kwargs)

    @property
    def environment(self) -> 'BitmexEnvironment':
        """A `BitmexEnvironment` instance for the agent to trade within."""
        return self._environment

    @environment.setter
    def environment(self, environment: 'BitmexEnvironment'):
        envs = [lambda: environment for _ in range(self._n_env)]

        if self._n_env == 1:
            self._environment = DummyVecEnv(envs)
        else:
            self._environment = SubprocVecEnv(envs)

    def restore_agent(self, path: str, custom_objects: any = {}):
        """Deserialize the strategy's learning agent from a file.

        Arguments:
            path: The `str` path of the file the agent specification is stored in.
        """
        self._custom_objects = custom_objects
        self._agent = self._model.load(path,
                                       env=self._environment,
                                       custom_objects=self._custom_objects,
                                       kwargs=self._model_kwargs)

    def save_agent(self, path: str):
        """Serialize the learning agent to a file for restoring later.

        Arguments:
            path: The `str` path of the file to store the agent specification in.
        """
        self._agent.save(path)

    def tune(self,
             steps: int = None,
             episodes: int = None,
             callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame:
        raise NotImplementedError

    def _train_callback(self, _locals, _globals):
        # performance = self._environment.performance
        #
        # if self._episode_callback and self._environment.done():
        #     self._episode_callback(performance)

        return True

    def train(
        self,
        steps: int = None,
        episodes: int = None,
        render_mode: str = None,
        episode_callback: Callable[[pd.DataFrame],
                                   bool] = None) -> pd.DataFrame:
        if steps is None:
            raise ValueError(
                'You must set the number of `steps` to train the strategy.')

        self._agent.learn(steps, callback=self._train_callback)

        return True

    def test(
        self,
        steps: int = None,
        episodes=None,
        render_mode: str = None,
        episode_callback: Callable[[pd.DataFrame],
                                   bool] = None) -> pd.DataFrame:
        if steps is None and episodes is None:
            raise ValueError(
                'You must set the number of `steps` or `episodes` to test the strategy.'
            )

        steps_completed, episodes_completed, average_reward = 0, 0, 0
        obs, state, dones = self._environment.reset(), None, [False]
        performance = {}

        while (steps is not None and
               (steps == 0 or steps_completed < steps)) or (
                   episodes is not None and episodes_completed < episodes):
            actions, state = self._agent.predict(obs, state=state, mask=dones)
            # actions, state = self._agent.predict(obs)
            obs, rewards, dones, info = self._environment.step(actions)

            steps_completed += 1
            average_reward -= average_reward / steps_completed
            average_reward += rewards[0] / (steps_completed + 1)

            exchange_performance = info[0].get('exchange').performance
            performance = exchange_performance if len(
                exchange_performance) > 0 else performance
            if render_mode is not None:
                self._environment.render(mode=render_mode)

            if dones[0]:
                if episode_callback is not None and not episode_callback(
                        performance):
                    break

                episodes_completed += 1
                obs = self._environment.reset()

        print("Finished running strategy.")
        print("Total episodes: {} ({} timesteps).".format(
            episodes_completed, steps_completed))
        print("Average reward: {}.".format(average_reward))

        return performance
policy = ''
model_tag = ''
if len(sys.argv) > 1:
    policy = sys.argv[1]
    model_tag = '_' + sys.argv[1]

if __name__ == '__main__':
    env = SubprocVecEnv([lambda: BaseEnv() for i in range(4)])
    env = VecFrameStack(env, 3)

    model = PPO2(get_policy(policy),
                 env,
                 verbose=0,
                 nminibatches=1,
                 tensorboard_log=tensorboard_folder)
    model.learn(total_timesteps=100000000, tb_log_name='PPO2' + model_tag)

    model.save(model_folder + "PPO2" + model_tag)
    del model
    model = PPO2.load(model_folder + "PPO2" + model_tag)

    done = False
    states = None
    obs = env.reset()

    while not done:
        action, states = model.predict(obs, states)
        obs, _, done, info = env.step(action)
        env.render()
Esempio n. 9
0
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env

    set_global_seeds(seed)
    return _init


if __name__ == '__main__':
    env_id = "CartPole-v1"
    num_cpu = 4  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    # Stable Baselines provides you with make_vec_env() helper
    # which does exactly the previous steps for you:
    # env = make_vec_env(env_id, n_envs=num_cpu, seed=0)

    model = ACKTR(MlpPolicy,
                  env,
                  verbose=1,
                  tensorboard_log="./a2c_cartpole_tensorboard/")
    model.learn(total_timesteps=250000)

    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        env.render(mode='rgb_array')
def main(args):
    log_dir = args.log_path if (args.log_path is not None) else \
        "/tmp/stable_baselines_" + time.strftime('%Y-%m-%d-%H-%M-%S')
    configure_logger(log_dir)

    set_global_seeds(args.seed)

    n_cpu = get_num_workers(args.env) if not args.play else 1
    env_kwargs = get_env_kwargs(args.env, args.random_ratio, args.sequential,
                                args.reward_type, args.n_object,
                                args.curriculum)

    def make_thunk(rank):
        return lambda: make_env(env_id=args.env,
                                rank=rank,
                                log_dir=log_dir,
                                flatten_dict=True,
                                kwargs=env_kwargs)

    env = SubprocVecEnv([make_thunk(i) for i in range(n_cpu)])

    eval_env_kwargs = env_kwargs.copy()
    eval_env_kwargs['random_ratio'] = 0.0
    if "use_cu" in eval_env_kwargs:
        eval_env_kwargs['use_cu'] = False
    eval_env = make_env(env_id=args.env,
                        rank=0,
                        flatten_dict=True,
                        kwargs=eval_env_kwargs)
    print(eval_env)
    if not args.play:
        os.makedirs(log_dir, exist_ok=True)
        train_kwargs = get_train_kwargs("ppo",
                                        args,
                                        parsed_action_noise=None,
                                        eval_env=eval_env)

        # policy = 'MlpPolicy'
        from utils.attention_policy import AttentionPolicy
        register_policy('AttentionPolicy', AttentionPolicy)
        policy_kwargs = get_policy_kwargs("ppo", args)
        print(policy_kwargs)

        model = PPO2(args.policy,
                     env,
                     verbose=1,
                     nminibatches=32,
                     lam=0.95,
                     noptepochs=10,
                     ent_coef=0.01,
                     learning_rate=3e-4,
                     cliprange=0.2,
                     policy_kwargs=policy_kwargs,
                     **train_kwargs)
        print(model.get_parameter_list())

        def callback(_locals, _globals):
            num_update = _locals["update"]
            if 'FetchStack' in args.env:
                mean_eval_reward = stack_eval_model(eval_env, _locals["self"])
            else:
                mean_eval_reward = eval_model(eval_env, _locals["self"])
            log_eval(num_update, mean_eval_reward)
            if num_update % 10 == 0:
                model_path = os.path.join(log_dir,
                                          'model_' + str(num_update // 10))
                model.save(model_path)
                print('model saved to', model_path)
            return True

        model.learn(total_timesteps=int(args.num_timesteps),
                    callback=callback,
                    seed=args.seed,
                    log_interval=1)
        model.save(os.path.join(log_dir, 'final'))

    else:
        assert args.load_path is not None
        model = PPO2.load(args.load_path)
        fig, ax = plt.subplots(1, 1, figsize=(8, 8))
        obs = env.reset()
        goal_dim = env.get_attr('goal')[0].shape[0]
        if 'FetchStack' in args.env:
            while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \
                    env.get_attr('task_mode')[0] != 1:
                obs = env.reset()
        elif 'FetchPush' in args.env:
            while not (1.25 < obs[0][6] < 1.33 and obs[0][7] < 0.61
                       and 0.7 < obs[0][4] < 0.8):
                obs = env.reset()
            env.env_method('set_goal', np.array([1.2, 0.75, 0.425, 1, 0]))
            obs = env.env_method('get_obs')
            obs[0] = np.concatenate([
                obs[0][key]
                for key in ['observation', 'achieved_goal', 'desired_goal']
            ])
        else:
            while np.argmax(obs[0][-goal_dim + 3:]) != 0:
                obs = env.reset()
        print('achieved_goal', obs[0][-2 * goal_dim:-goal_dim], 'goal',
              obs[0][-goal_dim:])
        episode_reward = 0.0
        num_episode = 0
        frame_idx = 0
        images = []
        if 'max_episode_steps' not in env_kwargs.keys():
            env_kwargs['max_episode_steps'] = 100
        for i in range(env_kwargs['max_episode_steps'] * 10):
            img = env.render(mode='rgb_array')
            ax.cla()
            ax.imshow(img)
            if env.get_attr('goal')[0].shape[0] <= 3:
                ax.set_title('episode ' + str(num_episode) + ', frame ' +
                             str(frame_idx))
            else:
                ax.set_title('episode ' + str(num_episode) + ', frame ' +
                             str(frame_idx) + ', goal idx ' +
                             str(np.argmax(env.get_attr('goal')[0][3:])))
                if 'FetchStack' in args.env:
                    tasks = ['pick and place', 'stack']
                    ax.set_title('episode ' + str(num_episode) + ', frame ' +
                                 str(frame_idx) + ', task: ' +
                                 tasks[np.argmax(obs[0][-2 * goal_dim - 2:-2 *
                                                        goal_dim])])
            images.append(img)
            action, _ = model.predict(obs)
            obs, reward, done, _ = env.step(action)
            episode_reward += reward
            frame_idx += 1
            if not args.export_video:
                plt.pause(0.1)
            else:
                plt.imsave(
                    os.path.join(os.path.dirname(args.load_path),
                                 'tempimg%d.png' % i), img)
            if done:
                print('episode_reward', episode_reward)
                if 'FetchStack' in args.env:
                    while env.get_attr('current_nobject')[0] != env.get_attr('n_object')[0] or \
                            env.get_attr('task_mode')[0] != 1:
                        obs = env.reset()
                else:
                    while np.argmax(obs[0][-goal_dim + 3:]) != 0:
                        obs = env.reset()
                print('goal', obs[0][-goal_dim:])
                episode_reward = 0.0
                frame_idx = 0
                num_episode += 1
                if num_episode >= 10:
                    break
        if args.export_video:
            os.system('ffmpeg -r 5 -start_number 0 -i ' +
                      os.path.dirname(args.load_path) +
                      '/tempimg%d.png -c:v libx264 -pix_fmt yuv420p ' +
                      os.path.join(os.path.dirname(args.load_path), args.env +
                                   '.mp4'))
            for i in range(env_kwargs['max_episode_steps'] * 10):
                try:
                    os.remove(
                        os.path.join(os.path.dirname(args.load_path),
                                     'tempimg' + str(i) + '.png'))
                except:
                    pass
Esempio n. 11
0

if __name__ == '__main__':
    env_id = "TaxiDummy-v01"

    DATA_PATH = os.path.join(os.environ['ALLDATA_PATH'], "macaoFiles",
                             "taxi_env_dummy")
    if os.path.isdir(DATA_PATH):
        shutil.rmtree(DATA_PATH)
    os.makedirs(DATA_PATH)

    num_cpu = 4  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    model = A2C(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=100000)

    obs = env.reset()
    images = []
    img = env.render(mode="rgb_array")
    images.append(img)
    for _ in range(70):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        images.append(env.render(mode="rgb_array"))
    imageio.mimwrite(os.path.join(DATA_PATH, 'taxi_dummy_a2c.gif'),
                     [np.array(img) for i, img in enumerate(images)],
                     format="GIF-PIL",
                     fps=5)
Esempio n. 12
0
if __name__ == "__main__":
    # Parallel environments
    if parallel:
        env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])
    else:
        env = DummyVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    model = PPO2(CnnPolicy, env, verbose=1, tensorboard_log=log_dir)
    model.learn(total_timesteps=max_steps)
    model.save(log_dir + "ppo_minigrid")
    del model  # remove to demonstrate saving and loading

    model = PPO2.load(log_dir + "ppo_minigrid")
    env = make_env(env_id, 0)()

    mean_reward, std_reward = evaluate_policy(model, env)
    print("Mean Reward: {}, std_dev: {}".format(mean_reward, std_reward))
    demo = input("Watch model? (q to quit)")
    if demo != "q":
        for _ in range(1000):
            obs = env.reset()
            t = 0
            while t < 200:
                action, _states = model.predict(obs)
                obs, rewards, done, info = env.step(action)
                print("Action: {}\tTimestep: {}".format(action, t))
                env.render(mode='human')
                t += 1
                if done:
                    break
Esempio n. 13
0
import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines import A2C

n_cpu = 4  # 支持4线程
env = SubprocVecEnv([lambda: gym.make('CartPole-v1') for i in range(n_cpu)])

model = A2C(MlpPolicy, env, verbose=1)  # 使用MlpPolicy的A2C算法
model.learn(total_timesteps=25000)  # 训练

obs = env.reset()
while True:
    action, _states = model.predict(obs)  # 预测
    obs, rewards, dones, info = env.step(action)  # 执行一步游戏
    env.render()  # 显示