Esempio n. 1
0
def generate(parameter_distribution,
             num_episodes,
             env_update_fn,
             filepath=None,
             n_cpu=6):
    env_name = 'CartPole-v1'
    model_dir = os.path.join(os.getcwd(), 'models')
    model_path = os.path.join(model_dir, 'ppo2_' + env_name + '.pkl')

    os.makedirs(model_dir, exist_ok=True)
    os.makedirs(os.path.dirname(filepath), exist_ok=True)

    def make_env(env_name):
        env = gym.make(env_name)
        return env

    env = SubprocVecEnv([lambda: make_env(env_name) for i in range(n_cpu)])

    try:
        model = PPO2.load(model_path)
    except Exception as e:
        trainer = CartPoleTrainer(env)
        model = trainer.train(model_path)

    obs = env.reset()

    env = make_env(env_name)

    states, actions, next_states, parameters, steps = [], [], [], [], []

    for ep in range(num_episodes):
        obs = env.reset()
        params = parameter_distribution()
        env_update_fn(env.unwrapped, params)

        done = False
        step = 0
        while not done:
            action, _states = model.predict(obs)
            states.append(obs)
            actions.append([action])
            obs, reward, done, info = env.step(action)
            next_states.append(obs)
            parameters.append(params)
            steps.append(step)
            step += 1

    data = {
        'states': np.array(states),
        'actions': np.array(actions),
        'next_states': np.array(next_states),
        'parameters': np.array(parameters),
        'steps': np.array(steps)
    }
    if filepath:
        print('filepath: ', filepath)
        with open(filepath, 'wb') as f:
            np.save(filepath, data)

    return data
Esempio n. 2
0
def play(env_name, seed, load_file, total_timesteps, n_cpu):
    np.set_printoptions(precision=5)
    def padding_obss(obss, dummy_obss):
        dummy_obss[ 0, :, :, : ] = obss
        return dummy_obss
    # trained LSTM model cannot change number of env.
    # so it needs to reshape observation by padding dummy data.
    dummy_obss = np.zeros((n_cpu, 64, 64, 4))
    env = SubprocVecEnv([make_env(env_name, 0, seed)])
    model = PPO2.load(load_file, verbose=1)
    obss = env.reset()
    obss = padding_obss(obss, dummy_obss)
    rewards_buf = []
    steps_buf = []
    # TODO: single
    for i in range(total_timesteps):
        actions, _states = model.predict(obss)
        actions = actions[0:1]
        obss, rewards, dones, infos = env.step(actions)
        obss = padding_obss(obss, dummy_obss)
        # env.render() # dummy
        if dones[0]:
            rewards_buf.append(infos[0]['episode']['r'])
            steps_buf.append(infos[0]['episode']['l'])
            line = np.array([np.mean(rewards_buf), np.std(rewards_buf), np.mean(steps_buf), np.std(steps_buf)])
            print(len(rewards_buf), line)
            obss = env.reset()
            obss = padding_obss(obss, dummy_obss)
    env.close()
Esempio n. 3
0
    def optimize_params(self,
                        trial,
                        n_prune_evals_per_trial: int = 2,
                        n_tests_per_eval: int = 1):
        train_provider, test_provider = self.data_provider.split_data_train_test(
            self.train_split_percentage)
        train_provider, validation_provider = train_provider.split_data_train_test(
            self.train_split_percentage)

        del test_provider

        train_env = SubprocVecEnv(
            [make_env(train_provider, i) for i in range(1)])
        validation_env = SubprocVecEnv(
            [make_env(validation_provider, i) for i in range(1)])

        model_params = self.optimize_agent_params(trial)
        model = self.Model(self.Policy,
                           train_env,
                           verbose=self.model_verbose,
                           nminibatches=1,
                           tensorboard_log=self.tensorboard_path,
                           **model_params)

        last_reward = -np.finfo(np.float16).max
        n_steps_per_eval = int(
            len(train_provider.data_frame) / n_prune_evals_per_trial)

        for eval_idx in range(n_prune_evals_per_trial):
            try:
                model.learn(n_steps_per_eval)
            except AssertionError:
                raise

            rewards = []
            n_episodes, reward_sum = 0, 0.0

            state = None
            obs = validation_env.reset()
            while n_episodes < n_tests_per_eval:
                action, state = model.predict(obs, state=state)
                obs, reward, done, _ = validation_env.step(action)
                reward_sum += reward

                if all(done):
                    rewards.append(reward_sum)
                    reward_sum = 0.0
                    n_episodes += 1
                    obs = validation_env.reset()

            last_reward = np.mean(rewards)
            trial.report(-1 * last_reward, eval_idx)

            if trial.should_prune(eval_idx):
                raise optuna.structs.TrialPruned()

        return -1 * last_reward
Esempio n. 4
0
class SimulatorModel(object):
    def __init__(self, _make_env_func, parallel_agents):
        """
        This class instantiates a dynamics model based on the pybullet simulator
        (i.e: simulates exactly the result of the actions), it can be used
        for reward tuning and verifying tasks..etc

        :param _make_env_func: (func) a function if called it will return a gym
                                      environment.
        :param parallel_agents: (int) number of parallel agents to siumulate
                                      to evaluate the actions.
        """
        self.parallel_agents = parallel_agents
        self.envs = SubprocVecEnv(
            [_make_env_func() for i in range(self.parallel_agents)])
        return

    def evaluate_trajectories(self, action_sequences):
        """
        A function to be called to evaluate the action sequences and return
        the corresponding reward for each sequence.

        :param action_sequences: (nd.array) actions to be evaluated
                                            (number of sequences, horizon length)
        :return: (nd.array) sum of rewards for each action sequence.
        """
        horizon_length = action_sequences.shape[1]
        num_of_particles = action_sequences.shape[0]
        rewards = np.zeros([num_of_particles])
        assert ((float(num_of_particles) / self.parallel_agents).is_integer())
        for j in range(0, num_of_particles, self.parallel_agents):
            self.envs.reset()
            total_reward = np.zeros([self.parallel_agents])
            for k in range(horizon_length):
                actions = action_sequences[j:j + self.parallel_agents, k]
                task_observations, current_reward, done, info = \
                    self.envs.step(actions)
                total_reward += current_reward
            rewards[j:j + self.parallel_agents] = total_reward
        return rewards

    def end_sim(self):
        """
        Closes the environments that were used for simulation.
        :return:
        """
        self.envs.close()
        return
Esempio n. 5
0
    def test(self, model_epoch: int = 0, should_render: bool = True):
        train_provider, test_provider = self.data_provider.split_data_train_test(self.train_split_percentage)

        del train_provider

        test_env = SubprocVecEnv([make_env(test_provider, i) for i in range(self.n_envs)])

        model_path = path.join('data', 'agents', f'{self.study_name}__{model_epoch}.pkl')
        model = self.Model.load(model_path, env=test_env)

        self.logger.info(f'Testing model ({self.study_name}__{model_epoch})')

        state = None
        obs, done, rewards = test_env.reset(), [False], []
        while not all(done):
            action, state = model.predict(obs, state=state)
            obs, reward, done, _ = test_env.step(action)

            rewards.append(reward)

            if should_render and self.n_envs == 1:
                test_env.render(mode='human')

        self.logger.info(
            f'Finished testing model ({self.study_name}__{model_epoch}): ${"{:.2f}".format(np.sum(rewards))}')
Esempio n. 6
0
def main():
    #env = SubprocVecEnv([(lambda i=i: SwocGym(i+1, GameServicePath, i, fieldWidth=10, fieldHeight=10, saveEpisode=True)) for i in range(1)])
    env = SubprocVecEnv([
        (lambda i=i: MazeGym(mazeWidth=10, mazeHeight=10, nrWallsToRemove=10))
        for i in range(1)
    ])
    try:
        model = PPO2("MlpPolicy",
                     env,
                     verbose=1,
                     tensorboard_log='/home/ralph/swoc2019/log')
        if SaveFile.exists():
            print('loading...')
            model.load_parameters(SaveFile)
        else:
            print('Warning: No save file loaded')

        print('evaluating...', end='')
        obs = env.reset()
        totalRewards = None
        for i in range(100):
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            totalRewards = totalRewards + rewards if totalRewards is not None else rewards
            env.render()
            sleep(0.2)
        print(f'mean reward: {np.mean(totalRewards)}')

    except KeyboardInterrupt:
        print('closing...')
    finally:
        env.close()
    print('closed')
Esempio n. 7
0
def test(model_name, env_name, num_cpu, log_dir):
    env = SubprocVecEnv([
        make_football_env(env_name, i, log_dir, useMonitor=False)
        for i in range(num_cpu)
    ])
    # env = Monitor(env, log_dir, allow_early_resets=True)
    model = get_model(model_name, env, log_dir)

    model = model.load(log_dir + model_name + '_' + env_name, env=env)

    obs = env.reset()
    from matplotlib import pyplot as plt
    show_num = 1
    while True:
        action, _states = model.predict(obs)
        # obs, rewards, done, info = env.step([int(input('action:'))]*num_cpu)
        obs, rewards, done, info = env.step(action)
        img = obs[show_num, :, :, :]
        fig = plt.figure(0)
        plt.clf()
        plt.imshow(img / 255)
        fig.canvas.draw()

        # env.render()
        plt.pause(0.000001)
Esempio n. 8
0
def play(env_name, seed, load_file, total_timesteps, n_cpu):
    np.set_printoptions(precision=5)
    def padding_obss(obss, dummy_obss):
        dummy_obss[ 0, :, :, : ] = obss
        return dummy_obss
    # if it's GUI mode, number of env is changed to 1 to reduce GUI windows.
    # but trained LSTM model cannot change number of env.
    # so it needs to reshape observation by padding dummy data.
    isGUI = env_name.find('GUI') != -1
    dummy_obss = np.zeros((n_cpu, 64, 64, 4)) if isGUI else None
    env = SubprocVecEnv([make_env(env_name, i, seed) for i in range(1 if isGUI else n_cpu)])
    model = PPO2.load(load_file, verbose=1)
    obss = env.reset()
    obss = padding_obss(obss, dummy_obss) if isGUI else obss
    rewards_buf = []
    steps_buf = []
    # TODO: single
    for i in range(total_timesteps):
        actions, _states = model.predict(obss)
        actions = actions[0:1] if isGUI else actions
        obss, rewards, dones, infos = env.step(actions)
        obss = padding_obss(obss, dummy_obss) if isGUI else obss
        # env.render() # dummy
        if dones.any():
            rewards_buf.extend([ info['episode']['r'] for info in infos if 'episode' in info ])
            steps_buf.extend([ info['episode']['l'] for info in infos if 'episode' in info ])
            line = np.array([np.mean(rewards_buf), np.std(rewards_buf), np.mean(steps_buf), np.std(steps_buf)])
            print(len(rewards_buf), line)
    env.close()
Esempio n. 9
0
def test(test_data, model_location):
    # Using a different environment to test the model
    env_test = SubprocVecEnv(
        [lambda: ExchangeEnv.ExchangeEnv(test_data, 10000, 0)])
    model = PPO2.load(model_location)
    obs = env_test.reset()
    done = False

    price_history = []
    portfolio_value = []

    while not done:
        action, _states = model.predict(obs)
        obs, rewards, done, _ = env_test.step(action)

        # Appending the current time steps highest bid
        price_history.append(obs[0][0][0])

        # Appending current portfolio value
        portfolio_value.append(rewards[0])

    with open("price_history.txt", "w") as f:
        writer = csv.writer(f)
        writer.writerow(price_history)

    with open("portfolio_value.txt", "w") as f:
        writer = csv.writer(f)
        writer.writerow(portfolio_value)
def main(mode="train"):

    n_cpu = 2
    env = SubprocVecEnv(
        [lambda: gym.make('balancebot-continuum-v0') for i in range(n_cpu)])

    if mode == "train":
        model = ppo2(policy=MlpPolicy,
                     env=env,
                     learning_rate=1e-3,
                     verbose=0,
                     full_tensorboard_log=False,
                     tensorboard_log="./ppo2_balancebot_tensorboard")

        model.learn(total_timesteps=100000, callback=callback)
        print("Saving model to ppo2_balance_continuum.pkl")
        model.save("ppo2_balance_continuum.pkl")

        del model  # remove to demonstrate saving and loading

    if mode == "test":
        model = ppo2.load("ppo2_balance_continuum.pkl")

        obs = env.reset()
        done = [False, False]
        # env.set_done(5000)
        while not all(done):
            action, _states = model.predict(obs)
            obs, rewards, done, info = env.step(action)
            # env.render()
            print(obs)
def main():
    num_envs = 20
    num_players_per_env = 2
    envs = [makeEnv for i in range(num_envs)]
    actionSpace = makeEnv().action_space
    env = SubprocVecEnv(envs)
    env.reset()
    gameFinished = [False] * num_envs
    while not all(gameFinished):
        inputs = []
        for game in range(num_envs):
            if not gameFinished[game]: inputs.append([actionSpace.sample() for player in range(num_players_per_env)])
            else: inputs.append([0] * num_players_per_env)

        _, _, done, info = env.step(inputs)
        
        gameFinished = [gameElem or doneElem for gameElem, doneElem in list(zip(gameFinished, done))]
        time.sleep(.0001)
Esempio n. 12
0
def attention_render(model_name, env_name, num_cpu, log_dir):
    if not os.path.exists(log_dir):
        raise ('log_dir not Exists')

    env_id = env_name + 'NoFrameskip-v4'
    env = SubprocVecEnv([make_env(env_id, i, log_dir) for i in range(num_cpu)])
    # env = Monitor(env, log_dir, allow_early_resets=True)

    if model_name == 'A2C_Attention':
        model = A2C(AttentionPolicy,
                    env,
                    verbose=1,
                    tensorboard_log=log_dir + 'tensorboard/')
    elif model_name == 'A2C_Attention2':
        model = A2C(Attention2Policy,
                    env,
                    verbose=1,
                    tensorboard_log=log_dir + 'tensorboard/')
    elif model_name == 'A2C':
        model = A2C(LstmPolicy,
                    env,
                    verbose=1,
                    tensorboard_log=log_dir + 'tensorboard/')
    else:
        model = None
    model = model.load(log_dir + model_name + '_' + env_name, env=env)

    obs = env.reset()
    # print(env.observation_space)
    # cv2.imshow('test', RGB2BGR(obs[0]))
    # cv2.waitKey(0)
    while True:
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        attentions = model.get_attention(obs, _states, done)[0]
        attentions_img = []
        # print('attention', np.array(attention).shape)
        for i, attention in enumerate(attentions):
            attention = np.array(attention)
            attention = np.reshape(attention, [
                env.observation_space.shape[0] // 10,
                env.observation_space.shape[1] // 10, 1
            ])
            attention = np.repeat(attention, [10] * attention.shape[0], axis=0)
            attention = np.repeat(attention, [10] * attention.shape[1], axis=1)
            attention = attention * 255
            attentions_img.append(attention)
            # print(np.sum(attention))
        attentions = tile_images(attentions_img)
        cv2.imshow('attention', attentions)
        cv2.waitKey(1)
        # break
        env.render()
    return model
Esempio n. 13
0
def optimize_agent(trial):
    """ Train the model and optimise
        Optuna maximises the negative log likelihood, so we
        need to negate the reward here
    """
    model_params = optimize_TD3(trial)
    env = SubprocVecEnv([
        lambda: NormalizeActionWrapper(LearningRocket(visualize=False))
        for i in range(n_cpu)
    ])

    n_actions = env.action_space.shape[-1]
    action_noise = NormalActionNoise(mean=np.zeros(n_actions),
                                     sigma=0.1 * np.ones(n_actions))

    model = TD3(MlpPolicy,
                env,
                action_noise=action_noise,
                policy_kwargs=dict(layers=[400, 300]))
    model.learn(50000)

    rewards = []
    n_episodes, reward_sum = 0, 0.0

    obs = env.reset()
    step = 0
    while n_episodes < 4:
        step += 1
        action, _ = model.predict(obs)
        obs, reward, done, _ = env.step(action)
        reward_sum += reward
        if done:
            rewards.append(reward_sum)
            reward_sum = 0.0
            n_episodes += 1
            obs = env.reset()

    last_reward = np.mean(rewards)
    trial.report(-1 * last_reward, step)

    return -1 * last_reward
Esempio n. 14
0
def run():
    torch.multiprocessing.freeze_support()
    env_id = "CartPole-v1"
    num_cpu = 4  # Number of processes to use
    # Create the vectorized environment
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])

    model = ACKTR(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=25000)

    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
Esempio n. 15
0
def _eval_model(model, env_id, ob_shape, num_eps, plot=False):
  test_env = SubprocVecEnv([make_env(env_id)])
  sharpe_ratios = []
  for episode in range(num_eps):
    # Padding zeros to the test env to match the shape of the training env.
    zero_completed_obs = np.zeros((NUM_CPU,) + ob_shape)
    zero_completed_obs[0, :] = test_env.reset()
    state = None
    for _ in range(L):
      action, state = model.predict(zero_completed_obs, state=state, deterministic=True)
      zero_completed_obs[0, :], reward, done, _ = test_env.env_method('step', action[0], indices=0)[0]
    sharpe_ratios.append(test_env.env_method('get_sharpe_ratio', indices=0)[0])
    if plot: test_env.env_method('render', indices=0)
  test_env.close()
  
  # Return the average sharpe ratio
  return sum(sharpe_ratios) / len(sharpe_ratios)
Esempio n. 16
0
def test():
    # Parallel environments
    n_cpu = 4
    env = SubprocVecEnv([lambda: RSEnv() for i in range(n_cpu)])

    model = A2C(MlpPolicy, env, verbose=1)
    model.learn(total_timesteps=600000, log_interval=10)

    model.save("sba2c")

    env = TestRSEnv()
    obs = env.reset()
    done = False
    while not done:
        action, _ = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        env.render()
    env.close()
Esempio n. 17
0
def objective(trial):
    # Hyper-parameters to adjust
    policy = trial.suggest_categorical('policy', ['MlpPolicy', 'MlpLnPolicy', 'MlpLstmPolicy', 'MlpLnLstmPolicy'])
    gamma = trial.suggest_uniform('gamma', 0.10, 1.0)
    ent_coef = trial.suggest_uniform('ent_coef', 0.01, 0.10)
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-2)
    vf_coef = trial.suggest_uniform('vf_coef', 0.10, 1.0)
    lam = trial.suggest_uniform('lam', 0.01, 0.95)

    if policy == 'MlpPolicy':
        policy = MlpPolicy
    elif policy == 'MlpLnPolicy':
        policy = MlpLnPolicy
    elif policy == 'MlpLstmPolicy':
        policy = MlpLstmPolicy
    elif policy == 'MlpLnLstmPolicy':
        policy = MlpLnLstmPolicy

    # Train with those hyper-parameters
    n_cpu = 4
    env = SubprocVecEnv([lambda: gimbal(5, 500) for i in range(n_cpu)])
    model = PPO2(policy=policy, env=env, gamma=gamma, n_steps=100, ent_coef=ent_coef, learning_rate=learning_rate, 
                vf_coef=vf_coef, max_grad_norm=0.5, lam=lam, nminibatches=4, noptepochs=4, cliprange=0.2, 
                verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False)
    model.learn(total_timesteps=250000, callback=None, seed=None, log_interval=1, tb_log_name='PPO2', reset_num_timesteps=True)

    # Calculate worth
    env = gimbal(5, 500)
    MAX_episodes = 25
    reward_avg = 0
    for episodes in range(MAX_episodes):
        obs = env.reset()
        r = 0
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            r += rewards
            #env.render()
            if dones:
                reward_avg += r
                break
    return - (reward_avg / MAX_episodes)
Esempio n. 18
0
def main():
    agent_data = pd.read_csv('../output_EURUSD_M1_/agentData.csv')
    agent_data = agent_data.drop(agent_data.columns[0], axis=1)
    agent_data = agent_data.astype('float32')

    env = SubprocVecEnv([lambda: ForexTradingEnv(agent_data)] * 10, )
    #env = DummyVecEnv([lambda: ForexTradingEnv(agent_data)], )

    #    model = DQN(CustomDQNPolicy, env, gamma=0.95, verbose=1, tensorboard_log = "./tensorboard", entcoeff=0.005, adam_epsilon = 1e-6)

    import tensorflow as tf
    from TenorboardCallbacks import TensorboardCallback
    checkpoint_callback = CheckpointCallback(save_freq=1000000,
                                             save_path='./models/',
                                             name_prefix='ppo2')

    for curr in [1]:
        model = PPO2(PPO2Policy_Basic,
                     env,
                     verbose=1,
                     tensorboard_log="./tensorboard",
                     vf_coef=1e-7,
                     ent_coef=1e-4,
                     n_steps=512,
                     gamma=0.99)
        #model = PPO2.load("5_days_model/ppo2_999000000_steps.zip", policy=PPO2Policy_Basic, env = env,verbose=1, tensorboard_log = "./tensorboard")

        model.learn(total_timesteps=10000000000,
                    log_interval=10000000,
                    callback=CallbackList(
                        [TensorboardCallback(env), checkpoint_callback]))
        model.save(model_fileName)

    obs = env.reset()
    for i in range(2000000):
        action, _states = model.predict(obs)
        obs, rewards, done, info = env.step(action)
        if i % 1 == 0:
            env.render()
        if done:
            break
Esempio n. 19
0
class Agent:
    def __init__(self):
        self.env = None
        self.model = None

    def create_env(self, game, envs, render=False, sleep=0.):
        env = gym.make(game)
        # env = FrameStack(env, 4)
        env = CustomGym(env, render=render, sleep=sleep)
        self.env = SubprocVecEnv([lambda: env for i in range(envs)])

    def create_model(self):
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        with tf.Session(config=config):
            self.model = PPO2(policy=MlpPolicy,
                              env=self.env,
                              n_steps=8192,
                              nminibatches=8,
                              lam=0.95,
                              gamma=0.99,
                              noptepochs=4,
                              ent_coef=0.001,
                              learning_rate=lambda _: 2e-5,
                              cliprange=lambda _: 0.2,
                              verbose=1,
                              tensorboard_log="gym_logs")

    def train(self, timesteps, loops, name="agent"):
        for i in range(loops):
            self.model.learn(timesteps)
            self.model.save(name + str(i))

    def evaluate(self, timesteps, agent_name):
        self.model = PPO2.load(agent_name)
        obs = self.env.reset()
        for i in range(timesteps):
            action, _states = self.model.predict(obs)
            obs, rewards, dones, info = self.env.step(action)
def run_baseline_ppo2(env_name, n_cpu=4, train=True):
    from stable_baselines.common.policies import MlpPolicy
    from stable_baselines.common.vec_env import SubprocVecEnv
    from stable_baselines import PPO2

    if train:
        # multiprocess environment
        env = SubprocVecEnv([lambda: gym.make(env_name) for i in range(n_cpu)])
        model = PPO2(MlpPolicy, env, verbose=1)
        model.learn(total_timesteps=100000)
        model.save("checkpoints/ppo2_" + env_name)
    else:
        from stable_baselines.common.vec_env import DummyVecEnv
        env = DummyVecEnv([lambda: gym.make(env_name)])
        model = PPO2.load("checkpoints/ppo2_" + env_name)

        obs = env.reset()
        while True:
            action, _states = model.predict(obs)
            obs, rewards, dones, info = env.step(action)
            env.render()
            print("state: ", obs, " reward: ", rewards, " done: ", dones, "info: ", info)
class BitmexTradingStrategySBL(TradingStrategy):
    """A trading strategy capable of self tuning, training, and evaluating with stable-baselines.

    Arguments:
        environments: An instance of a trading environments for the agent to trade within.
        model: The RL model to create the agent with.
            Defaults to DQN.
        policy: The RL policy to train the agent's model with.
            Defaults to 'MlpPolicy'.
        model_kwargs: Any additional keyword arguments to adjust the model.
        kwargs: Optional keyword arguments to adjust the strategy.
    """
    def __init__(self,
                 environment: BitmexEnvironment,
                 model: BaseRLModel = DQN,
                 policy: Union[str, BasePolicy] = 'MlpPolicy',
                 model_kwargs: any = {},
                 policy_kwargs: any = {},
                 n_env: int = 1,
                 **kwargs):
        self._model = model
        self._model_kwargs = model_kwargs
        self._policy_kwargs = policy_kwargs
        self._n_env = n_env

        self.environment = environment
        self._agent = self._model(policy,
                                  self._environment,
                                  **self._model_kwargs,
                                  policy_kwargs=self._policy_kwargs)

    @property
    def environment(self) -> 'BitmexEnvironment':
        """A `BitmexEnvironment` instance for the agent to trade within."""
        return self._environment

    @environment.setter
    def environment(self, environment: 'BitmexEnvironment'):
        envs = [lambda: environment for _ in range(self._n_env)]

        if self._n_env == 1:
            self._environment = DummyVecEnv(envs)
        else:
            self._environment = SubprocVecEnv(envs)

    def restore_agent(self, path: str, custom_objects: any = {}):
        """Deserialize the strategy's learning agent from a file.

        Arguments:
            path: The `str` path of the file the agent specification is stored in.
        """
        self._custom_objects = custom_objects
        self._agent = self._model.load(path,
                                       env=self._environment,
                                       custom_objects=self._custom_objects,
                                       kwargs=self._model_kwargs)

    def save_agent(self, path: str):
        """Serialize the learning agent to a file for restoring later.

        Arguments:
            path: The `str` path of the file to store the agent specification in.
        """
        self._agent.save(path)

    def tune(self,
             steps: int = None,
             episodes: int = None,
             callback: Callable[[pd.DataFrame], bool] = None) -> pd.DataFrame:
        raise NotImplementedError

    def _train_callback(self, _locals, _globals):
        # performance = self._environment.performance
        #
        # if self._episode_callback and self._environment.done():
        #     self._episode_callback(performance)

        return True

    def train(
        self,
        steps: int = None,
        episodes: int = None,
        render_mode: str = None,
        episode_callback: Callable[[pd.DataFrame],
                                   bool] = None) -> pd.DataFrame:
        if steps is None:
            raise ValueError(
                'You must set the number of `steps` to train the strategy.')

        self._agent.learn(steps, callback=self._train_callback)

        return True

    def test(
        self,
        steps: int = None,
        episodes=None,
        render_mode: str = None,
        episode_callback: Callable[[pd.DataFrame],
                                   bool] = None) -> pd.DataFrame:
        if steps is None and episodes is None:
            raise ValueError(
                'You must set the number of `steps` or `episodes` to test the strategy.'
            )

        steps_completed, episodes_completed, average_reward = 0, 0, 0
        obs, state, dones = self._environment.reset(), None, [False]
        performance = {}

        while (steps is not None and
               (steps == 0 or steps_completed < steps)) or (
                   episodes is not None and episodes_completed < episodes):
            actions, state = self._agent.predict(obs, state=state, mask=dones)
            # actions, state = self._agent.predict(obs)
            obs, rewards, dones, info = self._environment.step(actions)

            steps_completed += 1
            average_reward -= average_reward / steps_completed
            average_reward += rewards[0] / (steps_completed + 1)

            exchange_performance = info[0].get('exchange').performance
            performance = exchange_performance if len(
                exchange_performance) > 0 else performance
            if render_mode is not None:
                self._environment.render(mode=render_mode)

            if dones[0]:
                if episode_callback is not None and not episode_callback(
                        performance):
                    break

                episodes_completed += 1
                obs = self._environment.reset()

        print("Finished running strategy.")
        print("Total episodes: {} ({} timesteps).".format(
            episodes_completed, steps_completed))
        print("Average reward: {}.".format(average_reward))

        return performance
Esempio n. 22
0
    env = SubprocVecEnv([make_env(x) for x in range(num_envs)],
                        start_method='forkserver')

    # env.get_valid_actions = lambda: np.array([e.get_valid_actions() for e in env.envs])
    env.get_valid_actions = lambda: np.array(
        env.env_method('get_valid_actions'))

    model = algo.MaskedPPO(CustomLSTMPolicy,
                           env,
                           verbose=1,
                           n_steps=20,
                           nminibatches=batch_size,
                           tensorboard_log="../out/meta_opt/")

    model.learn(total_timesteps=100000, log_interval=10)
    model.save('meta_optimizer')

    obs = env.reset()
    state = None
    total_rewards = 0
    done = [False for _ in range(env.num_envs)]

    for i in range(1000):
        action, _states = model.predict(obs, state=state, mask=done)
        obs, rewards, done, info = env.step(action)
        total_rewards += rewards

        # if done:
        #     break
    print(total_rewards)
Esempio n. 23
0
def run(alg,
        alg_kwargs,
        task,
        task_kwargs,
        wrappers_kwargs,
        expl_params,
        rollout,
        num_trials,
        folder,
        n_thrds,
        n_lstm,
        rerun=False,
        test_kwargs={},
        num_retrains=10,
        seed=0,
        train_mode=None,
        sl_kwargs=None):
    train_mode = train_mode or 'RL'
    env = test_env(task, kwargs=task_kwargs, num_steps=1000)
    num_timesteps = int(1000 * num_trials / (env.num_tr))
    files = glob.glob(folder + '/*model*')
    vars_ = {
        'alg': alg,
        'alg_kwargs': alg_kwargs,
        'task': task,
        'task_kwargs': task_kwargs,
        'wrappers_kwargs': wrappers_kwargs,
        'expl_params': expl_params,
        'rollout': rollout,
        'folder': folder,
        'num_trials': num_trials,
        'n_thrds': n_thrds,
        'n_lstm': n_lstm
    }
    np.savez(folder + '/params.npz', **vars_)
    if len(files) == 0 or rerun:
        if train_mode == 'RL':
            if alg == "A2C":
                from stable_baselines import A2C as algo
            elif alg == "ACER":
                from stable_baselines import ACER as algo
            elif alg == "ACKTR":
                from stable_baselines import ACKTR as algo
            elif alg == "PPO2":
                from stable_baselines import PPO2 as algo
            env = SubprocVecEnv([
                make_env(env_id=task,
                         rank=i,
                         seed=seed,
                         wrapps=wrappers_kwargs,
                         **task_kwargs) for i in range(n_thrds)
            ])
            model = algo(LstmPolicy,
                         env,
                         verbose=0,
                         n_steps=rollout,
                         n_cpu_tf_sess=n_thrds,
                         tensorboard_log=None,
                         policy_kwargs={
                             "feature_extraction": "mlp",
                             "n_lstm": n_lstm
                         },
                         **alg_kwargs)
            # this assumes 1 trial ~ 10 steps
            sv_freq = 5 * wrappers_kwargs['MonitorExtended-v0']['sv_per']
            chckpnt_cllbck = CheckpointCallback(save_freq=sv_freq,
                                                save_path=folder,
                                                name_prefix='model')
            model.learn(total_timesteps=num_timesteps, callback=chckpnt_cllbck)
            model.save(f"{folder}/model_{num_timesteps}_steps.zip")
            plotting.plot_rew_across_training(folder=folder)
        elif train_mode == 'SL':
            stps_ep = sl_kwargs['steps_per_epoch']
            wraps_sl = deepc(wrappers_kwargs)
            del wraps_sl['PassAction-v0']
            del wraps_sl['PassReward-v0']
            del wraps_sl['MonitorExtended-v0']
            env = make_env(env_id=task,
                           rank=0,
                           seed=seed,
                           wrapps=wraps_sl,
                           **task_kwargs)()
            dataset = ngym.Dataset(env,
                                   batch_size=sl_kwargs['btch_s'],
                                   seq_len=rollout,
                                   batch_first=True)
            obs_size = env.observation_space.shape[0]
            act_size = env.action_space.n
            model = define_model(seq_len=rollout,
                                 num_h=n_lstm,
                                 obs_size=obs_size,
                                 act_size=act_size,
                                 batch_size=sl_kwargs['btch_s'],
                                 stateful=sl_kwargs['stateful'],
                                 loss=sl_kwargs['loss'])
            # Train network
            data_generator = (dataset() for i in range(stps_ep))
            model.fit(data_generator, verbose=1, steps_per_epoch=stps_ep)
            model.save(f"{folder}/model_{stps_ep}_steps")

    if len(test_kwargs) != 0:
        for key in test_kwargs.keys():
            sv_folder = folder + key
            test_kwargs[key]['seed'] = seed
            if train_mode == 'RL':
                if '_all' not in key:
                    ga.get_activity(folder, alg, sv_folder, **test_kwargs[key])
                else:
                    files = glob.glob(folder + '/model_*_steps.zip')
                    for f in files:
                        model_name = os.path.basename(f)
                        sv_f = folder + key + '_' + model_name[:-4]
                        ga.get_activity(folder,
                                        alg,
                                        sv_folder=sv_f,
                                        model_name=model_name,
                                        **test_kwargs[key])

            elif train_mode == 'SL':
                stps_ep = sl_kwargs['steps_per_epoch']
                wraps_sl = deepc(wrappers_kwargs)
                wraps_sl.update(test_kwargs[key]['wrappers'])
                del wraps_sl['PassAction-v0']
                del wraps_sl['PassReward-v0']
                env = make_env(env_id=task,
                               rank=0,
                               seed=seed,
                               wrapps=wraps_sl,
                               **task_kwargs)()
                obs_size = env.observation_space.shape[0]
                act_size = env.action_space.n
                model_test = define_model(seq_len=1,
                                          batch_size=1,
                                          obs_size=obs_size,
                                          act_size=act_size,
                                          stateful=sl_kwargs['stateful'],
                                          num_h=n_lstm,
                                          loss=sl_kwargs['loss'])
                ld_f = folder + 'model_' + str(stps_ep) + '_steps'.replace(
                    '//', '/')
                model_test.load_weights(ld_f)
                env.reset()
                for ind_stp in range(sl_kwargs['test_steps']):
                    obs = env.ob_now
                    obs = obs[np.newaxis]
                    obs = obs[np.newaxis]
                    action = model_test.predict(obs)
                    action = np.argmax(action, axis=-1)[0]
                    _, _, _, _ = env.step(action)
Esempio n. 24
0
import retro
from stable_baselines.common.vec_env import SubprocVecEnv, DummyVecEnv, VecFrameStack, VecNormalize
from stable_baselines import PPO2, A2C
import numpy as np
import gym
from stable_baselines.common.callbacks import CheckpointCallback
from utils import *

if __name__ == "__main__":
    num_envs = 16  # Must use the save number of envs as trained on but we create a single dummy env for testing.
    envs = SubprocVecEnv([make_env] * num_envs)
    envs = VecFrameStack(envs, n_stack=4)

    model = PPO2.load("./subzero_model.zip")
    model.set_env(envs)
    obs = envs.reset()
    print(obs.shape)

    # Create one env for testing
    env = DummyVecEnv([make_env])
    env = VecFrameStack(env, n_stack=4)
    obs = env.reset()

    # model.predict(test_obs) would through an error
    # because the number of test env is different from the number of training env
    # so we need to complete the observation with zeroes
    zero_completed_obs = np.zeros((num_envs, ) + envs.observation_space.shape)
    zero_completed_obs[0, :] = obs
    obs = zero_completed_obs

    while True:
Esempio n. 25
0
class PPO2_SB():
    def __init__(self):
        self.love = 'Ramona'
        self.env_fns = []
        self.env_names = []
        self.environs = [
            'SpringYardZone.Act3', 'SpringYardZone.Act2', 'GreenHillZone.Act3',
            'GreenHillZone.Act1', 'StarLightZone.Act2', 'StarLightZone.Act1',
            'MarbleZone.Act2', 'MarbleZone.Act1', 'MarbleZone.Act3',
            'ScrapBrainZone.Act2', 'LabyrinthZone.Act2', 'LabyrinthZone.Act1',
            'LabyrinthZone.Act3', 'SpringYardZone.Act1', 'GreenHillZone.Act2',
            'StarLightZone.Act3', 'ScrapBrainZone.Act1'
        ]
        self.environsv2 = ['1Player.Axel.Level1']
        self.generate_expert_traj = generate_expert_traj

    def create_envs(self, game_name, state_name, num_env):

        for i in range(num_env):
            self.env_fns.append(
                partial(make_env, game=game_name, state=state_name))
            self.env_names.append(game_name + '-' + state_name)
        self.env = SubprocVecEnv(self.env_fns)

    def train(self,
              game,
              state,
              num_e=1,
              n_timesteps=25000000,
              save='default2'):
        self.create_envs(game_name=game, state_name=state, num_env=num_e)
        #self.model = PPO2.load("default2", SubprocVecEnv(self.env_fns), policy=CnnPolicy, tensorboard_log="./sonic/" )
        #self.model = PPO2(CnnPolicy, SubprocVecEnv(self.env_fns), learning_rate=1e-5, verbose=1,tensorboard_log="./sonic/" )

        self.model = PPO2(policy=CnnPolicy,
                          env=SubprocVecEnv(self.env_fns),
                          n_steps=8192,
                          nminibatches=8,
                          lam=0.95,
                          gamma=0.99,
                          noptepochs=4,
                          ent_coef=0.001,
                          learning_rate=lambda _: 2e-5,
                          cliprange=lambda _: 0.2,
                          verbose=1,
                          tensorboard_log="./sonic/")
        self.model.learn(n_timesteps)
        self.model.save(save)
        self.model.learn(n_timesteps)
        self.model.save(save + '2')
        self.model.learn(n_timesteps)
        self.model.save(save + '3')
        self.model.learn(n_timesteps)
        self.model.save(save + '4')

    def evaluate(self, game, state, num_e=1, num_steps=14400):
        """
        Evaluate a RL agent
        :param model: (BaseRLModel object) the RL Agent
        :param num_steps: (int) number of timesteps to evaluate it
        :return: (float) Mean reward
        """

        self.create_envs(game_name=game, state_name=state, num_env=num_e)
        self.model = PPO2.load("default2",
                               SubprocVecEnv(self.env_fns),
                               policy=CnnPolicy,
                               tensorboard_log="./sonic/")
        episode_rewards = [[0.0] for _ in range(self.env.num_envs)]
        obs = self.env.reset()
        for i in range(num_steps):
            # _states are only useful when using LSTM policies
            actions, _states = self.model.predict(obs)
            # # here, action, rewards and dones are arrays
            # # because we are using vectorized env
            obs, rewards, dones, info = self.env.step(actions)

            # Stats
            for i in range(self.env.num_envs):
                episode_rewards[i][-1] += rewards[i]
                if dones[i]:
                    episode_rewards[i].append(0.0)

        mean_rewards = [0.0 for _ in range(self.env.num_envs)]
        n_episodes = 0
        for i in range(self.env.num_envs):
            mean_rewards[i] = np.mean(episode_rewards[i])
            n_episodes += len(episode_rewards[i])

    # Compute mean reward
        mean_reward = np.mean(mean_rewards)
        print("Mean reward:", mean_reward, "Num episodes:", n_episodes)

        return mean_reward

    def pre_train(self):
        # Using only one expert trajectory
        # you can specify `traj_limitation=-1` for using the whole dataset
        dataset = ExpertDataset(expert_path='expert_cartpole.npz',
                                traj_limitation=1,
                                batch_size=128)

        model = PPO2('MlpPolicy', 'CartPole-v1', verbose=1)
        # Pretrain the PPO2 model
        model.pretrain(dataset, n_epochs=1000)

        # As an option, you can train the RL agent
        # model.learn(int(1e5))

        # Test the pre-trained model
        env = model.get_env()
        obs = env.reset()

        reward_sum = 0.0
        for _ in range(1000):
            action, _ = model.predict(obs)
            obs, reward, done, _ = env.step(action)
            reward_sum += reward
            env.render()
            if done:
                print(reward_sum)
                reward_sum = 0.0
                obs = env.reset()

        env.close()

    def gen_pre_train(self,
                      game,
                      state,
                      num_e=1,
                      save='default2',
                      episodes=10):
        self.create_envs(game_name=game, state_name=state, num_env=num_e)
        env = SubprocVecEnv(self.env_fns)
        self.expert_agent = "moose"
        self.generate_expert_traj(self.expert_agent,
                                  save,
                                  env,
                                  n_episodes=episodes)
Esempio n. 26
0
    :param num_env: (int) the number of environments you wish to have in subprocesses
    :param seed: (int) the inital seed for RNG
    :param rank: (int) index of the subprocess
    """
    def _init():
        env = gym.make(env_id)
        env.seed(seed + rank)
        return env

    set_global_seeds(seed)
    return _init


if __name__ == '__main__':
    env_id = "HumanoidPyBulletEnv-v0"
    num_cpu = 1
    env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)])
    model = PPO2.load("HumanoidPyBulletEnv-v0_PPO2_2020_11_3016_29_44")
    ob = env.reset()
    reward = 0

    while True:
        action, _states = model.predict(ob)
        ob, r, done, info = env.step(action)
        reward += r
        time.sleep(0.01)
        if done:
            ob = env.reset()
            print('r is {}'.format(r))
            print('Episode reward is {}'.format(reward))
            reward = 0
Esempio n. 27
0
class PPO2_SB():
    def __init__(self):
        self.love = 'Ramona'
        self.env_fns = []
        self.env_names = []

    def make_env(self, env_id, rank, seed=0):
        """
        Utility function for multiprocessed env.
    
        :param env_id: (str) the environment ID
        :param num_env: (int) the number of environment you wish to have in subprocesses
        :param seed: (int) the inital seed for RNG
        :param rank: (int) index of the subprocess
        """
        def _init():
            env = Template_Gym()
            env.seed(seed + rank)
            return env

        set_global_seeds(seed)
        return _init

    def train(self, num_e=1, n_timesteps=1000000, save='saves/agent4'):
        env_id = "default"
        num_e = 1  # Number of processes to use
        # Create the vectorized environment
        #env = DummyVecEnv([lambda: env])

        self.env = SubprocVecEnv(
            [self.make_env(env_id, i) for i in range(num_e)])
        self.env = VecNormalize(self.env, norm_obs=True, norm_reward=True)
        #self.model = PPO2(policy=CnnPolicy,
        #env=SubprocVecEnv(self.env_fns),
        #n_steps=8192,
        #nminibatches=8,
        #lam=0.95,
        #gamma=0.99,
        #noptepochs=4,
        #ent_coef=0.001,
        #learning_rate=lambda _: 2e-5,
        #cliprange=lambda _: 0.2,
        #verbose=1,
        #tensorboard_log="./breakorbust")
        self.model = PPO2(CustomPolicy,
                          env=self.env,
                          verbose=0,
                          learning_rate=1e-5,
                          tensorboard_log=save)
        for i in range(10):
            self.model.learn(n_timesteps)
            self.model.save(save)

    def evaluate(self, num_env=1, num_steps=14400):
        """
        Evaluate a RL agent
        :param model: (BaseRLModel object) the RL Agent
        :param num_steps: (int) number of timesteps to evaluate it
        :return: (float) Mean reward
        """
        env_id = "default"
        num_e = 1
        self.env = SubprocVecEnv(
            [self.make_env(env_id, i) for i in range(num_e)])
        self.model = PPO2.load('saves/agent.pkl',
                               self.env,
                               policy=CustomPolicy,
                               tensorboard_log="./ppocnn/")

        episode_rewards = [[0.0] for _ in range(self.env.num_envs)]
        obs = self.env.reset()
        for i in range(num_steps):
            # _states are only useful when using LSTM policies
            actions, _states = self.model.predict(obs)
            # # here, action, rewards and dones are arrays
            # # because we are using vectorized env
            obs, rewards, dones, info = self.env.step(actions)

            # Stats
            for i in range(self.env.num_envs):
                episode_rewards[i][-1] += rewards[i]
                if dones[i]:
                    episode_rewards[i].append(0.0)

        mean_rewards = [0.0 for _ in range(self.env.num_envs)]
        n_episodes = 0
        for i in range(self.env.num_envs):
            mean_rewards[i] = np.mean(episode_rewards[i])
            n_episodes += len(episode_rewards[i])

    # Compute mean reward
        mean_reward = np.mean(mean_rewards)
        print("Mean reward:", mean_reward, "Num episodes:", n_episodes)

        return mean_reward
Esempio n. 28
0
import gym
import os

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import SubprocVecEnv
from stable_baselines import PPO2

from env import OsmoEnv

if __name__ == '__main__':
    env = SubprocVecEnv([lambda: OsmoEnv() for i in range(os.cpu_count())])
    # model = PPO2(MlpPolicy, env, verbose=1, learning_rate=1e-4)
    # model.learn(total_timesteps=25000)
    # model.save('PPO2_baselines')
    model = PPO2.load('PPO2_baselines')
    model.set_env(env)
    model.learning_rate = 1e-5
    model.learn(total_timesteps=30000)
    model.save('PP02_baselines')

    env = OsmoEnv()
    for i in range(10):
        observation = env.reset()
        done = False
        while not done:
            action, _ = model.predict(observation)
            observation, _, done, info = env.step(action)
        else:
            print(info)
Esempio n. 29
0

# 学習設定
train = False  # 学習をするかどうか
validation = True  # 学習結果を使って評価をするかどうか

#env_name = 'RoboschoolHumanoid-v1'
#env_name = 'RoboschoolWalker2d-v1'
env_name = 'Walker2DBulletEnv-v0'
num_cpu = 2  # 学習に使用するCPU数
learn_timesteps = 10**5  # 学習タイムステップ

ori_env = gym.make(env_name)
#env = DummyVecEnv([lambda: ori_env])
env = SubprocVecEnv([make_env(env_name, i) for i in range(num_cpu)])
env.reset()
#env.render()
#time.sleep(5)

savedir = './stable_baselines/{}/'.format(env_name)
logdir = '{}tensorboard_log/'.format(savedir)
os.makedirs(savedir, exist_ok=True)

starttime = datetime.now().strftime("%Y/%m/%d %H:%M:%S")
# 学習の実行
if train:
    model = PPO2(MlpPolicy, env, verbose=1, tensorboard_log=logdir)
    model.learn(total_timesteps=learn_timesteps)
    model.save('{}ppo2_model'.format(savedir))

endtime = datetime.now().strftime("%Y/%m/%d %H:%M:%S")
Esempio n. 30
0
if curr_idx == -1:
    model = PPO2(MlpLnLstmPolicy,
                 train_env,
                 verbose=0,
                 nminibatches=1,
                 tensorboard_log="./tensorboard",
                 **model_params)
else:
    model = PPO2.load('./agents/ppo2_' + reward_strategy + '_' +
                      str(curr_idx) + '.pkl',
                      env=train_env)

for idx in range(curr_idx + 1, 10):
    print('[', idx, '] Training for: ', train_len, ' time steps')

    model.learn(total_timesteps=train_len)

    obs = test_env.reset()
    done, reward_sum = False, 0

    while not np.all(done):
        action, _states = model.predict(obs)
        obs, reward, done, info = test_env.step(action)
        reward_sum += sum(
            [i for i in reward if isinstance(i, int) or isinstance(i, float)])

    print('[', idx, '] Total reward: ', reward_sum,
          ' (' + reward_strategy + ')')
    model.save('./agents/ppo2_' + reward_strategy + '_' + str(idx) + '.pkl')