Python DummyVecEnv.reset Examples, stable_baselines.common.vec_env.DummyVecEnv.reset Python Examples

Example #1

0

Show file

File: sac_policy.py Project: hanyas/sds

            verbose=1,
            learning_rate=1e-3,
            policy_kwargs={
                'layers': [64, 64],
                'reg_weight': 1e-32
            })

model.learn(total_timesteps=100000, log_interval=10)

obs, act = [], []
nb_rollouts, nb_steps = 25, 200
for n in range(nb_rollouts):
    _obs = np.empty((nb_steps, dm_obs))
    _act = np.empty((nb_steps, dm_act))

    x = env.reset()
    for t in range(nb_steps):
        u, _ = model.predict(x)
        _obs[t, :], _act[t, :] = x, u
        u = np.clip(u, -ulim, ulim)
        x, r, _, _ = env.step(u)

    obs.append(_obs)
    act.append(_act)

import matplotlib.pyplot as plt

fig, ax = plt.subplots(nrows=1, ncols=dm_obs + dm_act, figsize=(12, 4))
for _obs, _act in zip(obs, act):
    for k, col in enumerate(ax[:-1]):
        col.plot(_obs[:, k])

Example #2

0

Show file

def main():
    parser = argparse.ArgumentParser(description='PPO baseline implementation')
    parser.add_argument('-e',
                        '--experiment',
                        type=str,
                        default='ppo_test',
                        help='name of experiment')
    parser.add_argument('-w',
                        '--env',
                        type=str,
                        default='Shepherd-v0',
                        help='name of gym environment')
    parser.add_argument('-m',
                        '--mode',
                        type=str,
                        default='train',
                        help='mode to run experiment')
    parser.add_argument('-p',
                        '--policy',
                        type=str,
                        default='mlp',
                        help='type of policy network')
    parser.add_argument('-t',
                        '--timesteps',
                        type=int,
                        default=10000,
                        help='number of timesteps to train')
    parser.add_argument('-d',
                        '--datapath',
                        type=str,
                        default='../data',
                        help='path to save results')
    args = parser.parse_args()

    mode = args.mode
    env_name = args.env
    policy = args.policy
    data_path = args.datapath
    timesteps = args.timesteps
    experiment = args.experiment

    exp_path = '{}/{}'.format(data_path, experiment)
    log_path = '{}/log_{}'.format(exp_path, timesteps)
    model_path = '{}/model_{}'.format(exp_path, timesteps)

    env = gym.make(env_name)
    env = shepherd_gym.wrappers.SamplerWrapper(env,
                                               demo_path='../data/curriculum',
                                               increment_freq=250)
    env = DummyVecEnv([lambda: env])

    if policy == 'mlp':
        policy_type = MlpPolicy
    else:
        policy_type = MlpLstmPolicy

    model = PPO2(policy_type,
                 env,
                 verbose=1,
                 tensorboard_log=log_path,
                 nminibatches=1)

    if mode == 'train':
        model.learn(total_timesteps=timesteps)
        model.save(model_path)
    else:
        model.load(model_path)

    env.render()
    obs = env.reset()
    for _ in range(1000):
        action, _states = model.predict(obs)
        obs, _, _, _ = env.step(action)
        env.render()

    # complete simulation
    env.close()

Example #3

0

Show file

File: main.py Project: bgg11117/Bitcoin-Trader-RL

import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO2

from env.BitcoinTradingEnv import BitcoinTradingEnv

import pandas as pd

train_df = pd.read_csv('./datasets/bot_train_ETHBTC_700_hour.csv')
train_df = train_df.sort_values('Date')

test_df = pd.read_csv('./datasets/bot_rollout_ETHBTC_700_hour.csv')
test_df = test_df.sort_values('Date')

train_env = DummyVecEnv([lambda: BitcoinTradingEnv(train_df, serial=True)])

model = PPO2(MlpPolicy, train_env, verbose=1, tensorboard_log="./tensorboard/")
model.learn(total_timesteps=5000)

test_env = DummyVecEnv([lambda: BitcoinTradingEnv(test_df, serial=True)])

obs = test_env.reset()
for i in range(50000):
    action, _states = model.predict(obs)
    obs, rewards, done, info = test_env.step(action)
    test_env.render(mode="human", title="BTC")

test_env.close()

Example #4

0

Show file

File: test_continuous.py Project: xlwoo1/stable-baselines

def test_model_manipulation(model_class):
    """
    Test if the algorithm can be loaded and saved without any issues, the environment switching
    works and that the action prediction works

    :param model_class: (BaseRLModel) A model
    """
    try:
        env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

        # create and train
        model = model_class(policy="MlpPolicy", env=env)
        model.learn(total_timesteps=NUM_TIMESTEPS, seed=0)

        # predict and measure the acc reward
        acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            acc_reward += reward
        acc_reward = sum(acc_reward) / N_TRIALS

        # saving
        model.save("./test_model")

        del model, env

        # loading
        model = model_class.load("./test_model")

        # changing environment (note: this can be done at loading)
        env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])
        model.set_env(env)

        # predict the same output before saving
        loaded_acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        # assert <15% diff
        assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.15, \
            "Error: the prediction seems to have changed between loading and saving"

        # learn post loading
        model.learn(total_timesteps=100, seed=0)

        # validate no reset post learning
        # This test was failing from time to time for no good reason
        # other than bad luck
        # We should change this test
        # loaded_acc_reward = 0
        # set_global_seeds(0)
        # obs = env.reset()
        # for _ in range(N_TRIALS):
        #     action, _ = model.predict(obs)
        #     obs, reward, _, _ = env.step(action)
        #     loaded_acc_reward += reward
        # loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        # # assert <10% diff
        # assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.1, \
        #     "Error: the prediction seems to have changed between pre learning and post learning"

        # predict new values
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, _, _, _ = env.step(action)

        # Free memory
        del model, env

    finally:
        if os.path.exists("./test_model"):
            os.remove("./test_model")

Example #5

0

Show file

test_df = df[train_len:]

test_env = DummyVecEnv([
    lambda: TradingEnv(test_df,
                       reward_func=reward_strategy,
                       forecast_len=int(params['forecast_len']),
                       confidence_interval=params['confidence_interval'])
])

model_params = {
    'n_steps': int(params['n_steps']),
    'gamma': params['gamma'],
    'learning_rate': params['learning_rate'],
    'ent_coef': params['ent_coef'],
    'cliprange': params['cliprange'],
    'noptepochs': int(params['noptepochs']),
    'lam': params['lam'],
}

model = PPO2.load('./agents/ppo2_' + reward_strategy + '_' + str(curr_idx) +
                  '.pkl',
                  env=test_env)

obs, done = test_env.reset(), False
while not done:
    action, _states = model.predict(obs)
    obs, reward, done, info = test_env.step(action)

    test_env.render(mode="human")

Example #6

0

Show file

File: test_continuous.py Project: surendrasah/stable-baselines

def test_model_manipulation(request, model_class):
    """
    Test if the algorithm can be loaded and saved without any issues, the environment switching
    works and that the action prediction works

    :param model_class: (BaseRLModel) A model
    """
    try:
        env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])

        # create and train
        model = model_class(policy="MlpPolicy", env=env)
        model.learn(total_timesteps=NUM_TIMESTEPS, seed=0)

        # predict and measure the acc reward
        acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            acc_reward += reward
        acc_reward = sum(acc_reward) / N_TRIALS

        # saving
        model_fname = './test_model_{}.zip'.format(request.node.name)
        model.save(model_fname)

        del model, env

        # loading
        model = model_class.load(model_fname)

        # changing environment (note: this can be done at loading)
        env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)])
        model.set_env(env)

        # predict the same output before saving
        loaded_acc_reward = 0
        set_global_seeds(0)
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, reward, _, _ = env.step(action)
            loaded_acc_reward += reward
        loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS

        with pytest.warns(None) as record:
            act_prob = model.action_probability(obs)

        if model_class in [DDPG, SAC, TD3]:
            # check that only one warning was raised
            assert len(record) == 1, "No warning was raised for {}".format(
                model_class)
            assert act_prob is None, "Error: action_probability should be None for {}".format(
                model_class)
        else:
            assert act_prob[0].shape == (1, 1) and act_prob[1].shape == (1, 1), \
                "Error: action_probability not returning correct shape"

        # test action probability for given (obs, action) pair
        # must return zero and raise a warning or raise an exception if not defined
        env = model.get_env()
        obs = env.reset()
        observations = np.array([obs for _ in range(10)])
        observations = np.squeeze(observations)
        observations = observations.reshape((-1, 1))
        actions = np.array([env.action_space.sample() for _ in range(10)])

        if model_class in [DDPG, SAC, TD3]:
            with pytest.raises(ValueError):
                model.action_probability(observations, actions=actions)
        else:
            actions_probas = model.action_probability(observations,
                                                      actions=actions)
            assert actions_probas.shape == (len(actions),
                                            1), actions_probas.shape
            assert np.all(actions_probas >= 0), actions_probas
            actions_logprobas = model.action_probability(observations,
                                                         actions=actions,
                                                         logp=True)
            assert np.allclose(actions_probas,
                               np.exp(actions_logprobas)), (actions_probas,
                                                            actions_logprobas)

        # assert <15% diff
        assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.15, \
            "Error: the prediction seems to have changed between loading and saving"

        # learn post loading
        model.learn(total_timesteps=100, seed=0)

        # validate no reset post learning
        # This test was failing from time to time for no good reason
        # other than bad luck
        # We should change this test
        # loaded_acc_reward = 0
        # set_global_seeds(0)
        # obs = env.reset()
        # for _ in range(N_TRIALS):
        #     action, _ = model.predict(obs)
        #     obs, reward, _, _ = env.step(action)
        #     loaded_acc_reward += reward
        # loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS
        # # assert <10% diff
        # assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.1, \
        #     "Error: the prediction seems to have changed between pre learning and post learning"

        # predict new values
        obs = env.reset()
        for _ in range(N_TRIALS):
            action, _ = model.predict(obs)
            obs, _, _, _ = env.step(action)

        # Free memory
        del model, env

    finally:
        if os.path.exists("./test_model.zip"):
            os.remove("./test_model.zip")

Example #7

0

Show file

File: DDPG_baselines.py Project: WHY-David/repo

#        The replay buffer is used to store experience, because DDPG is an off-policy algorithm.
#        A target network is designed to minimize MSBE loss.
#        A target policy network to compute an action which approximately maximizes Q_{\phi_{\text{targ}}}.
#        Ornstein-Uhlenbeck process is applied to add exploration noise during training to make DDPG policies explore better.

model = DDPG(MlpPolicy,
             env,
             verbose=1,
             tau=tau,
             gamma=gamma,
             batch_size=batch_size,
             actor_lr=alr,
             critic_lr=clr,
             param_noise=param_noise,
             action_noise=action_noise)

if __name__ == '__main__':
    # train
    model.learn(total_timesteps=10000)
    model.save("DDPG_baselines")

    # play
    env = OsmoEnv()
    for i in range(10):
        observation = env.reset()
        done = False
        while not done:
            action, _ = model.predict(observation)
            observation, reward, done, info = env.step(action)
            # print(reward)
        print(info)

Example #8

0

Show file

File: RLTrader.py Project: bigbirdyp/RLTrader

    def test(self, model_epoch: int = 0, render_env: bool = True, render_report: bool = True, save_report: bool = False):
        train_provider, test_provider = self.data_provider.split_data_train_test(
            self.train_split_percentage)

        del train_provider

        history_data = test_provider.historical_ohlcv()
        history_data["Day"] = history_data["Date"].apply(
            lambda x: time.strftime("%Y-%m-%d", time.localtime(x)))
        history_data["Day"] = pd.to_datetime(history_data["Day"])
        history_data.sort_values(
            ['Day', 'Date'], ascending=[1, 0], inplace=True)
        grouped = history_data.groupby(['Day']).head(1)
        benchmark = grouped[["Day", "Close"]]
        benchmark.set_index('Day', drop=True, inplace=True)
        benchmark = benchmark.pct_change()[1:]
        self.logger.info(f"benchmark is:\n {benchmark}")

        init_envs = DummyVecEnv([make_env(test_provider)
                                 for _ in range(self.n_envs)])

        model_path = path.join(
            'data', 'agents', f'{self.study_name}__{model_epoch}.pkl')
        model = self.Model.load(model_path, env=init_envs)

        test_env = DummyVecEnv([make_env(test_provider) for _ in range(1)])

        self.logger.info(f'Testing model ({self.study_name}__{model_epoch})')

        zero_completed_obs = np.zeros(
            (self.n_envs,) + init_envs.observation_space.shape)
        zero_completed_obs[0, :] = test_env.reset()

        state = None
        rewards = []

        for _ in range(len(test_provider.data_frame)):
            action, state = model.predict(zero_completed_obs, state=state)
            obs, reward, done, info = test_env.step([action[0]])

            zero_completed_obs[0, :] = obs

            rewards.append(reward)

            if render_env:
                test_env.render(mode='human')

            if done:
                net_worths = pd.DataFrame({
                    'Date': info[0]['timestamps'],
                    'Balance': info[0]['net_worths'],
                })

                net_worths.set_index('Date', drop=True, inplace=True)
                returns = net_worths.pct_change()[1:]
                self.logger.info(f"returns.Balance is:\n {returns.Balance}")

                if render_report:
                    qs.plots.snapshot(
                        returns.Balance, title='RL Trader Performance')

                if save_report:
                    reports_path = path.join(
                        'data', 'reports', f'{self.study_name}__{model_epoch}.html')
                    try:
                        qs.reports.html(
                            returns.Balance, benchmark=benchmark.Close, output=reports_path)
                    except Exception as e:
                        self.logger.debug('catch exception: %s\n' % e)

        self.logger.info(
            f'Finished testing model ({self.study_name}__{model_epoch}): ${"{:.2f}".format(np.sum(rewards))}')

Example #9

0

Show file

File: models.py Project: Andre-Williams22/Reinforcement-Learning-Agent

def run_ensemble_strategy(df, unique_trade_date, rebalance_window,
                          validation_window) -> None:
    """Ensemble Strategy that combines PPO, A2C and DDPG"""
    print("============Start Ensemble Strategy============")
    # for ensemble model, it's necessary to feed the last state
    # of the previous model to the current model as the initial state
    last_state_ensemble = []

    ppo_sharpe_list = []
    ddpg_sharpe_list = []
    a2c_sharpe_list = []

    model_use = []

    # based on the analysis of the in-sample data
    #turbulence_threshold = 140
    # insample_turbulence = df[(df.datadate<20151000) & (df.datadate>=20090000)]
    # end = unique_trade_date.min()
    # start = end - 10000 # df["datadate"].min()
    # insample_turbulence = df[(df.datadate<20200831) & (df.datadate>=20200101)]
    # insample_turbulence = insample_turbulence.drop_duplicates(subset=['datadate'])
    # insample_turbulence_threshold = np.quantile(insample_turbulence.turbulence.values, .90)

    start = time.time()
    for i in range(rebalance_window + validation_window,
                   len(unique_trade_date), rebalance_window):
        print("============================================")
        ## initial state is empty
        if i - rebalance_window - validation_window == 0:
            # inital state
            initial = True
        else:
            # previous state
            initial = False

        # Tuning trubulence index based on historical data
        # Turbulence lookback window is one quarter
        end_date_index = df.index[df["datadate"] == unique_trade_date[
            i - rebalance_window - validation_window]].to_list()[-1]
        start_date_index = end_date_index - validation_window * 30 + 1

        # historical_turbulence = df.iloc[start_date_index:(end_date_index + 1), :]
        #historical_turbulence = df[(df.datadate<unique_trade_date[i - rebalance_window - validation_window]) & (df.datadate>=(unique_trade_date[i - rebalance_window - validation_window - 63]))]

        # historical_turbulence = historical_turbulence.drop_duplicates(subset=['datadate'])
        #
        # historical_turbulence_mean = np.mean(historical_turbulence.turbulence.values)
        #
        # if historical_turbulence_mean > insample_turbulence_threshold:
        #     # if the mean of the historical data is greater than the 90% quantile of insample turbulence data
        #     # then we assume that the current market is volatile,
        #     # therefore we set the 90% quantile of insample turbulence data as the turbulence threshold
        #     # meaning the current turbulence can't exceed the 90% quantile of insample turbulence data
        #     turbulence_threshold = insample_turbulence_threshold
        # else:
        #     # if the mean of the historical data is less than the 90% quantile of insample turbulence data
        #     # then we tune up the turbulence_threshold, meaning we lower the risk
        #     turbulence_threshold = np.quantile(insample_turbulence.turbulence.values, 1)

        turbulence_threshold = 0.0
        print("turbulence_threshold: ", turbulence_threshold)

        ############## Environment Setup starts ##############
        ## training env
        train = data_split(df,
                           start=20200101,
                           end=unique_trade_date[i - rebalance_window -
                                                 validation_window])
        print(train)
        env_train = DummyVecEnv([lambda: StockEnvTrain(train)])

        ## validation env
        validation = data_split(df,
                                start=unique_trade_date[i - rebalance_window -
                                                        validation_window],
                                end=unique_trade_date[i - rebalance_window])
        env_val = DummyVecEnv([
            lambda: StockEnvValidation(validation,
                                       turbulence_threshold=
                                       turbulence_threshold,
                                       iteration=i)
        ])
        obs_val = env_val.reset()
        ############## Environment Setup ends ##############

        ############## Training and Validation starts ##############
        print("======Model training from: ", 20200101, "to ",
              unique_trade_date[i - rebalance_window - validation_window])
        # print("training: ",len(data_split(df, start=20090000, end=test.datadate.unique()[i-rebalance_window]) ))
        # print("==============Model Training===========")
        print("======A2C Training========")
        model_a2c = train_A2C(env_train,
                              model_name="A2C_30k_dow_{}".format(i),
                              timesteps=30000)
        print("======A2C Validation from: ",
              unique_trade_date[i - rebalance_window - validation_window],
              "to ", unique_trade_date[i - rebalance_window])
        DRL_validation(model=model_a2c,
                       test_data=validation,
                       test_env=env_val,
                       test_obs=obs_val)
        sharpe_a2c = get_validation_sharpe(i)
        print("A2C Sharpe Ratio: ", sharpe_a2c)

        print("======PPO Training========")
        model_ppo = train_PPO(env_train,
                              model_name="PPO_100k_dow_{}".format(i),
                              timesteps=100000)
        print("======PPO Validation from: ",
              unique_trade_date[i - rebalance_window - validation_window],
              "to ", unique_trade_date[i - rebalance_window])
        DRL_validation(model=model_ppo,
                       test_data=validation,
                       test_env=env_val,
                       test_obs=obs_val)
        sharpe_ppo = get_validation_sharpe(i)
        print("PPO Sharpe Ratio: ", sharpe_ppo)

        print("======DDPG Training========")
        model_ddpg = train_DDPG(env_train,
                                model_name="DDPG_10k_dow_{}".format(i),
                                timesteps=10000)
        #model_ddpg = train_TD3(env_train, model_name="DDPG_10k_dow_{}".format(i), timesteps=20000)
        print("======DDPG Validation from: ",
              unique_trade_date[i - rebalance_window - validation_window],
              "to ", unique_trade_date[i - rebalance_window])
        DRL_validation(model=model_ddpg,
                       test_data=validation,
                       test_env=env_val,
                       test_obs=obs_val)
        sharpe_ddpg = get_validation_sharpe(i)

        ppo_sharpe_list.append(sharpe_ppo)
        a2c_sharpe_list.append(sharpe_a2c)
        ddpg_sharpe_list.append(sharpe_ddpg)

        # Model Selection based on sharpe ratio
        if (sharpe_ppo >= sharpe_a2c) & (sharpe_ppo >= sharpe_ddpg):
            model_ensemble = model_ppo
            model_use.append('PPO')
        elif (sharpe_a2c > sharpe_ppo) & (sharpe_a2c > sharpe_ddpg):
            model_ensemble = model_a2c
            model_use.append('A2C')
        else:
            model_ensemble = model_ddpg
            model_use.append('DDPG')
        ############## Training and Validation ends ##############

        ############## Trading starts ##############
        print("======Trading from: ", unique_trade_date[i - rebalance_window],
              "to ", unique_trade_date[i])
        #print("Used Model: ", model_ensemble)
        last_state_ensemble = DRL_prediction(
            df=df,
            model=model_ensemble,
            name="ensemble",
            last_state=last_state_ensemble,
            iter_num=i,
            unique_trade_date=unique_trade_date,
            rebalance_window=rebalance_window,
            turbulence_threshold=turbulence_threshold,
            initial=initial)
        # print("============Trading Done============")
        ############## Trading ends ##############

    end = time.time()
    print("Ensemble Strategy took: ", (end - start) / 60, " minutes")
    return model_ensemble

Example #10

0

Show file

File: tournament.py Project: ronaldosvieira/gym-locm

def run_matchup(drafter1: str, drafter2: str, battler: str, games: int,
                seed: int, concurrency: int) \
        -> Tuple[Tuple[float, float], Tuple[list, list], Tuple[list, list], List[List[Tuple]], Tuple[list, list], List[float]]:
    """
    Run the match-up between `drafter1` and `drafter2` using `battler` battler
    :param drafter1: drafter to play as first player
    :param drafter2: drafter to play as second player
    :param battler: battler to simulate the matches
    :param games: amount of matches to simulate
    :param seed: seed used to generate the matches
    :param concurrency: amount of matches executed at the same time
    :return: a tuple containing (i) a tuple containing the win rate of the
    first and second players, (ii) a tuple containing the average mana curves
    of the first and second players, (iii) a tuple containing the
    `30 * games` individual draft choices of the first and second players;
    (iv) a tuple of 3-uples containing the card alternatives presented to the
    players at each of the `games` episodes; and (v) a tuple containing the
    `games` decks built by the first and second players.
    """
    # parse the battle agent
    battler = agents.parse_battle_agent(battler)

    # initialize envs
    env = [lambda: LOCMDraftEnv(battle_agents=(battler(), battler())) for _ in range(concurrency)]

    # wrap envs in a vectorized env
    env = DummyVecEnv(env)

    for i in range(concurrency):
        # no overlap between episodes at each process
        current_seed = seed + (games // concurrency) * i
        current_seed -= 1  # resetting the env increases the seed by 1

        # set seed to env
        env.env_method('seed', current_seed, indices=[i])

    # reset the env
    env.reset()

    # initialize first player
    if drafter1.endswith('zip'):
        current_drafter = agents.RLDraftAgent(PPO2.load(drafter1))
        current_drafter.use_history = "history" in drafter1
    else:
        current_drafter = agents.parse_draft_agent(drafter1)()

    current_drafter.seed(seed)
    current_drafter.name = drafter1
    drafter1 = current_drafter

    # initialize second player
    if drafter2.endswith('zip'):
        other_drafter = agents.RLDraftAgent(PPO2.load(drafter2))
        other_drafter.use_history = "history" in drafter2
    else:
        other_drafter = agents.parse_draft_agent(drafter2)()

    other_drafter.seed(seed)
    other_drafter.name = drafter2
    drafter2 = other_drafter

    # initialize metrics
    episodes_so_far = 0
    episode_rewards = [[0.0] for _ in range(env.num_envs)]
    drafter1.mana_curve = [0 for _ in range(13)]
    drafter2.mana_curve = [0 for _ in range(13)]
    drafter1.choices = [[] for _ in range(env.num_envs)]
    drafter2.choices = [[] for _ in range(env.num_envs)]
    drafter1.decks = [[[]] for _ in range(env.num_envs)]
    drafter2.decks = [[[]] for _ in range(env.num_envs)]
    alternatives = [[] for _ in range(env.num_envs)]

    # run the episodes
    while True:
        observations = env.get_attr('state')

        # get the current agent's action for all concurrent envs
        if isinstance(current_drafter, agents.RLDraftAgent):
            all_past_choices = env.get_attr('choices')
            new_observations = []

            for i, observation in enumerate(observations):
                new_observation = encode_state_draft(
                    observation,
                    use_history=current_drafter.use_history,
                    past_choices=all_past_choices[i][observation.current_player.id]
                )

                new_observations.append(new_observation)

            actions = current_drafter.act(new_observations)
        else:
            actions = [current_drafter.act(observation)
                       for observation in observations]

        # log chosen cards into current agent's mana curve
        for i, (action, observation) in enumerate(zip(actions, observations)):
            # get chosen index
            try:
                chosen_index = action.origin
            except AttributeError:
                chosen_index = action

            # save choice
            current_drafter.choices[i].append(chosen_index)

            # get chosen card
            chosen_card = observation.current_player.hand[chosen_index]

            # increase amount of cards chosen with the chosen card's cost
            current_drafter.mana_curve[chosen_card.cost] += 1

            # add chosen card to this episode's deck
            current_drafter.decks[i][-1].append(chosen_card.id)

            # save card alternatives
            if observation.current_player.id == PlayerOrder.FIRST:
                alternatives[i].append(tuple(map(lambda c: c.id, observation.current_player.hand)))

        # perform the action and get the outcome
        _, rewards, dones, _ = env.step(actions)

        if isinstance(current_drafter, agents.RLDraftAgent):
            current_drafter.dones = dones

        # update metrics
        for i in range(env.num_envs):
            episode_rewards[i][-1] += rewards[i]

            if dones[i]:
                episode_rewards[i].append(0.0)
                current_drafter.decks[i].append([])
                other_drafter.decks[i].append([])

                episodes_so_far += 1

        # check exiting condition
        if episodes_so_far >= games:
            break

        # swap drafters
        current_drafter, other_drafter = other_drafter, current_drafter

    # normalize mana curves
    total_choices = sum(drafter1.mana_curve)
    drafter1.mana_curve = [freq / total_choices for freq in drafter1.mana_curve]
    drafter2.mana_curve = [freq / total_choices for freq in drafter2.mana_curve]

    # join all parallel rewards
    all_rewards = [reward for rewards in episode_rewards
                   for reward in rewards[:-1]]

    # join all parallel choices
    drafter1.choices = [c for choices in drafter1.choices for c in choices]
    drafter2.choices = [c for choices in drafter2.choices for c in choices]

    # join all parallel decks
    drafter1.decks = [deck for decks in drafter1.decks for deck in decks if deck]
    drafter2.decks = [deck for decks in drafter2.decks for deck in decks if deck]

    # join all parallel alternatives
    alternatives = [turn for env in alternatives for turn in env]

    # cap any unsolicited data from additional episodes
    all_rewards = all_rewards[:games]
    drafter1.choices = drafter1.choices[:30 * games]
    drafter2.choices = drafter2.choices[:30 * games]
    drafter1.decks = drafter1.decks[:games]
    drafter2.decks = drafter2.decks[:games]
    alternatives = alternatives[:30 * games]

    # convert the list of rewards to the first player's win rate
    win_rate = (mean(all_rewards) + 1) * 50

    return (win_rate, 100 - win_rate), \
        (drafter1.mana_curve, drafter2.mana_curve), \
        (drafter1.choices, drafter2.choices), \
        alternatives, \
        (drafter1.decks, drafter2.decks), \
        all_rewards

Example #11

0

Show file

File: ppo_vec_env.py Project: nspasic96/RL-algorithms

            learning_rate=args.learning_rate,
            epsilon=1e-05).minimize(totalLoss)

    trainableParams = utils.get_vars("AllTrainableParams")
    getTrainableParams = utils.flat_concat(trainableParams)
    setTrainableParams = utils.assign_params_from_flat(trainableParamsFlatten,
                                                       trainableParams)

    #tf session initialization
    init = tf.initialize_local_variables()
    init2 = tf.initialize_all_variables()
    sess.run([init, init2])

    finishedEp = 0
    evaluationNum = 0
    nextObs = env.reset()
    nextDone = 0
    epLen = 0
    epTotalRew = 0

    #algorithm
    for e in range(args.epochs):
        print("Epoch {} started".format(e))

        obs = np.zeros((args.epoch_len, inputLength))
        rewards = np.zeros((args.epoch_len, ))
        dones = np.zeros((args.epoch_len, ))
        predVals = np.zeros((args.epoch_len, ))
        actions = np.zeros((args.epoch_len, outputLength))
        sampledLogProb = np.zeros((args.epoch_len, ))
        returns = np.zeros((args.epoch_len, ))

Example #12

0

Show file

    def test(self,
             model_epoch: int = 0,
             render_env: bool = True,
             render_report: bool = True,
             save_report: bool = False):
        train_provider, test_provider = self.data_provider.split_data_train_test(
            self.train_split_percentage)

        del train_provider

        init_envs = DummyVecEnv(
            [make_env(test_provider) for _ in range(self.n_envs)])

        model_path = path.join('data', 'agents',
                               f'{self.study_name}__{model_epoch}.pkl')
        model = self.Model.load(model_path, env=init_envs)

        test_env = DummyVecEnv([make_env(test_provider) for _ in range(1)])

        self.logger.info(f'Testing model ({self.study_name}__{model_epoch})')

        zero_completed_obs = np.zeros((self.n_envs, ) +
                                      init_envs.observation_space.shape)
        zero_completed_obs[0, :] = test_env.reset()

        state = None
        rewards = []

        for _ in range(len(test_provider.data_frame)):
            action, state = model.predict(zero_completed_obs, state=state)
            obs, reward, done, info = test_env.step([action[0]])

            zero_completed_obs[0, :] = obs

            rewards.append(reward)

            if render_env:
                test_env.render(mode='human')

            if done:
                net_worths = pd.DataFrame({
                    'Date': info[0]['timestamps'],
                    'Balance': info[0]['net_worths'],
                })

                net_worths.set_index('Date', drop=True, inplace=True)
                returns = net_worths.pct_change()[1:]

                if render_report:
                    qs.plots.snapshot(returns.Balance,
                                      title='RL Trader Performance')

                if save_report:
                    reports_path = path.join(
                        'data', 'reports',
                        f'{self.study_name}__{model_epoch}.html')
                    qs.reports.html(returns.Balance, file=reports_path)

        self.logger.info(
            f'Finished testing model ({self.study_name}__{model_epoch}): ${"{:.2f}".format(np.sum(rewards))}'
        )

Example #13

0

Show file

    def optimize_params(self,
                        trial,
                        n_prune_evals_per_trial: int = 2,
                        n_tests_per_eval: int = 1):
        train_provider, test_provider = self.data_provider.split_data_train_test(
            self.train_split_percentage)
        train_provider, validation_provider = train_provider.split_data_train_test(
            self.train_split_percentage)

        del test_provider

        train_env = DummyVecEnv([lambda: TradingEnv(train_provider)])
        validation_env = DummyVecEnv([lambda: TradingEnv(validation_provider)])

        model_params = self.optimize_agent_params(trial)
        model = self.Model(self.Policy,
                           train_env,
                           verbose=self.model_verbose,
                           nminibatches=1,
                           tensorboard_log=self.tensorboard_path,
                           **model_params)

        last_reward = -np.finfo(np.float16).max
        n_steps_per_eval = int(
            len(train_provider.data_frame) / n_prune_evals_per_trial)

        for eval_idx in range(n_prune_evals_per_trial):
            try:
                model.learn(n_steps_per_eval)
            except AssertionError:
                raise

            rewards = []
            n_episodes, reward_sum = 0, 0.0

            trades = train_env.get_attr('trades')

            if len(trades[0]) < 1:
                self.logger.info(
                    f'Pruning trial for not making any trades: {eval_idx}')
                raise optuna.structs.TrialPruned()

            state = None
            obs = validation_env.reset()
            while n_episodes < n_tests_per_eval:
                action, state = model.predict(obs, state=state)
                obs, reward, done, _ = validation_env.step([action])

                reward_sum += reward[0]

                if all(done):
                    rewards.append(reward_sum)
                    reward_sum = 0.0
                    n_episodes += 1
                    obs = validation_env.reset()

            last_reward = np.mean(rewards)
            trial.report(-1 * last_reward, eval_idx)

            if trial.should_prune(eval_idx):
                raise optuna.structs.TrialPruned()

        return -1 * last_reward

Example #14

0

Show file

File: test.py Project: gabrielbarcik/Gym-Flappy-Bird

import gym
import gym_flappy_bird
from stable_baselines.common.vec_env import DummyVecEnv

env = gym.make('flappy-bird-v0')
env = DummyVecEnv([lambda: env])
env.reset()

Example #15

0

Show file

                                     scenario='test',
                                     gpu=not args.no_gpu)

    cb = NavrepEvalCallback(eval_env,
                            test_env_fn=test_env_fn,
                            logpath=LOGPATH,
                            savepath=MODELPATH,
                            verbose=1)
    #     model = PPO2(MlpPolicy, env, verbose=0)  #
    S = pd.read_csv(LOGPATH, index_col=0)
    cb.eval_env.episode_statistics = S
    model = PPO2.load(MODELPATH)
    model.set_env(env)
    print(S)
    model.learn(total_timesteps=TRAIN_STEPS + 1, callback=cb)
    obs = env.reset()

    model.save(MODELPATH)
    model.save(MODELPATH2)
    print("Model '{}' saved".format(MODELPATH))

    del model
    env.close()

    model = PPO2.load(MODELPATH)

    env = NavRepTrainEncodedEnv(args.backend,
                                args.encoding,
                                silent=True,
                                scenario='train')
    obs = env.reset()

Example #16

0

Show file

File: main_keep_trade.py Project: airocket/keep_rl

                      env=env,
                      verbose=1,
                      n_steps=49,
                      tensorboard_log="./tensorboard_keep_trade/")
    model.is_tb_set = True

    for n_epoch in range(0, 1):
        summary_writer = tf.compat.v1.summary.FileWriter(
            "./tensorboard_keep/" + "Keep_trade_test_" + str(n_epoch + 1))
        print('\x1b[6;30;42m' + '**************  Calculate epoch:', n_epoch,
              '**************' + '\x1b[0m')

        # save_s = model.learn(total_timesteps=20000, tb_log_name='Keep_learn')

        zero_completed_obs = np.zeros((n_cpu, ) + env.observation_space.shape)
        zero_completed_obs[0, :] = test_env.reset()
        state = None
        reward_sum = 0
        all_net_worth_np = np.array([])
        time_test = time.time()
        save = pd.DataFrame(columns=[
            'time', 'action', 'reward', 'profit', 'keep_price', 'keep_hodl',
            'net_worth', 'btc_price', 'eth_price'
        ])

        for i in range(200):
            action, states = model.predict(zero_completed_obs, state=state)
            obs, reward, done, info = test_env.step(action)
            zero_completed_obs[0, :] = obs
            keep_price = df_save['close_keep'][i]
            net_worth_log = info[0]['net_worth']

Example #17

0

Show file

File: start_q_learning.py Project: husam88/DroidbotX

def main():
    """
    the main function
    it starts a droidbot according to the arguments given in cmd line
    """

    opts = parse_args()
    import os
    if not os.path.exists(opts.apk_path):
        print("APK does not exist.")
        return
    if not opts.output_dir and opts.cv_mode:
        print("To run in CV mode, you need to specify an output dir (using -o option).")

    if opts.distributed:
        if opts.distributed == "master":
            start_mode = "master"
        else:
            start_mode = "worker"
    else:
        start_mode = "normal"

    if start_mode == "master":
        droidmaster = DroidMaster(
            app_path=opts.apk_path,
            is_emulator=opts.is_emulator,
            output_dir=opts.output_dir,
            # env_policy=opts.env_policy,
            env_policy=env_manager.POLICY_NONE,
            policy_name=opts.input_policy,
            random_input=opts.random_input,
            script_path=opts.script_path,
            event_interval=opts.interval,
            timeout=opts.timeout,
            event_count=opts.count,
            cv_mode=opts.cv_mode,
            debug_mode=opts.debug_mode,
            keep_app=opts.keep_app,
            keep_env=opts.keep_env,
            profiling_method=opts.profiling_method,
            grant_perm=opts.grant_perm,
            enable_accessibility_hard=opts.enable_accessibility_hard,
            qemu_hda=opts.qemu_hda,
            qemu_no_graphic=opts.qemu_no_graphic,
            humanoid=opts.humanoid,
            ignore_ad=opts.ignore_ad,
            replay_output=opts.replay_output)
        droidmaster.start()
    else:
        droidbot = DroidBot(
            app_path=opts.apk_path,
            device_serial=opts.device_serial,
            is_emulator=opts.is_emulator,
            output_dir=opts.output_dir,
            # env_policy=opts.env_policy,
            env_policy=env_manager.POLICY_NONE,
            policy_name=opts.input_policy,
            random_input=opts.random_input,
            script_path=opts.script_path,
            event_interval=opts.interval,
            timeout=opts.timeout,
            event_count=opts.count,
            cv_mode=opts.cv_mode,
            debug_mode=opts.debug_mode,
            keep_app=opts.keep_app,
            keep_env=opts.keep_env,
            profiling_method=opts.profiling_method,
            grant_perm=opts.grant_perm,
            enable_accessibility_hard=opts.enable_accessibility_hard,
            master=opts.master,
            humanoid=opts.humanoid,
            ignore_ad=opts.ignore_ad,
            replay_output=opts.replay_output)

        droidbot.start()

    env = DummyVecEnv([lambda: droidbot_env.DroidBotEnv(droidbot)])
    start_time = time.time()
    env.reset()

    def events_so_state(env):
        events = env.envs[0].possible_events
        state_now = env.envs[0].device.get_current_state()
        event_ids = []
        probs = []

        for i, event in enumerate(events):
            event_str = str(type(event)) + '_' + event.get_event_str(state_now)
            if event_str in event_ids:
                1/0
            if event:
                event_ids.append(event_str)
                probs.append(env.envs[0].events_probs[i])
        state = state_now.state_str
        probs = np.array(probs)
        return state, probs, event_ids

    state_function = {}
    num_iterations = 1000
    EPSILON = 0.1
    Q_TABLE = []
    transitions_matrix = None
    number_of_trans = []
    event_to_id = []
    max_number_of_actions = 50

    def check_state(state_id):
        nonlocal Q_TABLE
        nonlocal transitions_matrix
        nonlocal number_of_trans
        nonlocal event_to_id
        nonlocal state_function
        #print(state_id)
        if state_function.get(state_id) is None:
            if Q_TABLE == []:
                Q_TABLE = np.zeros((1, max_number_of_actions))
                transitions_matrix = np.zeros((1, max_number_of_actions, 1))
            else:
                Q_TABLE = np.concatenate([Q_TABLE, np.zeros((1, max_number_of_actions))], axis=0)
                transition_matrix_new = np.zeros((Q_TABLE.shape[0], max_number_of_actions, Q_TABLE.shape[0]))
                transition_matrix_new[:-1, :, :-1] = transitions_matrix
                transitions_matrix = transition_matrix_new
            event_to_id.append({})
            state_function[state_id] = Q_TABLE.shape[0] - 1
            Q_TABLE[-1][-1] = 1.0
            number_of_trans.append(np.zeros(max_number_of_actions))
        #print(state_function)
    state_pre, probs, event_ids = events_so_state(env)
    check_state(state_pre)
    state = state_function[state_pre]

    def make_decision(state_i, events):
        nonlocal Q_TABLE, event_to_id
        id_to_action = np.zeros((max_number_of_actions), dtype=np.int32) + 1000
        q_values = np.zeros(max_number_of_actions)
        probs_now = np.zeros(max_number_of_actions)

        for i, event in enumerate(events):
            if i == len(events) - 1:
                q_values[-1] = Q_TABLE[state_i][-1]
                id_to_action[-1] = min(len(events), max_number_of_actions) - 1
                continue
            if event_to_id[state_i].get(event) is None:
                if len(event_to_id[state_i]) >= max_number_of_actions - 1:
                    continue
                event_to_id[state_i][event] = int(len(list(event_to_id[state_i].keys())))
                Q_TABLE[state_i][event_to_id[state_i][event]] = 1.0
            q_values[event_to_id[state_i][event]] = Q_TABLE[state_i][event_to_id[state_i][event]]

            id_to_action[event_to_id[state_i][event]] = int(i)


        if np.random.rand() < EPSILON:
            action = max_number_of_actions - 1
            make_action = id_to_action[action]
        else:
            max_q = np.max(q_values)
            actions_argmax = np.arange(max_number_of_actions)[q_values >= max_q - 0.0001]
            probs_unnormed = 1/(np.arange(actions_argmax.shape[0]) + 1.)
            probs_unnormed /= np.sum(probs_unnormed)
            action = np.random.choice(actions_argmax)
            make_action = id_to_action[action]
        return action, make_action

    for i_step in np.arange(num_iterations):
        action, make_action = make_decision(state, event_ids)
        print(state, action, make_action)
        env.step([make_action])
        new_state_pre, probs, event_ids = events_so_state(env)

        check_state(new_state_pre)
        new_state = state_function[new_state_pre]

        number_of_trans[state][action] += 1
        transitions_matrix[state, action] *= (number_of_trans[state][action] - 1)
        transitions_matrix[state, action, new_state] += 1
        transitions_matrix[state, action] /= number_of_trans[state][action]
        for _ in np.arange(10):
            for i in np.arange(max_number_of_actions):
                transitions = transitions_matrix[:, i, :]
                q_target = np.array([[np.max(Q_TABLE[i])] for i in np.arange(Q_TABLE.shape[0])])
                new_q_values = np.matmul(transitions, q_target) * 0.99
                good_states = np.sum(transitions, axis=1) > 0.5
                if True in good_states:
                    Q_TABLE[good_states, i] = new_q_values[good_states, 0]
                else:
                    continue
        for i in np.arange(Q_TABLE.shape[0]):
            print(Q_TABLE[i])
        if i_step%10==0:
            np.save('q_function', Q_TABLE)
            np.save('transition_function', transitions_matrix)
            with open('states.json', 'w') as f:
                json.dump(state_function, f)
        state = new_state
    1/0
    droidbot.stop()

Example #18

0

Show file

File: stabilizing_highway.py Project: gitaar9/groupA1-coop-av-flow

    parser.add_argument('--result_name', type=str, default='stabilize_highway', help='Name of saved model')
    args = parser.parse_args()
    model = run_model(args.num_cpus, args.rollout_size, args.num_steps)
    # Save the model to a desired folder and then delete it to demonstrate loading
    if not os.path.exists(os.path.realpath(os.path.expanduser('~/baseline_results'))):
        os.makedirs(os.path.realpath(os.path.expanduser('~/baseline_results')))
    path = os.path.realpath(os.path.expanduser('~/baseline_results'))
    save_path = os.path.join(path, args.result_name)
    print('Saving the trained model!')
    model.save(save_path)
    # dump the flow params
    with open(os.path.join(path, args.result_name) + '.json', 'w') as outfile:
        json.dump(flow_params, outfile, cls=FlowParamsEncoder, sort_keys=True, indent=4)
    del model
    del flow_params

    # Replay the result by loading the model
    print('Loading the trained model and testing it out!')
    model = PPO2.load(save_path)
    flow_params = get_flow_params(os.path.join(path, args.result_name) + '.json')
    flow_params['sim'].render = True
    env_constructor = env_constructor(params=flow_params, version=0)()
    env = DummyVecEnv([lambda: env_constructor])  # The algorithms require a vectorized environment to run
    obs = env.reset()
    reward = 0
    for i in range(flow_params['env'].horizon):
        action, _states = model.predict(obs)
        obs, rewards, dones, info = env.step(action)
        reward += rewards
    print('the final reward is {}'.format(reward))

Example #19

0

Show file

File: agent.py Project: MS1908/ppo

avg_rewards_bat_list_dqn = []
avg_rewards_energy_list_dqn = []
dqn_data = []
train_time_slots = 20000
t_range = 2000
'''
    Myopic algorithm: The calculation to get the next action is implemented in the environment file.
    * Myopic is simply a greedy approach. We will use the optimization of scipy (namely minimize_scalar)
      to get the minimum value of the power function which corresponds to current state of the environment,
      and we take that action.
    * Myopic just optimize for a single timeslot.
'''

#myopic
set_seed(rand_seed)
obs = env.reset()
for i in range(t_range):
    action = env.env_method('myopic_action_cal')
    obs, rewards, dones, info = env.step(action)
    rewards_list_myopic.append(1 / rewards)
    avg_rewards_myopic.append(np.mean(rewards_list_myopic[:]))
    t, bak, bat = env.render()
    rewards_time_list_myopic.append(t)
    avg_rewards_time_list_myopic.append(np.mean(rewards_time_list_myopic[:]))
    rewards_bak_list_myopic.append(bak)
    avg_rewards_bak_list_myopic.append(np.mean(rewards_bak_list_myopic[:]))
    rewards_bat_list_myopic.append(bat)
    avg_rewards_bat_list_myopic.append(np.mean(rewards_bat_list_myopic[:]))
    avg_rewards_energy_list_myopic.append(avg_rewards_bak_list_myopic[-1] +
                                          avg_rewards_bat_list_myopic[-1])
    myopic_data.append([