Exemple #1
0
    def optimize_params(self, trial, n_prune_evals_per_trial: int = 2, n_tests_per_eval: int = 1):
        train_provider, test_provider = self.data_provider.split_data_train_test(
            self.train_split_percentage)
        train_provider, validation_provider = train_provider.split_data_train_test(
            self.train_split_percentage)

        del test_provider

        train_env = DummyVecEnv([lambda: TradingEnv(train_provider)])
        validation_env = DummyVecEnv([lambda: TradingEnv(validation_provider)])

        model_params = self.optimize_agent_params(trial)
        model = self.Model(self.Policy,
                           train_env,
                           verbose=self.model_verbose,
                           nminibatches=1,
                           tensorboard_log=self.tensorboard_path,
                           **model_params)

        last_reward = -np.finfo(np.float16).max
        n_steps_per_eval = int(
            len(train_provider.data_frame) / n_prune_evals_per_trial)

        for eval_idx in range(n_prune_evals_per_trial):
            try:
                model.learn(n_steps_per_eval)
            except AssertionError:
                raise

            rewards = []
            n_episodes, reward_sum = 0, 0.0

            trades = train_env.get_attr('trades')

            if len(trades[0]) < 1:
                self.logger.info(
                    f'Pruning trial for not making any trades: {eval_idx}')
                raise optuna.structs.TrialPruned()

            state = None
            obs = validation_env.reset()
            while n_episodes < n_tests_per_eval:
                action, state = model.predict(obs, state=state)
                obs, reward, done, _ = validation_env.step([action])

                reward_sum += reward[0]

                if all(done):
                    rewards.append(reward_sum)
                    reward_sum = 0.0
                    n_episodes += 1
                    obs = validation_env.reset()

            last_reward = np.mean(rewards)
            trial.report(-1 * last_reward, eval_idx)

            if trial.should_prune(eval_idx):
                raise optuna.structs.TrialPruned()

        return -1 * last_reward
Exemple #2
0
if __name__ == "__main__":
    cfg = parse()
    cfg_log = ConfigLog(cfg)

    # test & train
    if cfg.test:
        cfg_log.load(CFG_FILE)
        # load data
        df_train, df_test, df_rate = load_data(cfg)
        rl_returns = []
        naked_returns = []
        covered_returns = []
        delta_returns = []
        env = DummyVecEnv([lambda: HedgeEnv(df_test, df_rate, cfg)])
        T = env.get_attr('T')[0]
        model = DDPG(MlpPolicy, env, verbose=1)
        model.load(TEST_MODEL)
        delta = DeltaHedge()
        for i in range(cfg.test_times):
            # rl
            env.set_attr("b_rl", True)
            obs = env.reset()  # every time, create a new transaction
            naked_returns.append(naked(env))
            covered_returns.append(covered(env))
            for i in range(T):
                action, _states = model.predict(obs)
                obs, rewards, done, info = env.step(action)
                # env.render()
            rl_returns.append(env.get_attr('final_reward')[0])
            env.env_method('restart')  # only trace back to the initial state
Exemple #3
0
env = DummyVecEnv([lambda: pong])

model = DQN(MlpPolicy,
            env,
            verbose=1,
            gamma=0.95,
            tensorboard_log="./MinipongLog/")
model.learn(total_timesteps=80000)
#saving the model for future usability
model.save('model/DQN_model_minipong')

#you can check the tensor board log page easily by using this command in bash
#tensorboard --logdir ./MinipongLog/ --host --localhost

#displaying the training reward plot
cummulative_reward_per_episode = env.get_attr(
    'running_reward_list_per_episode')[0]
plt.title('Training reward per episode')
plt.xlabel('Number of episodes')
plt.ylabel('Cummulative reward sum')
plt.plot(cummulative_reward_per_episode)
plt.show()

#<---------------------------------------------------------testing---------------------------------------------------------->

pong = Minipong(level=3, size=5)
testing_env = DummyVecEnv([lambda: pong])
model = DQN.load('model/DQN_model_minipong')
number_of_episodes = 100
for _ in range(number_of_episodes):
    done = False
    state = testing_env.reset()
else:
    model = PPO2(MlpPolicy, env, verbose=0, learning_rate=learning_rate)
    print(float(1e-5) == 0.00001)
# Create the callback: check every 1000 steps
callback = SaveOnBestTrainingRewardCallback(env=env,
                                            check_freq=1000,
                                            log_dir=log_dir)
# Train the agent

try:
    model.learn(total_timesteps=int(time_steps), callback=callback)
    model.save(models_dir + model_name)
except KeyboardInterrupt:
    model.save(models_dir + model_name + "_abort")
finally:
    mean_episode_reward = env.get_attr('mean_episode_reward')
    print(mean_episode_reward)
    plt.plot(mean_episode_reward[0], 'r.-', label="Mean Episode Reward(100)")
    mean_episode_length = env.get_attr('mean_episode_length')
    print(mean_episode_length)
    plt.plot(mean_episode_length[0], 'g.-', label="Mean Episode Length(100)")
    plt.legend()
    plt.show()

t_steps = [i for i, k in enumerate(mean_episode_reward[0])]
tp = [(np.array(t_steps), np.array(mean_episode_reward[0]))]
print(tp)
results_plotter.plot_curves(tp, 'timesteps', "TITLE")
plt.show()

# env = DummyVecEnv([lambda: ProcessorEnv(taskFile='data/example.xlsx')])
Exemple #5
0
    # ====== IMPORT MODEL ======
    # fixme - should be able to import a previous model.
    modelToUse = selectFunctionAccordingToParams('model', params.get('model'))
    polictyToUse = selectFunctionAccordingToParams('policy',
                                                   params.get('policy'))
    agentsDir = join(td, 'agents')

    model = modelToUse.load(join(agentsDir, 'agentFinal.pkl'), env=testEnv)

    # ===== TEST MODEL ======
    obs, done = testEnv.reset(), False
    rewards = []
    while not done:
        action, _states = model.predict(obs)

        worthHistory = testEnv.get_attr('net_worths')
        tradeHistory = testEnv.get_attr('trades')

        obs, reward, done, _ = testEnv.step(action)
        # testEnv.render(mode="human")
        rewards.append(reward)

    print(' Total reward: {}'.format(sum(rewards) / len(rewards)))

    # ===== PLOTS =====
    worthHistory = worthHistory[
        0]  # We only use one environment -> understand better why we have vectorized
    tradeHistory = tradeHistory[0]
    print('Size of worth history: ' + str(len(worthHistory)))
    print('Size of trade history: ' + str(len(tradeHistory)))
    bitcoinPrice = test_df['Close'].values[params.get('forecast_len'):]
def run_matchup(drafter1: str, drafter2: str, battler: str, games: int,
                seed: int, concurrency: int) \
        -> Tuple[Tuple[float, float], Tuple[list, list], Tuple[list, list], List[List[Tuple]], Tuple[list, list], List[float]]:
    """
    Run the match-up between `drafter1` and `drafter2` using `battler` battler
    :param drafter1: drafter to play as first player
    :param drafter2: drafter to play as second player
    :param battler: battler to simulate the matches
    :param games: amount of matches to simulate
    :param seed: seed used to generate the matches
    :param concurrency: amount of matches executed at the same time
    :return: a tuple containing (i) a tuple containing the win rate of the
    first and second players, (ii) a tuple containing the average mana curves
    of the first and second players, (iii) a tuple containing the
    `30 * games` individual draft choices of the first and second players;
    (iv) a tuple of 3-uples containing the card alternatives presented to the
    players at each of the `games` episodes; and (v) a tuple containing the
    `games` decks built by the first and second players.
    """
    # parse the battle agent
    battler = agents.parse_battle_agent(battler)

    # initialize envs
    env = [lambda: LOCMDraftEnv(battle_agents=(battler(), battler())) for _ in range(concurrency)]

    # wrap envs in a vectorized env
    env = DummyVecEnv(env)

    for i in range(concurrency):
        # no overlap between episodes at each process
        current_seed = seed + (games // concurrency) * i
        current_seed -= 1  # resetting the env increases the seed by 1

        # set seed to env
        env.env_method('seed', current_seed, indices=[i])

    # reset the env
    env.reset()

    # initialize first player
    if drafter1.endswith('zip'):
        current_drafter = agents.RLDraftAgent(PPO2.load(drafter1))
        current_drafter.use_history = "history" in drafter1
    else:
        current_drafter = agents.parse_draft_agent(drafter1)()

    current_drafter.seed(seed)
    current_drafter.name = drafter1
    drafter1 = current_drafter

    # initialize second player
    if drafter2.endswith('zip'):
        other_drafter = agents.RLDraftAgent(PPO2.load(drafter2))
        other_drafter.use_history = "history" in drafter2
    else:
        other_drafter = agents.parse_draft_agent(drafter2)()

    other_drafter.seed(seed)
    other_drafter.name = drafter2
    drafter2 = other_drafter

    # initialize metrics
    episodes_so_far = 0
    episode_rewards = [[0.0] for _ in range(env.num_envs)]
    drafter1.mana_curve = [0 for _ in range(13)]
    drafter2.mana_curve = [0 for _ in range(13)]
    drafter1.choices = [[] for _ in range(env.num_envs)]
    drafter2.choices = [[] for _ in range(env.num_envs)]
    drafter1.decks = [[[]] for _ in range(env.num_envs)]
    drafter2.decks = [[[]] for _ in range(env.num_envs)]
    alternatives = [[] for _ in range(env.num_envs)]

    # run the episodes
    while True:
        observations = env.get_attr('state')

        # get the current agent's action for all concurrent envs
        if isinstance(current_drafter, agents.RLDraftAgent):
            all_past_choices = env.get_attr('choices')
            new_observations = []

            for i, observation in enumerate(observations):
                new_observation = encode_state_draft(
                    observation,
                    use_history=current_drafter.use_history,
                    past_choices=all_past_choices[i][observation.current_player.id]
                )

                new_observations.append(new_observation)

            actions = current_drafter.act(new_observations)
        else:
            actions = [current_drafter.act(observation)
                       for observation in observations]

        # log chosen cards into current agent's mana curve
        for i, (action, observation) in enumerate(zip(actions, observations)):
            # get chosen index
            try:
                chosen_index = action.origin
            except AttributeError:
                chosen_index = action

            # save choice
            current_drafter.choices[i].append(chosen_index)

            # get chosen card
            chosen_card = observation.current_player.hand[chosen_index]

            # increase amount of cards chosen with the chosen card's cost
            current_drafter.mana_curve[chosen_card.cost] += 1

            # add chosen card to this episode's deck
            current_drafter.decks[i][-1].append(chosen_card.id)

            # save card alternatives
            if observation.current_player.id == PlayerOrder.FIRST:
                alternatives[i].append(tuple(map(lambda c: c.id, observation.current_player.hand)))

        # perform the action and get the outcome
        _, rewards, dones, _ = env.step(actions)

        if isinstance(current_drafter, agents.RLDraftAgent):
            current_drafter.dones = dones

        # update metrics
        for i in range(env.num_envs):
            episode_rewards[i][-1] += rewards[i]

            if dones[i]:
                episode_rewards[i].append(0.0)
                current_drafter.decks[i].append([])
                other_drafter.decks[i].append([])

                episodes_so_far += 1

        # check exiting condition
        if episodes_so_far >= games:
            break

        # swap drafters
        current_drafter, other_drafter = other_drafter, current_drafter

    # normalize mana curves
    total_choices = sum(drafter1.mana_curve)
    drafter1.mana_curve = [freq / total_choices for freq in drafter1.mana_curve]
    drafter2.mana_curve = [freq / total_choices for freq in drafter2.mana_curve]

    # join all parallel rewards
    all_rewards = [reward for rewards in episode_rewards
                   for reward in rewards[:-1]]

    # join all parallel choices
    drafter1.choices = [c for choices in drafter1.choices for c in choices]
    drafter2.choices = [c for choices in drafter2.choices for c in choices]

    # join all parallel decks
    drafter1.decks = [deck for decks in drafter1.decks for deck in decks if deck]
    drafter2.decks = [deck for decks in drafter2.decks for deck in decks if deck]

    # join all parallel alternatives
    alternatives = [turn for env in alternatives for turn in env]

    # cap any unsolicited data from additional episodes
    all_rewards = all_rewards[:games]
    drafter1.choices = drafter1.choices[:30 * games]
    drafter2.choices = drafter2.choices[:30 * games]
    drafter1.decks = drafter1.decks[:games]
    drafter2.decks = drafter2.decks[:games]
    alternatives = alternatives[:30 * games]

    # convert the list of rewards to the first player's win rate
    win_rate = (mean(all_rewards) + 1) * 50

    return (win_rate, 100 - win_rate), \
        (drafter1.mana_curve, drafter2.mana_curve), \
        (drafter1.choices, drafter2.choices), \
        alternatives, \
        (drafter1.decks, drafter2.decks), \
        all_rewards