verbose=1, learning_rate=1e-3, policy_kwargs={ 'layers': [64, 64], 'reg_weight': 1e-32 }) model.learn(total_timesteps=100000, log_interval=10) obs, act = [], [] nb_rollouts, nb_steps = 25, 200 for n in range(nb_rollouts): _obs = np.empty((nb_steps, dm_obs)) _act = np.empty((nb_steps, dm_act)) x = env.reset() for t in range(nb_steps): u, _ = model.predict(x) _obs[t, :], _act[t, :] = x, u u = np.clip(u, -ulim, ulim) x, r, _, _ = env.step(u) obs.append(_obs) act.append(_act) import matplotlib.pyplot as plt fig, ax = plt.subplots(nrows=1, ncols=dm_obs + dm_act, figsize=(12, 4)) for _obs, _act in zip(obs, act): for k, col in enumerate(ax[:-1]): col.plot(_obs[:, k])
def main(): parser = argparse.ArgumentParser(description='PPO baseline implementation') parser.add_argument('-e', '--experiment', type=str, default='ppo_test', help='name of experiment') parser.add_argument('-w', '--env', type=str, default='Shepherd-v0', help='name of gym environment') parser.add_argument('-m', '--mode', type=str, default='train', help='mode to run experiment') parser.add_argument('-p', '--policy', type=str, default='mlp', help='type of policy network') parser.add_argument('-t', '--timesteps', type=int, default=10000, help='number of timesteps to train') parser.add_argument('-d', '--datapath', type=str, default='../data', help='path to save results') args = parser.parse_args() mode = args.mode env_name = args.env policy = args.policy data_path = args.datapath timesteps = args.timesteps experiment = args.experiment exp_path = '{}/{}'.format(data_path, experiment) log_path = '{}/log_{}'.format(exp_path, timesteps) model_path = '{}/model_{}'.format(exp_path, timesteps) env = gym.make(env_name) env = shepherd_gym.wrappers.SamplerWrapper(env, demo_path='../data/curriculum', increment_freq=250) env = DummyVecEnv([lambda: env]) if policy == 'mlp': policy_type = MlpPolicy else: policy_type = MlpLstmPolicy model = PPO2(policy_type, env, verbose=1, tensorboard_log=log_path, nminibatches=1) if mode == 'train': model.learn(total_timesteps=timesteps) model.save(model_path) else: model.load(model_path) env.render() obs = env.reset() for _ in range(1000): action, _states = model.predict(obs) obs, _, _, _ = env.step(action) env.render() # complete simulation env.close()
import gym from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import PPO2 from env.BitcoinTradingEnv import BitcoinTradingEnv import pandas as pd train_df = pd.read_csv('./datasets/bot_train_ETHBTC_700_hour.csv') train_df = train_df.sort_values('Date') test_df = pd.read_csv('./datasets/bot_rollout_ETHBTC_700_hour.csv') test_df = test_df.sort_values('Date') train_env = DummyVecEnv([lambda: BitcoinTradingEnv(train_df, serial=True)]) model = PPO2(MlpPolicy, train_env, verbose=1, tensorboard_log="./tensorboard/") model.learn(total_timesteps=5000) test_env = DummyVecEnv([lambda: BitcoinTradingEnv(test_df, serial=True)]) obs = test_env.reset() for i in range(50000): action, _states = model.predict(obs) obs, rewards, done, info = test_env.step(action) test_env.render(mode="human", title="BTC") test_env.close()
def test_model_manipulation(model_class): """ Test if the algorithm can be loaded and saved without any issues, the environment switching works and that the action prediction works :param model_class: (BaseRLModel) A model """ try: env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) # create and train model = model_class(policy="MlpPolicy", env=env) model.learn(total_timesteps=NUM_TIMESTEPS, seed=0) # predict and measure the acc reward acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) acc_reward += reward acc_reward = sum(acc_reward) / N_TRIALS # saving model.save("./test_model") del model, env # loading model = model_class.load("./test_model") # changing environment (note: this can be done at loading) env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) model.set_env(env) # predict the same output before saving loaded_acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) loaded_acc_reward += reward loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS # assert <15% diff assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.15, \ "Error: the prediction seems to have changed between loading and saving" # learn post loading model.learn(total_timesteps=100, seed=0) # validate no reset post learning # This test was failing from time to time for no good reason # other than bad luck # We should change this test # loaded_acc_reward = 0 # set_global_seeds(0) # obs = env.reset() # for _ in range(N_TRIALS): # action, _ = model.predict(obs) # obs, reward, _, _ = env.step(action) # loaded_acc_reward += reward # loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS # # assert <10% diff # assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.1, \ # "Error: the prediction seems to have changed between pre learning and post learning" # predict new values obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, _, _, _ = env.step(action) # Free memory del model, env finally: if os.path.exists("./test_model"): os.remove("./test_model")
test_df = df[train_len:] test_env = DummyVecEnv([ lambda: TradingEnv(test_df, reward_func=reward_strategy, forecast_len=int(params['forecast_len']), confidence_interval=params['confidence_interval']) ]) model_params = { 'n_steps': int(params['n_steps']), 'gamma': params['gamma'], 'learning_rate': params['learning_rate'], 'ent_coef': params['ent_coef'], 'cliprange': params['cliprange'], 'noptepochs': int(params['noptepochs']), 'lam': params['lam'], } model = PPO2.load('./agents/ppo2_' + reward_strategy + '_' + str(curr_idx) + '.pkl', env=test_env) obs, done = test_env.reset(), False while not done: action, _states = model.predict(obs) obs, reward, done, info = test_env.step(action) test_env.render(mode="human")
def test_model_manipulation(request, model_class): """ Test if the algorithm can be loaded and saved without any issues, the environment switching works and that the action prediction works :param model_class: (BaseRLModel) A model """ try: env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) # create and train model = model_class(policy="MlpPolicy", env=env) model.learn(total_timesteps=NUM_TIMESTEPS, seed=0) # predict and measure the acc reward acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) acc_reward += reward acc_reward = sum(acc_reward) / N_TRIALS # saving model_fname = './test_model_{}.zip'.format(request.node.name) model.save(model_fname) del model, env # loading model = model_class.load(model_fname) # changing environment (note: this can be done at loading) env = DummyVecEnv([lambda: IdentityEnvBox(eps=0.5)]) model.set_env(env) # predict the same output before saving loaded_acc_reward = 0 set_global_seeds(0) obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, reward, _, _ = env.step(action) loaded_acc_reward += reward loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS with pytest.warns(None) as record: act_prob = model.action_probability(obs) if model_class in [DDPG, SAC, TD3]: # check that only one warning was raised assert len(record) == 1, "No warning was raised for {}".format( model_class) assert act_prob is None, "Error: action_probability should be None for {}".format( model_class) else: assert act_prob[0].shape == (1, 1) and act_prob[1].shape == (1, 1), \ "Error: action_probability not returning correct shape" # test action probability for given (obs, action) pair # must return zero and raise a warning or raise an exception if not defined env = model.get_env() obs = env.reset() observations = np.array([obs for _ in range(10)]) observations = np.squeeze(observations) observations = observations.reshape((-1, 1)) actions = np.array([env.action_space.sample() for _ in range(10)]) if model_class in [DDPG, SAC, TD3]: with pytest.raises(ValueError): model.action_probability(observations, actions=actions) else: actions_probas = model.action_probability(observations, actions=actions) assert actions_probas.shape == (len(actions), 1), actions_probas.shape assert np.all(actions_probas >= 0), actions_probas actions_logprobas = model.action_probability(observations, actions=actions, logp=True) assert np.allclose(actions_probas, np.exp(actions_logprobas)), (actions_probas, actions_logprobas) # assert <15% diff assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.15, \ "Error: the prediction seems to have changed between loading and saving" # learn post loading model.learn(total_timesteps=100, seed=0) # validate no reset post learning # This test was failing from time to time for no good reason # other than bad luck # We should change this test # loaded_acc_reward = 0 # set_global_seeds(0) # obs = env.reset() # for _ in range(N_TRIALS): # action, _ = model.predict(obs) # obs, reward, _, _ = env.step(action) # loaded_acc_reward += reward # loaded_acc_reward = sum(loaded_acc_reward) / N_TRIALS # # assert <10% diff # assert abs(acc_reward - loaded_acc_reward) / max(acc_reward, loaded_acc_reward) < 0.1, \ # "Error: the prediction seems to have changed between pre learning and post learning" # predict new values obs = env.reset() for _ in range(N_TRIALS): action, _ = model.predict(obs) obs, _, _, _ = env.step(action) # Free memory del model, env finally: if os.path.exists("./test_model.zip"): os.remove("./test_model.zip")
# The replay buffer is used to store experience, because DDPG is an off-policy algorithm. # A target network is designed to minimize MSBE loss. # A target policy network to compute an action which approximately maximizes Q_{\phi_{\text{targ}}}. # Ornstein-Uhlenbeck process is applied to add exploration noise during training to make DDPG policies explore better. model = DDPG(MlpPolicy, env, verbose=1, tau=tau, gamma=gamma, batch_size=batch_size, actor_lr=alr, critic_lr=clr, param_noise=param_noise, action_noise=action_noise) if __name__ == '__main__': # train model.learn(total_timesteps=10000) model.save("DDPG_baselines") # play env = OsmoEnv() for i in range(10): observation = env.reset() done = False while not done: action, _ = model.predict(observation) observation, reward, done, info = env.step(action) # print(reward) print(info)
def test(self, model_epoch: int = 0, render_env: bool = True, render_report: bool = True, save_report: bool = False): train_provider, test_provider = self.data_provider.split_data_train_test( self.train_split_percentage) del train_provider history_data = test_provider.historical_ohlcv() history_data["Day"] = history_data["Date"].apply( lambda x: time.strftime("%Y-%m-%d", time.localtime(x))) history_data["Day"] = pd.to_datetime(history_data["Day"]) history_data.sort_values( ['Day', 'Date'], ascending=[1, 0], inplace=True) grouped = history_data.groupby(['Day']).head(1) benchmark = grouped[["Day", "Close"]] benchmark.set_index('Day', drop=True, inplace=True) benchmark = benchmark.pct_change()[1:] self.logger.info(f"benchmark is:\n {benchmark}") init_envs = DummyVecEnv([make_env(test_provider) for _ in range(self.n_envs)]) model_path = path.join( 'data', 'agents', f'{self.study_name}__{model_epoch}.pkl') model = self.Model.load(model_path, env=init_envs) test_env = DummyVecEnv([make_env(test_provider) for _ in range(1)]) self.logger.info(f'Testing model ({self.study_name}__{model_epoch})') zero_completed_obs = np.zeros( (self.n_envs,) + init_envs.observation_space.shape) zero_completed_obs[0, :] = test_env.reset() state = None rewards = [] for _ in range(len(test_provider.data_frame)): action, state = model.predict(zero_completed_obs, state=state) obs, reward, done, info = test_env.step([action[0]]) zero_completed_obs[0, :] = obs rewards.append(reward) if render_env: test_env.render(mode='human') if done: net_worths = pd.DataFrame({ 'Date': info[0]['timestamps'], 'Balance': info[0]['net_worths'], }) net_worths.set_index('Date', drop=True, inplace=True) returns = net_worths.pct_change()[1:] self.logger.info(f"returns.Balance is:\n {returns.Balance}") if render_report: qs.plots.snapshot( returns.Balance, title='RL Trader Performance') if save_report: reports_path = path.join( 'data', 'reports', f'{self.study_name}__{model_epoch}.html') try: qs.reports.html( returns.Balance, benchmark=benchmark.Close, output=reports_path) except Exception as e: self.logger.debug('catch exception: %s\n' % e) self.logger.info( f'Finished testing model ({self.study_name}__{model_epoch}): ${"{:.2f}".format(np.sum(rewards))}')
def run_ensemble_strategy(df, unique_trade_date, rebalance_window, validation_window) -> None: """Ensemble Strategy that combines PPO, A2C and DDPG""" print("============Start Ensemble Strategy============") # for ensemble model, it's necessary to feed the last state # of the previous model to the current model as the initial state last_state_ensemble = [] ppo_sharpe_list = [] ddpg_sharpe_list = [] a2c_sharpe_list = [] model_use = [] # based on the analysis of the in-sample data #turbulence_threshold = 140 # insample_turbulence = df[(df.datadate<20151000) & (df.datadate>=20090000)] # end = unique_trade_date.min() # start = end - 10000 # df["datadate"].min() # insample_turbulence = df[(df.datadate<20200831) & (df.datadate>=20200101)] # insample_turbulence = insample_turbulence.drop_duplicates(subset=['datadate']) # insample_turbulence_threshold = np.quantile(insample_turbulence.turbulence.values, .90) start = time.time() for i in range(rebalance_window + validation_window, len(unique_trade_date), rebalance_window): print("============================================") ## initial state is empty if i - rebalance_window - validation_window == 0: # inital state initial = True else: # previous state initial = False # Tuning trubulence index based on historical data # Turbulence lookback window is one quarter end_date_index = df.index[df["datadate"] == unique_trade_date[ i - rebalance_window - validation_window]].to_list()[-1] start_date_index = end_date_index - validation_window * 30 + 1 # historical_turbulence = df.iloc[start_date_index:(end_date_index + 1), :] #historical_turbulence = df[(df.datadate<unique_trade_date[i - rebalance_window - validation_window]) & (df.datadate>=(unique_trade_date[i - rebalance_window - validation_window - 63]))] # historical_turbulence = historical_turbulence.drop_duplicates(subset=['datadate']) # # historical_turbulence_mean = np.mean(historical_turbulence.turbulence.values) # # if historical_turbulence_mean > insample_turbulence_threshold: # # if the mean of the historical data is greater than the 90% quantile of insample turbulence data # # then we assume that the current market is volatile, # # therefore we set the 90% quantile of insample turbulence data as the turbulence threshold # # meaning the current turbulence can't exceed the 90% quantile of insample turbulence data # turbulence_threshold = insample_turbulence_threshold # else: # # if the mean of the historical data is less than the 90% quantile of insample turbulence data # # then we tune up the turbulence_threshold, meaning we lower the risk # turbulence_threshold = np.quantile(insample_turbulence.turbulence.values, 1) turbulence_threshold = 0.0 print("turbulence_threshold: ", turbulence_threshold) ############## Environment Setup starts ############## ## training env train = data_split(df, start=20200101, end=unique_trade_date[i - rebalance_window - validation_window]) print(train) env_train = DummyVecEnv([lambda: StockEnvTrain(train)]) ## validation env validation = data_split(df, start=unique_trade_date[i - rebalance_window - validation_window], end=unique_trade_date[i - rebalance_window]) env_val = DummyVecEnv([ lambda: StockEnvValidation(validation, turbulence_threshold= turbulence_threshold, iteration=i) ]) obs_val = env_val.reset() ############## Environment Setup ends ############## ############## Training and Validation starts ############## print("======Model training from: ", 20200101, "to ", unique_trade_date[i - rebalance_window - validation_window]) # print("training: ",len(data_split(df, start=20090000, end=test.datadate.unique()[i-rebalance_window]) )) # print("==============Model Training===========") print("======A2C Training========") model_a2c = train_A2C(env_train, model_name="A2C_30k_dow_{}".format(i), timesteps=30000) print("======A2C Validation from: ", unique_trade_date[i - rebalance_window - validation_window], "to ", unique_trade_date[i - rebalance_window]) DRL_validation(model=model_a2c, test_data=validation, test_env=env_val, test_obs=obs_val) sharpe_a2c = get_validation_sharpe(i) print("A2C Sharpe Ratio: ", sharpe_a2c) print("======PPO Training========") model_ppo = train_PPO(env_train, model_name="PPO_100k_dow_{}".format(i), timesteps=100000) print("======PPO Validation from: ", unique_trade_date[i - rebalance_window - validation_window], "to ", unique_trade_date[i - rebalance_window]) DRL_validation(model=model_ppo, test_data=validation, test_env=env_val, test_obs=obs_val) sharpe_ppo = get_validation_sharpe(i) print("PPO Sharpe Ratio: ", sharpe_ppo) print("======DDPG Training========") model_ddpg = train_DDPG(env_train, model_name="DDPG_10k_dow_{}".format(i), timesteps=10000) #model_ddpg = train_TD3(env_train, model_name="DDPG_10k_dow_{}".format(i), timesteps=20000) print("======DDPG Validation from: ", unique_trade_date[i - rebalance_window - validation_window], "to ", unique_trade_date[i - rebalance_window]) DRL_validation(model=model_ddpg, test_data=validation, test_env=env_val, test_obs=obs_val) sharpe_ddpg = get_validation_sharpe(i) ppo_sharpe_list.append(sharpe_ppo) a2c_sharpe_list.append(sharpe_a2c) ddpg_sharpe_list.append(sharpe_ddpg) # Model Selection based on sharpe ratio if (sharpe_ppo >= sharpe_a2c) & (sharpe_ppo >= sharpe_ddpg): model_ensemble = model_ppo model_use.append('PPO') elif (sharpe_a2c > sharpe_ppo) & (sharpe_a2c > sharpe_ddpg): model_ensemble = model_a2c model_use.append('A2C') else: model_ensemble = model_ddpg model_use.append('DDPG') ############## Training and Validation ends ############## ############## Trading starts ############## print("======Trading from: ", unique_trade_date[i - rebalance_window], "to ", unique_trade_date[i]) #print("Used Model: ", model_ensemble) last_state_ensemble = DRL_prediction( df=df, model=model_ensemble, name="ensemble", last_state=last_state_ensemble, iter_num=i, unique_trade_date=unique_trade_date, rebalance_window=rebalance_window, turbulence_threshold=turbulence_threshold, initial=initial) # print("============Trading Done============") ############## Trading ends ############## end = time.time() print("Ensemble Strategy took: ", (end - start) / 60, " minutes") return model_ensemble
def run_matchup(drafter1: str, drafter2: str, battler: str, games: int, seed: int, concurrency: int) \ -> Tuple[Tuple[float, float], Tuple[list, list], Tuple[list, list], List[List[Tuple]], Tuple[list, list], List[float]]: """ Run the match-up between `drafter1` and `drafter2` using `battler` battler :param drafter1: drafter to play as first player :param drafter2: drafter to play as second player :param battler: battler to simulate the matches :param games: amount of matches to simulate :param seed: seed used to generate the matches :param concurrency: amount of matches executed at the same time :return: a tuple containing (i) a tuple containing the win rate of the first and second players, (ii) a tuple containing the average mana curves of the first and second players, (iii) a tuple containing the `30 * games` individual draft choices of the first and second players; (iv) a tuple of 3-uples containing the card alternatives presented to the players at each of the `games` episodes; and (v) a tuple containing the `games` decks built by the first and second players. """ # parse the battle agent battler = agents.parse_battle_agent(battler) # initialize envs env = [lambda: LOCMDraftEnv(battle_agents=(battler(), battler())) for _ in range(concurrency)] # wrap envs in a vectorized env env = DummyVecEnv(env) for i in range(concurrency): # no overlap between episodes at each process current_seed = seed + (games // concurrency) * i current_seed -= 1 # resetting the env increases the seed by 1 # set seed to env env.env_method('seed', current_seed, indices=[i]) # reset the env env.reset() # initialize first player if drafter1.endswith('zip'): current_drafter = agents.RLDraftAgent(PPO2.load(drafter1)) current_drafter.use_history = "history" in drafter1 else: current_drafter = agents.parse_draft_agent(drafter1)() current_drafter.seed(seed) current_drafter.name = drafter1 drafter1 = current_drafter # initialize second player if drafter2.endswith('zip'): other_drafter = agents.RLDraftAgent(PPO2.load(drafter2)) other_drafter.use_history = "history" in drafter2 else: other_drafter = agents.parse_draft_agent(drafter2)() other_drafter.seed(seed) other_drafter.name = drafter2 drafter2 = other_drafter # initialize metrics episodes_so_far = 0 episode_rewards = [[0.0] for _ in range(env.num_envs)] drafter1.mana_curve = [0 for _ in range(13)] drafter2.mana_curve = [0 for _ in range(13)] drafter1.choices = [[] for _ in range(env.num_envs)] drafter2.choices = [[] for _ in range(env.num_envs)] drafter1.decks = [[[]] for _ in range(env.num_envs)] drafter2.decks = [[[]] for _ in range(env.num_envs)] alternatives = [[] for _ in range(env.num_envs)] # run the episodes while True: observations = env.get_attr('state') # get the current agent's action for all concurrent envs if isinstance(current_drafter, agents.RLDraftAgent): all_past_choices = env.get_attr('choices') new_observations = [] for i, observation in enumerate(observations): new_observation = encode_state_draft( observation, use_history=current_drafter.use_history, past_choices=all_past_choices[i][observation.current_player.id] ) new_observations.append(new_observation) actions = current_drafter.act(new_observations) else: actions = [current_drafter.act(observation) for observation in observations] # log chosen cards into current agent's mana curve for i, (action, observation) in enumerate(zip(actions, observations)): # get chosen index try: chosen_index = action.origin except AttributeError: chosen_index = action # save choice current_drafter.choices[i].append(chosen_index) # get chosen card chosen_card = observation.current_player.hand[chosen_index] # increase amount of cards chosen with the chosen card's cost current_drafter.mana_curve[chosen_card.cost] += 1 # add chosen card to this episode's deck current_drafter.decks[i][-1].append(chosen_card.id) # save card alternatives if observation.current_player.id == PlayerOrder.FIRST: alternatives[i].append(tuple(map(lambda c: c.id, observation.current_player.hand))) # perform the action and get the outcome _, rewards, dones, _ = env.step(actions) if isinstance(current_drafter, agents.RLDraftAgent): current_drafter.dones = dones # update metrics for i in range(env.num_envs): episode_rewards[i][-1] += rewards[i] if dones[i]: episode_rewards[i].append(0.0) current_drafter.decks[i].append([]) other_drafter.decks[i].append([]) episodes_so_far += 1 # check exiting condition if episodes_so_far >= games: break # swap drafters current_drafter, other_drafter = other_drafter, current_drafter # normalize mana curves total_choices = sum(drafter1.mana_curve) drafter1.mana_curve = [freq / total_choices for freq in drafter1.mana_curve] drafter2.mana_curve = [freq / total_choices for freq in drafter2.mana_curve] # join all parallel rewards all_rewards = [reward for rewards in episode_rewards for reward in rewards[:-1]] # join all parallel choices drafter1.choices = [c for choices in drafter1.choices for c in choices] drafter2.choices = [c for choices in drafter2.choices for c in choices] # join all parallel decks drafter1.decks = [deck for decks in drafter1.decks for deck in decks if deck] drafter2.decks = [deck for decks in drafter2.decks for deck in decks if deck] # join all parallel alternatives alternatives = [turn for env in alternatives for turn in env] # cap any unsolicited data from additional episodes all_rewards = all_rewards[:games] drafter1.choices = drafter1.choices[:30 * games] drafter2.choices = drafter2.choices[:30 * games] drafter1.decks = drafter1.decks[:games] drafter2.decks = drafter2.decks[:games] alternatives = alternatives[:30 * games] # convert the list of rewards to the first player's win rate win_rate = (mean(all_rewards) + 1) * 50 return (win_rate, 100 - win_rate), \ (drafter1.mana_curve, drafter2.mana_curve), \ (drafter1.choices, drafter2.choices), \ alternatives, \ (drafter1.decks, drafter2.decks), \ all_rewards
learning_rate=args.learning_rate, epsilon=1e-05).minimize(totalLoss) trainableParams = utils.get_vars("AllTrainableParams") getTrainableParams = utils.flat_concat(trainableParams) setTrainableParams = utils.assign_params_from_flat(trainableParamsFlatten, trainableParams) #tf session initialization init = tf.initialize_local_variables() init2 = tf.initialize_all_variables() sess.run([init, init2]) finishedEp = 0 evaluationNum = 0 nextObs = env.reset() nextDone = 0 epLen = 0 epTotalRew = 0 #algorithm for e in range(args.epochs): print("Epoch {} started".format(e)) obs = np.zeros((args.epoch_len, inputLength)) rewards = np.zeros((args.epoch_len, )) dones = np.zeros((args.epoch_len, )) predVals = np.zeros((args.epoch_len, )) actions = np.zeros((args.epoch_len, outputLength)) sampledLogProb = np.zeros((args.epoch_len, )) returns = np.zeros((args.epoch_len, ))
def test(self, model_epoch: int = 0, render_env: bool = True, render_report: bool = True, save_report: bool = False): train_provider, test_provider = self.data_provider.split_data_train_test( self.train_split_percentage) del train_provider init_envs = DummyVecEnv( [make_env(test_provider) for _ in range(self.n_envs)]) model_path = path.join('data', 'agents', f'{self.study_name}__{model_epoch}.pkl') model = self.Model.load(model_path, env=init_envs) test_env = DummyVecEnv([make_env(test_provider) for _ in range(1)]) self.logger.info(f'Testing model ({self.study_name}__{model_epoch})') zero_completed_obs = np.zeros((self.n_envs, ) + init_envs.observation_space.shape) zero_completed_obs[0, :] = test_env.reset() state = None rewards = [] for _ in range(len(test_provider.data_frame)): action, state = model.predict(zero_completed_obs, state=state) obs, reward, done, info = test_env.step([action[0]]) zero_completed_obs[0, :] = obs rewards.append(reward) if render_env: test_env.render(mode='human') if done: net_worths = pd.DataFrame({ 'Date': info[0]['timestamps'], 'Balance': info[0]['net_worths'], }) net_worths.set_index('Date', drop=True, inplace=True) returns = net_worths.pct_change()[1:] if render_report: qs.plots.snapshot(returns.Balance, title='RL Trader Performance') if save_report: reports_path = path.join( 'data', 'reports', f'{self.study_name}__{model_epoch}.html') qs.reports.html(returns.Balance, file=reports_path) self.logger.info( f'Finished testing model ({self.study_name}__{model_epoch}): ${"{:.2f}".format(np.sum(rewards))}' )
def optimize_params(self, trial, n_prune_evals_per_trial: int = 2, n_tests_per_eval: int = 1): train_provider, test_provider = self.data_provider.split_data_train_test( self.train_split_percentage) train_provider, validation_provider = train_provider.split_data_train_test( self.train_split_percentage) del test_provider train_env = DummyVecEnv([lambda: TradingEnv(train_provider)]) validation_env = DummyVecEnv([lambda: TradingEnv(validation_provider)]) model_params = self.optimize_agent_params(trial) model = self.Model(self.Policy, train_env, verbose=self.model_verbose, nminibatches=1, tensorboard_log=self.tensorboard_path, **model_params) last_reward = -np.finfo(np.float16).max n_steps_per_eval = int( len(train_provider.data_frame) / n_prune_evals_per_trial) for eval_idx in range(n_prune_evals_per_trial): try: model.learn(n_steps_per_eval) except AssertionError: raise rewards = [] n_episodes, reward_sum = 0, 0.0 trades = train_env.get_attr('trades') if len(trades[0]) < 1: self.logger.info( f'Pruning trial for not making any trades: {eval_idx}') raise optuna.structs.TrialPruned() state = None obs = validation_env.reset() while n_episodes < n_tests_per_eval: action, state = model.predict(obs, state=state) obs, reward, done, _ = validation_env.step([action]) reward_sum += reward[0] if all(done): rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = validation_env.reset() last_reward = np.mean(rewards) trial.report(-1 * last_reward, eval_idx) if trial.should_prune(eval_idx): raise optuna.structs.TrialPruned() return -1 * last_reward
import gym import gym_flappy_bird from stable_baselines.common.vec_env import DummyVecEnv env = gym.make('flappy-bird-v0') env = DummyVecEnv([lambda: env]) env.reset()
scenario='test', gpu=not args.no_gpu) cb = NavrepEvalCallback(eval_env, test_env_fn=test_env_fn, logpath=LOGPATH, savepath=MODELPATH, verbose=1) # model = PPO2(MlpPolicy, env, verbose=0) # S = pd.read_csv(LOGPATH, index_col=0) cb.eval_env.episode_statistics = S model = PPO2.load(MODELPATH) model.set_env(env) print(S) model.learn(total_timesteps=TRAIN_STEPS + 1, callback=cb) obs = env.reset() model.save(MODELPATH) model.save(MODELPATH2) print("Model '{}' saved".format(MODELPATH)) del model env.close() model = PPO2.load(MODELPATH) env = NavRepTrainEncodedEnv(args.backend, args.encoding, silent=True, scenario='train') obs = env.reset()
env=env, verbose=1, n_steps=49, tensorboard_log="./tensorboard_keep_trade/") model.is_tb_set = True for n_epoch in range(0, 1): summary_writer = tf.compat.v1.summary.FileWriter( "./tensorboard_keep/" + "Keep_trade_test_" + str(n_epoch + 1)) print('\x1b[6;30;42m' + '************** Calculate epoch:', n_epoch, '**************' + '\x1b[0m') # save_s = model.learn(total_timesteps=20000, tb_log_name='Keep_learn') zero_completed_obs = np.zeros((n_cpu, ) + env.observation_space.shape) zero_completed_obs[0, :] = test_env.reset() state = None reward_sum = 0 all_net_worth_np = np.array([]) time_test = time.time() save = pd.DataFrame(columns=[ 'time', 'action', 'reward', 'profit', 'keep_price', 'keep_hodl', 'net_worth', 'btc_price', 'eth_price' ]) for i in range(200): action, states = model.predict(zero_completed_obs, state=state) obs, reward, done, info = test_env.step(action) zero_completed_obs[0, :] = obs keep_price = df_save['close_keep'][i] net_worth_log = info[0]['net_worth']
def main(): """ the main function it starts a droidbot according to the arguments given in cmd line """ opts = parse_args() import os if not os.path.exists(opts.apk_path): print("APK does not exist.") return if not opts.output_dir and opts.cv_mode: print("To run in CV mode, you need to specify an output dir (using -o option).") if opts.distributed: if opts.distributed == "master": start_mode = "master" else: start_mode = "worker" else: start_mode = "normal" if start_mode == "master": droidmaster = DroidMaster( app_path=opts.apk_path, is_emulator=opts.is_emulator, output_dir=opts.output_dir, # env_policy=opts.env_policy, env_policy=env_manager.POLICY_NONE, policy_name=opts.input_policy, random_input=opts.random_input, script_path=opts.script_path, event_interval=opts.interval, timeout=opts.timeout, event_count=opts.count, cv_mode=opts.cv_mode, debug_mode=opts.debug_mode, keep_app=opts.keep_app, keep_env=opts.keep_env, profiling_method=opts.profiling_method, grant_perm=opts.grant_perm, enable_accessibility_hard=opts.enable_accessibility_hard, qemu_hda=opts.qemu_hda, qemu_no_graphic=opts.qemu_no_graphic, humanoid=opts.humanoid, ignore_ad=opts.ignore_ad, replay_output=opts.replay_output) droidmaster.start() else: droidbot = DroidBot( app_path=opts.apk_path, device_serial=opts.device_serial, is_emulator=opts.is_emulator, output_dir=opts.output_dir, # env_policy=opts.env_policy, env_policy=env_manager.POLICY_NONE, policy_name=opts.input_policy, random_input=opts.random_input, script_path=opts.script_path, event_interval=opts.interval, timeout=opts.timeout, event_count=opts.count, cv_mode=opts.cv_mode, debug_mode=opts.debug_mode, keep_app=opts.keep_app, keep_env=opts.keep_env, profiling_method=opts.profiling_method, grant_perm=opts.grant_perm, enable_accessibility_hard=opts.enable_accessibility_hard, master=opts.master, humanoid=opts.humanoid, ignore_ad=opts.ignore_ad, replay_output=opts.replay_output) droidbot.start() env = DummyVecEnv([lambda: droidbot_env.DroidBotEnv(droidbot)]) start_time = time.time() env.reset() def events_so_state(env): events = env.envs[0].possible_events state_now = env.envs[0].device.get_current_state() event_ids = [] probs = [] for i, event in enumerate(events): event_str = str(type(event)) + '_' + event.get_event_str(state_now) if event_str in event_ids: 1/0 if event: event_ids.append(event_str) probs.append(env.envs[0].events_probs[i]) state = state_now.state_str probs = np.array(probs) return state, probs, event_ids state_function = {} num_iterations = 1000 EPSILON = 0.1 Q_TABLE = [] transitions_matrix = None number_of_trans = [] event_to_id = [] max_number_of_actions = 50 def check_state(state_id): nonlocal Q_TABLE nonlocal transitions_matrix nonlocal number_of_trans nonlocal event_to_id nonlocal state_function #print(state_id) if state_function.get(state_id) is None: if Q_TABLE == []: Q_TABLE = np.zeros((1, max_number_of_actions)) transitions_matrix = np.zeros((1, max_number_of_actions, 1)) else: Q_TABLE = np.concatenate([Q_TABLE, np.zeros((1, max_number_of_actions))], axis=0) transition_matrix_new = np.zeros((Q_TABLE.shape[0], max_number_of_actions, Q_TABLE.shape[0])) transition_matrix_new[:-1, :, :-1] = transitions_matrix transitions_matrix = transition_matrix_new event_to_id.append({}) state_function[state_id] = Q_TABLE.shape[0] - 1 Q_TABLE[-1][-1] = 1.0 number_of_trans.append(np.zeros(max_number_of_actions)) #print(state_function) state_pre, probs, event_ids = events_so_state(env) check_state(state_pre) state = state_function[state_pre] def make_decision(state_i, events): nonlocal Q_TABLE, event_to_id id_to_action = np.zeros((max_number_of_actions), dtype=np.int32) + 1000 q_values = np.zeros(max_number_of_actions) probs_now = np.zeros(max_number_of_actions) for i, event in enumerate(events): if i == len(events) - 1: q_values[-1] = Q_TABLE[state_i][-1] id_to_action[-1] = min(len(events), max_number_of_actions) - 1 continue if event_to_id[state_i].get(event) is None: if len(event_to_id[state_i]) >= max_number_of_actions - 1: continue event_to_id[state_i][event] = int(len(list(event_to_id[state_i].keys()))) Q_TABLE[state_i][event_to_id[state_i][event]] = 1.0 q_values[event_to_id[state_i][event]] = Q_TABLE[state_i][event_to_id[state_i][event]] id_to_action[event_to_id[state_i][event]] = int(i) if np.random.rand() < EPSILON: action = max_number_of_actions - 1 make_action = id_to_action[action] else: max_q = np.max(q_values) actions_argmax = np.arange(max_number_of_actions)[q_values >= max_q - 0.0001] probs_unnormed = 1/(np.arange(actions_argmax.shape[0]) + 1.) probs_unnormed /= np.sum(probs_unnormed) action = np.random.choice(actions_argmax) make_action = id_to_action[action] return action, make_action for i_step in np.arange(num_iterations): action, make_action = make_decision(state, event_ids) print(state, action, make_action) env.step([make_action]) new_state_pre, probs, event_ids = events_so_state(env) check_state(new_state_pre) new_state = state_function[new_state_pre] number_of_trans[state][action] += 1 transitions_matrix[state, action] *= (number_of_trans[state][action] - 1) transitions_matrix[state, action, new_state] += 1 transitions_matrix[state, action] /= number_of_trans[state][action] for _ in np.arange(10): for i in np.arange(max_number_of_actions): transitions = transitions_matrix[:, i, :] q_target = np.array([[np.max(Q_TABLE[i])] for i in np.arange(Q_TABLE.shape[0])]) new_q_values = np.matmul(transitions, q_target) * 0.99 good_states = np.sum(transitions, axis=1) > 0.5 if True in good_states: Q_TABLE[good_states, i] = new_q_values[good_states, 0] else: continue for i in np.arange(Q_TABLE.shape[0]): print(Q_TABLE[i]) if i_step%10==0: np.save('q_function', Q_TABLE) np.save('transition_function', transitions_matrix) with open('states.json', 'w') as f: json.dump(state_function, f) state = new_state 1/0 droidbot.stop()
parser.add_argument('--result_name', type=str, default='stabilize_highway', help='Name of saved model') args = parser.parse_args() model = run_model(args.num_cpus, args.rollout_size, args.num_steps) # Save the model to a desired folder and then delete it to demonstrate loading if not os.path.exists(os.path.realpath(os.path.expanduser('~/baseline_results'))): os.makedirs(os.path.realpath(os.path.expanduser('~/baseline_results'))) path = os.path.realpath(os.path.expanduser('~/baseline_results')) save_path = os.path.join(path, args.result_name) print('Saving the trained model!') model.save(save_path) # dump the flow params with open(os.path.join(path, args.result_name) + '.json', 'w') as outfile: json.dump(flow_params, outfile, cls=FlowParamsEncoder, sort_keys=True, indent=4) del model del flow_params # Replay the result by loading the model print('Loading the trained model and testing it out!') model = PPO2.load(save_path) flow_params = get_flow_params(os.path.join(path, args.result_name) + '.json') flow_params['sim'].render = True env_constructor = env_constructor(params=flow_params, version=0)() env = DummyVecEnv([lambda: env_constructor]) # The algorithms require a vectorized environment to run obs = env.reset() reward = 0 for i in range(flow_params['env'].horizon): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) reward += rewards print('the final reward is {}'.format(reward))
avg_rewards_bat_list_dqn = [] avg_rewards_energy_list_dqn = [] dqn_data = [] train_time_slots = 20000 t_range = 2000 ''' Myopic algorithm: The calculation to get the next action is implemented in the environment file. * Myopic is simply a greedy approach. We will use the optimization of scipy (namely minimize_scalar) to get the minimum value of the power function which corresponds to current state of the environment, and we take that action. * Myopic just optimize for a single timeslot. ''' #myopic set_seed(rand_seed) obs = env.reset() for i in range(t_range): action = env.env_method('myopic_action_cal') obs, rewards, dones, info = env.step(action) rewards_list_myopic.append(1 / rewards) avg_rewards_myopic.append(np.mean(rewards_list_myopic[:])) t, bak, bat = env.render() rewards_time_list_myopic.append(t) avg_rewards_time_list_myopic.append(np.mean(rewards_time_list_myopic[:])) rewards_bak_list_myopic.append(bak) avg_rewards_bak_list_myopic.append(np.mean(rewards_bak_list_myopic[:])) rewards_bat_list_myopic.append(bat) avg_rewards_bat_list_myopic.append(np.mean(rewards_bat_list_myopic[:])) avg_rewards_energy_list_myopic.append(avg_rewards_bak_list_myopic[-1] + avg_rewards_bat_list_myopic[-1]) myopic_data.append([