learning_rate=0.3, exploration_fraction=0.2, double_q=True, verbose=0, tensorboard_log="gym_ouput/" + name + "/log/") model.setup_model() if start_value > 0: try: model.load("gym_ouput/" + name + "/it" + str(start_value + 1), env=env) print("\n\nOBS! this is not the latest NN load point\n\n") except: try: model.load("gym_ouput/" + name + "/it" + str(start_value), env=env) except: print("\n\nOBS! invalid load point\n\n") print("obs space: " + str(model.observation_space)) print("act space: " + str(model.action_space)) i = 1 while True: save_name = "gym_ouput/" + name + "/it" + (i + start_value).__str__() model.learn(total_timesteps=int(8e3), tb_log_name="log", reset_num_timesteps=False) model.save(save_name) i += 1
mean_episode_reward = np.mean(all_episode_rewards) print("Mean reward:", mean_episode_reward, "Num episodes:", num_episodes) return mean_episode_reward # TEST 1 kwargs = { "double_q": False, "prioritized_replay": False, "policy_kwargs": dict(dueling=False) } dqn_model = DQN('MlpPolicy', 'CartPole-v1', verbose=1, **kwargs) # before training mean_reward_before_training = evaluate(dqn_model, num_episodes=100) # after training dqn_model.learn(total_timesteps=10000, log_interval=10) mean_reward = evaluate(dqn_model, num_episodes=100) # Result: Mean reward: 228.27 Num episodes: 100 # TEST 2 # kwargs = {"double_q": False, "prioritized_replay": True, "policy_kwargs": dict(dueling=False)} # dqn_model = DQN('MlpPolicy', 'CartPole-v1', verbose=1, **kwargs) # # before training # mean_reward_before_training_prioritized = evaluate(dqn_model, num_episodes=100) # # after training # dqn_model.learn(total_timesteps=10000, log_interval=10) # mean_reward_prioritized = evaluate(dqn_model, num_episodes=100) # Mean reward: 165.65 Num episodes: 100 # Test 3 # kwargs = {"double_q": False, "prioritized_replay": True, "policy_kwargs": dict(dueling=True)}
class StrategyLearner: def __init__(self, metrics=[indi.pct_sma, indi.rsi], standards=[True, True], ws=[[20], [5]], log_dir='tmp/'): # set training params self.metrics = metrics self.standards = standards self.ws = ws # set logging directory if log_dir: self.log_dir = log_dir os.makedirs(self.log_dir, exist_ok=True) # n_steps used for callback debugging self.n_steps = 0 def train(self, symbol='JPM', sd=dt.datetime(2009, 1, 1), ed=dt.datetime(2010, 12, 31), time_steps=int(1e5), savepath=None, should_plot=False): # load data and indicators df = self._load_data([symbol], sd, ed) df_met = self._get_indicators(symbol, df) # set environment self.env = Monitor(LoanEnv(df_met), self.log_dir, allow_early_resets=True) # train model self.model = DQN(MlpPolicy, self.env, prioritized_replay=True, verbose=1) self.model.learn(total_timesteps=time_steps, callback=self.debugcb) # save and plot if savepath is not None: self.model.save(savepath) if should_plot: results_plotter.plot_results([self.log_dir], time_steps, results_plotter.X_TIMESTEPS, f'DQN {symbol}') plt.show() def load_model(self, symbol='JPM', sd=dt.datetime(2009, 1, 1), ed=dt.datetime(2010, 12, 31), loadpath=None): # load data and indicators df = self._load_data([symbol], sd, ed) df_met = self._get_indicators(symbol, df) print(f'min: {df_met.min()} max: {df_met.max()}') # set environment self.env = Monitor(LoanEnv(df_met), self.log_dir, allow_early_resets=True) # load model self.model = DQN.load(loadpath, env=self.env) def cmp_policy(self, symbol='JPM', sd=dt.datetime(2009, 1, 1), ed=dt.datetime(2010, 12, 31), sv=1e5, notional=1e3, commission=0.0, impact=0.0, should_show=False, should_save=False, save_path=None, stack_plot=True): df_trades = self.test_policy(symbol=symbol, sd=sd, ed=ed, sv=sv, notional=notional) sp = msim.compute_portvals(df_trades, start_val=sv, commission=commission, impact=impact) bp = self.benchmark_policy(symbol, sd=sd, ed=ed, sv=sv, notional=notional, commission=commission, impact=impact) df_cmp = pd.concat([bp, sp], axis=1) labels = ['benchmark', 'learner'] df_cmp.columns = labels df_cmp.benchmark /= bp.iloc[0] df_cmp.learner /= sp.iloc[0] if should_show and not stack_plot: pltr = Plotter() title = f'{symbol} Strategy' yax_label = 'Indexed MV' X = np.array([df_cmp.index for _ in labels]) Y = df_cmp.values.T colors = [(1, 0, 0), (0, 1, 0)] pltr.plot(X, Y, labels=labels, yax_label=yax_label, title=title, colors=colors, should_show=should_show, should_save=should_save, save_path=save_path) elif should_show and stack_plot: pltr = StackedPlotter() title = f'{symbol} Strategy' yax_labels = ['Indexed MV', 'Shares'] colors = [[(1, 0, 0), (0, 1, 0)], [(0.35, 0.35, 0.35)]] df_pos = df_trades.cumsum() pltr.stacked_plot(df_cmp, df_pos, yax_labels=yax_labels, title=title, colors=colors, should_show=should_show, save_path=save_path) return df_cmp def test_policy(self, symbol='JPM', sd=dt.datetime(2009, 1, 1), ed=dt.datetime(2010, 12, 31), sv=1e5, notional=1e3): """ Tests existing policy against new data """ # load data and indicators df = self._load_data([symbol], sd, ed) df_met = self._get_indicators(symbol, df) df_trades = pd.DataFrame(index=df_met.Date) df_trades['Shares'] = 0 positions = np.zeros((df_trades.shape[0], )) # new env for testing env = self.model.get_env() obs = env.reset() # initial state and action action, _states = self.model.predict(obs) positions[0] = np.clip(action, -1, 1) obs, rewards, done, info = env.step(action) # pass remaining samples thru policy i = 1 while True: action, _states = self.model.predict(obs) if action == LoanEnv.BUY: positions[i] = np.clip(positions[i - 1] + 1, -1, 1) elif action == LoanEnv.SELL: positions[i] = np.clip(positions[i - 1] - 1, -1, 1) else: raise ValueError(f'unknown action: {action}') obs, rewards, done, info = env.step(action) if done: break i += 1 df_actions = pd.DataFrame(positions, index=df_trades.index, columns=['Shares']) df_actions = df_actions.diff().fillna(positions[0]) df_trades.update(df_actions) df_trades *= notional return df_trades.rename(columns={'Shares': symbol}) def benchmark_policy(self, symbol, sd, ed, sv, notional, commission, impact): # load dates and compute buy and hold portvals dates = self._load_data(['SPY'], sd, ed).index.get_level_values(1) amnts = np.zeros(dates.shape) amnts[0] = notional df_trades = pd.DataFrame(amnts, index=dates, columns=[symbol]) vals = msim.compute_portvals(df_trades, start_val=sv, commission=commission, impact=impact) return vals.rename(symbol) def predict(self, symbol, loadpath=None, sd=dt.datetime(2018, 1, 29), ed=dt.datetime(2019, 12, 18), fwd=False): # update data dp.pull(symbol, should_save=True) dp.pull('SPY', should_save=True) # load data and add phantom SPY trading day df = self._load_data([symbol], sd, ed) if fwd: lastspy = df.loc['SPY'].tail(1).copy() lastspy.index = lastspy.index.shift(1, freq='D') lastspy['Symbol'] = 'SPY' lastspy = lastspy.reset_index().set_index(['Symbol', 'Date']) df = df.append(lastspy).sort_index() # load model and predict for test range self.model = DQN.load(loadpath) if fwd: chgs = np.linspace(-0.5, 0.5, num=101) pxs = chgs + df.loc[symbol].tail(1).copy().AdjClose.values[0] pxchgs = np.zeros((101, )) actions = np.zeros((101, )) for i, px in enumerate(pxs): last = df.loc[symbol].tail(1).copy() last.index = last.index.shift(1, freq='D') pxchgs[i] = px / last.AdjClose - 1 last.AdjClose = px last.Close = px last['Symbol'] = symbol last = last.reset_index().set_index(['Symbol', 'Date']) df_tmp = df.append(last).sort_index() # predict df_met = self._get_indicators(symbol, df_tmp) ob = df_met.tail(1).drop(['Date', 'AdjClose'], axis=1) action, _ = self.model.predict(ob) actions[i] = action df_preds = pd.DataFrame({ 'Price': pxs, 'Chg': pxchgs, 'Action': actions }) return df_preds else: df_met = self._get_indicators(symbol, df) ob = df_met.tail(1).drop(['Date', 'AdjClose'], axis=1) action, _ = self.model.predict(ob) return action def debugcb(self, _locals, _globals): self.n_steps += 1 def _load_data(self, symbols, sd, ed): return indi.load_data(symbols, pd.date_range(sd, ed)) def _get_indicators(self, symbol, df): df_pxs = pd.DataFrame(df.AdjClose) dinps = [df_pxs for _ in self.metrics] df_met = df_pxs.copy() for i, d, s, w in zip(self.metrics, dinps, self.standards, self.ws): df_met = df_met.join(i(d, window_sizes=w, standard=s), how='inner') df_met = df_met.loc[symbol].dropna().reset_index() return df_met
class Attack: def __init__(self, method, K=5, P=0.95): self.method = method self.K = K self.state_size = 2 * (self.K + 1) self.action_size = 2 self.reward = [] env_name = 'attacker-' + str(K) + '-' + str(P) + '-v0' self.log_dir = "/tmp/gym_attack/" os.makedirs(self.log_dir, exist_ok=True) env = gym.make(env_name) env = Monitor(env, self.log_dir, allow_early_resets=True) self.envs = DummyVecEnv([lambda: env]) if method=='PPO': self.model = PPO2(MLP_PPO, self.envs, verbose=0) elif method=='DQN': self.model = DQN(MLP_DQN, self.envs, verbose=0) else: raise Exception("Erreur ! Méthode: 'PPO' ou 'DQN'") print("Model Initialized !") self.best_mean_reward, self.n_steps = -np.inf, 0 def callback(self, _locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) """ # Print stats every 1000 calls if (self.n_steps + 1) % 1000 == 0: # Evaluate policy performance x, y = ts2xy(load_results(self.log_dir), 'timesteps') if len(x) > 0: mean_reward = np.mean(y[-100:]) print(x[-1], 'timesteps') print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(self.best_mean_reward, mean_reward)) # New best model, you could save the agent here if mean_reward > self.best_mean_reward: self.best_mean_reward = mean_reward self.n_steps += 1 return True def learn(self, timesteps=10000): self.model.learn(total_timesteps = timesteps, callback = self.callback) print("======\nLEARNING DONE\n======") def save(self, filename): self.model.save(filename) print("Model saved !\n Filename:", filename) def load(self, filename): if self.method=='PPO': self.model = PPO2.load(filename, policy=MLP_PPO) else: self.model = DQN.load(filename, policy=MLP_DQN) print("Model loaded !") def run(self, nb_episodes = 1000): self.nb_episodes = nb_episodes for index_episode in range(nb_episodes): state = self.envs.reset() state = np.array(state) done = False steps = 0 while not done: action, _states = self.model.predict(state) next_state, reward, done, _ = self.envs.step(action) next_state = np.array(next_state) state = next_state steps += 1 if index_episode %100 == 0: print("Episode {}#; \t Nb of steps: {}; \t Reward: {}.".format(index_episode, steps + 1, reward)) if index_episode > 0: self.reward += [((self.reward[-1] * len(self.reward)) + reward) / (len(self.reward) + 1)] else: self.reward += [reward]
from stable_baselines.common.atari_wrappers import make_atari from stable_baselines.deepq.policies import MlpPolicy, CnnPolicy from stable_baselines import DQN env = make_atari('BreakoutNoFrameskip-v4') model = DQN(CnnPolicy, env, verbose=1, tensorboard_log="./logs/DQN_test") model.learn(total_timesteps=25000) model.save("deepq_breakout") del model # remove to demonstrate saving and loading model = DQN.load("deepq_breakout") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
import gym from stable_baselines import DQN env = gym.make("LunarLander-v2") model = DQN("MlpPolicy", env, learning_rate=1e-3, prioritized_replay=True, verbose=1) model.learn(total_timesteps=int(2e5)) model.save("sql_lunar") del model model = DQN.load("dqn_lunar") obs = env.reset() for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def learn(time_steps, file_name): env = gym.make('gym_snake:snake-v0') env = snake_wrapper(env) model = DQN(SnakePolicy, env, verbose=1) model.learn(time_steps) model.save(file_name)
env = retro.make(game='Airstriker-Genesis', state='Level1') env = AirstrikerDiscretizer(env) # 行動空間を離散空間に変換 env = CustomRewardAndDoneEnv(env) # 報酬とエピソード完了の変更 env = StochasticFrameSkip(env, n=4, stickprob=0.25) # スティッキーフレームスキップ env = Downsample(env, 2) # ダウンサンプリング env = Rgb2gray(env) # グレースケール env = FrameStack(env, 4) # フレームスタック env = ScaledFloatFrame(env) # 状態の正規化 env = Monitor(env, log_dir, allow_early_resets=True) print('行動空間: ', env.action_space) print('状態空間: ', env.observation_space) # シードの指定 env.seed(0) set_global_seeds(0) # ベクトル化環境の生成 env = DummyVecEnv([lambda: env]) # モデルの生成 kwargs = {'double_q': False, 'prioritized_replay': True, 'policy_kwargs': dict(dueling=False)} model = DQN('MlpPolicy', env , verbose=1, **kwargs) # モデルの学習 model.learn(total_timesteps=128000, callback=callback) # モデルの保存 model.save('DQN_Prioritized_Replay')
## Define the type of RL algorithm you are using. modeltype = "DQN" ## Used the format to get date basicdate = str(datetime.datetime.now().strftime("%Y%m%d_%H%M%S")) ## Stage thresholds these are based on mean reward values which are calculated when evaluating the module ## These will be switched out if using a custom evaluation process. stage_one_threshold = 5 stage_two_threshold = 7 stage_three_threshold = 10 print("Stage 1 Training Started") env = plark_env_guided_reward.PlarkEnvGuidedReward(config_file_path=very_easy_config) model = DQN('CnnPolicy', env) model.learn(50) logger.info('STARTING STAGE 1 INITIAL EVALUATION') stg1_mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=1, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False) logger.info('FINISHING STAGE 1 INITIAL EVALUATION') stage1result = retrain(stg1_mean_reward, stage_one_threshold, 0 ,env, model) logger.info("Stage One Threshold Met") if stage1result == True: logger.info("Stage 2 Training Started") env = plark_env_guided_reward.PlarkEnvGuidedReward(config_file_path=easy_config) model.set_env(env) model.learn(50) logger.info('STARTING STAGE 2 INITIAL EVALUATION') stg2_mean_reward, n_steps = evaluate_policy(model, env, n_eval_episodes=1, deterministic=False, render=False, callback=None, reward_threshold=None, return_episode_rewards=False) logger.info('FINISHING STAGE 2 INITIAL EVALUATION') stage2result = retrain(stg2_mean_reward, stage_two_threshold, 0 ,env, model) logger.info("Stage Two Threshold Met")
env = Monitor(env, currentDirectory) # Instantiate the agent model = DQN('MlpPolicy', env, learning_rate=1e-3, prioritized_replay=True, verbose=1) # Create the callback: check every 100 steps callback = SaveOnBestTrainingRewardCallback(check_freq=100, log_dir=currentDirectory) # Train the agent time_steps = 1e4 model.learn(total_timesteps=int(time_steps)) # Save the agent model.save(currentDirectory + "/opendss_1e4") results_plotter.plot_results([currentDirectory], time_steps, results_plotter.X_TIMESTEPS, "DQN IEEE 13-bus") plt.show() # del model # delete trained model to demonstrate loading # Load the trained agent model = DQN.load(currentDirectory + "/opendss_1e4", env) # Evaluate the agent mean_reward, std_reward = evaluate_policy(model, model.get_env(),
target_network_update_freq=1000, train_freq=4, exploration_final_eps=0.01, exploration_fraction=0.1, prioritized_replay_alpha=0.6, prioritized_replay=True, verbose=1, ) print('-----------------') # Train agent print("Training DQN agent") dqn_model.learn( total_timesteps=int(1e6), log_interval=10, ) # Save the agent save_name = 'DQN_trained_v2' print('Saving', save_name) dqn_model.save(save_name) print('') print('-----------------') # Evaluate agent print('Evaluating DQN...', ) mean_reward, std_reward = evaluate_policy(dqn_model, env, n_eval_episodes=50) evaluation = Evaluation('DQN', mean_reward, std_reward)
episode_rewards.append(0.0) # Compute mean reward for the last 100 episodes mean_100ep_reward = round(np.mean(episode_rewards[-100:]), 1) print("Mean reward:", mean_100ep_reward, "Num episodes:", len(episode_rewards)) return mean_100ep_reward if __name__ == '__main__': # Instantiate the env env_test = FFenv() env = FFenv() # Check the env check_env(env_test, warn=True) # Wrap it env = Monitor(env, filename=None, allow_early_resets=True) # Define the model model = DQN('LnMlpPolicy', env, learning_rate=1e-3, prioritized_replay=True, verbose=1) # Random Agent, before training mean_reward_before_train = evaluate(model, num_steps=1000) # Train the agent num_episodes = 400 model.learn(total_timesteps=1000*num_episodes) model.save("dqn_model")
n_steps += 1 # Returning False will stop training early return True # Create environment env = RandomWalkEnv(max_timesteps) # Instantiate the agent model = DQN('MlpPolicy', env, learning_rate=1e-3, prioritized_replay=True, verbose=1) # Train the agent model.learn(total_timesteps=int(2e5), log_interval=100, callback=callback) # Save the agent model.save("dqn_lunar") del model # delete trained model to demonstrate loading # Load the trained agent model = DQN.load("dqn_lunar") # Evaluate the agent # mean_reward, n_steps = evaluate_policy(model, model.get_env(), n_eval_episodes=10) # Enjoy trained agent obs = env.reset() for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action)
class Config: def __init__(self, **entries): self.__dict__.update(entries) #props to object cfg = Config(**props) #read data from file DATASET_PATH = "data" file_name = os.path.join(os.path.dirname(__file__), DATASET_PATH + "/binanceOpenAi-jan.dat") df = pd.read_csv(file_name) df.rename(columns=lambda x: x.strip(), inplace=True) df['avrPrice'] = df['avrPrice'].replace(to_replace=0, method='ffill') env = TradingCoinEnv(df, cfg) env = DummyVecEnv([lambda: env]) #check_env(env, warn=True) model = DQN("MlpPolicy", env, verbose=2) model = PPO2(MlpPolicy, env, verbose=1) model.learn(total_timesteps=900000) model.save("trading3") # дообучение ''' model = PPO2.load("./trading2") model.learn(total_timesteps=900000) model.save("trading3") '''
from stable_baselines.deepq.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import DQN from uav_enviroment.UAV_Environment import UAVEnv if __name__ == '__main__': # Create log dir log_dir = "/tmp/uav_env/" load_path = './models/DQN_disc_cartesian/2018_11_05_16:58.pkl' experiment_name = 'DQN_disc_cartesian' models_dir = './models/'+ experiment_name+'/' t = datetime.datetime.now() date = t.strftime('%Y_%m_%d_%H:%M') os.makedirs(log_dir, exist_ok=True) os.makedirs(models_dir, exist_ok=True) # Create and wrap the environment env = UAVEnv(continuous=False, angular_movement=False, observation_with_image=False, reset_always=True) env.setup(n_obstacles=2, reset_always=True, threshold_dist=20) env = DummyVecEnv([lambda: env]) model = DQN(MlpPolicy, env, verbose=1, tensorboard_log=log_dir) # model.load(load_path, env=env) model = model.learn(total_timesteps=150000, log_interval=50, tb_log_name=experiment_name+date) model.save(models_dir + date)
save_path='./models/checkpoints/', name_prefix=prefix) #episode_plot_freq = n : Update plots every n time steps #update_stats_every = m: Update stats used in plots every m Episodes #Note! update_stats_every > 1 would lead to lose of information in the plot (not in the trining process), but increase the performance during training. plt_callback = plotcallback(episode_plot_freq=10000, update_stats_every=1, average_size=100, verbose=1, plot_prefix=prefix, plot_dir="./Plots") callbacks = CallbackList([checkpoint_callback, plt_callback]) model.learn(total_timesteps=total_timesteps, callback=callbacks) model.save("./models/" + prefix) #model = DQN.load("./models/7x7_4bins_2items_2binslots_1agentslots_128x64x32_1500k.zip") # Test the trained agent obs = env.reset() #env.init_video() n_steps = 300 total = 0 for step in range(n_steps): env.render(mode='human', sec_per_frame=0.3) action, _ = model.predict(obs, deterministic=True) print("Step {}".format(step + 1)) print("Action: ", action) obs, reward, done, info = env.step(action) print('obs=', obs, 'reward=', reward, 'done=', done)