def run(learning_steps=4300, verbose=0, n_steps=20, gamma=0.99, learning_rate=7e-4, ent_coef=0.01, tensorboard_log="tensorboard"): global inner_env inner_env = gym.make( 'gym_threshold:extended-state-semi-fixed-end-not-adapted-v0') env = DummyVecEnv([lambda: inner_env]) model = ACER(MlpPolicy, env, verbose=verbose, n_steps=n_steps, gamma=gamma, ent_coef=ent_coef, learning_rate=learning_rate, tensorboard_log=tensorboard_log) model.learn(total_timesteps=learning_steps, tb_log_name=os.path.basename(__file__).rstrip(".py"), callback=tensorboard_callback) env.close()
def train(env_id, num_timesteps, seed, policy, lr_schedule, num_cpu): """ train an ACER model on atari :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 'double_linear_con', 'middle_drop' or 'double_middle_drop') :param num_cpu: (int) The number of cpu to train on """ env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) if policy == 'cnn': policy_fn = CnnPolicy elif policy == 'lstm': policy_fn = CnnLstmPolicy else: warnings.warn("Policy {} not implemented".format(policy)) return model = ACER(policy_fn, env, lr_schedule=lr_schedule, buffer_size=5000) model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) env.close() # Free memory del model
def acer(env_id, log_dir, timesteps): # Create log dir os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = gym.make(env_id) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) model = ACER(MlpPolicy, env, verbose=0) # Train the agent print("Beginning training episodes with ACER.") model.learn(total_timesteps=timesteps) env.close()
def evaluate_policy(model, eval_data, runs_per_env: int, n_vars: int, episode_length: int, display: bool, printing: bool, wrapped_env: bool = False) -> np.array: if type(model) == str: model = ACER.load(model) differences = [] for fcm in eval_data: target_graph = CausalGraphGenerator.create_graph_from_fcm(fcm) for run in range(runs_per_env): predicted_graph = apply_policy(model=model, test_env=fcm, n_vars=n_vars, episode_length=episode_length, display=display, env_type='Gauss', printing=printing, wrapped_env=wrapped_env) difference = directed_shd(predicted_graph, target_graph) differences.append(difference) print('.') differences = np.array(differences) return differences
def NewPotential(current_window, algorithm='PPO'): # Determine the pretrained agent if algorithm == 'A2C': model = A2C.load("pretrained_A2C") elif algorithm == 'PPO': model = PPO2.load("pretrained_PPO") elif algorithm == 'ACKTR': model = ACKTR.load("pretrained_ACKTR") elif algorithm == 'ACER': model = ACER.load("pretrained_ACER") else: raise ValueError("%s is not a valid algorithm." % algorithm) if len(current_window) != model.observation_space.shape[0]: raise ValueError("%s is does not match the model's window size." % len(current_window)) action, _states = model.predict(current_window, deterministic=False) voltages = np.linspace(0, 1, num=model.action_space.n) if action >= 0 and action <= model.action_space.n - 1: voltage = voltages[action] else: raise ValueError( "Received invalid action={} which is not part of the action space". format(action)) return voltage
def test_action_mask_run_acer(vec_env, policy, env_class): env = vec_env([env_class]) model = ACER(policy, env, verbose=0) obs, done, action_masks = env.reset(), [False], [] while not done[0]: action, _states = model.predict(obs, action_mask=action_masks) obs, _, done, infos = env.step(action) action_masks.clear() for info in infos: env_action_mask = info.get('action_mask') action_masks.append(env_action_mask) env.close()
def get_acer(vec_env=None, policy='CnnPolicy', learning_rate=7e-4, n_steps=20, max_grad_norm=10, lr_schedule='linear', buffer_size=5000, replay_start=1000) -> ACER: """ Parameter's default values are taken from stable_baselines.acer.acer_simple.py """ if vec_env is None: vec_env = create_training_env(1) return ACER(policy=policy, env=vec_env, gamma=0.99, n_steps=n_steps, num_procs=None, q_coef=0.5, ent_coef=0.01, max_grad_norm=max_grad_norm, learning_rate=learning_rate, lr_schedule=lr_schedule, rprop_alpha=0.99, rprop_epsilon=1e-5, buffer_size=buffer_size, replay_ratio=4, replay_start=replay_start, correction_term=10.0, trust_region=True, alpha=0.99, delta=1, verbose=2)
def _init_model(self): if not self._model_kwargs["agent"].lower() == "acer": raise ValueError( "The model_kwargs dict has to be created using args from ACER agent as reference. Make sure the correct parameters models." ) del self._model_kwargs["agent"] self._callback_checkpoint_kwargs["save_freq"] = int( self._callback_checkpoint_kwargs["save_freq"] / self._num_cpus) if self._env_kwargs["extractor"] == "mlp": self._model = ACER(CustomMlpPolicy, self._env, **self._model_kwargs) else: self._model = ACER(CustomCnnPolicy, self._env, **self._model_kwargs)
def get_existing_model(model_path): print('--- Training from existing model', model_path, '---') # Load model model = ACER.load(model_path) return model
def loader(algo, env_name): if algo == 'dqn': return DQN.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'ppo2': return PPO2.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'a2c': return A2C.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'acer': return ACER.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'trpo': return TRPO.load("trained_agents/" + algo + "/" + env_name + ".pkl")
def train(): """Trains an ACER policy """ env = create_env() model = ACER(policy=CnnPolicy, env=env, gamma=0.99, n_steps=20, num_procs=4, q_coef=0.5, ent_coef=0.01, max_grad_norm=10, learning_rate=0.0007, lr_schedule='linear', rprop_alpha=0.99, rprop_epsilon=1e-05, buffer_size=5000, replay_ratio=4, replay_start=1000, correction_term=10.0, trust_region=True, alpha=0.99, delta=1, verbose=1, tensorboard_log="./tb") model.learn(total_timesteps=int(1e7), callback=callback, tb_log_name="acer") model.save("models/pacman_acer.pkl")
def train_acer(seed): """ test ACER on the uav_env(cartesian,discrete) :param seed: random seed :return: evaluation """ """ ACER(policy, env, gamma=0.99, n_steps=20, num_procs=1, q_coef=0.5, ent_coef=0.01, max_grad_norm=10, learning_rate=0.0007, lr_schedule='linear', rprop_alpha=0.99, rprop_epsilon=1e-05, buffer_size=5000, replay_ratio=4, replay_start=1000, correction_term=10.0, trust_region=True, alpha=0.99, delta=1, verbose=0, tensorboard_log=None, _init_setup_model=True) """ algo = 'ACER' num_timesteps = 3000000 env = set_up_env(seed) global best_mean_reward, n_steps best_mean_reward, n_steps = -np.inf, 0 model = ACER(policy=MlpPolicy, env=env, gamma=0.99, n_steps=20, num_procs=1, q_coef=0.5, ent_coef=0.01, max_grad_norm=10, learning_rate=0.0007, lr_schedule='linear', rprop_alpha=0.99, rprop_epsilon=1e-05, buffer_size=5000, replay_ratio=4, replay_start=1000, correction_term=10.0, trust_region=True, alpha=0.99, delta=1, verbose=0, tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo)) model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed, log_interval=500, tb_log_name="seed_{}".format(seed)) model = ACER.load(log_dir + 'best_model.pkl') evaluation = evaluate_model(env, model, 100) os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True) os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed)) env.close() del model, env gc.collect() return evaluation
def train_ACER(env_train, model_name, timesteps=25000): start = time.time() model = ACER('MlpPolicy', env_train, verbose=0) model.learn(total_timesteps=timesteps) end = time.time() model.save(f"{config.TRAINED_MODEL_DIR}/{model_name}") print('Training time (A2C): ', (end - start) / 60, ' minutes') return model
def load_model(path: str, algorithm: str): from stable_baselines import PPO2, DQN, A2C, ACER, GAIL, TRPO if algorithm == 'PPO2': return PPO2.load(path) if algorithm == 'DQN': return DQN.load(path) if algorithm == 'A2C': return A2C.load(path) if algorithm == 'ACER': return ACER.load(path) if algorithm == 'GAIL': return GAIL.load(path) if algorithm == 'TRPO': return TRPO.load(path) return None
def train_acer(timesteps, name): env = datares_roulette env = DummyVecEnv([env]) model = ACER( stable_baselines.common.policies.MlpPolicy, env, verbose=1, ) model.learn(total_timesteps=timesteps) model.save(name) return model
def setup_game(): playing = True while (playing): games = input("Do you want to play 5, 10 or 20 games? ") if (games.replace(" ", "") == "1"): games = 1 playing = False elif (games.replace(" ", "") == "5"): games = 5 playing = False elif (games.replace(" ", "") == "10"): games = 10 playing = False elif (games.replace(" ", "") == "20"): games = 20 playing = False else: print("Unrecognized please try again!") playing = True while (playing): AIagent = input( "Do you want to play against PPO2(p)(1), A2C(a)(2) or ACER(c)(3) agent?" ) if (AIagent.replace(" ", "").upper() == "p".upper() or AIagent.replace(" ", "").upper() == "ppo2".upper() or AIagent.replace(" ", "") == "1"): AIagent = PPO2.load("models/PPO2-qiscoin-v1-10k") ai_name = "PPO2" playing = False elif (AIagent.replace(" ", "").upper() == "a".upper() or AIagent.replace(" ", "").upper() == "a2c".upper() or AIagent.replace(" ", "") == "2"): AIagent = A2C.load("models/A2C-qiscoin-v1-10k") ai_name = "A2C" playing = False elif (AIagent.replace(" ", "").upper() == "c".upper() or AIagent.replace(" ", "").upper() == "acer".upper() or AIagent.replace(" ", "") == "3"): AIagent = ACER.load("models/ACER-qiscoin-v1-10k") ai_name = "ACER" playing = False else: print("Unrecognized please try again!") return games, AIagent, ai_name
def record_video(): """Record of a video for an trained ACER agent""" model = ACER.load("models/pacman_acer.pkl", verbose=1) env = create_env() model.set_env(env) video_length = 3000 env = wrap_video_env(env, name="pacman_acer", video_length=video_length, path='videos/') state = env.reset() for _ in range(video_length + 1): action, _states = model.predict(state) state, _, _, _ = env.step(action) print("Video recorded") env.close()
from stable_baselines.common.cmd_util import make_atari_env from stable_baselines.common.vec_env import VecFrameStack from stable_baselines import ACER # There already exists an environment generator # that will make and wrap atari environments correctly. # Here we are also multiprocessing training (num_env=4 => 4 processes) env = make_atari_env('PongNoFrameskip-v4', num_env=4, seed=0) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) model = ACER('CnnPolicy', env, verbose=1) model.learn(total_timesteps=25000) # save model.save("cnn_pong") obs = env.reset() while True: action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
env = CarEnv(out_dir, n_stacks=5, a_space_type=args.action) env.next_weather() env = Monitor(env, out_dir) print("==========Creating model------------------") policy = CnnPolicy if args.model == 'trpo': model = TRPO(policy, env, verbose=1, timesteps_per_batch=64, tensorboard_log=out_dir) elif args.model == 'acer': model = ACER(policy, env, verbose=1, n_steps=64, tensorboard_log=out_dir) elif args.model == 'ppo': model = PPO2(policy, env, verbose=1, n_steps=64, tensorboard_log=out_dir) elif args.model == 'acktr': model = ACKTR(policy, env, n_steps=4, verbose=1, tensorboard_log=out_dir) elif args.model == 'ddpg':
class LoadRLModel(IStrategy): stoploss = -0.50 trailing_stop = False ticker_interval = '5m' # Run "populate_indicators()" only for new candle. process_only_new_candles = False startup_candle_count: int = 20 model = ACER.load('model') def informative_pairs(self): return [] def populate_indicators(self, dataframe: DataFrame, metadata: dict) -> DataFrame: # Momentum Indicators # ------------------------------------ # ADX dataframe['adx'] = ta.ADX(dataframe) # Plus Directional Indicator / Movement dataframe['plus_dm'] = ta.PLUS_DM(dataframe) dataframe['plus_di'] = ta.PLUS_DI(dataframe) # # Minus Directional Indicator / Movement dataframe['minus_dm'] = ta.MINUS_DM(dataframe) dataframe['minus_di'] = ta.MINUS_DI(dataframe) # Aroon, Aroon Oscillator aroon = ta.AROON(dataframe) dataframe['aroonup'] = aroon['aroonup'] dataframe['aroondown'] = aroon['aroondown'] dataframe['aroonosc'] = ta.AROONOSC(dataframe) # Awesome Oscillator dataframe['ao'] = qtpylib.awesome_oscillator(dataframe) # # Keltner Channel # keltner = qtpylib.keltner_channel(dataframe) # dataframe["kc_upperband"] = keltner["upper"] # dataframe["kc_lowerband"] = keltner["lower"] # dataframe["kc_middleband"] = keltner["mid"] # dataframe["kc_percent"] = ( # (dataframe["close"] - dataframe["kc_lowerband"]) / # (dataframe["kc_upperband"] - dataframe["kc_lowerband"]) # ) # dataframe["kc_width"] = ( # (dataframe["kc_upperband"] - dataframe["kc_lowerband"]) / dataframe["kc_middleband"] # ) # Ultimate Oscillator dataframe['uo'] = ta.ULTOSC(dataframe) # Commodity Channel Index: values [Oversold:-100, Overbought:100] dataframe['cci'] = ta.CCI(dataframe) # RSI dataframe['rsi'] = ta.RSI(dataframe) # Inverse Fisher transform on RSI: values [-1.0, 1.0] (https://goo.gl/2JGGoy) rsi = 0.1 * (dataframe['rsi'] - 50) dataframe['fisher_rsi'] = (np.exp(2 * rsi) - 1) / (np.exp(2 * rsi) + 1) # Inverse Fisher transform on RSI normalized: values [0.0, 100.0] (https://goo.gl/2JGGoy) dataframe['fisher_rsi_norma'] = 50 * (dataframe['fisher_rsi'] + 1) # Stochastic Slow stoch = ta.STOCH(dataframe) dataframe['slowd'] = stoch['slowd'] dataframe['slowk'] = stoch['slowk'] # Stochastic Fast stoch_fast = ta.STOCHF(dataframe) dataframe['fastd'] = stoch_fast['fastd'] dataframe['fastk'] = stoch_fast['fastk'] # Stochastic RSI stoch_rsi = ta.STOCHRSI(dataframe) dataframe['fastd_rsi'] = stoch_rsi['fastd'] dataframe['fastk_rsi'] = stoch_rsi['fastk'] # MACD macd = ta.MACD(dataframe) dataframe['macd'] = macd['macd'] dataframe['macdsignal'] = macd['macdsignal'] dataframe['macdhist'] = macd['macdhist'] # MFI dataframe['mfi'] = ta.MFI(dataframe) # # ROC dataframe['roc'] = ta.ROC(dataframe) # Overlap Studies # ------------------------------------ # # Bollinger Bands # bollinger = qtpylib.bollinger_bands(qtpylib.typical_price(dataframe), window=20, stds=2) # dataframe['bb_lowerband'] = bollinger['lower'] # dataframe['bb_middleband'] = bollinger['mid'] # dataframe['bb_upperband'] = bollinger['upper'] # dataframe["bb_percent"] = ( # (dataframe["close"] - dataframe["bb_lowerband"]) / # (dataframe["bb_upperband"] - dataframe["bb_lowerband"]) # ) # dataframe["bb_width"] = ( # (dataframe["bb_upperband"] - dataframe["bb_lowerband"]) / dataframe["bb_middleband"] # ) # # Bollinger Bands - Weighted (EMA based instead of SMA) # weighted_bollinger = qtpylib.weighted_bollinger_bands( # qtpylib.typical_price(dataframe), window=20, stds=2 # ) # dataframe["wbb_upperband"] = weighted_bollinger["upper"] # dataframe["wbb_lowerband"] = weighted_bollinger["lower"] # dataframe["wbb_middleband"] = weighted_bollinger["mid"] # dataframe["wbb_percent"] = ( # (dataframe["close"] - dataframe["wbb_lowerband"]) / # (dataframe["wbb_upperband"] - dataframe["wbb_lowerband"]) # ) # dataframe["wbb_width"] = ( # (dataframe["wbb_upperband"] - dataframe["wbb_lowerband"]) / # dataframe["wbb_middleband"] # ) # # EMA - Exponential Moving Average # dataframe['ema3'] = ta.EMA(dataframe, timeperiod=3) # dataframe['ema5'] = ta.EMA(dataframe, timeperiod=5) # dataframe['ema10'] = ta.EMA(dataframe, timeperiod=10) # dataframe['ema21'] = ta.EMA(dataframe, timeperiod=21) # dataframe['ema50'] = ta.EMA(dataframe, timeperiod=50) # dataframe['ema100'] = ta.EMA(dataframe, timeperiod=100) # # SMA - Simple Moving Average # dataframe['sma3'] = ta.SMA(dataframe, timeperiod=3) # dataframe['sma5'] = ta.SMA(dataframe, timeperiod=5) # dataframe['sma10'] = ta.SMA(dataframe, timeperiod=10) # dataframe['sma21'] = ta.SMA(dataframe, timeperiod=21) # dataframe['sma50'] = ta.SMA(dataframe, timeperiod=50) # dataframe['sma100'] = ta.SMA(dataframe, timeperiod=100) # Parabolic SAR # dataframe['sar'] = ta.SAR(dataframe) # TEMA - Triple Exponential Moving Average # dataframe['tema'] = ta.TEMA(dataframe, timeperiod=9) # # Cycle Indicator # # ------------------------------------ # # Hilbert Transform Indicator - SineWave # hilbert = ta.HT_SINE(dataframe) # dataframe['htsine'] = hilbert['sine'] # dataframe['htleadsine'] = hilbert['leadsine'] # # Pattern Recognition - Bullish candlestick patterns # # ------------------------------------ # # Hammer: values [0, 100] # dataframe['CDLHAMMER'] = ta.CDLHAMMER(dataframe) # # Inverted Hammer: values [0, 100] # dataframe['CDLINVERTEDHAMMER'] = ta.CDLINVERTEDHAMMER(dataframe) # # Dragonfly Doji: values [0, 100] # dataframe['CDLDRAGONFLYDOJI'] = ta.CDLDRAGONFLYDOJI(dataframe) # # Piercing Line: values [0, 100] # dataframe['CDLPIERCING'] = ta.CDLPIERCING(dataframe) # values [0, 100] # # Morningstar: values [0, 100] # dataframe['CDLMORNINGSTAR'] = ta.CDLMORNINGSTAR(dataframe) # values [0, 100] # # Three White Soldiers: values [0, 100] # dataframe['CDL3WHITESOLDIERS'] = ta.CDL3WHITESOLDIERS(dataframe) # values [0, 100] # # Pattern Recognition - Bearish candlestick patterns # # ------------------------------------ # # Hanging Man: values [0, 100] # dataframe['CDLHANGINGMAN'] = ta.CDLHANGINGMAN(dataframe) # # Shooting Star: values [0, 100] # dataframe['CDLSHOOTINGSTAR'] = ta.CDLSHOOTINGSTAR(dataframe) # # Gravestone Doji: values [0, 100] # dataframe['CDLGRAVESTONEDOJI'] = ta.CDLGRAVESTONEDOJI(dataframe) # # Dark Cloud Cover: values [0, 100] # dataframe['CDLDARKCLOUDCOVER'] = ta.CDLDARKCLOUDCOVER(dataframe) # # Evening Doji Star: values [0, 100] # dataframe['CDLEVENINGDOJISTAR'] = ta.CDLEVENINGDOJISTAR(dataframe) # # Evening Star: values [0, 100] # dataframe['CDLEVENINGSTAR'] = ta.CDLEVENINGSTAR(dataframe) # # Pattern Recognition - Bullish/Bearish candlestick patterns # # ------------------------------------ # # Three Line Strike: values [0, -100, 100] # dataframe['CDL3LINESTRIKE'] = ta.CDL3LINESTRIKE(dataframe) # # Spinning Top: values [0, -100, 100] # dataframe['CDLSPINNINGTOP'] = ta.CDLSPINNINGTOP(dataframe) # values [0, -100, 100] # # Engulfing: values [0, -100, 100] # dataframe['CDLENGULFING'] = ta.CDLENGULFING(dataframe) # values [0, -100, 100] # # Harami: values [0, -100, 100] # dataframe['CDLHARAMI'] = ta.CDLHARAMI(dataframe) # values [0, -100, 100] # # Three Outside Up/Down: values [0, -100, 100] # dataframe['CDL3OUTSIDE'] = ta.CDL3OUTSIDE(dataframe) # values [0, -100, 100] # # Three Inside Up/Down: values [0, -100, 100] # dataframe['CDL3INSIDE'] = ta.CDL3INSIDE(dataframe) # values [0, -100, 100] # # Chart type # # ------------------------------------ # # Heikin Ashi Strategy # heikinashi = qtpylib.heikinashi(dataframe) # dataframe['ha_open'] = heikinashi['open'] # dataframe['ha_close'] = heikinashi['close'] # dataframe['ha_high'] = heikinashi['high'] # dataframe['ha_low'] = heikinashi['low'] # Retrieve best bid and best ask from the orderbook # ------------------------------------ """ # first check if dataprovider is available if self.dp: if self.dp.runmode in ('live', 'dry_run'): ob = self.dp.orderbook(metadata['pair'], 1) dataframe['best_bid'] = ob['bids'][0][0] dataframe['best_ask'] = ob['asks'][0][0] """ return dataframe def populate_buy_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame: """ Based on TA indicators, populates the buy signal for the given dataframe :param dataframe: DataFrame populated with indicators :param metadata: Additional information, like the currently traded pair :return: DataFrame with buy column """ # dataframe.loc[ # ( # (qtpylib.crossed_above(dataframe['rsi'], 30)) & # Signal: RSI crosses above 30 # (dataframe['tema'] <= dataframe['bb_middleband']) & # Guard: tema below BB middle # (dataframe['tema'] > dataframe['tema'].shift(1)) & # Guard: tema is raising # (dataframe['volume'] > 0) # Make sure Volume is not 0 # ), # 'buy'] = 1 action, nan_list = self.rl_model_redict(dataframe) dataframe.loc[action == 1, 'buy'] = 1 dataframe.loc[nan_list == True, 'buy'] = 0 return dataframe def populate_sell_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame: """ Based on TA indicators, populates the sell signal for the given dataframe :param dataframe: DataFrame populated with indicators :param metadata: Additional information, like the currently traded pair :return: DataFrame with buy column """ # dataframe.loc[ # ( # (qtpylib.crossed_above(dataframe['rsi'], 70)) & # Signal: RSI crosses above 70 # (dataframe['tema'] > dataframe['bb_middleband']) & # Guard: tema above BB middle # (dataframe['tema'] < dataframe['tema'].shift(1)) & # Guard: tema is falling # (dataframe['volume'] > 0) # Make sure Volume is not 0 # ), # 'sell'] = 1 action, nan_list = self.rl_model_redict(dataframe) dataframe.loc[action == 2, 'sell'] = 1 dataframe.loc[nan_list == True, 'sell'] = 0 return dataframe def rl_model_redict(self, dataframe): data = np.array( [ dataframe['adx'], dataframe['plus_dm'], dataframe['plus_di'], dataframe['minus_dm'], dataframe['minus_di'], dataframe['aroonup'], dataframe['aroondown'], dataframe['aroonosc'], dataframe['ao'], # dataframe['kc_percent'], # dataframe['kc_width'], dataframe['uo'], dataframe['cci'], dataframe['rsi'], dataframe['fisher_rsi'], dataframe['slowd'], dataframe['slowk'], dataframe['fastd'], dataframe['fastk'], dataframe['fastd_rsi'], dataframe['fastk_rsi'], dataframe['macd'], dataframe['macdsignal'], dataframe['macdhist'], dataframe['mfi'], dataframe['roc'], # row['bb_percent'], # row['bb_width'], # row['wbb_percent'], # row['wbb_width'], # dataframe['htsine'], # dataframe['htleadsine'], # row['CDLHAMMER'], # row['CDLINVERTEDHAMMER'], # row['CDLDRAGONFLYDOJI'], # row['CDLPIERCING'], # row['CDLMORNINGSTAR'], # row['CDL3WHITESOLDIERS'], # row['CDLHANGINGMAN'], # row['CDLSHOOTINGSTAR'], # row['CDLGRAVESTONEDOJI'], # row['CDLDARKCLOUDCOVER'], # row['CDLEVENINGDOJISTAR'], # row['CDLEVENINGSTAR'], # row['CDL3LINESTRIKE'], # row['CDLSPINNINGTOP'], # row['CDLENGULFING'], # row['CDLHARAMI'], # row['CDL3OUTSIDE'], # row['CDL3INSIDE'], # trad_status, # (self.trade != None) ], dtype=np.float) data = data.reshape(-1, 24) nan_list = np.isnan(data).any(axis=1) data = np.nan_to_num(data) action, _ = self.model.predict(data, deterministic=True) return action, nan_list
class ACERAgent(Agent): def __init__( self, model_name="model_name", save_dir="./models", log_interval=1e4, num_cpus=8, eval_episodes=1000, n_steps=1e6, layer_normalization=False, model_kwargs={"tensorboard_log": "./tensorboards/"}, env_kwargs={ "board_size": 4, "binary": True, "extractor": "cnn" }, callback_checkpoint_kwargs={ "save_freq": 0, "save_path": "./models/", "name_prefix": "model_name" }, callback_hist_kwargs={"hist_freq": 0}, ): super().__init__( model_name, save_dir, num_cpus, model_kwargs, env_kwargs, layer_normalization, callback_checkpoint_kwargs, callback_hist_kwargs, n_steps, log_interval, eval_episodes, ) self._init_model() def _init_model(self): if not self._model_kwargs["agent"].lower() == "acer": raise ValueError( "The model_kwargs dict has to be created using args from ACER agent as reference. Make sure the correct parameters models." ) del self._model_kwargs["agent"] self._callback_checkpoint_kwargs["save_freq"] = int( self._callback_checkpoint_kwargs["save_freq"] / self._num_cpus) if self._env_kwargs["extractor"] == "mlp": self._model = ACER(CustomMlpPolicy, self._env, **self._model_kwargs) else: self._model = ACER(CustomCnnPolicy, self._env, **self._model_kwargs) def train(self): "Optimize the model." callbacks = [] # Checkpoint callback if self._callback_checkpoint_kwargs["save_freq"] > 0: # Append model name into checkpoint save_path self._callback_checkpoint_kwargs["save_path"] = ( self._callback_checkpoint_kwargs["save_path"] + "/" + str(self._model_name)) checkpoint_callback = CheckpointCallback( **self._callback_checkpoint_kwargs) callbacks.append(checkpoint_callback) if self._callback_hist_kwargs["hist_freq"] > 0: # hist_callback = CustomCallbackPPO2(**self._callback_hist_kwargs) # callbacks.append(hist_callback) pass try: self._model.learn(self._n_steps, log_interval=self._log_interval, callback=callbacks, tb_log_name=self._model_name) except KeyboardInterrupt: pass folder_path = os.path.join(self._save_dir, self._model_name) self._model.save(os.path.join(folder_path, self._model_name)) def test(self): "Evaluate the model." mean_reward = super()._test(self._model) return mean_reward
#cutoff_s=str(cutoffpenaltyscalar).split('.')[0] #rg_s=max(str(float(rg_prob)).split('.')) turnspc_s=str(turnspc).split('.')[1] scenario=str(f'{trialv}_{inputfile_s}_t{test}_lr{LR_s}_g{gamma_s}') savepath='./output/%s' % scenario for n in range(500): turns=round(random.random()*x*y*z*turnspc) env = environment(x,y,z,gamma, turnspc, policyname, rg_prob='loadenv') # Instantiate the agent model = ACER(policy, env, gamma=gamma, n_steps=episodetimesteps, learning_rate=LR, buffer_size=10000, verbose=1) # Load the trained agent model = ACER.load("%s/best_model" % savepath) # Evaluate the agent #mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10) # Enjoy trained agent obs = env.reset() for i in range(turns): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) #print(action, rewards, dones) #env.renderif('on') if dones == True:
env = gym.make(env_name) env = Monitor(env, agent_dir) if model_name == 'dqn': from stable_baselines.deepq.policies import MlpPolicy from stable_baselines import DQN model = DQN(MlpPolicy, env, verbose=1, exploration_fraction=0.9, exploration_final_eps=0.5) model.learn(total_timesteps=1000000, log_interval=1) model.save(agent_dir + 'agent') elif model_name == 'ppo1': from stable_baselines.common.policies import MlpPolicy from stable_baselines import PPO1 model = PPO1(MlpPolicy, env, verbose=1) model.learn(total_timesteps=1000000, log_interval=1) model.save(agent_dir + 'agent') elif model_name == 'acer': from stable_baselines.common.policies import MlpPolicy from stable_baselines import ACER model = ACER(MlpPolicy, env, verbose=1) model.learn(total_timesteps=1000000, log_interval=1) model.save(agent_dir + 'agent') else: print('Usage: python training.py <env> <model> <agent_name>')
'8': 'data/fcms/gauss/8x0_5000.pkl'} ep_lengths = {'6': 30, '8': 40} runs_per_env = 1 collected_data = {'vars': [], 'env': [], 'run': [], 'algo': [], 'time': []} run_experiment = True analyze_experiment = True if run_experiment: for var in vars: model = ACER.load(model_paths[var]) envs = FCMGenerator.load_dataset(data[var])[:500] env_counter = 0 print(var+' var environments') bar = tqdm(total=len(envs)*len(algos)*runs_per_env) for env in envs: # create fcm environment for our algo fcm_env = FCMEnvironment(agent=DiscreteAgent(int(var), env_type='Gauss'), fcm=env, eval_func=NoEval()) # collect obs data for notears and GES obs_data = DataFrame(columns=['X'+str(i) for i in range(int(var))]) for i in range(1000): inst = env.get_next_instantiation()[0] obs_data = obs_data.append({'X' + str(i): float(inst[i]) for i in range(len(inst))}, ignore_index=True)
def test_acer(name): model_path = os.path.join('models', name) model = ACER.load(model_path) return model
import pytest from stable_baselines import A2C, ACER, ACKTR, DeepQ, DDPG, PPO1, PPO2, TRPO from stable_baselines.ddpg import AdaptiveParamNoiseSpec from stable_baselines.common.identity_env import IdentityEnv, IdentityEnvBox from stable_baselines.common.vec_env import DummyVecEnv PARAM_NOISE_DDPG = AdaptiveParamNoiseSpec(initial_stddev=float(0.2), desired_action_stddev=float(0.2)) # Hyperparameters for learning identity for each RL model LEARN_FUNC_DICT = { 'a2c': lambda e: A2C(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'acer': lambda e: ACER(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'acktr': lambda e: ACKTR(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'deepq': lambda e: DeepQ(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'ddpg': lambda e: DDPG(policy="MlpPolicy", env=e, param_noise=PARAM_NOISE_DDPG). learn(total_timesteps=1000), 'ppo1': lambda e: PPO1(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'ppo2': lambda e: PPO2(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'trpo': lambda e: TRPO(policy="MlpPolicy", env=e).learn(total_timesteps=1000), }
import pytest import numpy as np from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, SAC, PPO1, PPO2, TD3, TRPO from stable_baselines.ddpg import NormalActionNoise from stable_baselines.common.identity_env import IdentityEnv, IdentityEnvBox from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines.common.evaluation import evaluate_policy # Hyperparameters for learning identity for each RL model LEARN_FUNC_DICT = { 'a2c': lambda e: A2C(policy="MlpPolicy", learning_rate=1e-3, n_steps=1, gamma=0.7, env=e, seed=0).learn(total_timesteps=10000), 'acer': lambda e: ACER(policy="MlpPolicy", env=e, seed=0, n_steps=1, replay_ratio=1).learn(total_timesteps=15000), 'acktr': lambda e: ACKTR(policy="MlpPolicy", env=e, seed=0, learning_rate=5e-4, n_steps=1).learn(total_timesteps=20000), 'dqn': lambda e: DQN(policy="MlpPolicy", batch_size=16, gamma=0.1, exploration_fraction=0.001, env=e, seed=0).learn(total_timesteps=40000), 'ppo1': lambda e: PPO1(policy="MlpPolicy", env=e, seed=0, lam=0.5, optim_batchsize=16, optim_stepsize=1e-3).learn(total_timesteps=15000), 'ppo2': lambda e: PPO2(policy="MlpPolicy", env=e, seed=0, learning_rate=1.5e-3, lam=0.8).learn(total_timesteps=20000), 'trpo': lambda e: TRPO(policy="MlpPolicy", env=e, seed=0, max_kl=0.05, lam=0.7).learn(total_timesteps=10000), } @pytest.mark.slow @pytest.mark.parametrize("model_name", ['a2c', 'acer', 'acktr', 'dqn', 'ppo1', 'ppo2', 'trpo'])
import gym from stable_baselines import ACER from stable_baselines.common.policies import CnnPolicy from stable_baselines.common.vec_env import DummyVecEnv # trying to get an idea of how quickly my computer can train this pong_env = gym.make('Pong-v0') pong_env = DummyVecEnv([lambda: pong_env]) pong_model_acer = ACER( CnnPolicy, pong_env, verbose=0, tensorboard_log="./../../data/baselines-stuff/pong/acer_pong_tensorboard/") pong_model_acer.learn(total_timesteps=50_000_000, tb_log_name="run-1-50_000_000") # since I know I'll be stopping it early pong_model_acer.save( './../../data/baselines-stuff/pong/terrible_pong_model_acer')
eval_freq=50000, deterministic=True, best_model_save_path=evpath), EvalCallback(env1, log_path=savepath, n_eval_episodes=20, eval_freq=10000, deterministic=False, best_model_save_path=savepath) ]) if (os.path.exists("%s/final_model.zip" % savepath)): # Instantiate the agent model = ACER(policy, env, gamma=gamma, n_steps=episodetimesteps, learning_rate=LR, buffer_size=5000, verbose=1, n_cpu_tf_sess=num_cpu) # Load the trained agent model = ACER.load("%s/final_model" % savepath, env=env) print('loaded agent') save_evals() model.learn( total_timesteps=episodetimesteps**50, callback=callbacklist ) #total timesteps set to very large number so program will terminate based on runtime parameter) else: #create model with Stable Baselines package. model = ACER(policy, env,
turnspc_s = str(turnspc).split('.')[1] scenario = str( f'{trialv}_{inputfile_s}_t{test}_lr{LR_s}_g{gamma_s}') #_cpu{ncpu} savepath = './output/%s/%s' % (scenario, 'eval') turns = round(x * y * z * turnspc) env = environment(x, y, z, cutoff, turnspc, policyname, rg_prob='loadenv') if test == 'CNNACER' or test == 'MLPACER': # Instantiate the agent model = ACER(policy, env, gamma=gamma, learning_rate=LR, n_steps=episodetimesteps, verbose=1) #model = DQN('MlpPolicy', env, learning_rate=LR, prioritized_replay=True, verbose=1) # # Load the trained agent model = ACER.load("%s/best_model" % savepath) #model = DQN.load("%s/best_model" % savepath) print('loaded agent %s' % savepath) else: # Instantiate the agent model = A2C(policy, env, gamma=gamma, learning_rate=LR,