def NewPotential(current_window, algorithm='PPO'): # Determine the pretrained agent if algorithm == 'A2C': model = A2C.load("pretrained_A2C") elif algorithm == 'PPO': model = PPO2.load("pretrained_PPO") elif algorithm == 'ACKTR': model = ACKTR.load("pretrained_ACKTR") elif algorithm == 'ACER': model = ACER.load("pretrained_ACER") else: raise ValueError("%s is not a valid algorithm." % algorithm) if len(current_window) != model.observation_space.shape[0]: raise ValueError("%s is does not match the model's window size." % len(current_window)) action, _states = model.predict(current_window, deterministic=False) voltages = np.linspace(0, 1, num=model.action_space.n) if action >= 0 and action <= model.action_space.n - 1: voltage = voltages[action] else: raise ValueError( "Received invalid action={} which is not part of the action space". format(action)) return voltage
def evaluate_policy(model, eval_data, runs_per_env: int, n_vars: int, episode_length: int, display: bool, printing: bool, wrapped_env: bool = False) -> np.array: if type(model) == str: model = ACER.load(model) differences = [] for fcm in eval_data: target_graph = CausalGraphGenerator.create_graph_from_fcm(fcm) for run in range(runs_per_env): predicted_graph = apply_policy(model=model, test_env=fcm, n_vars=n_vars, episode_length=episode_length, display=display, env_type='Gauss', printing=printing, wrapped_env=wrapped_env) difference = directed_shd(predicted_graph, target_graph) differences.append(difference) print('.') differences = np.array(differences) return differences
def get_existing_model(model_path): print('--- Training from existing model', model_path, '---') # Load model model = ACER.load(model_path) return model
def loader(algo, env_name): if algo == 'dqn': return DQN.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'ppo2': return PPO2.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'a2c': return A2C.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'acer': return ACER.load("trained_agents/" + algo + "/" + env_name + ".pkl") elif algo == 'trpo': return TRPO.load("trained_agents/" + algo + "/" + env_name + ".pkl")
def load_model(path: str, algorithm: str): from stable_baselines import PPO2, DQN, A2C, ACER, GAIL, TRPO if algorithm == 'PPO2': return PPO2.load(path) if algorithm == 'DQN': return DQN.load(path) if algorithm == 'A2C': return A2C.load(path) if algorithm == 'ACER': return ACER.load(path) if algorithm == 'GAIL': return GAIL.load(path) if algorithm == 'TRPO': return TRPO.load(path) return None
def setup_game(): playing = True while (playing): games = input("Do you want to play 5, 10 or 20 games? ") if (games.replace(" ", "") == "1"): games = 1 playing = False elif (games.replace(" ", "") == "5"): games = 5 playing = False elif (games.replace(" ", "") == "10"): games = 10 playing = False elif (games.replace(" ", "") == "20"): games = 20 playing = False else: print("Unrecognized please try again!") playing = True while (playing): AIagent = input( "Do you want to play against PPO2(p)(1), A2C(a)(2) or ACER(c)(3) agent?" ) if (AIagent.replace(" ", "").upper() == "p".upper() or AIagent.replace(" ", "").upper() == "ppo2".upper() or AIagent.replace(" ", "") == "1"): AIagent = PPO2.load("models/PPO2-qiscoin-v1-10k") ai_name = "PPO2" playing = False elif (AIagent.replace(" ", "").upper() == "a".upper() or AIagent.replace(" ", "").upper() == "a2c".upper() or AIagent.replace(" ", "") == "2"): AIagent = A2C.load("models/A2C-qiscoin-v1-10k") ai_name = "A2C" playing = False elif (AIagent.replace(" ", "").upper() == "c".upper() or AIagent.replace(" ", "").upper() == "acer".upper() or AIagent.replace(" ", "") == "3"): AIagent = ACER.load("models/ACER-qiscoin-v1-10k") ai_name = "ACER" playing = False else: print("Unrecognized please try again!") return games, AIagent, ai_name
def record_video(): """Record of a video for an trained ACER agent""" model = ACER.load("models/pacman_acer.pkl", verbose=1) env = create_env() model.set_env(env) video_length = 3000 env = wrap_video_env(env, name="pacman_acer", video_length=video_length, path='videos/') state = env.reset() for _ in range(video_length + 1): action, _states = model.predict(state) state, _, _, _ = env.step(action) print("Video recorded") env.close()
def train_acer(seed): """ test ACER on the uav_env(cartesian,discrete) :param seed: random seed :return: evaluation """ """ ACER(policy, env, gamma=0.99, n_steps=20, num_procs=1, q_coef=0.5, ent_coef=0.01, max_grad_norm=10, learning_rate=0.0007, lr_schedule='linear', rprop_alpha=0.99, rprop_epsilon=1e-05, buffer_size=5000, replay_ratio=4, replay_start=1000, correction_term=10.0, trust_region=True, alpha=0.99, delta=1, verbose=0, tensorboard_log=None, _init_setup_model=True) """ algo = 'ACER' num_timesteps = 3000000 env = set_up_env(seed) global best_mean_reward, n_steps best_mean_reward, n_steps = -np.inf, 0 model = ACER(policy=MlpPolicy, env=env, gamma=0.99, n_steps=20, num_procs=1, q_coef=0.5, ent_coef=0.01, max_grad_norm=10, learning_rate=0.0007, lr_schedule='linear', rprop_alpha=0.99, rprop_epsilon=1e-05, buffer_size=5000, replay_ratio=4, replay_start=1000, correction_term=10.0, trust_region=True, alpha=0.99, delta=1, verbose=0, tensorboard_log="./logs/{}/tensorboard/{}/".format(EXPERIMENT_NATURE, algo)) model.learn(total_timesteps=num_timesteps, callback=callback, seed=seed, log_interval=500, tb_log_name="seed_{}".format(seed)) model = ACER.load(log_dir + 'best_model.pkl') evaluation = evaluate_model(env, model, 100) os.makedirs('./logs/{}/csv/{}/'.format(EXPERIMENT_NATURE, algo), exist_ok=True) os.rename('/tmp/gym/monitor.csv', "./logs/{}/csv/{}/seed_{}.csv".format(EXPERIMENT_NATURE, algo, seed)) env.close() del model, env gc.collect() return evaluation
verbose=1, tensorboard_log=out_dir) elif args.model == 'sac': model = SAC("CnnPolicy", env) train(model, env, out_dir) else: #results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "rl") path = '{}/best_model.zip'.format(args.eval) env = CarEnv(args.eval, cam_idx_list=(0, 3, 4)) env.next_weather() #env = Monitor(env, args.eval) #print(env.num_envs) if args.model == 'trpo': model = TRPO.load(path) elif args.model == 'acer': model = ACER.load(path) elif args.model == 'ppo': model = PPO2.load(path) elif args.model == 'acktr': model = ACKTR.load(path) elif args.model == 'ddpg': model = DDPG.load(path) elif args.model == 'a2c': model = A2C.load(path) elif args.model == 'sac': model = SAC.load(path) #mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=5,return_episode_rewards=True) #eps_rewards, eps_len = evaluate_policy(model, env, n_eval_episodes=5,return_episode_rewards=True) # print(eps_rewards) # print(eps_len) # print(np.mean(eps_rewards))
class LoadRLModel(IStrategy): stoploss = -0.50 trailing_stop = False ticker_interval = '5m' # Run "populate_indicators()" only for new candle. process_only_new_candles = False startup_candle_count: int = 20 model = ACER.load('model') def informative_pairs(self): return [] def populate_indicators(self, dataframe: DataFrame, metadata: dict) -> DataFrame: # Momentum Indicators # ------------------------------------ # ADX dataframe['adx'] = ta.ADX(dataframe) # Plus Directional Indicator / Movement dataframe['plus_dm'] = ta.PLUS_DM(dataframe) dataframe['plus_di'] = ta.PLUS_DI(dataframe) # # Minus Directional Indicator / Movement dataframe['minus_dm'] = ta.MINUS_DM(dataframe) dataframe['minus_di'] = ta.MINUS_DI(dataframe) # Aroon, Aroon Oscillator aroon = ta.AROON(dataframe) dataframe['aroonup'] = aroon['aroonup'] dataframe['aroondown'] = aroon['aroondown'] dataframe['aroonosc'] = ta.AROONOSC(dataframe) # Awesome Oscillator dataframe['ao'] = qtpylib.awesome_oscillator(dataframe) # # Keltner Channel # keltner = qtpylib.keltner_channel(dataframe) # dataframe["kc_upperband"] = keltner["upper"] # dataframe["kc_lowerband"] = keltner["lower"] # dataframe["kc_middleband"] = keltner["mid"] # dataframe["kc_percent"] = ( # (dataframe["close"] - dataframe["kc_lowerband"]) / # (dataframe["kc_upperband"] - dataframe["kc_lowerband"]) # ) # dataframe["kc_width"] = ( # (dataframe["kc_upperband"] - dataframe["kc_lowerband"]) / dataframe["kc_middleband"] # ) # Ultimate Oscillator dataframe['uo'] = ta.ULTOSC(dataframe) # Commodity Channel Index: values [Oversold:-100, Overbought:100] dataframe['cci'] = ta.CCI(dataframe) # RSI dataframe['rsi'] = ta.RSI(dataframe) # Inverse Fisher transform on RSI: values [-1.0, 1.0] (https://goo.gl/2JGGoy) rsi = 0.1 * (dataframe['rsi'] - 50) dataframe['fisher_rsi'] = (np.exp(2 * rsi) - 1) / (np.exp(2 * rsi) + 1) # Inverse Fisher transform on RSI normalized: values [0.0, 100.0] (https://goo.gl/2JGGoy) dataframe['fisher_rsi_norma'] = 50 * (dataframe['fisher_rsi'] + 1) # Stochastic Slow stoch = ta.STOCH(dataframe) dataframe['slowd'] = stoch['slowd'] dataframe['slowk'] = stoch['slowk'] # Stochastic Fast stoch_fast = ta.STOCHF(dataframe) dataframe['fastd'] = stoch_fast['fastd'] dataframe['fastk'] = stoch_fast['fastk'] # Stochastic RSI stoch_rsi = ta.STOCHRSI(dataframe) dataframe['fastd_rsi'] = stoch_rsi['fastd'] dataframe['fastk_rsi'] = stoch_rsi['fastk'] # MACD macd = ta.MACD(dataframe) dataframe['macd'] = macd['macd'] dataframe['macdsignal'] = macd['macdsignal'] dataframe['macdhist'] = macd['macdhist'] # MFI dataframe['mfi'] = ta.MFI(dataframe) # # ROC dataframe['roc'] = ta.ROC(dataframe) # Overlap Studies # ------------------------------------ # # Bollinger Bands # bollinger = qtpylib.bollinger_bands(qtpylib.typical_price(dataframe), window=20, stds=2) # dataframe['bb_lowerband'] = bollinger['lower'] # dataframe['bb_middleband'] = bollinger['mid'] # dataframe['bb_upperband'] = bollinger['upper'] # dataframe["bb_percent"] = ( # (dataframe["close"] - dataframe["bb_lowerband"]) / # (dataframe["bb_upperband"] - dataframe["bb_lowerband"]) # ) # dataframe["bb_width"] = ( # (dataframe["bb_upperband"] - dataframe["bb_lowerband"]) / dataframe["bb_middleband"] # ) # # Bollinger Bands - Weighted (EMA based instead of SMA) # weighted_bollinger = qtpylib.weighted_bollinger_bands( # qtpylib.typical_price(dataframe), window=20, stds=2 # ) # dataframe["wbb_upperband"] = weighted_bollinger["upper"] # dataframe["wbb_lowerband"] = weighted_bollinger["lower"] # dataframe["wbb_middleband"] = weighted_bollinger["mid"] # dataframe["wbb_percent"] = ( # (dataframe["close"] - dataframe["wbb_lowerband"]) / # (dataframe["wbb_upperband"] - dataframe["wbb_lowerband"]) # ) # dataframe["wbb_width"] = ( # (dataframe["wbb_upperband"] - dataframe["wbb_lowerband"]) / # dataframe["wbb_middleband"] # ) # # EMA - Exponential Moving Average # dataframe['ema3'] = ta.EMA(dataframe, timeperiod=3) # dataframe['ema5'] = ta.EMA(dataframe, timeperiod=5) # dataframe['ema10'] = ta.EMA(dataframe, timeperiod=10) # dataframe['ema21'] = ta.EMA(dataframe, timeperiod=21) # dataframe['ema50'] = ta.EMA(dataframe, timeperiod=50) # dataframe['ema100'] = ta.EMA(dataframe, timeperiod=100) # # SMA - Simple Moving Average # dataframe['sma3'] = ta.SMA(dataframe, timeperiod=3) # dataframe['sma5'] = ta.SMA(dataframe, timeperiod=5) # dataframe['sma10'] = ta.SMA(dataframe, timeperiod=10) # dataframe['sma21'] = ta.SMA(dataframe, timeperiod=21) # dataframe['sma50'] = ta.SMA(dataframe, timeperiod=50) # dataframe['sma100'] = ta.SMA(dataframe, timeperiod=100) # Parabolic SAR # dataframe['sar'] = ta.SAR(dataframe) # TEMA - Triple Exponential Moving Average # dataframe['tema'] = ta.TEMA(dataframe, timeperiod=9) # # Cycle Indicator # # ------------------------------------ # # Hilbert Transform Indicator - SineWave # hilbert = ta.HT_SINE(dataframe) # dataframe['htsine'] = hilbert['sine'] # dataframe['htleadsine'] = hilbert['leadsine'] # # Pattern Recognition - Bullish candlestick patterns # # ------------------------------------ # # Hammer: values [0, 100] # dataframe['CDLHAMMER'] = ta.CDLHAMMER(dataframe) # # Inverted Hammer: values [0, 100] # dataframe['CDLINVERTEDHAMMER'] = ta.CDLINVERTEDHAMMER(dataframe) # # Dragonfly Doji: values [0, 100] # dataframe['CDLDRAGONFLYDOJI'] = ta.CDLDRAGONFLYDOJI(dataframe) # # Piercing Line: values [0, 100] # dataframe['CDLPIERCING'] = ta.CDLPIERCING(dataframe) # values [0, 100] # # Morningstar: values [0, 100] # dataframe['CDLMORNINGSTAR'] = ta.CDLMORNINGSTAR(dataframe) # values [0, 100] # # Three White Soldiers: values [0, 100] # dataframe['CDL3WHITESOLDIERS'] = ta.CDL3WHITESOLDIERS(dataframe) # values [0, 100] # # Pattern Recognition - Bearish candlestick patterns # # ------------------------------------ # # Hanging Man: values [0, 100] # dataframe['CDLHANGINGMAN'] = ta.CDLHANGINGMAN(dataframe) # # Shooting Star: values [0, 100] # dataframe['CDLSHOOTINGSTAR'] = ta.CDLSHOOTINGSTAR(dataframe) # # Gravestone Doji: values [0, 100] # dataframe['CDLGRAVESTONEDOJI'] = ta.CDLGRAVESTONEDOJI(dataframe) # # Dark Cloud Cover: values [0, 100] # dataframe['CDLDARKCLOUDCOVER'] = ta.CDLDARKCLOUDCOVER(dataframe) # # Evening Doji Star: values [0, 100] # dataframe['CDLEVENINGDOJISTAR'] = ta.CDLEVENINGDOJISTAR(dataframe) # # Evening Star: values [0, 100] # dataframe['CDLEVENINGSTAR'] = ta.CDLEVENINGSTAR(dataframe) # # Pattern Recognition - Bullish/Bearish candlestick patterns # # ------------------------------------ # # Three Line Strike: values [0, -100, 100] # dataframe['CDL3LINESTRIKE'] = ta.CDL3LINESTRIKE(dataframe) # # Spinning Top: values [0, -100, 100] # dataframe['CDLSPINNINGTOP'] = ta.CDLSPINNINGTOP(dataframe) # values [0, -100, 100] # # Engulfing: values [0, -100, 100] # dataframe['CDLENGULFING'] = ta.CDLENGULFING(dataframe) # values [0, -100, 100] # # Harami: values [0, -100, 100] # dataframe['CDLHARAMI'] = ta.CDLHARAMI(dataframe) # values [0, -100, 100] # # Three Outside Up/Down: values [0, -100, 100] # dataframe['CDL3OUTSIDE'] = ta.CDL3OUTSIDE(dataframe) # values [0, -100, 100] # # Three Inside Up/Down: values [0, -100, 100] # dataframe['CDL3INSIDE'] = ta.CDL3INSIDE(dataframe) # values [0, -100, 100] # # Chart type # # ------------------------------------ # # Heikin Ashi Strategy # heikinashi = qtpylib.heikinashi(dataframe) # dataframe['ha_open'] = heikinashi['open'] # dataframe['ha_close'] = heikinashi['close'] # dataframe['ha_high'] = heikinashi['high'] # dataframe['ha_low'] = heikinashi['low'] # Retrieve best bid and best ask from the orderbook # ------------------------------------ """ # first check if dataprovider is available if self.dp: if self.dp.runmode in ('live', 'dry_run'): ob = self.dp.orderbook(metadata['pair'], 1) dataframe['best_bid'] = ob['bids'][0][0] dataframe['best_ask'] = ob['asks'][0][0] """ return dataframe def populate_buy_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame: """ Based on TA indicators, populates the buy signal for the given dataframe :param dataframe: DataFrame populated with indicators :param metadata: Additional information, like the currently traded pair :return: DataFrame with buy column """ # dataframe.loc[ # ( # (qtpylib.crossed_above(dataframe['rsi'], 30)) & # Signal: RSI crosses above 30 # (dataframe['tema'] <= dataframe['bb_middleband']) & # Guard: tema below BB middle # (dataframe['tema'] > dataframe['tema'].shift(1)) & # Guard: tema is raising # (dataframe['volume'] > 0) # Make sure Volume is not 0 # ), # 'buy'] = 1 action, nan_list = self.rl_model_redict(dataframe) dataframe.loc[action == 1, 'buy'] = 1 dataframe.loc[nan_list == True, 'buy'] = 0 return dataframe def populate_sell_trend(self, dataframe: DataFrame, metadata: dict) -> DataFrame: """ Based on TA indicators, populates the sell signal for the given dataframe :param dataframe: DataFrame populated with indicators :param metadata: Additional information, like the currently traded pair :return: DataFrame with buy column """ # dataframe.loc[ # ( # (qtpylib.crossed_above(dataframe['rsi'], 70)) & # Signal: RSI crosses above 70 # (dataframe['tema'] > dataframe['bb_middleband']) & # Guard: tema above BB middle # (dataframe['tema'] < dataframe['tema'].shift(1)) & # Guard: tema is falling # (dataframe['volume'] > 0) # Make sure Volume is not 0 # ), # 'sell'] = 1 action, nan_list = self.rl_model_redict(dataframe) dataframe.loc[action == 2, 'sell'] = 1 dataframe.loc[nan_list == True, 'sell'] = 0 return dataframe def rl_model_redict(self, dataframe): data = np.array( [ dataframe['adx'], dataframe['plus_dm'], dataframe['plus_di'], dataframe['minus_dm'], dataframe['minus_di'], dataframe['aroonup'], dataframe['aroondown'], dataframe['aroonosc'], dataframe['ao'], # dataframe['kc_percent'], # dataframe['kc_width'], dataframe['uo'], dataframe['cci'], dataframe['rsi'], dataframe['fisher_rsi'], dataframe['slowd'], dataframe['slowk'], dataframe['fastd'], dataframe['fastk'], dataframe['fastd_rsi'], dataframe['fastk_rsi'], dataframe['macd'], dataframe['macdsignal'], dataframe['macdhist'], dataframe['mfi'], dataframe['roc'], # row['bb_percent'], # row['bb_width'], # row['wbb_percent'], # row['wbb_width'], # dataframe['htsine'], # dataframe['htleadsine'], # row['CDLHAMMER'], # row['CDLINVERTEDHAMMER'], # row['CDLDRAGONFLYDOJI'], # row['CDLPIERCING'], # row['CDLMORNINGSTAR'], # row['CDL3WHITESOLDIERS'], # row['CDLHANGINGMAN'], # row['CDLSHOOTINGSTAR'], # row['CDLGRAVESTONEDOJI'], # row['CDLDARKCLOUDCOVER'], # row['CDLEVENINGDOJISTAR'], # row['CDLEVENINGSTAR'], # row['CDL3LINESTRIKE'], # row['CDLSPINNINGTOP'], # row['CDLENGULFING'], # row['CDLHARAMI'], # row['CDL3OUTSIDE'], # row['CDL3INSIDE'], # trad_status, # (self.trade != None) ], dtype=np.float) data = data.reshape(-1, 24) nan_list = np.isnan(data).any(axis=1) data = np.nan_to_num(data) action, _ = self.model.predict(data, deterministic=True) return action, nan_list
eval_freq=10000, deterministic=False, best_model_save_path=savepath) ]) if (os.path.exists("%s/best_model.zip" % savepath)): # Instantiate the agent model = ACER(policy, env, gamma=gamma, n_steps=episodetimesteps, learning_rate=LR, buffer_size=10000, verbose=1, n_cpu_tf_sess=num_cpu) # Load the trained agent model = ACER.load("%s/best_model" % savepath, env=env) print('loaded agent') model.learn( total_timesteps=episodetimesteps**50, callback=callbacklist ) #total timesteps set to very large number so program will terminate based on runtime parameter) else: #create model with Stable Baselines package. model = ACER(policy, env, gamma=gamma, n_steps=episodetimesteps, learning_rate=LR, buffer_size=10000, verbose=1, n_cpu_tf_sess=num_cpu) #, tensorboard_log=scenario)
scenario=str(f'{trialv}_{inputfile_s}_t{test}_lr{LR_s}_g{gamma_s}') savepath='./output/%s' % scenario for n in range(500): turns=round(random.random()*x*y*z*turnspc) env = environment(x,y,z,gamma, turnspc, policyname, rg_prob='loadenv') # Instantiate the agent model = ACER(policy, env, gamma=gamma, n_steps=episodetimesteps, learning_rate=LR, buffer_size=10000, verbose=1) # Load the trained agent model = ACER.load("%s/best_model" % savepath) # Evaluate the agent #mean_reward, std_reward = evaluate_policy(model, env, n_eval_episodes=10) # Enjoy trained agent obs = env.reset() for i in range(turns): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) #print(action, rewards, dones) #env.renderif('on') if dones == True: break env.save() #env.render()
'8': 'data/fcms/gauss/8x0_5000.pkl'} ep_lengths = {'6': 30, '8': 40} runs_per_env = 1 collected_data = {'vars': [], 'env': [], 'run': [], 'algo': [], 'time': []} run_experiment = True analyze_experiment = True if run_experiment: for var in vars: model = ACER.load(model_paths[var]) envs = FCMGenerator.load_dataset(data[var])[:500] env_counter = 0 print(var+' var environments') bar = tqdm(total=len(envs)*len(algos)*runs_per_env) for env in envs: # create fcm environment for our algo fcm_env = FCMEnvironment(agent=DiscreteAgent(int(var), env_type='Gauss'), fcm=env, eval_func=NoEval()) # collect obs data for notears and GES obs_data = DataFrame(columns=['X'+str(i) for i in range(int(var))]) for i in range(1000): inst = env.get_next_instantiation()[0] obs_data = obs_data.append({'X' + str(i): float(inst[i]) for i in range(len(inst))}, ignore_index=True)
print('Usage: python play.py <env> <model> <agent_name>') sys.exit() env_name = sys.argv[1] model_type = sys.argv[2] model_name = sys.argv[3] + '/agent' env = gym.make(env_name) if model_type == 'ppo1': from stable_baselines.common.policies import MlpPolicy from stable_baselines import PPO1 model = PPO1.load(model_name) elif model_type == 'dqn': from stable_baselines.deepq.policies import MlpPolicy from stable_baselines import DQN model = DQN.load(model_name) elif model_type == 'acer': from stable_baselines.common.policies import MlpPolicy from stable_baselines import ACER model = ACER.load(model_name) obs = env.reset() done = False while not done: action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render()
import matplotlib.pyplot as plt # There already exists an environment generator # that will make and wrap atari environments correctly. # Here we are also multiprocessing training (num_env=4 => 4 processes) env = make_atari_env('PongNoFrameskip-v4', num_env=4, seed=0) # Frame-stacking with 4 frames env = VecFrameStack(env, n_stack=4) # model = ACER('CnnPolicy', env, verbose=1) # model.learn(total_timesteps=25000) # # save # model.save("cnn_pong") # load model = ACER.load("cnn_pong") print(model) obs = env.reset() for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) print(rewards) # env.render() # env.render(mode='rgb_array') img = env.render(mode='rgb_array') plt.imshow(img) print(type(img)) plt.show()
def test_acer(name): model_path = os.path.join('models', name) model = ACER.load(model_path) return model
eval_freq=10000, deterministic=False, best_model_save_path=savepath) ]) if (os.path.exists("%s/final_model.zip" % savepath)): # Instantiate the agent model = ACER(policy, env, gamma=gamma, n_steps=episodetimesteps, learning_rate=LR, buffer_size=5000, verbose=1, n_cpu_tf_sess=num_cpu) # Load the trained agent model = ACER.load("%s/final_model" % savepath, env=env) print('loaded agent') save_evals() model.learn( total_timesteps=episodetimesteps**50, callback=callbacklist ) #total timesteps set to very large number so program will terminate based on runtime parameter) else: #create model with Stable Baselines package. model = ACER(policy, env, gamma=gamma, n_steps=episodetimesteps, learning_rate=LR, buffer_size=5000, verbose=1,
end_time = time.time() print('Training time for algorithm {}: {:.2f}s = {:.2f}min = {:.4f}hrs'.format(algo_list[alg],\ end_time-start_time,(end_time-start_time)/60,(end_time-start_time)/3600)) print('Trained using RL') else: #test print('Testing {} learnt policy from model file {} for {} games!'.format(algo_list[alg],\ args.model,int(args.num_test))) start_time = time.time() if alg == 0: model = TRPO.load(args.model) elif alg == 1: model = DQN.load(args.model) elif alg == 2: model = ACKTR.load(args.model) elif alg == 3: model = ACER.load(args.model) elif alg == 4: model = A2C.load(args.model) elif alg == 5: model = PPO1.load(args.model) env = gym.make('gym_pursuitevasion_small:pursuitevasion_small-v0') g = 1 obs = env.reset(ep=g) e_win_games = int(0) while True: action, _states = model.predict(obs) obs, rewards, done, e_win = env.step(action) if done: g += 1 obs = env.reset(ep=g) if g % 100 == 0: