config = { "num_team_a": 1, "num_team_b": 1, "width": 5, "height": 3, "density": 0.0, "max_turns": 50 } a_config = {"hp": 30, "skill": 50, "fray": 40, "dmg": "2d10+6"} b_config = {"hp": 30, "skill": 50, "fray": 40, "dmg": "2d10+6"} ray.init() env = epenv.EP_Environment(config, a_config, b_config) check_env(env) register_env("ep_environment", lambda _: epenv.EP_Environment(config, a_config, b_config)) trainer = PPO2(env=env, config={ "multiagent": { "policies": { "one": (None, env.observation_space, env.action_space, {}), "two": (None, env.observation_space, env.action_space, {}), } } })
print('Saving the config file in path: {}'.format(specified_path)) with open(join(specified_path, 'config.yml'), 'w') as f: yaml.dump(config, f, indent=4, sort_keys=False, line_break=' ') # train model try: try: model_path = join(specified_path, 'pretrained-model.zip') model = PPO2.load(model_path, env=env_8, tensorboard_log=specified_path) print("Existing model loaded...") except: model = PPO2(policy, env=env_8, tensorboard_log=specified_path, **model_config) print('New model created..') # Launch the tensorboard if args.tensorboard: launch_tensorboard(specified_path) start = datetime.now() print('Start time training: {}'.format(start)) model.learn(total_timesteps=n_steps, tb_log_name='{}_{}'.format(max_in_dir, args.name), callback=eval_callback) model_path = join(specified_path, '{}_final_model.zip'.format(max_in_dir)) model.save(model_path)
else: if args.multi: #setup random offset of network ports os.environ['DONKEY_SIM_MULTI'] = '1' # Number of processes to use num_cpu = 4 # Create the vectorized environment env = SubprocVecEnv([make_env(env_id, i) for i in range(num_cpu)]) #create recurrent policy model = PPO2(CnnLstmPolicy, env, verbose=1) else: #make gym env env = gym.make(env_id) # Create the vectorized environment env = DummyVecEnv([lambda: env]) #create cnn policy model = PPO2(CnnPolicy, env, verbose=1) #set up model in learning mode with goal number of timesteps to complete model.learn(total_timesteps=10000)
agent = PPO2.load( agent_params["pretrained_agent"].value, env=env, reset_num_timesteps=False, n_steps=agent_params["update_nepisodes"].value * (eng_params["nsteps"].value - 1), learning_rate=agent_params["learning_rate"].value, gamma=agent_params["gamma"].value, tensorboard_log=logdir, ) else: agent = PPO2( MlpPolicy, env, verbose=1, n_steps=agent_params["update_nepisodes"].value * (eng_params["nsteps"].value - 1), learning_rate=agent_params["learning_rate"].value, gamma=agent_params["gamma"].value, tensorboard_log=logdir, ) agent.learn( total_timesteps=agent_params["number_episodes"].value * (eng_params["nsteps"].value - 1) * agent_params["nranks"].value, callback=callback, ) elif agent_params["agent"].value == "manual": env = DummyVecEnv([lambda: eng]) agent = agents.ManualAgent(env) agent.learn(agent_params["injection_cas"].value, agent_params["qdot_cas"].value)
import time import json import random import sys np.set_printoptions(threshold=sys.maxsize) env = DummyVecEnv( [lambda: SwapTradingEnv(data_file='./data/DATA.parquet', training=True)]) test_env = DummyVecEnv( [lambda: SwapTradingEnv(data_file='./data/TEST.parquet', training=True)]) # env = SubprocVecEnv([lambda: SwapTradingEnv( # data_file='/home/thorad/Core/Projects/SwapTrader/data/BTC-USD-SWAP-FRACDIFF.parquet' # ) for i in range(2)]) model = PPO2(MlpPolicy, env, verbose=1, tensorboard_log="./tensorboard") for x in range(5): model.learn(50000) model.save('./agents/agent_' + str(x) + '.pkl') obs = test_env.reset() done = False while not done: action, _ = model.predict(obs) obs, reward, done, info = test_env.step(action) print(reward) print(info) wandb.log(info[0])
'eta': 1, 'X_kunit': 0.49, 'theta': 0.1 } #if a parameter is set to None it will be sampled from a uniform distribution at every reset args = { 'feedback': feedback, 'q': qs, 'params': params } #i parametri di default son questi: rewfunc=Tools.purity_like_rew,q=1e-4,dt=1e-3,plot=False,pow=0.5 #instantiate environment env = make_vec_env(FisherEnv, n_envs=N, env_kwargs=args) #instantiate model model = PPO2(MlpPolicy, env, n_steps=128, learning_rate=LR, lam=0.95, ent_coef=e_c, verbose=1, nminibatches=4, noptepochs=4, tensorboard_log='./Fisher_mix_TRAIN_LOG/{}/{}_q{}'.format( dirname, title, qs), seed=2) #train the model model.learn(total_timesteps=TIMESTEPS, callback=callback, tb_log_name='{}_q{}'.format(title, qs)) #save the trained model at a given path model.save('./MODELS/{}/{}_q{}'.format(dirname, title, qs))
def objective(trial): # Define what to optimize in environment envParams = { 'reward_func': reward_strategy, 'forecast_len': int(trial.suggest_loguniform('forecast_len', 1, 200)), 'confidence_interval': trial.suggest_uniform('confidence_interval', 0.7, 0.99), } train_df, test_df = getDatasets( params.get('input_data_file'), percentageToUse=params.get('dataset_percentage')) trainEnv = DummyVecEnv([lambda: BitcoinTradingEnv(train_df, **envParams)]) testEnv = DummyVecEnv([lambda: BitcoinTradingEnv(test_df, **envParams)]) # Define what to optimize in agent agentParams = { 'n_steps': int(trial.suggest_loguniform('n_steps', 16, 2048)), 'gamma': trial.suggest_loguniform('gamma', 0.9, 0.9999), 'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1.), 'ent_coef': trial.suggest_loguniform('ent_coef', 1e-8, 1e-1), 'cliprange': trial.suggest_uniform('cliprange', 0.1, 0.4), 'noptepochs': int(trial.suggest_loguniform('noptepochs', 1, 48)), 'lam': trial.suggest_uniform('lam', 0.8, 1.) } model = PPO2(MlpLnLstmPolicy, trainEnv, verbose=0, nminibatches=1, **agentParams) # Run optimizer last_reward = -np.finfo(np.float16).max evaluation_interval = int(len(train_df) / params.get('n_test_episodes')) for eval_idx in range(params.get('n_evaluations')): try: model.learn(evaluation_interval) except AssertionError: raise rewards = [] n_episodes, reward_sum = 0, 0.0 obs = testEnv.reset() while n_episodes < params.get('n_test_episodes'): action, _ = model.predict(obs) obs, reward, done, _ = testEnv.step(action) reward_sum += reward if done: rewards.append(reward_sum) reward_sum = 0.0 n_episodes += 1 obs = testEnv.reset() last_reward = np.mean(rewards) trial.report(-1 * last_reward, eval_idx) if trial.should_prune(eval_idx): raise optuna.structs.TrialPruned() return -1 * last_reward
f.close() else: save_path = '../logs/' env = Monitor(env, '../logs/') # logging monitor model_dir = save_path + '{}_final_model'.format(args.alg) # model save/load directory if args.alg == 'ddpg': action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=args.action_noise * np.ones(n_actions)) param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(args.param_noise_stddev), desired_action_stddev=float(args.param_noise_stddev)) model = DDPG(DDPGPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, render=args.play) elif args.alg == 'ppo2': model = PPO2(CommonMlpPolicy, env, verbose=1) elif args.alg == 'trpo': model = TRPO(CommonMlpPolicy, env, verbose=1, model_dir=save_path) elif args.alg =='a2c': model = A2C(CommonMlpPolicy, env, verbose=1) else: print(args.alg) raise Exception('Algorithm name is not defined!') print('Model is Created') try: print('Training Started') if args.alg == 'ddpg': model.learn(total_timesteps=args.num_timesteps, log_interval=args.log_interval, save_path=save_path) else: model.learn(total_timesteps=args.num_timesteps, log_interval=args.log_interval)
from stable_baselines import A2C, ACER, ACKTR, DQN, DDPG, PPO1, PPO2, SAC, TRPO from stable_baselines.ddpg import AdaptiveParamNoiseSpec from stable_baselines.common.identity_env import IdentityEnv, IdentityEnvBox from stable_baselines.common.vec_env import DummyVecEnv PARAM_NOISE_DDPG = AdaptiveParamNoiseSpec(initial_stddev=float(0.2), desired_action_stddev=float(0.2)) # Hyperparameters for learning identity for each RL model LEARN_FUNC_DICT = { 'a2c': lambda e: A2C(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'acer': lambda e: ACER(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'acktr': lambda e: ACKTR(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'dqn': lambda e: DQN(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'ddpg': lambda e: DDPG(policy="MlpPolicy", env=e, param_noise=PARAM_NOISE_DDPG).learn(total_timesteps=1000), 'ppo1': lambda e: PPO1(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'ppo2': lambda e: PPO2(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'sac': lambda e: SAC(policy="MlpPolicy", env=e).learn(total_timesteps=1000), 'trpo': lambda e: TRPO(policy="MlpPolicy", env=e).learn(total_timesteps=1000), } @pytest.mark.slow @pytest.mark.parametrize("model_name", ['a2c', 'acer', 'acktr', 'dqn', 'ppo1', 'ppo2', 'trpo']) def test_identity(model_name): """ Test if the algorithm (with a given policy) can learn an identity transformation (i.e. return observation as an action) :param model_name: (str) Name of the RL model """ env = DummyVecEnv([lambda: IdentityEnv(10)])
def train( task, alg, logdir, domain_name, *, random_seed=None, num_steps=int(2e3), log_every=int(10e3), num_parallel=8, load_policy=False, load_policy_dir="", **kwargs ): """Train and evaluate an agent Args: task (str): Jitterbug task to train on alg (str): Algorithm to train, one of; - 'ddpg': DDPG Algorithm - 'ppo2': PPO2 Algorithm - 'sac': SAC Algorithm logdir (str): Logging directory domain_name (str): Name of the DMC domain random_seed (int): Random seed to use, or None num_steps (int): Number of training steps to train for log_every (int): Save and log progress every this many timesteps num_parallel (int): Number of parallel environments to run. Only used load_policy (bool): Whether to load an existing or not. It Yes, the policy is loaded from logdir. for A2C and PPO2. """ assert alg in ('ddpg', 'sac', 'ppo2', 'td3'), "Invalid alg: {}".format(alg) assert domain_name in ('jitterbug', 'augmented_jitterbug'), "Invalid domain_name: {}".format(domain_name) # Cast args to types if random_seed is not None: random_seed = int(random_seed) else: random_seed = int(time.time()) # Fix random seed random.seed(random_seed) np.random.seed(random_seed) # Prepare the logging directory os.makedirs(logdir, exist_ok=True) print("Training {} on {} with seed {} for {} steps " "(log every {}), saving to {}".format( alg, task, random_seed, num_steps, log_every, logdir )) if domain_name == "augmented_jitterbug": augmented_jitterbug.augment_Jitterbug(modify_legs=True, modify_mass=True, modify_coreBody1=False, modify_coreBody2=False, modify_global_density=False, modify_gear=False, ) # Construct DMC env env_dmc = suite.load( domain_name=domain_name, task_name=task, task_kwargs=dict(random=random_seed, norm_obs=True), environment_kwargs=dict(flat_observation=True) ) # Wrap gym env in a dummy parallel vector if alg in ('ppo2'): if num_parallel > multiprocessing.cpu_count(): warnings.warn("Number of parallel workers " "({}) > CPU count ({}), setting to # CPUs - 1".format( num_parallel, multiprocessing.cpu_count() )) num_parallel = max( 1, multiprocessing.cpu_count() - 1 ) print("Using {} parallel environments".format(num_parallel)) # XXX ajs 13/Sep/19 Hack to create multiple monitors that don't write to the same file env_vec = SubprocVecEnv([ lambda: Monitor( gym.wrappers.FlattenDictWrapper( jitterbug_dmc.JitterbugGymEnv(env_dmc), dict_keys=["observations"] ), os.path.join(logdir, str(random.randint(0, 99999999))), allow_early_resets=True ) for n in range(num_parallel) ]) else: num_parallel = 1 env_vec = DummyVecEnv([ lambda: Monitor( gym.wrappers.FlattenDictWrapper( jitterbug_dmc.JitterbugGymEnv(env_dmc), dict_keys=["observations"] ), logdir, allow_early_resets=True ) ]) # Record start time start_time = datetime.datetime.now() def _cb(_locals, _globals): """Callback for during training""" if 'last_num_eps' not in _cb.__dict__: _cb.last_num_eps = 0 # Extract episode reward history based on model type if isinstance(_locals['self'], DDPG): ep_r_hist = list(_locals['episode_rewards_history']) elif isinstance(_locals['self'], PPO2): ep_r_hist = [d['r'] for d in _locals['ep_info_buf']] elif isinstance(_locals['self'], SAC): ep_r_hist = [d['r'] for d in _locals['ep_info_buf']] elif isinstance(_locals['self'], TD3): ep_r_hist = [d['r'] for d in _locals['ep_info_buf']] else: raise ValueError("Invalid algorithm: {}".format( _locals['self'] )) # Compute # elapsed steps based on # elapsed episodes ep_size = int( jitterbug_dmc.jitterbug.DEFAULT_TIME_LIMIT / jitterbug_dmc.jitterbug.DEFAULT_CONTROL_TIMESTEP ) num_eps = len(ep_r_hist) elapsed_steps = ep_size * num_eps # Compute elapsed time in seconds elapsed_time = (datetime.datetime.now() - start_time).total_seconds() # Log some info if num_eps != _cb.last_num_eps: _cb.last_num_eps = num_eps print("{:.2f}s | {}ep | {}#: episode reward = " "{:.2f}, last 5 episode reward = {:.2f}".format( elapsed_time, num_eps, elapsed_steps, ep_r_hist[-1], np.mean(ep_r_hist[-5:]) )) # Save model checkpoint model_path = os.path.join(logdir, "model.pkl") print("Saved checkpoint to {}".format(model_path)) _locals['self'].save(model_path) return True if alg == 'ddpg': # Default parameters for DDPG # kwargs.setdefault("normalize_returns", True) # kwargs.setdefault("return_range", (0., 1.)) # kwargs.setdefault("normalize_observations", True) # kwargs.setdefault("observation_range", (-1., 1.)) kwargs.setdefault("batch_size", 256) kwargs.setdefault("actor_lr", 1e-4) kwargs.setdefault("critic_lr", 1e-4) kwargs.setdefault("buffer_size", 1000000) kwargs.setdefault("action_noise", OrnsteinUhlenbeckActionNoise( mean=np.array([0.3]), sigma=0.3, theta=0.15 )) print("Constructing DDPG agent with settings:") pprint.pprint(kwargs) # Construct the agent if load_policy: print("Load DDPG agent from ", load_policy_dir) agent = DDPG.load(load_path=os.path.join(load_policy_dir, "model.final.pkl"), policy=CustomPolicyDDPG, env=env_vec, verbose=1, tensorboard_log=logdir, **kwargs ) else: agent = DDPG( policy=CustomPolicyDDPG, env=env_vec, verbose=1, tensorboard_log=logdir, **kwargs ) # Train for a while (logging and saving checkpoints as we go) agent.learn( total_timesteps=num_steps, callback=_cb ) elif alg == 'ppo2': kwargs.setdefault("learning_rate", 1e-4) kwargs.setdefault("n_steps", 256 // num_parallel) kwargs.setdefault("ent_coef", 0.01) kwargs.setdefault("cliprange", 0.1) print("Constructing PPO2 agent with settings:") pprint.pprint(kwargs) if load_policy: print("Load PPO2 agent from ", load_policy_dir) agent = PPO2.load(load_path=os.path.join(load_policy_dir, "model.final.pkl"), policy=CustomPolicyGeneral, env=env_vec, verbose=1, tensorboard_log=logdir, **kwargs ) else: agent = PPO2( policy=CustomPolicyGeneral, env=env_vec, verbose=1, tensorboard_log=logdir, **kwargs ) # Train for a while (logging and saving checkpoints as we go) agent.learn( total_timesteps=num_steps, callback=_cb, log_interval=10 ) elif alg == 'sac': # Default parameters for SAC kwargs.setdefault("learning_rate", 1e-4) kwargs.setdefault("buffer_size", 1000000) kwargs.setdefault("batch_size", 256) kwargs.setdefault("ent_coef", 'auto') # kwargs.setdefault("ent_coef", 'auto_0.1') kwargs.setdefault("action_noise", NormalActionNoise( mean=0, sigma=0.2, )) print("Constructing SAC agent with settings:") pprint.pprint(kwargs) # Construct the agent # XXX ajs 14/Sep/19 SAC in stable_baselines uses outdated policy # classes so we just use MlpPolicy and pass policy_kwargs if load_policy: print("Load SAC agent from ", load_policy_dir) kwargs.setdefault("policy_kwargs", dict(layers=[350, 250], act_fun=tf.nn.relu)) agent = SAC.load(load_path=os.path.join(load_policy_dir, "model.final.pkl"), env=env_vec, verbose=1, tensorboard_log=logdir, **kwargs ) else: agent = SAC( policy='MlpPolicy', env=env_vec, verbose=1, tensorboard_log=logdir, policy_kwargs=dict(layers=[350, 250], act_fun=tf.nn.relu), **kwargs ) # Train for a while (logging and saving checkpoints as we go) agent.learn( total_timesteps=num_steps, callback=_cb ) elif alg == 'td3': # Default parameters for SAC kwargs.setdefault("learning_rate", 1e-4) kwargs.setdefault("buffer_size", 1000000) kwargs.setdefault("batch_size", 256) kwargs.setdefault("gradient_steps", 1000) kwargs.setdefault("learning_starts", 10000) kwargs.setdefault("train_freq", 1000) # kwargs.setdefault("ent_coef", 'auto_0.1') kwargs.setdefault("action_noise", NormalActionNoise( mean=0, sigma=0.2, )) print("Constructing TD3 agent with settings:") pprint.pprint(kwargs) # Construct the agent # XXX ajs 14/Sep/19 SAC in stable_baselines uses outdated policy # classes so we just use MlpPolicy and pass policy_kwargs if load_policy: print("Load TD3 agent from ", load_policy_dir) kwargs.setdefault("policy_kwargs", dict(layers=[350, 250], act_fun=tf.nn.relu)) agent = TD3.load(load_path=os.path.join(load_policy_dir, "model.final.pkl"), env=env_vec, verbose=1, tensorboard_log=logdir, **kwargs ) else: agent = TD3( policy='MlpPolicy', env=env_vec, verbose=1, tensorboard_log=logdir, policy_kwargs=dict(layers=[350, 250], act_fun=tf.nn.relu), **kwargs ) # Train for a while (logging and saving checkpoints as we go) agent.learn( total_timesteps=num_steps, callback=_cb ) else: raise ValueError("Invalid alg: {}".format(alg)) # Save final model agent.save(os.path.join(logdir, 'model.final.pkl')) print("Done")
from util import log_dir, callback, AirstrikerDiscretizer, CustomRewardAndDoneEnv # 環境の生成 (1) env = retro.make(game='Airstriker-Genesis', state='Level1') env = AirstrikerDiscretizer(env) # 行動空間を離散空間に変換 env = CustomRewardAndDoneEnv(env) # 報酬とエピソード完了の変更 env = StochasticFrameSkip(env, n=4, stickprob=0.25) # スティッキーフレームスキップ env = Downsample(env, 2) # ダウンサンプリング env = Rgb2gray(env) # グレースケール env = FrameStack(env, 4) # フレームスタック env = ScaledFloatFrame(env) # 状態の正規化 env = Monitor(env, log_dir, allow_early_resets=True) print('行動空間: ', env.action_space) print('状態空間: ', env.observation_space) # シードの指定 env.seed(0) set_global_seeds(0) # ベクトル化環境の生成 env = DummyVecEnv([lambda: env]) # モデルの生成 model = PPO2('CnnPolicy', env, verbose=1) # モデルの学習 model.learn(total_timesteps=300000, callback=callback) # モデルの保存 model.save('PPO2')
total_timesteps_ = 3000000 # 3000000 for sac, 500000 for ppo2, 1500000 for ddpg exp_num = "2" tensorboard_log_name = algorithm + "_" + exp_num + "_" + env_name tensorboard_log_dir = "./logs/" # tensorboard --logdir=PPO2_1_Ex3_EKF_gyro-v0_1 --port=6006 --host=127.0.0.1 # tensorboard --logdir=sac_ekf_3_3 --port=6007 --host=127.0.0.2 model_save_name = tensorboard_log_name + "_model_" + exp_num if algorithm == "PPO2": from itertools import cycle from stable_baselines.common.policies import MlpPolicy from stable_baselines.common import make_vec_env from stable_baselines import PPO2 env = make_vec_env(env_name, n_envs=3) model = PPO2(MlpPolicy, env, verbose=1, tensorboard_log=tensorboard_log_dir) if train: for i in range(model_num): model.learn(total_timesteps=total_timesteps_, tb_log_name=tensorboard_log_name) model.save(model_save_name) elif algorithm == "PPO1": from stable_baselines.common.policies import MlpPolicy from stable_baselines import PPO1 env = gym.make(env_name) model = PPO1(MlpPolicy, env, verbose=1) if train: for i in range(model_num): model.learn(total_timesteps=total_timesteps_) model.save(model_save_name)
# print(data_df.shape, 'after filling na') # Note that I always use a copy of the original data to try it track step by step. data_clean = data_df.copy() # data_clean.head() # data_clean.tail() train = data_clean[1:2000] # the index needs to start from 0 train = train.reset_index(drop=True) # train.head() #tensorboard --logdir ./single_stock_tensorboard/ env_train = DummyVecEnv([lambda: SingleStockEnv(train)]) model_ppo = PPO2('MlpPolicy', env_train, tensorboard_log="./single_stock_trading_2_tensorboard/") model_ppo.learn(total_timesteps=100000, tb_log_name="run_aapl_ppo") #model.save('AAPL_ppo_100k') test = data_clean[2000:] # the index needs to start from 0 test = test.reset_index(drop=True) model = model_ppo env_test = DummyVecEnv([lambda: SingleStockEnv(test)]) obs_test = env_test.reset() print("==============Model Prediction===========") # for i in range(len(test.index.unique())): # print("testing", i, "th")
else: bench.config[k] = config[k] return bench.get_environment() # Experiment configuration # Play 5D scenario action_values = (3, 3, 3, 3, 3) env_config = { "seed": 0, "action_values": action_values, "instance_set_path": "../instance_sets/sigmoid/sigmoid_5D3M_train.csv", } # Make environment # To track rewards we use our wrapper (this is only for simplicity) env = make_sigmoid(env_config) env = PerformanceTrackingWrapper(env) # Make simple PPO policy model = PPO2("MlpPolicy", env) # Run for 10 steps model.learn(total_timesteps=200) performance = env.get_performance()[0] for i in range(len(performance)): print( f"Episode {i+1}/{len(performance)}...........................................Reward: {performance[i]}" )
import gym from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import PPO2 from FrankaGymEnvironment import CustomEnv my_signal_rate = 100 my_signal_repetitions = 15 my_step_limit = 24 env = CustomEnv(signal_rate= my_signal_rate, signal_repetitions= my_signal_repetitions, step_limit= my_step_limit) # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor # env = DummyVecEnv([lambda: env]) my_learning_rate = 0.003 timesteps = 160000 # Configure tensorflow using GPU # Use tensorboard to show reward over time etc model = PPO2(MlpPolicy, env, learning_rate= my_learning_rate, verbose=1, tensorboard_log="/home/ryuga/Documents/TensorBoardLogs") # defaults: learning_rate=2.5e-4, model.learn(total_timesteps=timesteps) name = "BALLS_franka_continuous_ppo2" + str(my_learning_rate) + "_timesteps_" + str(timesteps) model.save(name) f = open("envparameters_" + name, "x") f.write(str([my_signal_rate, my_signal_repetitions, my_step_limit])) f.close()
def run_model(is_train=True, model_name='rl_model'): df = pd.read_csv('./data/db.csv') df = df.sort_values('date') df = df.drop(columns='date') df = df.dropna().reset_index() print(df.isnull().sum().sum()) # train, test = train_test_split(df, test_size=0.1) train = df[:int(0.9 * len(df))].reset_index() test = df[int(0.9 * len(df)):].reset_index() # The algorithms require a vectorized environment to run if is_train: env = DummyVecEnv([lambda: StockTradingEnv(train, train, 29, True)]) else: env = DummyVecEnv([lambda: StockTradingEnv(test, train, 29, False)]) if is_train and model_name == 'rl_rand_model': model = PPO2(RandomPolicy, env, verbose=11, tensorboard_log="./log/rand_stock_tensorboard/") elif not is_train and model_name == 'rl_rand_model': #model = PPO2.load("./ckpt/rl_rand_model") model = PPO2(RandomPolicy, env, verbose=11, tensorboard_log="./log/rand_stock_tensorboard/") elif is_train and model_name == 'rl_model': #model = PPO2(CustomPolicy, env, verbose=11, tensorboard_log="./log/ppo2_stock_tensorboard/") model = PPO2.load("./ckpt/rl_model", env=env) elif not is_train and model_name == 'rl_model': model = PPO2.load("./ckpt/rl_model", env=env) elif not is_train and model_name == 'hr_model': model = Heristic(env) elif is_train and model_name == 'hr_model': model = Heristic(env) elif not is_train and model_name == 'rnn_model': model = Baseline(env) elif is_train and model_name == 'rnn_model': model = Baseline(env) else: assert False for epoch in range(1): if model_name == 'rl_model' and is_train: obs = env.reset() model.learn(total_timesteps=100000) model.save("./ckpt/rl_model") obs = env.reset() success = [] for i in range(len(test.loc[:, 'TROW_PRC'].values) - 30): action, _states = model.predict(obs) obs, reward, done, info = env.step(action) success.append(info[0]['success_rate']) env.render() return success
actor_options = { 'learning_rate': lr, 'gamma': 1., 'verbose': 0, 'n_steps': 100, 'ent_coef': 0., 'max_grad_norm': 1e2, } description = ','.join( ['{}={}'.format(k, v) for k, v in actor_options.items()]) description += ',num_env={},norm_obs={},norm_reward={}'.format( num_env, norm_obs, norm_reward) learning_options = {'total_timesteps': int(1e6)} # Wrap in a try statement to close the environment properly in case of keyboard interrupt. try: envs = [make_mujoco_env(env_name, 2) for _ in range(num_env)] # env = DummyVecEnv([lambda: env for env in envs]) env = SubprocVecEnv([lambda: env for env in envs]) env = VecNormalize(env, norm_obs=norm_obs, norm_reward=norm_reward) # Create the actor and learn actor_options['tensorboard_log'] = os.path.join( tensorboard_logdir, env_name) model = PPO2(MlpPolicy, env, **actor_options) # model = PPO2(MlpLstmPolicy, env, **actor_options) model.learn(**learning_options, tb_log_name=description) finally: env.close()
if args.load: print('Model Loaded') load_file = args.load env = DummyVecEnv([lambda: env]) model = PPO2.load(load_file, env=env) else: print('Training') gamma = 0.9 # discount rate #self.epsilon_decay = 0.99 learning_rate = 1e-4 target_network_update_freq = 1000 model = PPO2(MlpPolicy, env, verbose=0, gamma=gamma, noptepochs=8, nminibatches=8, learning_rate=learning_rate, ent_coef=0.001, tensorboard_log=tensorboard_log_dir) if args.evaluate: if not args.load: print('Load a model to evaluate') evaluate_policy(model, env, deterministic=False, n_eval_episodes=10) else: model.learn(total_timesteps=int(1e7), callback=callback) print('done simulation')
from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import PPO2 from stable_baselines.common.evaluation import evaluate_policy from stable_baselines.common.env_checker import check_env import numpy as np from gym import spaces # Since opponent_model wasn't set, this environment uses a random # opponent. random_env = gym.make('custom_gyms:tictac4-v0') check_env(random_env) # set n_cpu_tf_sess so that this will just run on 1 thread. We need this for determinism. model = PPO2("MlpPolicy", random_env, verbose=False, learning_rate=0.0025, nminibatches=4, n_cpu_tf_sess=1, seed=1) mean_reward, std_reward = evaluate_policy(model, random_env, n_eval_episodes=2, deterministic=True, render=True) print(f'random opponent: mean reward: {mean_reward}, std reward {std_reward}')
description_str = "trajectories1-124_e7_l7_5sec_gamma0.3_t0_r1.0_lin0.001_rot0.04" log_dir = "../runs/tensorboard/ppo2_2019_07_03_trajectory002/" continue_learning = False # bagfiles = ["../resources/torque_trajectory_{0:03}.bag".format(i) for i in range(1, 125)] bagfiles = ["../resources/torque_trajectory_001.bag"] gazebo_env = GazeboEnv(0.1, 5.0, bagfiles, example_embodiments.panda_embodiment, example_embodiments.panda_embodiment) env = DummyVecEnv([lambda: Monitor(gazebo_env, "../runs/monitor/", allow_early_resets=True)]) if continue_learning: model = PPO2.load("../runs/models/trajectory003_e7_l4_5sec_gamma0.3_t0_r1.0_lin0.001_rot0.01_best.pkl", env=env, tensorboard_log=log_dir) else: model = PPO2(MlpPolicy, env, verbose=2, tensorboard_log=log_dir, gamma=0.3, # n_steps=30, # nminibatches=1 ) best_mean_reward, n_steps = -np.inf, 0 def callback(_locals, _globals): """ Callback called at each step (for DQN an others) or after n steps (see ACER or PPO2) :param _locals: (dict) :param _globals: (dict) """ global n_steps, best_mean_reward # Print stats every 256 calls if (n_steps + 1) % 100 == 0:
if not os.path.isdir(tensorboard_folder): os.makedirs(tensorboard_folder) if not os.path.isdir(model_folder): os.makedirs(model_folder) policy = '' model_tag = '' if len(sys.argv) > 1: policy = sys.argv[1] model_tag = '_' + sys.argv[1] env = DummyVecEnv([lambda: ActionMaskEnv()]) model = PPO2(get_policy(policy), env, verbose=0, nminibatches=1, tensorboard_log=tensorboard_folder) model.learn(total_timesteps=25000, tb_log_name='PPO2' + model_tag) model.save(model_folder + "PPO2" + model_tag) del model model = PPO2.load(model_folder + "PPO2" + model_tag) done = False states = None action_masks = [] obs = env.reset() while not done: action, states = model.predict(obs, states, action_mask=action_masks)
ball_friction=0.3, ball_elasticity=1.5, max_cycles=125) env = ss.color_reduction_v0(env, mode='B') env = ss.resize_v0(env, x_size=84, y_size=84) env = ss.frame_stack_v1(env, 3) env = ss.pettingzoo_env_to_vec_env_v0(env) env = ss.concat_vec_envs_v0(env, 8, num_cpus=4, base_class='stable_baselines') model = PPO2(CnnPolicy, env, verbose=3, gamma=0.99, n_steps=125, ent_coef=0.01, learning_rate=0.00025, vf_coef=0.5, max_grad_norm=0.5, lam=0.95, nminibatches=4, noptepochs=4, cliprange=0.2, cliprange_vf=1) model.learn(total_timesteps=2000000) model.save("policy") # Rendering env = pistonball_v3.env() env = ss.color_reduction_v0(env, mode='B') env = ss.resize_v0(env, x_size=84, y_size=84) env = ss.frame_stack_v1(env, 3)
#p_quarks = dict(net_arch=[256, 256, dict( # vf=[256, 128], pi=[64])]) name = "Rev_i7_DISCRETE_DefNN_RndmBall_Phys006_ppo2_franka_LR_" + print_LR + "_timesteps_" + \ str(timesteps) + "_srate_sreps_slimit_" + str(my_signal_rate) + \ str(my_signal_repetitions) + str(my_step_limit) + "_joints_" + str(my_number_of_joints) + "_rdmBall_" + str(my_randomBall) + "_ballPos_" + str(my_ballPos) #model = PPO2(MlpPolicy, env, policy_kwargs=p_quarks, learning_rate=my_learning_rate, verbose=1, # tensorboard_log="/media/ryuga/Shared Storage/TensorBoardLogs/NEW_DEEP_FRANKA5_RYZEN") # defaults: learning_rate=2.5e-4, policy = MlpPolicy # if MlpLstmPolicy then nminibatches=1 # MlpPolicy model = PPO2(policy, env, learning_rate=my_learning_rate, verbose=1, tensorboard_log= "/media/ryuga/Shared Storage/TensorBoardLogs/Rev_NEW_DEEP_FRANKA" ) # defaults: learning_rate=2.5e-4, try: f = open("../Envparameters/envparameters_" + name, "x") f.write( str([ my_signal_rate, my_signal_repetitions, my_step_limit, lr_start, lr_end, timesteps, my_number_of_joints, my_randomBall, my_ballPos ])) f.close() except: print("envparameters couldn't be saved. They are:" + str([ my_signal_rate, my_signal_repetitions, my_step_limit, lr_start, lr_end,
env = DummyVecEnv([lambda: ProcessorEnv()]) # env = DummyVecEnv([lambda: ProcessorEnv(taskFile='data/dataset/CSV/0.csv')]) # np.random.seed(123) # env.seed(123) # print (env.get_attr('reward_range')) # env.reward_range = env.get_attr('reward_range') # env = Monitor(env, log_dir, allow_early_resets=True) # Because we use parameter noise, we should use a MlpPolicy with layer normalization if (resume): model = PPO2.load(models_dir + "ppo2_resetnew_noroundoff_1_expt8") model.set_env(env) print("RESUMED") else: model = PPO2(MlpPolicy, env, verbose=0, learning_rate=learning_rate) print(float(1e-5) == 0.00001) # Create the callback: check every 1000 steps callback = SaveOnBestTrainingRewardCallback(env=env, check_freq=1000, log_dir=log_dir) # Train the agent try: model.learn(total_timesteps=int(time_steps), callback=callback) model.save(models_dir + model_name) except KeyboardInterrupt: model.save(models_dir + model_name + "_abort") finally: mean_episode_reward = env.get_attr('mean_episode_reward') print(mean_episode_reward)
super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, net_arch=[8, 'lstm', dict(vf=[5, 10], pi=[10])], layer_norm=True, feature_extraction="mlp", **_kwargs) # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: QtradeEnv()]) model = PPO2(CustomLSTMPolicy, env, verbose=1, nminibatches=1) model.learn(total_timesteps=50000) model.save('ppo2_mlplnlstm') del model model = PPO2.load('ppo2_mlplnlstm', env=env) obs = env.reset() for i in range(20000): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render()
def main(args): envconfig_string = args.envconfig custom_envconfig = _preprocess_custom_envconfig( args.envconfig) if args.envconfig is not None else {} env_id = 'gym_auv:' + args.env env_name = env_id.split(':')[-1] if ':' in env_id else env_id envconfig = gym_auv.SCENARIOS[env_name][ 'config'] if env_name in gym_auv.SCENARIOS else {} envconfig.update(custom_envconfig) NUM_CPU = 8 EXPERIMENT_ID = str(int(time())) + args.algo.lower() model = { 'ppo': PPO2, 'ddpg': DDPG, 'td3': TD3, 'a2c': A2C, 'acer': ACER, 'acktr': ACKTR }[args.algo.lower()] if args.mode == 'play': agent = model.load(args.agent) if args.agent is not None else None envconfig_play = envconfig.copy() envconfig_play['show_indicators'] = True #envconfig_play['autocamera3d'] = False env = create_env(env_id, envconfig_play, test_mode=True, render_mode=args.render, pilot=args.pilot, verbose=True) print('Created environment instance') if args.scenario: env.load(args.scenario) vec_env = DummyVecEnv([lambda: env]) recorded_env = VecVideoRecorder( vec_env, args.video_dir, record_video_trigger=lambda x: x == 0, video_length=args.recording_length, name_prefix=(args.env if args.video_name == 'auto' else args.video_name)) print(args.video_dir, args.video_name) play_scenario(env, recorded_env, args, agent=agent) recorded_env.env.close() elif (args.mode == 'enjoy'): agent = model.load(args.agent) # params = agent.get_parameters() # policy_weights = [ # params['model/pi_fc0/w:0'], # params['model/pi_fc1/w:0'], # params['model/pi/w:0'] # ] # policy_biases = [ # params['model/pi_fc0/b:0'], # params['model/pi_fc1/b:0'], # params['model/pi/b:0'] # ] # for param in params: # print(param, params[param].shape) video_folder = os.path.join(DIR_PATH, 'logs', 'videos', args.env, EXPERIMENT_ID) os.makedirs(video_folder, exist_ok=True) env = create_env(env_id, envconfig, test_mode=True, render_mode=args.render, pilot=args.pilot) if args.scenario: env.load(args.scenario) vec_env = DummyVecEnv([lambda: env]) recorded_env = VecVideoRecorder( vec_env, video_folder, record_video_trigger=lambda x: x == 0, video_length=args.recording_length, name_prefix=(args.env if args.video_name == 'auto' else args.video_name)) obs = recorded_env.reset() state = None done = [False for _ in range(vec_env.num_envs)] for _ in range(args.recording_length): if args.recurrent: action, _states = agent.predict( observation=obs, state=state, mask=done, deterministic=not args.stochastic) state = _states else: action, _states = agent.predict( obs, deterministic=not args.stochastic) obs, reward, done, info = recorded_env.step(action) recorded_env.render() recorded_env.close() elif (args.mode == 'train'): figure_folder = os.path.join(DIR_PATH, 'logs', 'figures', args.env, EXPERIMENT_ID) os.makedirs(figure_folder, exist_ok=True) scenario_folder = os.path.join(figure_folder, 'scenarios') os.makedirs(scenario_folder, exist_ok=True) video_folder = os.path.join(DIR_PATH, 'logs', 'videos', args.env, EXPERIMENT_ID) recording_length = 8000 os.makedirs(video_folder, exist_ok=True) agent_folder = os.path.join(DIR_PATH, 'logs', 'agents', args.env, EXPERIMENT_ID) os.makedirs(agent_folder, exist_ok=True) tensorboard_log = os.path.join(DIR_PATH, 'logs', 'tensorboard', args.env, EXPERIMENT_ID) tensorboard_port = 6006 if (args.nomp or model == DDPG or model == TD3): num_cpu = 1 vec_env = DummyVecEnv( [lambda: create_env(env_id, envconfig, pilot=args.pilot)]) else: num_cpu = NUM_CPU vec_env = SubprocVecEnv([ make_mp_env(env_id, i, envconfig, pilot=args.pilot) for i in range(num_cpu) ]) if (args.agent is not None): agent = model.load(args.agent) agent.set_env(vec_env) else: if (model == PPO2): if args.recurrent: hyperparams = { # 'n_steps': 1024, # 'nminibatches': 32, # 'lam': 0.95, # 'gamma': 0.99, # 'noptepochs': 10, # 'ent_coef': 0.0, # 'learning_rate': 0.0003, # 'cliprange': 0.2, 'n_steps': 1024, 'nminibatches': 1, 'lam': 0.98, 'gamma': 0.999, 'noptepochs': 4, 'ent_coef': 0.01, 'learning_rate': 2e-3, } class CustomLSTMPolicy(MlpLstmPolicy): def __init__(self, sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm=256, reuse=False, **_kwargs): super().__init__(sess, ob_space, ac_space, n_env, n_steps, n_batch, n_lstm, reuse, net_arch=[ 256, 256, 'lstm', dict(vf=[64], pi=[64]) ], **_kwargs) agent = PPO2(CustomLSTMPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams) else: hyperparams = { # 'n_steps': 1024, # 'nminibatches': 32, # 'lam': 0.95, # 'gamma': 0.99, # 'noptepochs': 10, # 'ent_coef': 0.0, # 'learning_rate': 0.0003, # 'cliprange': 0.2, 'n_steps': 1024, 'nminibatches': 32, 'lam': 0.98, 'gamma': 0.999, 'noptepochs': 4, 'ent_coef': 0.01, 'learning_rate': 2e-4, } #policy_kwargs = dict(act_fun=tf.nn.tanh, net_arch=[64, 64, 64]) #policy_kwargs = dict(net_arch=[64, 64, 64]) layers = [256, 128, 64] #layers = [64, 64] policy_kwargs = dict(net_arch=[dict(vf=layers, pi=layers)]) agent = PPO2(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams, policy_kwargs=policy_kwargs) elif (model == DDPG): hyperparams = { 'memory_limit': 1000000, 'normalize_observations': True, 'normalize_returns': False, 'gamma': 0.98, 'actor_lr': 0.00156, 'critic_lr': 0.00156, 'batch_size': 256, 'param_noise': AdaptiveParamNoiseSpec(initial_stddev=0.287, desired_action_stddev=0.287) } agent = DDPG(LnMlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams) elif (model == TD3): action_noise = NormalActionNoise(mean=np.zeros(2), sigma=0.1 * np.ones(2)) agent = TD3(stable_baselines.td3.MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, action_noise=action_noise) elif model == A2C: hyperparams = { 'n_steps': 5, 'gamma': 0.995, 'ent_coef': 0.00001, 'learning_rate': 2e-4, } layers = [64, 64] policy_kwargs = dict(net_arch=[dict(vf=layers, pi=layers)]) agent = A2C(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log, **hyperparams, policy_kwargs=policy_kwargs) elif model == ACER: agent = ACER(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log) elif model == ACKTR: agent = ACKTR(MlpPolicy, vec_env, verbose=True, tensorboard_log=tensorboard_log) print('Training {} agent on "{}"'.format(args.algo.upper(), env_id)) n_updates = 0 n_episodes = 0 def callback(_locals, _globals): nonlocal n_updates nonlocal n_episodes sys.stdout.write('Training update: {}\r'.format(n_updates)) sys.stdout.flush() _self = _locals['self'] vec_env = _self.get_env() class Struct(object): pass report_env = Struct() report_env.history = [] report_env.config = envconfig report_env.nsensors = report_env.config[ "n_sensors_per_sector"] * report_env.config["n_sectors"] report_env.sensor_angle = 2 * np.pi / (report_env.nsensors + 1) report_env.last_episode = vec_env.get_attr('last_episode')[0] report_env.config = vec_env.get_attr('config')[0] report_env.obstacles = vec_env.get_attr('obstacles')[0] env_histories = vec_env.get_attr('history') for episode in range(max(map(len, env_histories))): for env_idx in range(len(env_histories)): if (episode < len(env_histories[env_idx])): report_env.history.append( env_histories[env_idx][episode]) report_env.episode = len(report_env.history) + 1 total_t_steps = _self.get_env().get_attr( 'total_t_steps')[0] * num_cpu agent_filepath = os.path.join(agent_folder, str(total_t_steps) + '.pkl') if model == PPO2: recording_criteria = n_updates % 10 == 0 report_criteria = True _self.save(agent_filepath) elif model == A2C or model == ACER or model == ACKTR: save_criteria = n_updates % 100 == 0 recording_criteria = n_updates % 1000 == 0 report_criteria = True if save_criteria: _self.save(agent_filepath) elif model == DDPG or model == TD3: save_criteria = n_updates % 10000 == 0 recording_criteria = n_updates % 50000 == 0 report_criteria = report_env.episode > n_episodes if save_criteria: _self.save(agent_filepath) if report_env.last_episode is not None and len( report_env.history) > 0 and report_criteria: try: #gym_auv.reporting.plot_trajectory(report_env, fig_dir=scenario_folder, fig_prefix=args.env + '_ep_{}'.format(report_env.episode)) gym_auv.reporting.report(report_env, report_dir=figure_folder) #vec_env.env_method('save', os.path.join(scenario_folder, '_ep_{}'.format(report_env.episode))) except OSError as e: print("Ignoring reporting OSError:") print(repr(e)) if recording_criteria: if args.pilot: cmd = 'python run.py enjoy {} --agent "{}" --video-dir "{}" --video-name "{}" --recording-length {} --algo {} --pilot {} --envconfig {}{}'.format( args.env, agent_filepath, video_folder, args.env + '-' + str(total_t_steps), recording_length, args.algo, args.pilot, envconfig_string, ' --recurrent' if args.recurrent else '') else: cmd = 'python run.py enjoy {} --agent "{}" --video-dir "{}" --video-name "{}" --recording-length {} --algo {} --envconfig {}{}'.format( args.env, agent_filepath, video_folder, args.env + '-' + str(total_t_steps), recording_length, args.algo, envconfig_string, ' --recurrent' if args.recurrent else '') subprocess.Popen(cmd) n_episodes = report_env.episode n_updates += 1 agent.learn(total_timesteps=1500000, tb_log_name='log', callback=callback) elif (args.mode in ['policyplot', 'vectorfieldplot', 'streamlinesplot']): figure_folder = os.path.join(DIR_PATH, 'logs', 'plots', args.env, EXPERIMENT_ID) os.makedirs(figure_folder, exist_ok=True) agent = PPO2.load(args.agent) if args.testvals: testvals = json.load(open(args.testvals, 'r')) valuegrid = list(ParameterGrid(testvals)) for valuedict in valuegrid: customconfig = envconfig.copy() customconfig.update(valuedict) env = create_env(env_id, envconfig, test_mode=True, pilot=args.pilot) valuedict_str = '_'.join( (key + '-' + str(val) for key, val in valuedict.items())) print('Running {} test for {}...'.format( args.mode, valuedict_str)) if args.mode == 'policyplot': gym_auv.reporting.plot_actions(env, agent, fig_dir=figure_folder, fig_prefix=valuedict_str) elif args.mode == 'vectorfieldplot': gym_auv.reporting.plot_vector_field( env, agent, fig_dir=figure_folder, fig_prefix=valuedict_str) elif args.mode == 'streamlinesplot': gym_auv.reporting.plot_streamlines( env, agent, fig_dir=figure_folder, fig_prefix=valuedict_str) else: env = create_env(env_id, envconfig, test_mode=True, pilot=args.pilot) with open(os.path.join(figure_folder, 'config.json'), 'w') as f: json.dump(env.config, f) if args.mode == 'policyplot': gym_auv.reporting.plot_actions(env, agent, fig_dir=figure_folder) elif args.mode == 'vectorfieldplot': gym_auv.reporting.plot_vector_field(env, agent, fig_dir=figure_folder) elif args.mode == 'streamlinesplot': gym_auv.reporting.plot_streamlines(env, agent, fig_dir=figure_folder) print('Output folder: ', figure_folder) elif args.mode == 'test': figure_folder = os.path.join(DIR_PATH, 'logs', 'tests', args.env, EXPERIMENT_ID) scenario_folder = os.path.join(figure_folder, 'scenarios') video_folder = os.path.join(figure_folder, 'videos') os.makedirs(figure_folder, exist_ok=True) os.makedirs(scenario_folder, exist_ok=True) os.makedirs(video_folder, exist_ok=True) if not args.onlyplot: agent = model.load(args.agent) def create_test_env(video_name_prefix, envconfig=envconfig): print('Creating test environment: ' + env_id) env = create_env(env_id, envconfig, test_mode=True, render_mode=args.render if args.video else None, pilot=args.pilot) vec_env = DummyVecEnv([lambda: env]) if args.video: video_length = min(500, args.recording_length) recorded_env = VecVideoRecorder(vec_env, video_folder, record_video_trigger=lambda x: (x % video_length) == 0, video_length=video_length, name_prefix=video_name_prefix) active_env = recorded_env if args.video else vec_env return env, active_env failed_tests = [] def run_test(id, reset=True, report_dir=figure_folder, scenario=None, max_t_steps=None, env=None, active_env=None): nonlocal failed_tests if env is None or active_env is None: env, active_env = create_test_env(video_name_prefix=args.env + '_' + id) if scenario is not None: obs = active_env.reset() env.load(args.scenario) print('Loaded', args.scenario) else: if reset: obs = active_env.reset() else: obs = env.observe() gym_auv.reporting.plot_scenario(env, fig_dir=scenario_folder, fig_postfix=id, show=args.onlyplot) if args.onlyplot: return cumulative_reward = 0 t_steps = 0 if max_t_steps is None: done = False else: done = t_steps > max_t_steps while not done: action, _states = agent.predict( obs, deterministic=not args.stochastic) obs, reward, done, info = active_env.step(action) if args.video: active_env.render() t_steps += 1 cumulative_reward += reward[0] report_msg = '{:<20}{:<20}{:<20.2f}{:<20.2%}\r'.format( id, t_steps, cumulative_reward, info[0]['progress']) sys.stdout.write(report_msg) sys.stdout.flush() if args.save_snapshots and t_steps % 30 == 0 and not done: env.save_latest_episode(save_history=False) for size in (20, 50, 100, 200, 300, 400, 500): gym_auv.reporting.plot_trajectory( env, fig_dir=scenario_folder, fig_prefix=(args.env + '_t_step_' + str(t_steps) + '_' + str(size) + '_' + id), local=True, size=size) elif done: gym_auv.reporting.plot_trajectory(env, fig_dir=scenario_folder, fig_prefix=(args.env + '_' + id)) env.close() gym_auv.reporting.report(env, report_dir=report_dir, lastn=-1) #gym_auv.reporting.plot_trajectory(env, fig_dir=scenario_folder, fig_prefix=(args.env + '_' + id)) #env.save(os.path.join(scenario_folder, id)) if env.collision: failed_tests.append(id) with open(os.path.join(figure_folder, 'failures.txt'), 'w') as f: f.write(', '.join(map(str, failed_tests))) return copy.deepcopy(env.last_episode) print('Testing scenario "{}" for {} episodes.\n '.format( args.env, args.episodes)) report_msg_header = '{:<20}{:<20}{:<20}{:<20}{:<20}{:<20}{:<20}'.format( 'Episode', 'Timesteps', 'Cum. Reward', 'Progress', 'Collisions', 'CT-Error [m]', 'H-Error [deg]') print(report_msg_header) print('-' * len(report_msg_header)) if args.testvals: testvals = json.load(open(args.testvals, 'r')) valuegrid = list(ParameterGrid(testvals)) if args.scenario: if args.testvals: episode_dict = {} for valuedict in valuegrid: customconfig = envconfig.copy() customconfig.update(valuedict) env, active_env = create_test_env(envconfig=customconfig) valuedict_str = '_'.join( (key + '-' + str(val) for key, val in valuedict.items())) colorval = -np.log10( valuedict['reward_lambda']) #should be general rep_subfolder = os.path.join(figure_folder, valuedict_str) os.makedirs(rep_subfolder, exist_ok=True) for episode in range(args.episodes): last_episode = run_test(valuedict_str + '_ep' + str(episode), report_dir=rep_subfolder) episode_dict[valuedict_str] = [last_episode, colorval] print('Plotting all') gym_auv.reporting.plot_trajectory(env, fig_dir=scenario_folder, fig_prefix=(args.env + '_all_agents'), episode_dict=episode_dict) else: run_test("ep0", reset=True, scenario=args.scenario) else: if args.testvals: episode_dict = {} agent_index = 1 for valuedict in valuegrid: customconfig = envconfig.copy() customconfig.update(valuedict) env, active_env = create_test_env(envconfig=customconfig) valuedict_str = '_'.join( (key + '-' + str(val) for key, val in valuedict.items())) colorval = np.log10( valuedict['reward_lambda']) #should be general rep_subfolder = os.path.join(figure_folder, valuedict_str) os.makedirs(rep_subfolder, exist_ok=True) for episode in range(args.episodes): last_episode = run_test(valuedict_str + '_ep' + str(episode), report_dir=rep_subfolder) episode_dict['Agent ' + str(agent_index)] = [last_episode, colorval] agent_index += 1 gym_auv.reporting.plot_trajectory(env, fig_dir=figure_folder, fig_prefix=(args.env + '_all_agents'), episode_dict=episode_dict) else: env, active_env = create_test_env(video_name_prefix=args.env) for episode in range(args.episodes): run_test('ep' + str(episode), env=env, active_env=active_env) if args.video and active_env: active_env.close()
#!/usr/bin/env python # -*- coding: utf-8 -*- """Example on how to use the 'Pendulum' OpenAI Gym environments in PRL using the `stable_baselines` library. """ from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import PPO2 from pyrobolearn.envs import gym # this is a thin wrapper around the gym library # create env, state, and action from gym env = gym.make('Pendulum-v0') state, action = env.state, env.action print("State and action space: {} and {}".format(state.space, action.space)) # The algorithms require a vectorized environment to run env = DummyVecEnv([lambda: env]) model = PPO2(MlpPolicy, env, verbose=1) model.learn(total_timesteps=10000) obs = env.reset() for i in range(1000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()
def main(): #criando diretorio log_dir = "tmp/" os.makedirs(log_dir, exist_ok=True) #criando envs envRoll = gym.make('gym_foo:DroneRoll-v0') envRoll = Monitor(envRoll, log_dir) modelRoll = PPO2(MlpPolicy, envRoll, gamma=0.99, n_steps=2048, ent_coef=0.0, learning_rate=3e-4, lam=0.95, nminibatches=32, noptepochs=10, cliprange=0.2, verbose=1) envPitch = gym.make('gym_foo:DronePitch-v0') envPitch = Monitor(envPitch, log_dir) modelPitch = PPO2(MlpPolicy, envPitch, gamma=0.99, n_steps=2048, ent_coef=0.0, learning_rate=3e-4, lam=0.95, nminibatches=32, noptepochs=10, cliprange=0.2, verbose=1) envYaw = gym.make('gym_foo:DroneYaw-v0') envYaw = Monitor(envYaw, log_dir) modelYaw = PPO2(MlpPolicy, envYaw, gamma=0.99, n_steps=2048, ent_coef=0.0, learning_rate=3e-4, lam=0.95, nminibatches=32, noptepochs=10, cliprange=0.2, verbose=1) callback = SaveOnBestTrainingRewardCallback(check_freq=1000, log_dir=log_dir) #treinando time_steps = 2e6 modelRoll.learn(total_timesteps=int(2e6), callback=callback) results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "PPO Roll") plt.show() modelPitch.learn(total_timesteps=int(2e6), callback=callback) results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "PPO Pitch") plt.show() modelYaw.learn(total_timesteps=int(2e6), callback=callback) results_plotter.plot_results([log_dir], time_steps, results_plotter.X_TIMESTEPS, "PPO Yaw") plt.show() #salvando modelos modelRoll.save("Drone_Roll_PPO_001") modelPitch.save("Drone_Pitch_PPO_001") modelYaw.save("Drone_Yaw_PPO_001") #Load modelo #model = PPO2.load("Drone_Roll_PPO_0.01") #testando gerando resposta no tempo T = [0] # Loop de teste t = 0 #obs = env.reset() obsRoll = envRoll.reset() obsPitch = envPitch.reset() obsYaw = envYaw.reset() Roll = [envRoll.state[0]] Pitch = [envPitch.state[0]] Yaw = [envYaw.state[0]] #loop de simulação while t < 10: # ate 10 segundos actionRoll, _states = modelRoll.predict(obsRoll) # Retrieve new state, reward, and whether the state is terminal obsRoll, reward, done, info = envRoll.step(actionRoll) Roll.append((180 / np.pi) * envRoll.state[0]) actionPitch, _states = modelPitch.predict(obsPitch) # Retrieve new state, reward, and whether the state is terminal obsPitch, reward, done, info = envPitch.step(actionPitch) Pitch.append((180 / np.pi) * envPitch.state[0]) actionYaw, _states = modelYaw.predict(obsYaw) # Retrieve new state, reward, and whether the state is terminal obsYaw, reward, done, info = envYaw.step(actionYaw) Yaw.append((180 / np.pi) * envYaw.state[0]) t += 0.01 T.append(t) #Plots plt.figure(1) plt.plot(T, Roll) plt.yticks(np.arange(0, 190, 10)) plt.ylabel('Roll') plt.xlabel('Time (seconds)') plt.title('Roll Response') plt.grid() plt.show() plt.figure(2) plt.plot(T, Pitch) plt.yticks(np.arange(0, 190, 10)) plt.ylabel('Pitch') plt.xlabel('Time (seconds)') plt.title('Pitch Response') plt.grid() plt.show() plt.figure(3) plt.plot(T, Yaw) plt.yticks(np.arange(0, 190, 10)) plt.ylabel('Yaw') plt.xlabel('Time (seconds)') plt.title('Yaw Response') plt.grid() plt.show()
INITIAL_ACCOUNT_BALANCE = 50000 model_name = 'dqn' # dataset loading df_train = pd.read_csv('./data/SPY_training.csv') df_train = df_train.sort_values('Date') df_test = pd.read_csv('./data/SPY_test.csv') df_test = df_test.sort_values('Date') # training env = DummyVecEnv([lambda: StockTradingEnv(df_train)]) model = PPO2(MlpPolicy, env, verbose=1, seed=42, n_cpu_tf_sess=1, tensorboard_log="./tensorboard/") # kwargs = {'double_q': False, 'prioritized_replay': False, 'policy_kwargs': dict(dueling=False)} # model = DQN(MlpPolicy, env, verbose=1, seed=42, n_cpu_tf_sess=1, tensorboard_log="./tensorboard/", **kwargs) model.learn(total_timesteps=40000, log_interval=10) # model.save(save_dir + model_name) # del model # model = DQN.load(save_dir + model_name) # # testing (previous 5 days) # env = DummyVecEnv([lambda: StockTradingEnv(df_test)]) # obs = env.reset() # daily_profit = [] # buy_hold_profit = []
import os import matplotlib.pyplot as plt from gym_minigrid.wrappers import * from gym_minigrid.wrappers import FlatObsWrapper from stable_baselines import PPO2 from stable_baselines.common.vec_env import DummyVecEnv tensorboard_folder = '/root/code/stable_baselines/tensorboard/MiniGrid-Empty-16x16/' model_folder = './models/MiniGrid-Empty-16x16/' if not os.path.isdir(tensorboard_folder): os.makedirs(tensorboard_folder) if not os.path.isdir(model_folder): os.makedirs(model_folder) env = gym.make('MiniGrid-Empty-16x16-v0') env = FlatObsWrapper(env) model = PPO2('MlpPolicy', env, verbose=0, nminibatches=1, n_steps=128, tensorboard_log=tensorboard_folder) model.learn(total_timesteps=1000000, tb_log_name='PPO2') model.save(model_folder + "PPO2")