os.makedirs(tensorboard_folder) if not os.path.isdir(model_folder): os.makedirs(model_folder) policy = '' model_tag = '' if len(sys.argv) > 1: policy = sys.argv[1] model_tag = '_' + sys.argv[1] env = DummyVecEnv([lambda: ActionMaskEnv()]) env = VecFrameStack(env, 3) model = PPO2(get_policy(policy), env, verbose=0, nminibatches=1, tensorboard_log=tensorboard_folder) model.learn(total_timesteps=2500000, tb_log_name='PPO2' + model_tag) model.save(model_folder + "PPO2" + model_tag) del model model = PPO2.load(model_folder + "PPO2" + model_tag) done = False states = None action_masks = [] obs = env.reset() while not done: action, states = model.predict(obs, states, action_mask=action_masks)
def __init__(self, sim_env_name='Hopper-v2', real_env_name='HopperModified-v2', frames=NUM_FRAMES_INPUT, num_cores=NUM_CORES, num_rl_threads=NUM_RL_THREADS, load_policy=None, algo=None): self.env_name = sim_env_name self.real_env_name = real_env_name self.frames = frames self.num_cores = num_cores self.fwd_norms_x = (0., 1.) self.fwd_norms_y = (0., 1.) self.inv_norms_x = (0., 1.) self.inv_norms_y = (0., 1.) self.num_rl_threads = num_rl_threads self.real_env = SubprocVecEnv( [lambda: gym.make(self.real_env_name) for i in range(self.num_cores)]) print('MODIFIED ENV BODY_MASS : ', gym.make(self.real_env_name).model.body_mass) self.sim_env = SubprocVecEnv( [lambda: gym.make(self.env_name) for i in range(self.num_cores)]) print('SIMULATED ENV BODY_MASS : ', gym.make(self.env_name).model.body_mass) # lists to reuse experience from previous grounding steps self.fwd_model_x_list = [] self.fwd_model_y_list = [] self.inv_model_x_list = [] self.inv_model_y_list = [] # initialize target policy if load_policy is None: print('LOADING -RANDOM- INITIAL POLICY') self.target_policy = PPO2( MlpPolicy, env=self.sim_env, verbose=1, tensorboard_log='data/TBlogs/' + self.env_name) else: print('LOADING -PRETRAINED- INITIAL POLICY') # self.target_policy = SAC.load( # load_policy, # env=SubprocVecEnv([lambda: gym.make(self.env_name)]), # tensorboard_log='data/TBlogs/'+self.env_name, # verbose=1, # batch_size=256, # buffer_size=1000000, # ) # TODO: write easy way to switch algorithms # self.target_policy = PPO2.load( # load_policy, # env=SubprocVecEnv([lambda: gym.make(self.env_name)]), # tensorboard_log='TBlogs/'+self.env_name, # verbose=1, # n_steps=256, # # buffer_size=1000000, # ) n_actions = self.sim_env.action_space.shape[-1] action_noise = NormalActionNoise(mean=np.zeros(n_actions), sigma=0.1 * np.ones(n_actions)) self.target_policy = TD3.load( load_policy, env=DummyVecEnv([lambda: gym.make(self.env_name)]), tensorboard_log='data/TBlogs/'+self.env_name, verbose=1, batch_size=128, gamma=0.99, learning_rate=0.001, action_noise=action_noise, buffer_size=1000000, ) # define the Grounded Action Transformer models here self._init_gat_models() self.grounded_sim_env = None
print(test_df.describe()) test_env = DummyVecEnv([ lambda: BitcoinTradingEnv(test_df, reward_func=reward_strategy, forecast_len=int(params['forecast_len']), confidence_interval=params['confidence_interval'] ) ]) model_params = { 'n_steps': int(params['n_steps']), 'gamma': params['gamma'], 'learning_rate': params['learning_rate'], 'ent_coef': params['ent_coef'], 'cliprange': params['cliprange'], 'noptepochs': int(params['noptepochs']), 'lam': params['lam'], } model = PPO2.load('./agents/ppo2_' + reward_strategy + '_' + str(curr_idx) + '.pkl', env=test_env) obs, done = test_env.reset(), False while not done: action, _states = model.predict(obs) obs, reward, done, info = test_env.step(action) test_env.render(mode="system")
def generate_checkpoint_from_model(model, checkpoint_name): with model.graph.as_default(): # if os.path.exists(checkpoint_name): # shutil.rmtree(checkpoint_name) tf.saved_model.simple_save( model.sess, checkpoint_name, inputs={"obs": model.act_model.obs_ph}, outputs={"action": model.act_model._deterministic_action}) if __name__ == '__main__': if os.path.isdir(file): shutil.rmtree(file) model = PPO2.load(file) generate_checkpoint_from_model(model, file) converter = tf.lite.TFLiteConverter.from_saved_model(file) tflite_model = converter.convert() open(file + "/converted_model.tflite", "wb").write(tflite_model) # multiprocess environment n_cpu = 1 # 'Balboa-balance-ctrl-render-v1' env = gym.make('AntPyBulletEnv-v0') env.render(mode="human") obs = env.reset() # When using VecEnv, done is a vector while True:
import roboschool import gym from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import PPO2 env = gym.make('RoboschoolHopper-v1') env = DummyVecEnv([lambda: env ]) # The algorithms require a vectorized environment to run model = PPO2(MlpPolicy, env, verbose=1) model.learn(total_timesteps=10000) obs = env.reset() for i in range(1000000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render() env.close()
#parser.add_argument('--no-train', dest='train', action='store_false') #parser.set_defaults(train=True) args = parser.parse_args() import logging from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import PPO2 from aegis_core.aegis_env import AegisEnv log = logging.getLogger('werkzeug') log.setLevel(logging.ERROR) # Create environment env = AegisEnv(args.input_shape, args.output_shape, args.urls, port=args.port, discrete=args.discrete, niceness=args.niceness, n_steps=args.steps, reward_propagation=args.reward_prop) env = DummyVecEnv([lambda: env]) #load model model = PPO2.load(args.path, env, verbose=args.verbose, tensorboard_log=args.logdir) #train ep_counter = 0 while True: env.reset() #TODO: is this necessary? model.learn(total_timesteps=args.steps, reset_num_timesteps=False, tb_log_name=args.name) ep_counter += 1 #TODO: actual step counter might be off because .learn might have different intervals print("Steps: {}".format(ep_counter * args.steps)) model.save(args.path)
sps = my_signal_rate / my_signal_repetitions env = CustomEnv(signal_rate=my_signal_rate, signal_repetitions=my_signal_repetitions, step_limit=my_step_limit, number_of_gears=my_number_of_gears, gear_interval=my_gear_interval) # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor # env = DummyVecEnv([lambda: env]) my_learning_rate = 0.01 # 0.01 is probably a good value for training <1h timesteps = 5000 model = PPO2(MlpPolicy, env, learning_rate=my_learning_rate, verbose=1, tensorboard_log="/home/fritz/Documents/TensorBoardLogs" ) # defaults: learning_rate=2.5e-4, model.learn(total_timesteps=timesteps) name = "ppo2_franka_SHIFTING_GEARS_learning_rate_" + str( my_learning_rate) + "_sps_" + str(sps) + "_timesteps_" + str(timesteps) model.save(name) # + str(my_learning_rate)) f = open("envparameters_" + name, "x") f.write( str([ my_signal_rate, my_signal_repetitions, my_step_limit, my_number_of_gears, my_gear_interval ])) f.close()
return envs if __name__ == "__main__": parser = argparse.ArgumentParser( description="Train script for reinforcement learning") parser.add_argument('--algo', type=str, default='ppo2') parser.add_argument('--seed', type=int, default=0) parser.add_argument('--log-dir', type=str, default='/logs/') parser.add_argument('--env', type=str, default='KukaButtonGymEnv-v0') parser.add_argument('--num-timsteps', type=int, default=int(1e5)) parser.add_argument('--obs-type', type=str, default='ground_truth') parser.add_argument('--num-cpu', type=int, default=1) args, unknown = parser.parse_known_args() env_class = InmoovGymEnv # env default kwargs default_env_kwargs = { k: v.default for k, v in inspect.signature(env_class.__init__).parameters.items() if v is not None } env = createEnvs(args) tt() model = PPO2(policy=MlpPolicy, env=env, learning_rate=lambda f: f * 2.5e-4, verbose=1) model.learn(total_timesteps=args.num_timsteps, seed=args.seed)
# callback for evaluation eval_callback = EvalCallback(env, best_model_save_path=specified_path, log_path=specified_path, eval_freq=100000, n_eval_episodes=5, verbose=1, deterministic=False, render=False) # train model try: try: model_path = join(specified_path, 'best_model.zip') model = PPO2.load(model_path, env=env_8, tensorboard_log=specified_path) # model = PPO2('MlpPolicy', env=env_8, tensorboard_log=specified_path, **model_config).load(args.modelpath, env=env_8) print("Existing model loaded from directory") except: model = PPO2(policy, env=env_8, tensorboard_log=specified_path, **model_config) print('New model created.') #Pretrain the model print('Starting pre-training of the model..') model.pretrain(dataset, n_epochs=100) model.save(join(specified_path, 'pretrained-model.zip'))
import warnings from stable_baselines import TRPO, PPO2, SAC, ACKTR, DDPG, TD3, ACER, DQN from stable_baselines.common.vec_env import DummyVecEnv, SubprocVecEnv from navigation_env import NavigationEnvDefault from default_config import config from stable_baselines.gail import generate_expert_traj from stable_baselines.gail import ExpertDataset if __name__ == "__main__": env = SubprocVecEnv( [lambda: NavigationEnvDefault(**config) for _ in range(32)]) model = PPO2(env=env, policy="MlpLstmPolicy", n_steps=32, nminibatches=4, tensorboard_log='./', verbose=1) model.learn(1000000000) model.save("recurrent_nodelay")
print("act dim : ", env.action_space) loadFileIndex = 2 loadFileString = "ppo2_pybulletAnt_6_end_{}".format(loadFileIndex) saveFileString = "ppo2_pybulletAnt_6_end_{}".format(loadFileIndex + 1) print("loadFile : ", loadFileString) print("saveFile : ", saveFileString) isTrain = True isContinue = True if isTrain: print("start training =========================================") if not isContinue: model = PPO2(MlpPolicy, env, verbose=1) else: print("load model =========================================") model = PPO2.load(loadFileString, env) model.learn(total_timesteps=1000000) print("end training =========================================") model.save(saveFileString) print("saved model =========================================") else: print("load model =========================================") model = PPO2.load(loadFileString, env) print("start test =========================================")
plt.tick_params(bottom=False, labelbottom=False, left=False, labelleft=False, right=False, labelright=False, top=False, labeltop=False) plt.gca().spines['right'].set_visible(False) plt.gca().spines['top'].set_visible(False) plt.gca().spines['bottom'].set_visible(False) plt.gca().spines['left'].set_visible(False) for mf in model_files: iteration = get_model_iteration(mf) model = PPO2.load(mf) ims = [] observation = env.reset() done_count = 0 while True: img = env.render() im = plt.imshow(img) ims.append([im]) action, _ = model.predict(observation) observation, reward, done, info = env.step(action) if done: done_count += 1
import pybullet_envs.bullet.minitaur_gym_env as e from stable_baselines import PPO2 total_timesteps = 1000000 model = PPO2.load("./model/model{}".format(total_timesteps)) env = e.MinitaurBulletEnv(render=True) obs = env.reset() for i in range(50000): action, _states = model.predict(obs) obs, rewards, done, info = env.step(action) env.render(mode="human") env.close()
import os from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import DummyVecEnv from stable_baselines import PPO2, DQN from envs import Trainer # Environment env = DummyVecEnv([lambda: Trainer()]) # Train model = PPO2(MlpPolicy, env, tensorboard_log='log') model.learn(total_timesteps=100000) # Save model model.save('model') # Play episodes = 1 for e in range(episodes): obs = env.reset() done = False i = 0 while not done: # Perform action action, _states = model.predict(obs) # Perform action obs, reward, done, _ = env.step(action) print('Iteration: {i} - Action: {a} - Reward: {r}'.format(i=i, a=action, r=reward)) # Render environment
def main(): parser = argparse.ArgumentParser( description='Plotting mechanisms for GARAT and related modifications') parser.add_argument('--sim_env', default="InvertedPendulum-v2", type=str, help="Name of the simulator/source environment") parser.add_argument('--real_env', default="InvertedPendulumModified-v2", type=str, help="Name of the real/target environment") parser.add_argument( '--load_policy_path', default= "data/models/TRPO_initial_policy_steps_InvertedPendulum-v2_2000000_.pkl", help="relative path of policy to be used for generating plots") parser.add_argument( '--load_atp_path', default= "data/models/garat/Single_GAIL_sim2real_TRPO_2000000_1000_50_0/", type=str, help="relative path for stored Action transformation policies") parser.add_argument('--seed', default=0, type=int, help="Random seed") args = parser.parse_args() #Set seed np.random.seed(args.seed) sim_env = gym.make(args.sim_env) real_env = gym.make(args.real_env) policy = TRPO.load(args.load_policy_path) action_tf_policy_list_single = [] action_tf_policy_list_double = [] action_tf_policy_list_shared_double = [] action_tf_policy_list_airl = [] num_grounding = 50 atp_path_single = args.load_atp_path atp_path_double = args.load_atp_path.replace('_0', '_2') atp_path_shared_double = args.load_atp_path.replace('_0', '_1') atp_path_airl = args.load_atp_path.replace( 'Single_GAIL_sim2real_TRPO_2000000_1000_50_0', 'Single_AIRL_sim2real_TRPO_2000000_1000_50_1') print('################## Begin File loading ##################') for index in range(num_grounding): file_path_single = os.path.join( atp_path_single, "action_transformer_policy1_" + str(index) + ".pkl") print(file_path_single) action_tf_policy_list_single.append(PPO2.load(file_path_single)) file_path_double = os.path.join( atp_path_double, "action_transformer_policy1_" + str(index) + ".pkl") print(file_path_double) action_tf_policy_list_double.append(PPO2.load(file_path_double)) file_path_shared_double = os.path.join( atp_path_shared_double, "action_transformer_policy1_" + str(index) + ".pkl") print(file_path_shared_double) action_tf_policy_list_shared_double.append( PPO2.load(file_path_shared_double)) #file_path_airl = os.path.join(atp_path_airl,"action_transformer_policy1_"+str(index)+".pkl") #print(file_path_airl) #action_tf_policy_list_airl.append(PPO2.load(file_path_airl)) results_dict = {} print('################## File loading Completed ##################') results_single = calculate_transition_errors(sim_env, real_env, policy, action_tf_policy_list_single) print('############## Begin Double Discriminator Calculations') results_shared_double = calculate_transition_errors( sim_env, real_env, policy, action_tf_policy_list_shared_double) results_double = calculate_transition_errors(sim_env, real_env, policy, action_tf_policy_list_double) print('############## Begin AIRL Calculations') #results_airl = calculate_transition_errors(sim_env, real_env, policy, action_tf_policy_list_airl) results_dict['GARAT'] = results_single results_dict['GARAT Double Discriminator'] = results_double results_dict[ 'GARAT Double Discriminator (Generator LR modifications)'] = results_shared_double #results_dict['GARAT AIRL'] = results_airl plot_results(results_dict)
# Create the callback list callback = CallbackList([checkpoint_callback, eval_callback]) lr_sch = LinearSchedule(int(10e6), 1.0e-5, 2.5e-4) model = PPO2( policy=MlpPolicy, env=env, verbose=1, tensorboard_log="./ppo2_docking_tensorboard/", policy_kwargs=dict(net_arch=[128, dict(pi=[128], vf=[128])], act_fun=tf.nn.relu), lam=0.95, gamma=0.99, # lower 0.9 ~ 0.99 # n_steps=math.floor(cfg['env']['max_time'] / cfg['env']['ctl_dt']), n_steps=600, ent_coef=0.00, learning_rate=3e-4, # learning_rate=lr_sch.value, # learning_rate=linear_schedule(3e-4), vf_coef=0.5, max_grad_norm=0.5, nminibatches=10, noptepochs=10, cliprange=0.2) # load trained model # model = PPO2.load("ppo2_docking_621_shaping_10M", env=env, tensorboard_log="./ppo2_docking_tensorboard/") model.learn(total_timesteps=int(10e6), callback=callback)
from stable_baselines import PPO2 def env_create(): env = ClientDapr("ActorOpenAI") env.create("CartPole-v1") print(f"[Client] Created Actor {env.actor_id}", flush=True) return env print("===============================================") print("INFERING") print("===============================================") model = PPO2.load("baselines_ppo_cartpole") env_local = env_create() # Start monitoring print("[Client] Starting to monitor", flush=True) env_local.monitor_start(1) # Run Experiment obs = env_local.reset() is_done = False while not (is_done): action, _states = model.predict(obs) obs, rewards, is_done, info = env_local.step(action) # Stop Monitoring
def train(env_id, num_timesteps, seed, policy, n_envs=8, nminibatches=4, n_steps=128, peer=0., scheduler=None, individual=False, repeat=1): """ Train PPO2 model for atari environment, for testing purposes :param env_id: (str) the environment id string :param num_timesteps: (int) the number of timesteps to run :param seed: (int) Used to seed the random generator. :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) :param n_envs: (int) Number of parallel environments :param nminibatches: (int) Number of training minibatches per update. For recurrent policies, the number of environments run in parallel should be a multiple of nminibatches. :param n_steps: (int) The number of steps to run for each environment per update (i.e. batch size is n_steps * n_env where n_env is number of environment copies running in parallel) """ policy = { 'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy }[policy] is_atari = 'NoFrameskip' in env_id make_env = lambda: VecFrameStack(make_atari_env(env_id, n_envs, seed), 4) if is_atari \ else make_vec_env(env_id, n_envs, seed) print(make_env) models = { "A": PPO2( policy=policy, policy_kwargs={'view': 'even'}, n_steps=n_steps, env=make_env(), nminibatches=nminibatches, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=2.5e-4, cliprange=lambda f: f * 0.1, verbose=1), "B": PPO2( policy=policy, policy_kwargs={'view': 'odd'}, n_steps=n_steps, env=make_env(), nminibatches=nminibatches, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, learning_rate=2.5e-4, cliprange=lambda f: f * 0.1, verbose=1)} views = {view: View(models[view], peer=peer) for view in ("A", "B")} n_batch = n_envs * n_steps n_updates = num_timesteps // n_batch for t in range(n_updates): logger.info("current episode:", t) for view in "A", "B": models[view].learn(n_batch) if not individual: for view, other_view in zip(("A", "B"), ("B", "A")): obses, _, _, actions, _, _, _, _, _ = models[other_view].rollout views[view].peer = peer * scheduler(t) logger.info("current alpha:", views[view].peer) for _ in range(repeat): views[view].learn( obses, actions, views[view].learning_rate / repeat) for view in "A", "B": models[view].env.close() del models[view] # free memory
'D': [450, 450, 5000] }, 'Linear_To_Angular_Scaler': [1, 1, 0], 'Yaw_Rate_Scaler': 0.18, 'Angular_PID': { 'P': [24000, 24000, 1500], 'I': [0, 0, 1.2], 'D': [12000, 12000, 0] }, 'Angular_PID2': { 'P': [4000, 4000, 1500], 'I': [0, 0, 1.2], 'D': [1500, 1500, 0] }, } env = Quad_Env() env = make_vec_env(lambda: env, n_envs=1) # If the environment don't follow the interface, an error will be thrown obs = env.reset() model = PPO2(MlpLnLstmPolicy, env, nminibatches=1, tensorboard_log="./stationary_env_ppo/") model.learn(total_timesteps=100000, log_interval=4000) model.save("ppo_30rotor_fault_blending") print("Training complete - agent saved")
# ============ Number of days trained ============= REPEAT_NO = 10 tstep_list = [200000] # tstep_list = [50000, 100000] # tstep_list = [100000, 500000] for tstep in tstep_list: final_result = [] summary_fileName = summary_fileName_model[:-5] + str(tstep) + ".out" for modelNo in range(REPEAT_NO): profit_list = [] act_profit_list = [] detail_list = [] model = PPO2(MlpPolicy, trainEnv, verbose=1, tensorboard_log="./" + SAVE_DIR[-3:] + '_' + str(tstep) + "_tensorboard/") model.learn(total_timesteps=tstep, log_interval=128) # model.learn(total_timesteps=tstep) model_name = common_fileName_prefix + str(tstep) + '-' + str( modelNo) + "-model.model" model.save(path.join(SAVE_DIR, model_name), cloudpickle=True) obs = testEnv.reset() # Test for consecutive 2000 days for testNo in range(365 * 5): action, _states = model.predict(obs) if np.isnan(action).any(): print(testNo)
stiffness_value = "stiffness_test16" save_name_extension = RL_method log_dir = "./logs/{}/{}/{}/".format(experiment_ID, RL_method, stiffness_value) # defining the environments env = gym.make('NmiLeg-v1') #env = DummyVecEnv([lambda: env]) # loading the trained model if RL_method == "PPO1": model = PPO1.load(log_dir+"/model.pkl") elif RL_method == "PPO2": model = PPO2.load(log_dir+"/model.pkl") env = DummyVecEnv([lambda: env]) elif RL_method == "DDPG": model = DDPG.load(log_dir+"/model.pkl") env = DummyVecEnv([lambda: env]) else: raise ValueError("Invalid RL mode") # setting the environment model.set_env(env) env_run = gym.make('NmiLeg-v1') #env_run = Monitor(env_run,'./video/'+log_dir,force=True) #model = DDPG.load("PPO2-HalfCheetah_nssu-v3_test2") obs = env_run.reset() #while True:
def train(self): if not self.train_df: self.logger.info("Running built-in data preparation") self.prepare_data() else: self.logger.info("Using provided data (Length: %d)" % len(self.train_df)) study_name = 'ppo2_' + self.reward_strategy study = optuna.load_study(study_name=study_name, storage=self.params_db_file) params = study.best_trial.params train_env = DummyVecEnv([ lambda: BitcoinTradingEnv(self.train_df, reward_func=self.reward_strategy, forecast_len=int(params['forecast_len']), confidence_interval=params[ 'confidence_interval']) ]) test_env = DummyVecEnv([ lambda: BitcoinTradingEnv(self.test_df, reward_func=self.reward_strategy, forecast_len=int(params['forecast_len']), confidence_interval=params[ 'confidence_interval']) ]) model_params = self.model_params(params) model = PPO2(MlpLnLstmPolicy, train_env, verbose=0, nminibatches=1, tensorboard_log=os.path.join('.', 'tensorboard'), **model_params) models_to_train = 1 self.logger.info("Training {} model instances".format(models_to_train)) for idx in range( 0, models_to_train): #Not sure why we are doing this, tbh self.logger.info('[', idx, '] Training for: ', len(self.train_df), ' time steps') model.learn(total_timesteps=len(self.train_df)) obs = test_env.reset() done, reward_sum = False, 0 while not done: action, _states = model.predict(obs) obs, reward, done, info = test_env.step(action) reward_sum += reward self.logger.info('[', idx, '] Total reward: ', reward_sum, ' (' + self.reward_strategy + ')') model.save( os.path.join( '.', 'agents', 'ppo2_' + self.reward_strategy + '_' + str(idx) + '.pkl')) self.logger.info("Trained {} model instances".format(models_to_train))
print('Saving the trained model!') model.save(save_path) # dump the flow params with open(os.path.join(path, args.result_name) + '.json', 'w') as outfile: json.dump(flow_params, outfile, cls=FlowParamsEncoder, sort_keys=True, indent=4) del model del flow_params # Replay the result by loading the model print('Loading the trained model and testing it out!') model = PPO2.load(save_path) flow_params = get_flow_params( os.path.join(path, args.result_name) + '.json') flow_params['sim'].render = True env_constructor = env_constructor(params=flow_params, version=0)() env = DummyVecEnv([ lambda: env_constructor ]) # The algorithms require a vectorized environment to run obs = env.reset() reward = 0 for i in range(flow_params['env'].horizon): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) reward += rewards print('the final reward is {}'.format(reward))
# storage='sqlite:///params.db') # calmar_env = DummyVecEnv([lambda: BitcoinTradingEnv( # test_df, reward_func="profit", forecast_len=int(calmar_study.best_trial.params['forecast_len']), confidence_interval=calmar_study.best_trial.params['confidence_interval'])]) omega_study = optuna.load_study(study_name='ppo2_omega', storage='sqlite:///params.db') omega_env = DummyVecEnv([ lambda: BitcoinTradingEnv( test_df, reward_func="profit", forecast_len=int(omega_study.best_trial.params['forecast_len']), confidence_interval=omega_study.best_trial.params['confidence_interval' ]) ]) profit_model = PPO2.load('./agents/ppo2_profit_4.pkl', env=profit_env) sortino_model = PPO2.load('./agents/ppo2_sortino_4.pkl', env=sortino_env) # calmar_model = PPO2.load('./agents/ppo2_calmar_4.pkl', env=calmar_env) omega_model = PPO2.load('./agents/ppo2_omega_4.pkl', env=omega_env) profit_obs = profit_env.reset() sortino_obs = sortino_env.reset() # calmar_obs = calmar_env.reset() omega_obs = omega_env.reset() profit_net_worths = [10000] sortino_net_worths = [10000] # calmar_net_worths = [10000] omega_net_worths = [10000] done = False
'NovelGridworld-v3_200000_8beams0filled40range3items_in_360degrees_lfd_best_model', 'NovelGridworld-v4_200000_8beams0filled40range3items_in_360degrees_lfd_best_model', 'NovelGridworld-v3_200000_8beams0filled40range3items_in_360degrees_lfd_best_model' ] assert len(env_key_list) == len( env_models), "Provide both: env_id and their models" render = True render_title = '' env_dict = {env_id: {} for env_id in env_key_list} # Load the trained agents for i in range(len(env_key_list)): print("env_key_list[i]: ", env_key_list[i]) env_dict[env_key_list[i]]['model'] = PPO2.load(env_models[i]) render_title += env_key_list[i] + '_' render_title = render_title[:-1] render_title = 'NovelGridworld-v5' # make 1st env env_dict[env_key_list[0]]['env'] = gym.make(env_id_list[0]) for i_episode in range(10): # make 2nd env, 3rd env, ... nth env that can restore previous env for i in range(1, len(env_key_list)): env_dict[env_key_list[i]]['env'] = gym.make( env_id_list[i], env=env_dict[env_key_list[i - 1]]['env']) # Play trained env. for env_idx in range(len(env_key_list)):
print(x[-1], 'timesteps') print("Best mean reward: {:.2f} - Last mean reward per episode: {:.2f}".format(best_mean_reward, mean_reward)) # New best model, you could save the agent here if mean_reward > best_mean_reward: best_mean_reward = mean_reward # Example for saving best model print("Saving new best model") _locals['self'].save(log_dir + 'best_model.pkl') return True if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--config', type=str, default=config_file) args = parser.parse_args() # Create log dir log_dir = "/tmp/gym/" os.makedirs(log_dir, exist_ok=True) env = HamstirGibsonEnv(config=args.config) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) model = PPO2(CnnPolicy, env, verbose=1, gamma=0.95, n_steps=2000) # print(env.config) model.learn(total_timesteps=100000, callback=callback)
def env_create(): env = ClientDapr("ActorOpenAI") env.create("LunarLander-v2") print(f"[Client] Created Sim {env.actor_id}", flush=True) return env if __name__ == '__main__': print("===============================================", flush=True) print("TRAINING", flush=True) print("===============================================", flush=True) cpu = 50 env = SubprocVecEnv([lambda: env_create() for _ in range(cpu)]) model = PPO2(MlpPolicy, env, verbose=1, tensorboard_log="./output/tensorboard") model.learn(total_timesteps=100000) print("[Client][Train] Saving Model", flush=True) model.save("baselines_ppo_cartpole") print("[Client][Train] DONE", flush=True) # Evaluate # mean_reward, std_reward = evaluate_policy(model, model.get_env()[0], n_eval_episodes=10) # print(f"Mean Reward: {mean_reward}; Std Reward: {std_reward}") # evaluate_actor_id = model.get_env()[0].actor_id # print(f"Env ID: {evaluate_actor_id}")
my_step_size = float(f_list[1]) my_maxspeed = float(f_list[2]) my_acceleration = 2.5 / 4 my_randomBall = True my_binaryReward = True # Initialize environment with signal parameters: env = CustomEnv(step_limit=my_step_limit, step_size=my_step_size, maxspeed=my_maxspeed, acceleration=my_acceleration, randomBall=my_randomBall, binaryReward=my_binaryReward) # 0.01745*5 # Load trained model and execute it forever: model = PPO2.load("../Models/" + filename) while True: #obs = env.reset() obs = env.reset() #obs = obs.reshape((1,4)) #print(env.observation_space.shape) #obs, rewards, dones, info = env.step([0,0]) for i in range(my_step_limit): #my_step_limit action, _states = model.predict(obs) print(action) obs, rewards, dones, info = env.step(action) #obs = np.array(obs).reshape((1,4)) env.renderSlow(50) if (dones): env.renderSlow(1)
def set_seed(rand_seed): set_global_seeds(100) env.env_method('seed', rand_seed) np.random.seed(rand_seed) os.environ['PYTHONHASHSEED'] = str(rand_seed) model.set_random_seed(rand_seed) x = 0.5 env = gym.make('offload-autoscale-v0', p_coeff=x) # Optional: PPO2 requires a vectorized environment to run # the env is now wrapped automatically when passing it to the constructor env = DummyVecEnv([lambda: env]) rand_seed = 1234 model = PPO2(MlpPolicy, env, verbose=1, seed=rand_seed) model.learn(total_timesteps=1000) rewards_list_ppo = [] avg_rewards_ppo = [] rewards_time_list_ppo = [] avg_rewards_time_list_ppo = [] rewards_bak_list_ppo = [] avg_rewards_bak_list_ppo = [] rewards_bat_list_ppo = [] avg_rewards_bat_list_ppo = [] avg_rewards_energy_list_ppo = [] ppo_data = [] rewards_list_random = [] avg_rewards_random = []
import gym_real import numpy as np import matplotlib.pyplot as plt import datetime from stable_baselines.common.policies import MlpPolicy from stable_baselines.common.vec_env import SubprocVecEnv, DummyVecEnv from stable_baselines import PPO2 from stable_baselines.bench import Monitor from stable_baselines.results_plotter import load_results, ts2xy if __name__ == "__main__": env_name = str(sys.argv[1]) file_name = str(sys.argv[2]) if file_name[:3] == "mod": model_name = file_name else: dirpath = os.path.join(os.path.dirname(os.path.realpath(__file__)), "models") log_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "tmp") model_name = os.path.join(dirpath, file_name) env = gym.make(env_name) model = PPO2.load(model_name) obs = env.reset() for i in range(10000): action, _states = model.predict(obs) obs, rewards, dones, info = env.step(action) env.render()