def train(env_name, num_time_steps, policy_kwargs, eval_ep, eval_freq, ckpt_freq, load_model=None): env = gym.make(env_name) env_ = gym.make(env_name) rank = MPI.COMM_WORLD.Get_rank() today = date.today() today = str(today).replace('-', '_') now = datetime.now() current_time = now.strftime("%H_%M_%S") model_name = env_name + '_PPO1_' + today + current_time Path('./run/' + model_name).mkdir(parents=True, exist_ok=True) path = os.path.join(os.path.dirname(__file__), './run/' + model_name) ############################ # callback # ############################ callbacklist = [] eval_callback = EvalCallback_wandb(env_, n_eval_episodes=eval_ep, eval_freq=eval_freq, log_path=path) ckpt_callback = CheckpointCallback(save_freq=ckpt_freq, save_path='./run/' + model_name + '/ckpt', name_prefix='') callbacklist.append(eval_callback) callbacklist.append(ckpt_callback) callback = CallbackList(callbacklist) if load_model: model = PPO1.load(env=env, load_path=load_model) else: model = PPO1(MlpPolicy, env, verbose=1, policy_kwargs=policy_kwargs) ############################ # Logging # ############################ if rank == 0: logger.configure(path) config = {} config['load'] = [{'load_model': load_model}] config['eval'] = [{'eval_freq': eval_freq, 'eval_ep': eval_ep}] config['ckpt'] = [{'ckpt_freq': ckpt_freq}] config['policy'] = [{'policy_network': policy_kwargs}] with open('./run/' + model_name + '/' + model_name + '.txt', 'w+') as outfile: json.dump(config, outfile, indent=4) else: logger.configure(path, format_strs=[]) ############################ # run # ############################ model.learn(total_timesteps=int(num_time_steps), callback=callback) model.save(path + '/finish')
def train(): """ Train PPO1 model for slime volleyball, in MPI multiprocessing. Tested for 96 CPUs. """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure(folder=LOGDIR) else: logger.configure(format_strs=[]) workerseed = SEED + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_env(workerseed) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=1) eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) env.close() del env if rank == 0: model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.
def getPpo1(env, arch): return PPO1( env=env, policy=MlpPolicy, policy_kwargs=dict(net_arch=arch), n_cpu_tf_sess=None )
def train(env_id, num_timesteps, seed): """ Train PPO1 model for Atari environments, for testing purposes :param env_id: (str) Environment ID :param num_timesteps: (int) The total number of samples :param seed: (int) The initial seed for training """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_atari(env_id) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) model = PPO1(CnnPolicy, env, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2) model.learn(total_timesteps=num_timesteps) env.close() del env
def main(args): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = args.device env = gym.make(args.env) train_log_dir = os.path.join( args.train_log_dir, args.env + '_' + args.expert + '_' + args.policy_type) if args.expert == 'PPO': expert_model = PPO1(args.policy_type, env, verbose=1, tensorboard_log=train_log_dir) else: raise NotImplementedError expert_model.learn(total_timesteps=args.expert_training_step) generate_expert_traj(expert_model, os.path.join(train_log_dir, 'expert_traj'), n_timesteps=1000, n_episodes=args.expert_episodes) dataset = ExpertDataset(expert_path=os.path.join(train_log_dir, 'expert_traj.npz'), traj_limitation=-1) gail_model = GAIL(args.policy_type, env, dataset, verbose=1, tensorboard_log=train_log_dir) gail_model.learn(args.student_training_step) evaluate(gail_model, env, num_steps=10000) gail_model.save(train_log_dir) env.close()
def run_model(self, K, model=None, configs=None): if(model is None): model = self.model if(configs is None): configs = self.configs with open(configs, 'rb') as f: env_dict = pickle.load(f) env = gym.make(env_dict['env_name']) env.settings.set("matchframes", self.turns*10) env.init(**env_dict) with open(model, 'rb') as f: h = PPO1.load(f, env=env) obs = env.reset() generated_trajectories = [] for episode in range(K): tau = [] for turn in range(self.turns): action, _ = h.predict(obs) obs, _, done, _ = env.step(action) tau.append(obs) if(done): env.reset() generated_trajectories.append(tau) env.close() return np.array(generated_trajectories).astype(np.float)
def init(self, **kwargs): self.action_idx = [10, 11, 18, 19, 20, 21] super().init(**kwargs) models = kwargs['inner_models'] # set up major model and environment parameters self.major = PPO1.load(models['major_model']) with open(models['major_model_configs'], 'rb') as f: self.major_configs = pickle.load(f) self.major_action_idx = [1, 3, 4, 5, 7, 8, 14, 15] # set up minor model and environment parameters self.minor = PPO1.load(models['minor_model']) with open(models['minor_model_configs'], 'rb') as f: self.minor_configs = pickle.load(f) self.minor_action_idx = [0, 2, 6, 9, 12, 13, 16, 17]
def train(): """ Train PPO1 model for slime volleyball, in MPI multiprocessing. Tested for 96 CPUs. """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure(folder=LOGDIR) else: logger.configure(format_strs=[]) workerseed = SEED + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_env(workerseed) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) model = PPO1.load(BEST_MODEL_PATH, env=env) eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) env.close() del env if rank == 0: model.save(os.path.join( LOGDIR, "final_model")) # probably never get to this point.
def eval_file(args, evalfile, modelfile, model_index): model = PPO1.load(modelfile) stage_scheduler = StageScheduler(args) container = Container(args, stage_scheduler=stage_scheduler) env = ProofEnv(args, container, stage_scheduler) prove.guidance_time = 0 fileparts = evalfile.split("/") filename = fileparts[-1] evalprefix = "eval_{}_{}_{}_{}".format(model_index, filename, args.evaltype, args.evalcount) print("\n\nTrying to find proof for {}".format(evalfile)) success, prooflen, attempts = find_one_proof_nobacktrack( args, model, env, evalfile) print("\n\nEVALUATION") print(" evaltime: {}".format(args.evaltime)) print(" evalfile: {}".format(filename)) print(" model_index: {}".format(model_index)) print(" evaltype: {}".format(args.evaltype)) print(" evalcount: {}".format(args.evalcount)) print(" Success: {}".format(success)) print(" Proof length: {}".format(prooflen)) print(" Attempts: {}".format(attempts))
def load_model(self,agent_to_load_directory,is_test=False): if self.game_type != "atari": if agent_to_load_directory=="": self.model=PPO1.load("./models/agentPPO.pkl",env=self.env) else: self.model=PPO1.load(agent_to_load_directory,env=self.env) else: if is_test: if agent_to_load_directory=="": self.model=PPO2.load("./models/agentPPO.pkl") else: self.model=PPO2.load(agent_to_load_directory) else: if agent_to_load_directory=="": self.model=PPO2.load("./models/agentPPO.pkl",env=self.env) else: self.model=PPO2.load(agent_to_load_directory,env=self.env)
def build_model(self): if self.is_stack: if self.game_type == "box": self.env = DummyVecEnv([lambda: self.env]) self.model = PPO1(MlpPolicy, self.env, verbose=0, gamma=self.gamma, lam=self.c1, entcoeff=self.c2, clip_param=self.clip_epslion, adam_epsilon=self.lr) if self.game_type == "atari": self.model = PPO2(CnnPolicy, self.env, verbose=1, gamma=self.gamma, vf_coef=self.c1, ent_coef=self.c2, cliprange=self.clip_epslion, learning_rate=self.lr) else: if self.game_type=="box": self.env = DummyVecEnv([lambda: self.env]) self.model = PPO1(MlpPolicy, self.env, verbose=0,gamma=self.gamma,lam=self.c1,entcoeff=self.c2,clip_param=self.clip_epslion,adam_epsilon=self.lr) if self.game_type=="atari": self.model = PPO2(CnnLstmPolicy, self.env, verbose=1,gamma=self.gamma,vf_coef=self.c1,ent_coef=self.c2,cliprange=self.clip_epslion,learning_rate=self.lr)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--algorithm") parser.add_argument("--env") parser.add_argument("--steps") parser.add_argument("--alpha") parser.add_argument("--grid_search") args = parser.parse_args() algorithm = args.algorithm env = gym.make(args.env) grid_search = args.grid_search alpha = args.alpha if algorithm == "ppo1": from stable_baselines import PPO1 from stable_baselines.common.policies import MlpPolicy model = PPO1(MlpPolicy, env, verbose=1) else: from stable_baselines import DQN from stable_baselines.deepq.policies import MlpPolicy model = DQN(MlpPolicy, env, learning_rate=alpha, verbose=1) model.learn(total_timesteps=int(args.steps), log_interval=10) model.save(f"{algorithm}_cartpole") del model # remove to demonstrate saving and loading if algorithm == "ppo1": model = PPO1.load(f"{algorithm}_cartpole") else: model = DQN.load(f"{algorithm}_cartpole") mean_reward = evaluate(model, env, num_steps=10000) hparams_str = f" algorithm={algorithm} env={args.env} steps={args.steps} alpha={alpha}" if grid_search: with open("grid_search_results.txt", "a") as myfile: myfile.write(str(mean_reward) + hparams_str) myfile.close() else: print(str(mean_reward) + hparams_str)
def init(self, **kwargs): self.action_idx = [0, 2, 6, 9, 12, 13, 16, 17] super().init(**kwargs) # the major actions self.major = PPO1.load(kwargs['inner_models']['major_model']) with open(kwargs['inner_models']['major_model_configs'], 'rb') as f: self.major_configs = pickle.load(f) self.major_action_idx = [1, 3, 4, 5, 7, 8, 14, 15]
def mainHybrid(arg): test = arg == TEST env = fep.FurutaEnvPosPpo(cm.RUN, render=not test) #env.setRender(True) modelBal = PPO1.load(POLICY_PATH + "ppo1_pos_policy_bal.zip") modelUp = PPO1.load(POLICY_PATH + "ppo1_pos_policy_up.zip") buf_rew = [] test_cutoff_count = 0 test_count = 0 overspeed = 0 complete_count = 0 while True: test_count += 1 if test and test_count >= TEST_COUNT_HYBRID: print("\n***Average reward: %.3f\tLong runs: %d\tComplete: %d" % (sum(buf_rew) / float(len(buf_rew)), test_cutoff_count - overspeed, complete_count)) break obs, done = env.reset(), False episode_rew = 0 count = 0 while not done: if abs(obs[2]) > cm.deg2Rad(cm.ANGLE_TERMINAL_MIN_D): action, _ = modelUp.predict(obs) else: action, _ = modelBal.predict(obs) obs, rew, done, _ = env.step(action) if speedCheck(obs): overspeed += 1 episode_rew += rew count += 1 if count > 999: complete_count += 1 buf_rew.append(episode_rew) if test and count >= TEST_CUTOFF_MAX: test_cutoff_count += 1 print("Episode reward: %.3f" % (episode_rew))
def train(params): # create model env = FlattenObservation(gym.make(params.get("environment"))) exp_name = params.get("model_name") + "_train_" + params.get("environment") log_dir = './logs/' + exp_name expert_name = 'expert_{0}'.format(exp_name) if params.get("model_name") == 'TRPO': print("Loading TRPO Model") model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir) model.learn(total_timesteps=params.get("train_steps")) model.save(exp_name) if params.get("model_name") == 'PPO': print("Loading PPO Model") model = PPO1(MlpPolicy, env, verbose=1, tensorboard_log=log_dir, entcoeff=params.get("ent_coef"), gamma=params.get("gamma"), optim_batchsize=params.get("batch_size"), clip_param=params.get("clip_range"), lam=params.get("gae_lambda")) model.learn(total_timesteps=params.get("train_steps")) model.save(exp_name) if params.get("expert_exists") is False: print("Training expert trajectories") # Train expert controller (if needed) and record expert trajectories. generate_expert_traj(model, expert_name, n_timesteps=params.get("expert_timesteps"), n_episodes=params.get("n_episodes")) dataset = ExpertDataset( expert_path='{0}.npz'.format(expert_name), traj_limitation=-1, randomize=True, # if the dataset should be shuffled verbose=1) model = GAIL('MlpPolicy', env, dataset, verbose=1, tensorboard_log=log_dir) # Check out for defaults if params.get("pre_train") is True: print("Pretraining Dataset with Behavioural Cloning") model.pretrain(dataset, n_epochs=10000) print("Executing GAIL Learning") model.learn(total_timesteps=params.get("train_steps")) model.save("BC" + exp_name) env.close() del env
def __init__(self, model, action_idx): """ A simple class wrapper for lower level models. Has some useful functions for learning with. Args: model (string): path to the model action_idx (list): the list of action indexes controlled by the model """ self.model = PPO1.load(model) self.action_indexes = action_idx self.action_list = list( product([1, 2, 3, 4], repeat=len(self.action_indexes)))
def ppo1(env_id, log_dir, timesteps): # Create log dir os.makedirs(log_dir, exist_ok=True) # Create and wrap the environment env = gym.make(env_id) env = Monitor(env, log_dir, allow_early_resets=True) env = DummyVecEnv([lambda: env]) model = PPO1(MlpPolicy, env, verbose=0) # Train the agent print("Beginning training episodes with PPO1.") model.learn(total_timesteps=timesteps) env.close()
def load_env(model_name='flexible_load_first',seed=9): #flexible_load_first, overnight, larger_margin_cost, discount_06, flex50 model_path = os.path.join(MODEL_PATH,model_name) params_name = model_name +'_params.p' param_path = os.path.join(MODEL_PATH,params_name) try: model = DDPG.load(model_path) except: model = PPO1.load(model_path) env = ActiveEnv(seed=seed) with open(param_path,'rb') as f: params = pickle.load(f) env.set_parameters(params) model.set_env(env) return model, env
def test_action_mask_run_ppo1(vec_env, policy, env_class): env = vec_env([env_class]) model = PPO1(policy, env, verbose=0) obs, done, action_masks = env.reset(), [False], [] while not done[0]: action, _states = model.predict(obs, action_mask=action_masks) obs, _, done, infos = env.step(action) action_masks.clear() for info in infos: env_action_mask = info.get('action_mask') action_masks.append(env_action_mask) env.close()
def ppo1_train(): # best parames fxcm_11_H4_full_2015_2018_train_6300 v_policy = MlpPolicy # policies = [MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy] v_gamma = 0.99 # default 0.99 v_learning_rate = 0.0003 # default 0.0003 v_ent_coef = 'auto' # default 'auto' v_env = PortfolioEnv(settings['data_file'], settings['output_file'], settings['strategy_name'], settings['total_steps'], settings['window_length'], settings['capital_base'], settings['lot_size'], settings['leverage'], settings['commission_percent'], settings['commission_fixed'], settings['max_slippage_percent'], settings['start_idx'], settings['compute_indicators'], settings['compute_reward'], settings['compute_position'], settings['debug']) # Create the vectorized environment # v_env = DummyVecEnv([lambda: v_env]) # Normalize environment # v_env = VecNormalize(v_env, norm_obs=settings['norm_obs'], norm_reward=settings['norm_reward'], clip_obs=settings['clip_obs'], clip_reward=settings['clip_reward'], gamma=p_gamma, epsilon=EPS) # n_actions = v_env.action_space.shape[-1] # v_action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions)) v_action_noise = None # for v_policy, v_gamma, v_lam in it.product(p_policy, p_gamma, p_lam): # print(str(v_policy) + '_' + str(v_gamma) + '_' + str(v_lam)) model_name = settings['model_name'] + '_' + str( settings['total_timestamp']) + '_' + str( settings['window_length']) + '_' + str( settings['compute_indicators']) + '_' + str(v_gamma) + '_' + ( uuid.uuid4().hex)[:16] model = PPO1(env=v_env, policy=v_policy, gamma=v_gamma, verbose=0, tensorboard_log='log_' + model_name) model.learn(total_timesteps=(settings['total_timestamp'])) model.save(MODELS_DIR + model_name) # v_env.save_running_average(MODELS_DIR) del model
def train(env_dict, save_folder, log_dir): """ Run training on a Toribash Environment. Saves a model and the environment configurations used. Because the actions may need to be remembered, this method builds the action space here and saves it to the environment dictionary Args: env_dict (dictionary): The dictionary from the yaml file. save_folder (filepath): path to save models log_dir (filepath): path to save logs. If file is run, then found inside of save_folder """ # setting up reward and action space if(env_dict['agent'] == 'single'): env_dict = load_single_model(env_dict) elif(env_dict['agent'] == 'multi'): env_dict = load_multi_model(env_dict) elif(env_dict['agent'] == 'limb'): env_dict['env_name'] = 'Toribash-{}-v0'.format(env_dict['limb']) elif(env_dict['agent'] == 'hierarchy'): env_dict = load_hierarchy_model(env_dict) else: raise ValueError("Incorrect agent type given. Make sure agent: [single, multi, limb, hierarchy]" + "\n And, make sure other necessary components are loaded correctly." ) with open(os.path.join(save_folder, 'configs_dict.pkl'), 'wb') as f: pickle.dump(env_dict, f) # setting up the model and environment env = make_env(env_dict, env_dict['env_name']) model = PPO1(MlpPolicy, env, verbose=1, tensorboard_log="./tensorboard/{}/".format(env_dict['savename']), optim_stepsize=0.01) try: model.learn(total_timesteps=env_dict['timesteps'], callback=callback) except KeyboardInterrupt as identifier: print("Incomplete Model Save") model.save(os.path.join(save_folder, 'incomplete')) finally: model.save(os.path.join(save_folder, 'final_model.pkl'))
def eval_on_dir(args, env, modelFile, evalcount, evaldir, evaltype, evaltime): model = PPO1.load(modelFile) successes = [] failures = [] timeouts = [] lenghts = [] for filename in os.listdir(evaldir): ### PARALLELIZE THIS!!! if filename.endswith(".p"): evalfile = os.path.join(dir, filename) success_ratio, failure_ratio, timeout_ratio, avglen = eval_on_file( args, model, env, evalcount, evalfile, evaltype, evaltime) successes.append(success_ratio) failures.append(failure_ratio) timeouts.append(timeout_ratio) lenghts.append(avglen) return successes, failures, timeouts, lenghts
def eval(args, evaldir, modelfile, model_index): model = PPO1.load(modelfile) stage_scheduler = StageScheduler(args) container = Container(args, stage_scheduler=stage_scheduler) env = ProofEnv(args, container, stage_scheduler) proofs_found = 0 proofs_tried = 0 len_sum = 0.0 attempts_sum = 0.0 prove.guidance_time = 0 dirparts = evaldir.split("/") if dirparts[-1] == "": dirname = dirparts[-2] else: dirname = dirparts[-1] evalprefix = "eval_{}_{}_{}_{}".format(model_index, dirname, args.evaltype, args.evalcount) for filename in os.listdir(evaldir): if filename.endswith(".p"): name = os.path.join(evaldir, filename) print("\n\nTrying to find proof for {}".format(name)) proofs_tried += 1 success, prooflen, attempts = find_one_proof_nobacktrack( args, model, env, name) if success == 1: proofs_found += 1 len_sum += prooflen attempts_sum += attempts print("Found: {}/{} proofs".format(proofs_found, proofs_tried)) print("\n\nEVALUATION") print(" evaltime: {}".format(args.evaltime)) print(" evaldir: {}".format(dirname)) print(" model_index: {}".format(model_index)) print(" evaltype: {}".format(args.evaltype)) print(" evalcount: {}".format(args.evalcount)) print(" FOUND: {}/{}".format(proofs_found, proofs_tried)) print(" Avg proof length: {}".format(safediv(len_sum, proofs_found))) print(" Avg attempts: {}".format(safediv(attempts_sum, proofs_found)))
def ppo1_nmileg_pool(sensory_value): RL_method = "PPO1" # total_MC_runs = 50 experiment_ID = "handtest_rot_pool_with_MC_C_task0/" save_name_extension = RL_method total_timesteps = 500000 sensory_info = "sensory_{}".format(sensory_value) current_mc_run_num =22 #starts from 0 for mc_cntr in range(current_mc_run_num, current_mc_run_num+1): log_dir = "./logs/{}/MC_{}/{}/{}/".format(experiment_ID, mc_cntr, RL_method, sensory_info) # defining the environments env = gym.make('HandManipulate-v1{}'.format(sensory_value)) #env = gym.wrappers.Monitor(env, "./tmp/gym-results", video_callable=False, force=True) ## setting the Monitor env = gym.wrappers.Monitor(env, log_dir+"Monitor/", video_callable=False, force=True, uid="Monitor_info") # defining the initial model if RL_method == "PPO1": model = PPO1(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir) elif RL_method == "PPO2": env = DummyVecEnv([lambda: env]) model = PPO2(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir) elif RL_method == "DDPG": env = DummyVecEnv([lambda: env]) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5)* 5 * np.ones(n_actions)) model = DDPG(DDPG_MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log=log_dir) else: raise ValueError("Invalid RL mode") # setting the environment on the model #model.set_env(env) # setting the random seed for some of the random instances random_seed = mc_cntr random.seed(random_seed) env.seed(random_seed) env.action_space.seed(random_seed) np.random.seed(random_seed) tf.random.set_random_seed(random_seed) # training the model # training the model model.learn(total_timesteps=total_timesteps) # saving the trained model model.save(log_dir+"/model") return None
def mainBal(arg): test = arg == TEST env = fep.FurutaEnvPosPpoBal(cm.RUN, render=not test) #env.setRender(not test) #model = PPO1.load(POLICY_PATH + "ppo1_pos_policy_bal.zip") model = PPO1.load(POLICY_PATH + "ppo1_pos_policy_bal.pkl") buf_rew = [] test_cutoff_count = 0 complete_count = 0 test_count = 0 overspeed = 0 total_count = 0 while True: test_count += 1 if test and test_count >= TEST_COUNT_BAL: print( "\n***Average reward: %.3f\tLong runs: %d\tAverage count: %.3f\tCompleted: %d\tOverspeed: %d***\n" % (sum(buf_rew) / float(len(buf_rew)), test_cutoff_count, total_count / float(test_count), complete_count, overspeed)) break obs, done = env.reset(), False #obs[4] = ARM_TARGET_RAD episode_rew = 0 count = 0 while not done: action, _ = model.predict(obs) obs, rew, done, _ = env.step(action) #obs[4] = ARM_TARGET_RAD if speedCheck(obs): overspeed += 1 episode_rew += rew count += 1 total_count += 1 if count > 999: complete_count += 1 buf_rew.append(episode_rew) if test and count >= TEST_CUTOFF_MIN: test_cutoff_count += 1 print("Episode reward: %.3f\tCount: %d" % (episode_rew, count))
def create_ppo1(self): return PPO1(MlpPolicy, self.env, gamma=0.99, timesteps_per_actorbatch=1500, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=0.001, optim_batchsize=256, lam=0.95, adam_epsilon=1e-05, schedule='linear', verbose=0, tensorboard_log=None, _init_setup_model=True, policy_kwargs=None, full_tensorboard_log=False, seed=None, n_cpu_tf_sess=1)
def __init__(self, env=None, load_dir=None, load_type=None, **kwargs): self.algorithm = load_type if self.algorithm == 'ars': params = np.load(load_dir + 'params1.npy') policy_params = { 'ob_dim': 23, 'ac_dim': 12, 'ob_filter': 'NoFilter', 'hsize': 2, 'numlayers': 32 } if kwargs["mode"] == 'mlp': self.agent = MLPPolicy(policy_params) self.agent.load(params) elif kwargs["mode"] == 'linearbias': self.agent = LinearBiasPolicy(policy_params) self.agent.load(params) else: raise NotImplementedError elif self.algorithm == 'openloop': if kwargs["mode"] == '2finger': config_data = util.read_config_file( 'gym_roam_hand_2fin_grasping_baseline.cfg', '') elif kwargs["mode"] == '3finger': config_data = util.read_config_file('roam_grasping_3fin.cfg', '') else: raise NotImplementedError self.agent = OpenLoopPolicy(config_data, env) elif self.algorithm == 'ppo1': self.agent = PPO1.load("ppo1_roam") elif self.algorithm == 'ppo2': self.agent = PPO2.load("{}/trained_model".format(load_dir)) else: raise NotImplementedError
def ppo1_nmileg_pool(stiffness_value): RL_method = "PPO1" experiment_ID = "experiment_4_pool_A/mc_1/" save_name_extension = RL_method total_timesteps = 500000 stiffness_value_str = "stiffness_{}".format(stiffness_value) log_dir = "./logs/{}/{}/{}/".format(experiment_ID, RL_method, stiffness_value_str) # defining the environments env = gym.make('TSNMILeg{}-v1'.format(stiffness_value)) #env = gym.wrappers.Monitor(env, "./tmp/gym-results", video_callable=False, force=True) # defining the initial model if RL_method == "PPO1": model = PPO1(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir) elif RL_method == "PPO2": env = DummyVecEnv([lambda: env]) model = PPO2(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir) elif RL_method == "DDPG": env = DummyVecEnv([lambda: env]) n_actions = env.action_space.shape[-1] param_noise = None action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * 5 * np.ones(n_actions)) model = DDPG(DDPG_MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log=log_dir) else: raise ValueError("Invalid RL mode") # setting the environment on the model #model.set_env(env) # training the model # training the model model.learn(total_timesteps=total_timesteps) # saving the trained model model.save(log_dir + "/model") return None
def mainUp(arg): test = arg == TEST env = fep.FurutaEnvPosPpoUp(cm.RUN, render=not test) #env.setRender(True) model = PPO1.load(POLICY_PATH + "ppo1_pos_policy_up.zip") buf_rew = [] test_cutoff_count = 0 test_count = 0 overspeed = 0 total_count = 0 while True: test_count += 1 if test and test_count >= TEST_COUNT_UP: print( "\n***Average reward: %.3f\tAverage count: %.3f\tShort runs: %d***" % (sum(buf_rew) / float(len(buf_rew)), total_count / float(test_count), test_cutoff_count - overspeed)) break obs, done = env.reset(), False episode_rew = 0 count = 0 while not done: action, _ = model.predict(obs) obs, rew, done, _ = env.step(action) if speedCheck(obs): overspeed += 1 episode_rew += rew count += 1 total_count += 1 buf_rew.append(episode_rew) if test and count <= TEST_CUTOFF_MAX: test_cutoff_count += 1 print("Episode average reward: %.3f\tCount: %d" % (episode_rew / count, count))
def advlearn(env, model_name=None, dir_dict=None): _, _ = setup_logger(SAVE_DIR, EXP_NAME) if model_name == 'ppo1_oppomodel': ## inline hyperparameters ## param timesteps_per_actorbatch: timesteps per actor per update ## other inline hyperparameters is by default choice in file 'PPO1_model_value' model = PPO1_model_value( MlpPolicy_hua, env, timesteps_per_actorbatch=1000, verbose=1, tensorboard_log=dir_dict['tb'], hyper_weights=dir_dict['_hyper_weights'], benigned_model_file=None, full_tensorboard_log=False, black_box_att=dir_dict['_black_box'], attention_weights=dir_dict['_attention'], model_saved_loc=dir_dict['model'], clipped_attention=dir_dict['_clipped_attention'], exp_method=dir_dict['_x_method'], mimic_model_path=dir_dict['_mimic_model_path'], save_victim_traj=dir_dict['_save_victim_traj']) else: model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=1000, verbose=1, tensorboard_log=dir_dict['tb']) try: model.learn(TRAINING_ITER, callback=callback, seed=SEED) except ValueError as e: traceback.print_exc() print("Learn exit!") model_file_name = "{0}agent.pkl".format(dir_dict['model']) model.save(model_file_name)