def run_model(self, K, model=None, configs=None): if(model is None): model = self.model if(configs is None): configs = self.configs with open(configs, 'rb') as f: env_dict = pickle.load(f) env = gym.make(env_dict['env_name']) env.settings.set("matchframes", self.turns*10) env.init(**env_dict) with open(model, 'rb') as f: h = PPO1.load(f, env=env) obs = env.reset() generated_trajectories = [] for episode in range(K): tau = [] for turn in range(self.turns): action, _ = h.predict(obs) obs, _, done, _ = env.step(action) tau.append(obs) if(done): env.reset() generated_trajectories.append(tau) env.close() return np.array(generated_trajectories).astype(np.float)
def eval_file(args, evalfile, modelfile, model_index): model = PPO1.load(modelfile) stage_scheduler = StageScheduler(args) container = Container(args, stage_scheduler=stage_scheduler) env = ProofEnv(args, container, stage_scheduler) prove.guidance_time = 0 fileparts = evalfile.split("/") filename = fileparts[-1] evalprefix = "eval_{}_{}_{}_{}".format(model_index, filename, args.evaltype, args.evalcount) print("\n\nTrying to find proof for {}".format(evalfile)) success, prooflen, attempts = find_one_proof_nobacktrack( args, model, env, evalfile) print("\n\nEVALUATION") print(" evaltime: {}".format(args.evaltime)) print(" evalfile: {}".format(filename)) print(" model_index: {}".format(model_index)) print(" evaltype: {}".format(args.evaltype)) print(" evalcount: {}".format(args.evalcount)) print(" Success: {}".format(success)) print(" Proof length: {}".format(prooflen)) print(" Attempts: {}".format(attempts))
def train(): """ Train PPO1 model for slime volleyball, in MPI multiprocessing. Tested for 96 CPUs. """ rank = MPI.COMM_WORLD.Get_rank() if rank == 0: logger.configure(folder=LOGDIR) else: logger.configure(format_strs=[]) workerseed = SEED + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = make_env(workerseed) env = bench.Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) env.seed(workerseed) model = PPO1.load(BEST_MODEL_PATH, env=env) eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES) model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback) env.close() del env if rank == 0: model.save(os.path.join( LOGDIR, "final_model")) # probably never get to this point.
def train(env_name, num_time_steps, policy_kwargs, eval_ep, eval_freq, ckpt_freq, load_model=None): env = gym.make(env_name) env_ = gym.make(env_name) rank = MPI.COMM_WORLD.Get_rank() today = date.today() today = str(today).replace('-', '_') now = datetime.now() current_time = now.strftime("%H_%M_%S") model_name = env_name + '_PPO1_' + today + current_time Path('./run/' + model_name).mkdir(parents=True, exist_ok=True) path = os.path.join(os.path.dirname(__file__), './run/' + model_name) ############################ # callback # ############################ callbacklist = [] eval_callback = EvalCallback_wandb(env_, n_eval_episodes=eval_ep, eval_freq=eval_freq, log_path=path) ckpt_callback = CheckpointCallback(save_freq=ckpt_freq, save_path='./run/' + model_name + '/ckpt', name_prefix='') callbacklist.append(eval_callback) callbacklist.append(ckpt_callback) callback = CallbackList(callbacklist) if load_model: model = PPO1.load(env=env, load_path=load_model) else: model = PPO1(MlpPolicy, env, verbose=1, policy_kwargs=policy_kwargs) ############################ # Logging # ############################ if rank == 0: logger.configure(path) config = {} config['load'] = [{'load_model': load_model}] config['eval'] = [{'eval_freq': eval_freq, 'eval_ep': eval_ep}] config['ckpt'] = [{'ckpt_freq': ckpt_freq}] config['policy'] = [{'policy_network': policy_kwargs}] with open('./run/' + model_name + '/' + model_name + '.txt', 'w+') as outfile: json.dump(config, outfile, indent=4) else: logger.configure(path, format_strs=[]) ############################ # run # ############################ model.learn(total_timesteps=int(num_time_steps), callback=callback) model.save(path + '/finish')
def init(self, **kwargs): self.action_idx = [10, 11, 18, 19, 20, 21] super().init(**kwargs) models = kwargs['inner_models'] # set up major model and environment parameters self.major = PPO1.load(models['major_model']) with open(models['major_model_configs'], 'rb') as f: self.major_configs = pickle.load(f) self.major_action_idx = [1, 3, 4, 5, 7, 8, 14, 15] # set up minor model and environment parameters self.minor = PPO1.load(models['minor_model']) with open(models['minor_model_configs'], 'rb') as f: self.minor_configs = pickle.load(f) self.minor_action_idx = [0, 2, 6, 9, 12, 13, 16, 17]
def load_model(self,agent_to_load_directory,is_test=False): if self.game_type != "atari": if agent_to_load_directory=="": self.model=PPO1.load("./models/agentPPO.pkl",env=self.env) else: self.model=PPO1.load(agent_to_load_directory,env=self.env) else: if is_test: if agent_to_load_directory=="": self.model=PPO2.load("./models/agentPPO.pkl") else: self.model=PPO2.load(agent_to_load_directory) else: if agent_to_load_directory=="": self.model=PPO2.load("./models/agentPPO.pkl",env=self.env) else: self.model=PPO2.load(agent_to_load_directory,env=self.env)
def mainHybrid(arg): test = arg == TEST env = fep.FurutaEnvPosPpo(cm.RUN, render=not test) #env.setRender(True) modelBal = PPO1.load(POLICY_PATH + "ppo1_pos_policy_bal.zip") modelUp = PPO1.load(POLICY_PATH + "ppo1_pos_policy_up.zip") buf_rew = [] test_cutoff_count = 0 test_count = 0 overspeed = 0 complete_count = 0 while True: test_count += 1 if test and test_count >= TEST_COUNT_HYBRID: print("\n***Average reward: %.3f\tLong runs: %d\tComplete: %d" % (sum(buf_rew) / float(len(buf_rew)), test_cutoff_count - overspeed, complete_count)) break obs, done = env.reset(), False episode_rew = 0 count = 0 while not done: if abs(obs[2]) > cm.deg2Rad(cm.ANGLE_TERMINAL_MIN_D): action, _ = modelUp.predict(obs) else: action, _ = modelBal.predict(obs) obs, rew, done, _ = env.step(action) if speedCheck(obs): overspeed += 1 episode_rew += rew count += 1 if count > 999: complete_count += 1 buf_rew.append(episode_rew) if test and count >= TEST_CUTOFF_MAX: test_cutoff_count += 1 print("Episode reward: %.3f" % (episode_rew))
def init(self, **kwargs): self.action_idx = [0, 2, 6, 9, 12, 13, 16, 17] super().init(**kwargs) # the major actions self.major = PPO1.load(kwargs['inner_models']['major_model']) with open(kwargs['inner_models']['major_model_configs'], 'rb') as f: self.major_configs = pickle.load(f) self.major_action_idx = [1, 3, 4, 5, 7, 8, 14, 15]
def __init__(self, model, action_idx): """ A simple class wrapper for lower level models. Has some useful functions for learning with. Args: model (string): path to the model action_idx (list): the list of action indexes controlled by the model """ self.model = PPO1.load(model) self.action_indexes = action_idx self.action_list = list( product([1, 2, 3, 4], repeat=len(self.action_indexes)))
def load_env(model_name='flexible_load_first',seed=9): #flexible_load_first, overnight, larger_margin_cost, discount_06, flex50 model_path = os.path.join(MODEL_PATH,model_name) params_name = model_name +'_params.p' param_path = os.path.join(MODEL_PATH,params_name) try: model = DDPG.load(model_path) except: model = PPO1.load(model_path) env = ActiveEnv(seed=seed) with open(param_path,'rb') as f: params = pickle.load(f) env.set_parameters(params) model.set_env(env) return model, env
def main(): parser = argparse.ArgumentParser() parser.add_argument("--algorithm") parser.add_argument("--env") parser.add_argument("--steps") parser.add_argument("--alpha") parser.add_argument("--grid_search") args = parser.parse_args() algorithm = args.algorithm env = gym.make(args.env) grid_search = args.grid_search alpha = args.alpha if algorithm == "ppo1": from stable_baselines import PPO1 from stable_baselines.common.policies import MlpPolicy model = PPO1(MlpPolicy, env, verbose=1) else: from stable_baselines import DQN from stable_baselines.deepq.policies import MlpPolicy model = DQN(MlpPolicy, env, learning_rate=alpha, verbose=1) model.learn(total_timesteps=int(args.steps), log_interval=10) model.save(f"{algorithm}_cartpole") del model # remove to demonstrate saving and loading if algorithm == "ppo1": model = PPO1.load(f"{algorithm}_cartpole") else: model = DQN.load(f"{algorithm}_cartpole") mean_reward = evaluate(model, env, num_steps=10000) hparams_str = f" algorithm={algorithm} env={args.env} steps={args.steps} alpha={alpha}" if grid_search: with open("grid_search_results.txt", "a") as myfile: myfile.write(str(mean_reward) + hparams_str) myfile.close() else: print(str(mean_reward) + hparams_str)
def eval_on_dir(args, env, modelFile, evalcount, evaldir, evaltype, evaltime): model = PPO1.load(modelFile) successes = [] failures = [] timeouts = [] lenghts = [] for filename in os.listdir(evaldir): ### PARALLELIZE THIS!!! if filename.endswith(".p"): evalfile = os.path.join(dir, filename) success_ratio, failure_ratio, timeout_ratio, avglen = eval_on_file( args, model, env, evalcount, evalfile, evaltype, evaltime) successes.append(success_ratio) failures.append(failure_ratio) timeouts.append(timeout_ratio) lenghts.append(avglen) return successes, failures, timeouts, lenghts
def eval(args, evaldir, modelfile, model_index): model = PPO1.load(modelfile) stage_scheduler = StageScheduler(args) container = Container(args, stage_scheduler=stage_scheduler) env = ProofEnv(args, container, stage_scheduler) proofs_found = 0 proofs_tried = 0 len_sum = 0.0 attempts_sum = 0.0 prove.guidance_time = 0 dirparts = evaldir.split("/") if dirparts[-1] == "": dirname = dirparts[-2] else: dirname = dirparts[-1] evalprefix = "eval_{}_{}_{}_{}".format(model_index, dirname, args.evaltype, args.evalcount) for filename in os.listdir(evaldir): if filename.endswith(".p"): name = os.path.join(evaldir, filename) print("\n\nTrying to find proof for {}".format(name)) proofs_tried += 1 success, prooflen, attempts = find_one_proof_nobacktrack( args, model, env, name) if success == 1: proofs_found += 1 len_sum += prooflen attempts_sum += attempts print("Found: {}/{} proofs".format(proofs_found, proofs_tried)) print("\n\nEVALUATION") print(" evaltime: {}".format(args.evaltime)) print(" evaldir: {}".format(dirname)) print(" model_index: {}".format(model_index)) print(" evaltype: {}".format(args.evaltype)) print(" evalcount: {}".format(args.evalcount)) print(" FOUND: {}/{}".format(proofs_found, proofs_tried)) print(" Avg proof length: {}".format(safediv(len_sum, proofs_found))) print(" Avg attempts: {}".format(safediv(attempts_sum, proofs_found)))
def mainBal(arg): test = arg == TEST env = fep.FurutaEnvPosPpoBal(cm.RUN, render=not test) #env.setRender(not test) #model = PPO1.load(POLICY_PATH + "ppo1_pos_policy_bal.zip") model = PPO1.load(POLICY_PATH + "ppo1_pos_policy_bal.pkl") buf_rew = [] test_cutoff_count = 0 complete_count = 0 test_count = 0 overspeed = 0 total_count = 0 while True: test_count += 1 if test and test_count >= TEST_COUNT_BAL: print( "\n***Average reward: %.3f\tLong runs: %d\tAverage count: %.3f\tCompleted: %d\tOverspeed: %d***\n" % (sum(buf_rew) / float(len(buf_rew)), test_cutoff_count, total_count / float(test_count), complete_count, overspeed)) break obs, done = env.reset(), False #obs[4] = ARM_TARGET_RAD episode_rew = 0 count = 0 while not done: action, _ = model.predict(obs) obs, rew, done, _ = env.step(action) #obs[4] = ARM_TARGET_RAD if speedCheck(obs): overspeed += 1 episode_rew += rew count += 1 total_count += 1 if count > 999: complete_count += 1 buf_rew.append(episode_rew) if test and count >= TEST_CUTOFF_MIN: test_cutoff_count += 1 print("Episode reward: %.3f\tCount: %d" % (episode_rew, count))
def __init__(self, env=None, load_dir=None, load_type=None, **kwargs): self.algorithm = load_type if self.algorithm == 'ars': params = np.load(load_dir + 'params1.npy') policy_params = { 'ob_dim': 23, 'ac_dim': 12, 'ob_filter': 'NoFilter', 'hsize': 2, 'numlayers': 32 } if kwargs["mode"] == 'mlp': self.agent = MLPPolicy(policy_params) self.agent.load(params) elif kwargs["mode"] == 'linearbias': self.agent = LinearBiasPolicy(policy_params) self.agent.load(params) else: raise NotImplementedError elif self.algorithm == 'openloop': if kwargs["mode"] == '2finger': config_data = util.read_config_file( 'gym_roam_hand_2fin_grasping_baseline.cfg', '') elif kwargs["mode"] == '3finger': config_data = util.read_config_file('roam_grasping_3fin.cfg', '') else: raise NotImplementedError self.agent = OpenLoopPolicy(config_data, env) elif self.algorithm == 'ppo1': self.agent = PPO1.load("ppo1_roam") elif self.algorithm == 'ppo2': self.agent = PPO2.load("{}/trained_model".format(load_dir)) else: raise NotImplementedError
def mainUp(arg): test = arg == TEST env = fep.FurutaEnvPosPpoUp(cm.RUN, render=not test) #env.setRender(True) model = PPO1.load(POLICY_PATH + "ppo1_pos_policy_up.zip") buf_rew = [] test_cutoff_count = 0 test_count = 0 overspeed = 0 total_count = 0 while True: test_count += 1 if test and test_count >= TEST_COUNT_UP: print( "\n***Average reward: %.3f\tAverage count: %.3f\tShort runs: %d***" % (sum(buf_rew) / float(len(buf_rew)), total_count / float(test_count), test_cutoff_count - overspeed)) break obs, done = env.reset(), False episode_rew = 0 count = 0 while not done: action, _ = model.predict(obs) obs, rew, done, _ = env.step(action) if speedCheck(obs): overspeed += 1 episode_rew += rew count += 1 total_count += 1 buf_rew.append(episode_rew) if test and count <= TEST_CUTOFF_MAX: test_cutoff_count += 1 print("Episode average reward: %.3f\tCount: %d" % (episode_rew / count, count))
import gym import pybulletgym from stable_baselines.common.policies import MlpPolicy from stable_baselines import PPO1 import time if __name__ == '__main__': env = gym.make('Walker2DPyBulletEnv-v0') model = PPO1.load("sac_sliding") env.render() ob = env.reset() reward = 0 while True: action, _states = model.predict(ob) ob, r, done, info = env.step(action) reward += r time.sleep(0.01) if done: ob = env.reset() print('r is {}'.format(r)) print('Episode reward is {}'.format(reward)) reward = 0
import gym import numpy as np from matplotlib import pyplot as plt from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy,CnnPolicy from stable_baselines.common import make_vec_env from stable_baselines import TRPO, ACKTR, A2C, SAC, PPO1 # multiprocess environment env = gym.make('gym_squeeze:squeeze-v0') model = PPO1.load("ppo1_squeeze") #model.save("quello_quasi_buono_squeeze") obs = env.reset() x=[] rc=[] sc=[] rew=[] done=[] dones=False i=0 rc2=[] sc2=[] rew2=[] done2=[] dones=False i=0 obs2=obs while dones==False:
print("original action space: ", env.action_space) print("original observation space: ", env.observation_space) env_player1 = RoboSumoWrapper(env, player_id=1) policy1 = PPO1(MlpPolicy, env_player1, verbose=1) env_player0 = RoboSumoWrapper(env) policy0 = PPO1(MlpPolicy, env_player0, verbose=1) env_player0.opponent_policy = policy1 print("action space of policy0 is: ", policy0.action_space) print("observation space of policy0 is: ", policy0.observation_space) policy0.learn(total_timesteps=5) policy0.save("policy0") del policy0 # remove to demonstrate saving and loading model = PPO1.load("policy0") obs = env_player0.reset() while True: print("shape of obs is: ", obs.shape) action, _states = model.predict(obs) print("action is: ", action) obs, rewards, dones, info = env_player0.step(action) # obs = env.state env.render(mode="human")
def main(): parser = custom_arg_parser() args = parser.parse_args() load_defaults(args) print("Arguments:{}".format(args)) # Create the model name with all the parameters model_dir_name = serialize_args(args) print("Model name: {}".format(model_dir_name)) if args.model is not None: model_save_path = os.path.dirname(args.model) + "/" tb_save_path = model_save_path.replace("learned_models","tb_logs") else: model_save_path = "../../learned_models/" + model_dir_name + "/" tb_save_path = "../../tb_logs/" + model_dir_name + "/" print("Model save path:{}".format(model_save_path)) print("TB logs save path:{}".format(tb_save_path)) final_model_path = model_save_path + "final_" + model_dir_name model_load_path = args.model show_render = args.visualize # Save args to json for training from checkpoints if not os.path.exists(model_save_path): os.makedirs(model_save_path) with open(model_save_path + "args.json", 'w+') as f: json.dump(vars(args),f,indent=2,sort_keys=True) env = GymWrapper( suite.make( "JR2Door", has_renderer = show_render, use_camera_obs = False, ignore_done = False, control_freq = args.control_freq, horizon = args.horizon, door_type = args.door_type, bot_motion = args.bot_motion, robot_pos = args.robot_pos, robot_theta = args.robot_theta, dist_to_handle_coef = args.rcoef_dist_to_handle, door_angle_coef = args.rcoef_door_angle, handle_con_coef = args.rcoef_handle_con, body_door_con_coef = args.rcoef_body_door_con, self_con_coef = args.rcoef_self_con, arm_handle_con_coef = args.rcoef_arm_handle_con, arm_door_con_coef = args.rcoef_arm_door_con, force_coef = args.rcoef_force, gripper_touch_coef = args.rcoef_gripper_touch, dist_to_door_coef = args.rcoef_dist_to_door, wall_con_coef = args.rcoef_wall_con, reset_on_large_force= args.reset_on_large_force, debug_print = args.print_info, eef_type = args.eef_type, door_init_qpos = args.door_init_qpos, goal_offset = args.goal_offset, ) ) if args.slurm: env = SubprocVecEnv([lambda: env for i in range(args.n_cpu)]) else: env = DummyVecEnv([lambda: env]) # Load the specified model, if there is one if args.model is not None: # Training from checkpoint, so need to reset timesteps for tb reset_num_timesteps = False if args.rl_alg == "ppo2": model = PPO2.load(model_load_path,env=env) print("Succesfully loaded PPO2 model") if args.rl_alg == "ppo1": model = PPO1.load(model_load_path,env=env) print("Succesfully loaded PPO1 model") else: # New model, so need to reset timesteps for tb reset_num_timesteps = True if args.rl_alg == "ppo2": model = PPO2( args.policy, env, verbose=args.verbose, n_steps=args.n_steps, nminibatches=args.minibatches, noptepochs=args.opt_epochs, cliprange=args.clip_range, ent_coef=args.ent_coef, tensorboard_log=tb_save_path, #full_tensorboard_log=True ) elif args.rl_alg == "ppo1": model = PPO1( args.policy, env, verbose=args.verbose, timesteps_per_actorbatch=args.n_steps, optim_epochs=args.opt_epochs, tensorboard_log=tb_save_path, ) if args.replay: # Replay a policy obs = env.reset() count = 0 with open('episode-reward.csv', mode='w') as fid: writer = csv.writer(fid, delimiter=',') writer.writerow("reward") while(count < 1000): env.render() count += 1 print(count) while True: if args.model is None: print("Error: No model has been specified") action, _states = model.predict(obs,deterministic=True) #print("action {}".format(action)) obs, reward, done, info = env.step(action) env.render() #print(obs) #print(env.sim.data.qpos[env._ref_joint_vel_indexes]) #time.sleep(0.1) with open('episode-reward.csv', mode='a') as fid: writer = csv.writer(fid, delimiter=',') writer.writerow(reward) #if done: # quit() else: # Train model.learn( total_timesteps = args.total_timesteps, save_dir = model_save_path, render=show_render, reset_num_timesteps=reset_num_timesteps, ) model.save(final_model_path) print("Done training") obs = env.reset()
save_name = "model/gail_exp1" epochs = 10 timestep_per_epoch = int(1e5) expert_n_episodes = 100 ############################################ if __name__ == "__main__": if not os.path.exists(save_name): os.makedirs(save_name) # Generate expert trajectories (train expert) print("\n...Generate expert trajectories\n") env = PrticleEnv(alpha=1, beta=10, win_thre=1, max_timestep=256) model = PPO1.load("model/part_circle_exp2_epoch05_sib.zip") model.set_env(env) generate_expert_traj(model, 'expert_part_circle_exp2_epoch05_sib', n_episodes=expert_n_episodes) print("...finish\n") # Load the expert dataset print("\n...Load the expert dataset\n") dataset = ExpertDataset( expert_path='expert_part_circle_exp2_epoch05_sib.npz', traj_limitation=-1, verbose=1) print("...finish\n")
parser = argparse.ArgumentParser() parser.add_argument("--algorithm") args = parser.parse_args() algorithm = args.algorithm env = gym.make('CartPole-v0') if algorithm == "ppo1": from stable_baselines import PPO1 from stable_baselines.common.policies import MlpPolicy model = PPO1(MlpPolicy, env, verbose=1) else: from stable_baselines import DQN from stable_baselines.deepq.policies import MlpPolicy model = DQN(MlpPolicy, env, verbose=1) model.learn(total_timesteps=int(2e4), log_interval=10) model.save(f"{algorithm}_cartpole") del model # remove to demonstrate saving and loading if algorithm == "ppo1": model = PPO1.load(f"{algorithm}_cartpole") else: model = DQN.load(f"{algorithm}_cartpole") mean_reward = evaluate(model, num_steps=10000)
description='Evaluate pre-trained PPO agent.') parser.add_argument('--model-path', help='path to stable-baselines model.', type=str, default="zoo/ppo/best_model.zip") parser.add_argument('--render', action='store_true', help='render to screen?', default=False) args = parser.parse_args() render_mode = args.render env = gym.make("SlimeVolley-v0") # the yellow agent: print("Loading", args.model_path) policy = PPO1.load(args.model_path, env=env) # 96-core PPO1 policy history = [] for i in range(1000): env.seed(seed=i) cumulative_score = rollout(env, policy, render_mode) print("cumulative score #", i, ":", cumulative_score) history.append(cumulative_score) print("history dump:", history) # this is what I got: [1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 2, 1, 1, 1, 1, 4, 0, 0, 0, 2, 2, 0, 1, 2, 2, 2, 1, 2, 1, 0, 2, 0, 1, 1, 1, 0, 0, 1, 0, 1, 2, 0, 0, 1, 1, 4, 1, 0, 2, 2, 3, 2, 4, 4, 1, 1, 2, 0, 0, 0, 4, 1, 1, 2, 0, 1, 1, 1, 2, 1, 1, 3, 2, 0, 1, 1, 1, 2, 2, 1, 1, 0, 0, 0, 1, 1, 1, 2, 5, 3, 3, 0, 0, 1, 0, 0, 2, 2, 1, 2, 1, 1, 0, 1, 0, 1, 1, 2, 2, 1, 3, 4, 0, 0, 0, 3, 0, 1, 5, 2, 4, 0, 1, 1, 1, 3, 0, 1, 2, 1, 1, 2, 1, 1, 2, 0, 1, 1, 0, 1, 0, 1, 2, 0, 2, 0, 2, 1, 1, 1, 0, 0, 0, 2, 2, 1, 0, 0, 0, 3, 0, 1, 3, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 2, 0, 1, 2, 1, 0, 0, 1, 2, 0, 2, 1, 0, 1, 2, 2, 0, 2, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 2, 0, 1, 1, 0, 2, 1, 0, 1, 0, 1, 0, 1, 3, 2, 2, 1, 2, 0, 2, 2, 0, 1, 0, 1, 0, 0, 2, 1, 2, 1, 0, 2, 1, 0, 1, 0, 2, 1, 1, 1, 2, 2, 2, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 2, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, 0, 1, 1, 0, 1, 2, 1, 0, 2, 3, 3, 4, 0, 0, 1, 0, 1, 1, 2, 0, 1, 0, 1, 0, 2, 1, 0, 3, 0, 0, 1, 1, 1, 2, 2, 0, 0, 2, 0, 0, 1, 2, 4, 0, 2, 0, 1, 1, 1, 0, 1, 2, 1, 0, 0, 4, 1, 0, 0, 0, 0, 2, 1, 1, 1, 3, 1, 1, 1, 2, 1, 1, 1, 2, 1, 0, 1, 1, 2, 0, 0, 0, 1, 4, 2, 3, 0, 3, 1, 0, 0, 1, 2, 2, 1, 0, 0, 1, 2, 0, 2, 1, 0, 1, 0, 0, 0, 1, 0, 2, 1, 2, 0, 1, 1, 2, 1, 0, 1, 0, 1, 1, 2, 0, 2, 0, 0, 1, 1, 0, 0, 2, 0, 2, 0, 1, 2, 2, 3, 1, 1, 0, 0, 1, 1, 4, 2, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 2, 3, 0, 0, 2, 2, 0, 3, 1, 0, 2, 0, 1, 0, 0, 2, 1, 2, 3, 1, 0, 1, 0, 1, 2, 1, 0, 2, 0, 0, 1, 0, 0, 1, 1, 1, 0, 2, 1, 0, 2, 2, 0, 1, 0, 1, 0, 5, 2, 2, 0, 1, 2, 0, 2, 0, 0, 0, 1, 0, 0, 1, 0, 2, 2, 1, 0, 1, 1, 2, 0, 0, 2, 0, 0, 3, 2, 2, -1, 3, 1, 1, 2, 0, 0, 2, 1, 1, 0, 1, 1, 3, 0, 2, 1, 1, 0, 3, 2, 1, 0, 2, 1, 2, 0, 1, 0, 2, 0, 2, 0, 3, 0, 0, 1, 0, 0, 1, 0, 0, 0, 2, 1, 2, 0, 3, 0, 2, 0, 1, 2, 1, 0, 0, 1, 2, 1, 0, 0, 4, 3, 0, 2, 1, 0, 0, 0, 2, 2, 1, 1, 0, 0, 2, 1, 0, 2, 2, 1, 0, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 2, 0, 0, 0, 2, 2, 2, 0, 0, 4, 3, 0, 0, 1, 0, 1, 1, 3, 3, 1, 0, 1, 1, 0, 0, 3, 3, 0, 2, 3, 1, 2, 1, 3, 2, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 2, 0, 1, 1, 2, 1, 3, 1, 2, 0, -1, 0, 1, 0, 1, 4, 4, 0, 0, 0, 1, 0, 1, 0, 1, 3, 1, 0, 1, 1, 1, 0, 1, 1, 0, 2, 0, 2, 0, 0, 2, 1, 1, 1, 0, 1, 3, 1, 0, 0, 0, 1, 1, 0, 1, 2, 0, 2, 2, 0, 1, 0, 2, 3, 1, 1, 1, 1, 0, 2, 2, 1, 2, 0, 0, 2, 0, 1, 3, 0, 1, 0, 1, 0, 1, 0, 0, 2, 1, 2, 0, 2, 1, 1, 3, 1, 2, 2, 0, 1, 0, 2, 0, 1, 2, 0, 1, 2, 1, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 1, 2, 0, 0, 0, 2, 1, 1, 3, 1, 2, 2, 2, 2, 0, 1, 1, 1, 2, 0, 1, 4, 0, 0, 0, 1, 4, 0, 1, 4, 1, 2, 1, 1, 3, 3, 3, 4, 1, 0, 1, 0, 0, 3, 1, 4, 1, 3, 1, 1, 1, 0, 2, 4, 1, 0, 3, 2, 1, 0, 0, 3, 1, 2, 0, 0, 0, 4, 0, 1, 0, 1, 1, 0, 0, 0, 0, 2, 1, 1, 0, 2, 3, 0, 1, 0, 1, 1, 2, 0, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 0, 0, 1, 0, 0, 1, 2, 1, 3, 2, 0, 2, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 2, 1, 1, 2, 0, 1, 1, 1, 1, 2, 1, 0, 1, 0, 2, 3, 3, 0, -1, 2, 0, 1, 1, 3, 0, 1, 0, 0, 3, 0, 2, 0, 0, 1, 0, 2, 2, -1, 1, 0, 0, 1, 0, 1, 1, 0, 2, 1, 3, 1, 0, 2, 2, 1, 1, 1, 1, 1, 3, 1, 1, 2, 0, 2, 2, 1, 0, 0, 2, 0, 1, 2, 3, 2, 3, 0, 3, 2, 3, 2] print("average score", np.mean(history), "standard_deviation", np.std(history))
RL_method = "PPO1" experiment_ID = "experiment_4" stiffness_value = "stiffness_test16" save_name_extension = RL_method log_dir = "./logs/{}/{}/{}/".format(experiment_ID, RL_method, stiffness_value) # defining the environments env = gym.make('NmiLeg-v1') #env = DummyVecEnv([lambda: env]) # loading the trained model if RL_method == "PPO1": model = PPO1.load(log_dir+"/model.pkl") elif RL_method == "PPO2": model = PPO2.load(log_dir+"/model.pkl") env = DummyVecEnv([lambda: env]) elif RL_method == "DDPG": model = DDPG.load(log_dir+"/model.pkl") env = DummyVecEnv([lambda: env]) else: raise ValueError("Invalid RL mode") # setting the environment model.set_env(env) env_run = gym.make('NmiLeg-v1') #env_run = Monitor(env_run,'./video/'+log_dir,force=True) #model = DDPG.load("PPO2-HalfCheetah_nssu-v3_test2")
def __init__(self, path): print(path) self.model = PPO1.load(path)
else: raise ValueError('No such environment!') env.render() if args.alg == 'td3' or args.alg == 'TD3': from stable_baselines import TD3 model = TD3.load(args.file) elif args.alg == 'ddpg' or args.alg == 'DDPG': from stable_baselines import DDPG model = DDPG.load(args.file) elif args.alg == 'SAC' or args.alg == 'sac': from stable_baselines import SAC model = SAC.load(args.file) elif args.alg == 'ppo1' or args.alg == 'PPO1': from stable_baselines import PPO1 model = PPO1.load(args.file) elif args.alg == 'ppo2' or args.alg == 'PPO2': from stable_baselines import PPO2 model = PPO2.load(args.file) else: raise ValueError('No such algorithm') ob = env.reset() reward = 0 while True: action, _states = model.predict(ob) ob, r, done, info = env.step(action) reward += r time.sleep(0.01) if done:
def eval_mpi(args, evaldir, modelfile, model_index): from mpi4py import MPI as mpi rank = mpi.COMM_WORLD.Get_rank() all = mpi.COMM_WORLD.Get_size() model = PPO1.load(modelfile) stage_scheduler = StageScheduler(args) container = Container(args, stage_scheduler=stage_scheduler) env = ProofEnv(args, container, stage_scheduler) dirparts = evaldir.split("/") if dirparts[-1] == "": dirname = dirparts[-2] else: dirname = dirparts[-1] evalprefix = "eval_{}_{}_{}_{}".format(model_index, dirname, args.evaltype, args.evalcount) proofs_found = 0 proofs_tried = 0 len_sum = 0.0 attempts_sum = 0.0 prove.guidance_time = 0 filenames_original = sorted([ filename for filename in os.listdir(evaldir) if filename.endswith(".p") ]) def data_gen(filenames, i): return filenames[i % len(filenames)] chunks = int(len(filenames_original) / all) + 1 filenames_extended = [ data_gen(filenames_original, i) for i in range(chunks * all) ] # [rank:][::all] assert (len(filenames_extended) > 0) for index in range(chunks): chunk = filenames_extended[index * all:(index + 1) * all] assert (len(chunk) == all) name = os.path.join(evaldir, chunk[rank]) print("\n\nTrying to find proof for {}".format(name)) success, prooflen, attempts = find_one_proof_nobacktrack( args, model, env, name) results = mpi.COMM_WORLD.gather((1, success, prooflen, attempts), root=0) if rank == 0: # print(results) for i in range(len(results)): proofs_tried += results[i][0] succ = results[i][1] if succ == 1: proofs_found += 1 len_sum += results[i][2] attempts_sum += results[i][3] logger.record_tabular("update_no", proofs_tried) logger.record_tabular("{}_proofs_found".format(evalprefix), proofs_found) logger.record_tabular("{}_found".format(evalprefix), safediv(proofs_found, proofs_tried)) logger.record_tabular("{}_avg_prooflen".format(evalprefix), safediv(len_sum, proofs_found)) logger.record_tabular("{}_avg_attempts".format(evalprefix), safediv(attempts_sum, proofs_found)) logger.dumpkvs() print("Found: {}/{} proofs".format(proofs_found, proofs_tried)) print("\n\nEVALUATION {}".format(rank)) print(" evaltime: {}".format(args.evaltime)) print(" evaldir: {}".format(dirname)) print(" model_index: {}".format(model_index)) print(" evaltype: {}".format(args.evaltype)) print(" evalcount: {}".format(args.evalcount)) print(" FOUND: {}/{}".format(proofs_found, proofs_tried)) print(" Avg proof length: {}".format(safediv(len_sum, proofs_found))) print(" Avg attempts: {}".format(safediv(attempts_sum, proofs_found)))
def ppo1_test(): v_env = PortfolioEnv(settings['data_file'], settings['output_file'], settings['strategy_name'], settings['total_steps'], settings['window_length'], settings['capital_base'], settings['lot_size'], settings['leverage'], settings['commission_percent'], settings['commission_fixed'], settings['max_slippage_percent'], settings['start_idx'], settings['compute_indicators'], settings['compute_reward'], settings['compute_position'], settings['debug']) # Create the vectorized environment # v_env = DummyVecEnv([lambda: v_env]) # Normalize environment # v_env = VecNormalize(v_env, norm_obs=settings['norm_obs'], norm_reward=settings['norm_reward'], clip_obs=settings['clip_obs'], clip_reward=settings['clip_reward'], gamma=p_gamma, epsilon=EPS) model = PPO1.load(MODELS_DIR + settings['model_name']) # Strategy obs = v_env.reset() dones = False while not dones: action, _states = model.predict(obs, deterministic=True) obs, rewards, dones, info = v_env.step(action) # v_env.render(mode='ansi') v_env.strategy_name = 'fxcm_XXXUSD_H4 returns PPO1' v_env.render(mode='human') # pv,pp,pw=v_env.get_summary() # pr=pv.sum(axis=1).pct_change().fillna(0) pr = v_env.returns # Random agent obs = v_env.reset() dones = False while not dones: # action, _states = model.predict(obs, deterministic=True) action = v_env.action_sample obs, rewards, dones, info = v_env.step(action) # v_env.render(mode='ansi') v_env.strategy_name = 'Random agent' v_env.render(mode='human') # Buy and hold obs = v_env.reset() dones = False weights = np.concatenate( (np.ones(len(v_env.instruments)) / len(v_env.instruments), [0])) # print(weights) while not dones: obs, rewards, dones, info = v_env.step(action=weights) weights = v_env.current_weights v_env.render(mode='ansi') v_env.strategy_name = 'Buy and hold' v_env.render(mode='human') bpv, bpp, bpw = v_env.get_summary() bpr = bpv.sum(axis=1).pct_change().fillna(0) bpr = v_env.returns ''' # Extended pv,pp,pw=v_env.get_summary() pv.sum(axis=1).plot() plt.title('strategy') plt.show() bpv,bpp,bpw=v_env.get_summary() bpv.sum(axis=1).plot() plt.title('buy and hold') plt.show() pr=pv.sum(axis=1).pct_change().fillna(0) bpr=bpv.sum(axis=1).pct_change().fillna(0) ''' # pf.create_simple_tear_sheet(returns=pr,benchmark_rets=bpr) pf.create_full_tear_sheet(returns=pr, benchmark_rets=bpr)
RL_method = "PPO1" experiment_ID = "handtest_rot_pool_with_MC_C_task4/" mc_cntr = 10 sensory_value = 0 sesnory_value_str = "sensory_{}".format(sensory_value) save_name_extension = RL_method log_dir_read = "./logs/{}/MC_{}/{}/{}/".format(experiment_ID, mc_cntr, RL_method, sesnory_value_str) log_dir_write = "./logs/{}/videos/MC_{}/{}/{}/".format(experiment_ID, mc_cntr, RL_method, sesnory_value_str) # defining the environments env = gym.make('HandManipulate-v1{}'.format(sensory_value)) #env = DummyVecEnv([lambda: env]) # loading the trained model if RL_method == "PPO1": model = PPO1.load(log_dir_read +"/model") elif RL_method == "PPO2": model = PPO2.load(log_dir_read+"/model.pkl") env = DummyVecEnv([lambda: env]) elif RL_method == "DDPG": model = DDPG.load(log_dir_read+"/model") env = DummyVecEnv([lambda: env]) else: raise ValueError("Invalid RL mode") # setting the environment model.set_env(env) env_run = gym.make('HandManipulate-v1{}'.format(sensory_value)) #env_run = Monitor(env_run,log_dir_write,force=True) #model = DDPG.load("PPO2-HalfCheetah_nssu-v3_test2")
import gym from stable_baselines.common.policies import MlpPolicy from stable_baselines import PPO1 from env.RSEnv import RSEnv from env.TestRSEnv import TestRSEnv env = RSEnv() #model = PPO1(MlpPolicy, env, verbose=1) model = PPO1.load("sbppov3") model.set_env(env) model.learn(total_timesteps=3000000, log_interval=10, reset_num_timesteps=False) model.save("sbppov4") env = TestRSEnv() obs = env.reset() done = False while not done: action, _ = model.predict(obs) obs, rewards, done, info = env.step(action) env.render() env.close()