Esempio n. 1
0
    def run_model(self, K, model=None, configs=None):
        if(model is None):
            model = self.model
        if(configs is None):
            configs = self.configs

        with open(configs, 'rb') as f:
            env_dict = pickle.load(f)
        
        env = gym.make(env_dict['env_name'])
        env.settings.set("matchframes", self.turns*10)

        env.init(**env_dict)
        
        with open(model, 'rb') as f:
            h = PPO1.load(f, env=env)
        obs = env.reset()

        generated_trajectories = []
        for episode in range(K):
            tau = []
            for turn in range(self.turns):
                action, _ = h.predict(obs)
                obs, _, done, _ = env.step(action)
                tau.append(obs)
                if(done):
                    env.reset()
            generated_trajectories.append(tau)

        env.close()
        return np.array(generated_trajectories).astype(np.float)
Esempio n. 2
0
def eval_file(args, evalfile, modelfile, model_index):

    model = PPO1.load(modelfile)
    stage_scheduler = StageScheduler(args)
    container = Container(args, stage_scheduler=stage_scheduler)
    env = ProofEnv(args, container, stage_scheduler)

    prove.guidance_time = 0

    fileparts = evalfile.split("/")
    filename = fileparts[-1]
    evalprefix = "eval_{}_{}_{}_{}".format(model_index, filename,
                                           args.evaltype, args.evalcount)

    print("\n\nTrying to find proof for {}".format(evalfile))
    success, prooflen, attempts = find_one_proof_nobacktrack(
        args, model, env, evalfile)

    print("\n\nEVALUATION")
    print("   evaltime: {}".format(args.evaltime))
    print("   evalfile: {}".format(filename))
    print("   model_index: {}".format(model_index))
    print("   evaltype: {}".format(args.evaltype))
    print("   evalcount: {}".format(args.evalcount))
    print("   Success: {}".format(success))
    print("   Proof length: {}".format(prooflen))
    print("   Attempts: {}".format(attempts))
Esempio n. 3
0
def train():
    """
  Train PPO1 model for slime volleyball, in MPI multiprocessing. Tested for 96 CPUs.
  """
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure(folder=LOGDIR)

    else:
        logger.configure(format_strs=[])
    workerseed = SEED + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_env(workerseed)

    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    model = PPO1.load(BEST_MODEL_PATH, env=env)

    eval_callback = EvalCallback(env,
                                 best_model_save_path=LOGDIR,
                                 log_path=LOGDIR,
                                 eval_freq=EVAL_FREQ,
                                 n_eval_episodes=EVAL_EPISODES)

    model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

    env.close()
    del env
    if rank == 0:
        model.save(os.path.join(
            LOGDIR, "final_model"))  # probably never get to this point.
Esempio n. 4
0
def train(env_name,
          num_time_steps,
          policy_kwargs,
          eval_ep,
          eval_freq,
          ckpt_freq,
          load_model=None):
    env = gym.make(env_name)
    env_ = gym.make(env_name)
    rank = MPI.COMM_WORLD.Get_rank()
    today = date.today()
    today = str(today).replace('-', '_')
    now = datetime.now()
    current_time = now.strftime("%H_%M_%S")
    model_name = env_name + '_PPO1_' + today + current_time
    Path('./run/' + model_name).mkdir(parents=True, exist_ok=True)
    path = os.path.join(os.path.dirname(__file__), './run/' + model_name)

    ############################
    #         callback         #
    ############################
    callbacklist = []
    eval_callback = EvalCallback_wandb(env_,
                                       n_eval_episodes=eval_ep,
                                       eval_freq=eval_freq,
                                       log_path=path)
    ckpt_callback = CheckpointCallback(save_freq=ckpt_freq,
                                       save_path='./run/' + model_name +
                                       '/ckpt',
                                       name_prefix='')
    callbacklist.append(eval_callback)
    callbacklist.append(ckpt_callback)
    callback = CallbackList(callbacklist)

    if load_model:
        model = PPO1.load(env=env, load_path=load_model)
    else:
        model = PPO1(MlpPolicy, env, verbose=1, policy_kwargs=policy_kwargs)

    ############################
    #          Logging         #
    ############################
    if rank == 0:
        logger.configure(path)
        config = {}
        config['load'] = [{'load_model': load_model}]
        config['eval'] = [{'eval_freq': eval_freq, 'eval_ep': eval_ep}]
        config['ckpt'] = [{'ckpt_freq': ckpt_freq}]
        config['policy'] = [{'policy_network': policy_kwargs}]
        with open('./run/' + model_name + '/' + model_name + '.txt',
                  'w+') as outfile:
            json.dump(config, outfile, indent=4)
    else:
        logger.configure(path, format_strs=[])
    ############################
    #            run           #
    ############################

    model.learn(total_timesteps=int(num_time_steps), callback=callback)
    model.save(path + '/finish')
Esempio n. 5
0
    def init(self, **kwargs):
        self.action_idx = [10, 11, 18, 19, 20, 21]
        super().init(**kwargs)

        models = kwargs['inner_models']
        # set up major model and environment parameters
        self.major = PPO1.load(models['major_model'])
        with open(models['major_model_configs'], 'rb') as f:
            self.major_configs = pickle.load(f)
        self.major_action_idx = [1, 3, 4, 5, 7, 8, 14, 15]

        # set up minor model and environment parameters
        self.minor = PPO1.load(models['minor_model'])
        with open(models['minor_model_configs'], 'rb') as f:
            self.minor_configs = pickle.load(f)
        self.minor_action_idx = [0, 2, 6, 9, 12, 13, 16, 17]
Esempio n. 6
0
 def load_model(self,agent_to_load_directory,is_test=False):
     if self.game_type != "atari":
         if  agent_to_load_directory=="":
             self.model=PPO1.load("./models/agentPPO.pkl",env=self.env)
         else:
             self.model=PPO1.load(agent_to_load_directory,env=self.env)
     else:
         if is_test:
             if  agent_to_load_directory=="":
                 self.model=PPO2.load("./models/agentPPO.pkl")
             else:
                 self.model=PPO2.load(agent_to_load_directory)
         else:
             if  agent_to_load_directory=="":
                 self.model=PPO2.load("./models/agentPPO.pkl",env=self.env)
             else:
                 self.model=PPO2.load(agent_to_load_directory,env=self.env)
Esempio n. 7
0
def mainHybrid(arg):
    test = arg == TEST

    env = fep.FurutaEnvPosPpo(cm.RUN, render=not test)
    #env.setRender(True)
    modelBal = PPO1.load(POLICY_PATH + "ppo1_pos_policy_bal.zip")
    modelUp = PPO1.load(POLICY_PATH + "ppo1_pos_policy_up.zip")

    buf_rew = []
    test_cutoff_count = 0
    test_count = 0
    overspeed = 0
    complete_count = 0
    while True:
        test_count += 1
        if test and test_count >= TEST_COUNT_HYBRID:
            print("\n***Average reward: %.3f\tLong runs: %d\tComplete: %d" %
                  (sum(buf_rew) / float(len(buf_rew)),
                   test_cutoff_count - overspeed, complete_count))
            break

        obs, done = env.reset(), False
        episode_rew = 0
        count = 0
        while not done:
            if abs(obs[2]) > cm.deg2Rad(cm.ANGLE_TERMINAL_MIN_D):
                action, _ = modelUp.predict(obs)
            else:
                action, _ = modelBal.predict(obs)

            obs, rew, done, _ = env.step(action)

            if speedCheck(obs):
                overspeed += 1

            episode_rew += rew
            count += 1
        if count > 999:
            complete_count += 1
        buf_rew.append(episode_rew)
        if test and count >= TEST_CUTOFF_MAX:
            test_cutoff_count += 1
        print("Episode reward: %.3f" % (episode_rew))
Esempio n. 8
0
    def init(self, **kwargs):
        self.action_idx = [0, 2, 6, 9, 12, 13, 16, 17]
        super().init(**kwargs)

        # the major actions
        self.major = PPO1.load(kwargs['inner_models']['major_model'])
        with open(kwargs['inner_models']['major_model_configs'], 'rb') as f:
            self.major_configs = pickle.load(f)

        self.major_action_idx = [1, 3, 4, 5, 7, 8, 14, 15]
Esempio n. 9
0
    def __init__(self, model, action_idx):
        """
         A simple class wrapper for lower level models. 
         Has some useful functions for learning with.

         Args:
             model (string): path to the model
             action_idx (list): the list of action indexes controlled by the model
        """
        self.model = PPO1.load(model)
        self.action_indexes = action_idx
        self.action_list = list(
            product([1, 2, 3, 4], repeat=len(self.action_indexes)))
Esempio n. 10
0
def load_env(model_name='flexible_load_first',seed=9):
#flexible_load_first, overnight, larger_margin_cost, discount_06, flex50
    model_path = os.path.join(MODEL_PATH,model_name)
    params_name = model_name +'_params.p'
    param_path = os.path.join(MODEL_PATH,params_name)
    try:
        model = DDPG.load(model_path)
    except:
        model = PPO1.load(model_path)
    env = ActiveEnv(seed=seed)
    with open(param_path,'rb') as f:
        params = pickle.load(f)

    env.set_parameters(params)
    model.set_env(env)
    return model, env
Esempio n. 11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--algorithm") 
    parser.add_argument("--env")
    parser.add_argument("--steps")
    parser.add_argument("--alpha")
    parser.add_argument("--grid_search")
    args = parser.parse_args()

    algorithm = args.algorithm 
    env = gym.make(args.env)
    grid_search = args.grid_search
    alpha = args.alpha

    if algorithm == "ppo1":
        from stable_baselines import PPO1
        from stable_baselines.common.policies import MlpPolicy
        
        model = PPO1(MlpPolicy, env, verbose=1)
    else:
        from stable_baselines import DQN
        from stable_baselines.deepq.policies import MlpPolicy

        model = DQN(MlpPolicy, env, learning_rate=alpha, verbose=1)

    model.learn(total_timesteps=int(args.steps), log_interval=10)
    model.save(f"{algorithm}_cartpole")

    del model # remove to demonstrate saving and loading

    if algorithm == "ppo1":
        model = PPO1.load(f"{algorithm}_cartpole")
    else:
        model = DQN.load(f"{algorithm}_cartpole")

    mean_reward = evaluate(model, env, num_steps=10000)
    
    hparams_str = f" algorithm={algorithm} env={args.env} steps={args.steps} alpha={alpha}"

    if grid_search:
        with open("grid_search_results.txt", "a") as myfile:
            myfile.write(str(mean_reward) + hparams_str)

        myfile.close()
    else:
        print(str(mean_reward) + hparams_str)
Esempio n. 12
0
def eval_on_dir(args, env, modelFile, evalcount, evaldir, evaltype, evaltime):
    model = PPO1.load(modelFile)

    successes = []
    failures = []
    timeouts = []
    lenghts = []
    for filename in os.listdir(evaldir):  ### PARALLELIZE THIS!!!
        if filename.endswith(".p"):
            evalfile = os.path.join(dir, filename)
            success_ratio, failure_ratio, timeout_ratio, avglen = eval_on_file(
                args, model, env, evalcount, evalfile, evaltype, evaltime)
            successes.append(success_ratio)
            failures.append(failure_ratio)
            timeouts.append(timeout_ratio)
            lenghts.append(avglen)

    return successes, failures, timeouts, lenghts
Esempio n. 13
0
def eval(args, evaldir, modelfile, model_index):

    model = PPO1.load(modelfile)
    stage_scheduler = StageScheduler(args)
    container = Container(args, stage_scheduler=stage_scheduler)
    env = ProofEnv(args, container, stage_scheduler)

    proofs_found = 0
    proofs_tried = 0
    len_sum = 0.0
    attempts_sum = 0.0
    prove.guidance_time = 0

    dirparts = evaldir.split("/")
    if dirparts[-1] == "":
        dirname = dirparts[-2]
    else:
        dirname = dirparts[-1]
    evalprefix = "eval_{}_{}_{}_{}".format(model_index, dirname, args.evaltype,
                                           args.evalcount)

    for filename in os.listdir(evaldir):
        if filename.endswith(".p"):
            name = os.path.join(evaldir, filename)
            print("\n\nTrying to find proof for {}".format(name))
            proofs_tried += 1
            success, prooflen, attempts = find_one_proof_nobacktrack(
                args, model, env, name)
            if success == 1:
                proofs_found += 1
                len_sum += prooflen
                attempts_sum += attempts

        print("Found: {}/{} proofs".format(proofs_found, proofs_tried))

    print("\n\nEVALUATION")
    print("   evaltime: {}".format(args.evaltime))
    print("   evaldir: {}".format(dirname))
    print("   model_index: {}".format(model_index))
    print("   evaltype: {}".format(args.evaltype))
    print("   evalcount: {}".format(args.evalcount))
    print("   FOUND: {}/{}".format(proofs_found, proofs_tried))
    print("   Avg proof length: {}".format(safediv(len_sum, proofs_found)))
    print("   Avg attempts: {}".format(safediv(attempts_sum, proofs_found)))
Esempio n. 14
0
def mainBal(arg):
    test = arg == TEST

    env = fep.FurutaEnvPosPpoBal(cm.RUN, render=not test)
    #env.setRender(not test)
    #model = PPO1.load(POLICY_PATH + "ppo1_pos_policy_bal.zip")
    model = PPO1.load(POLICY_PATH + "ppo1_pos_policy_bal.pkl")

    buf_rew = []
    test_cutoff_count = 0
    complete_count = 0
    test_count = 0
    overspeed = 0
    total_count = 0
    while True:
        test_count += 1
        if test and test_count >= TEST_COUNT_BAL:
            print(
                "\n***Average reward: %.3f\tLong runs: %d\tAverage count: %.3f\tCompleted: %d\tOverspeed: %d***\n"
                % (sum(buf_rew) / float(len(buf_rew)), test_cutoff_count,
                   total_count / float(test_count), complete_count, overspeed))
            break

        obs, done = env.reset(), False
        #obs[4] = ARM_TARGET_RAD
        episode_rew = 0
        count = 0
        while not done:
            action, _ = model.predict(obs)
            obs, rew, done, _ = env.step(action)
            #obs[4] = ARM_TARGET_RAD
            if speedCheck(obs):
                overspeed += 1
            episode_rew += rew
            count += 1
            total_count += 1
        if count > 999:
            complete_count += 1
        buf_rew.append(episode_rew)
        if test and count >= TEST_CUTOFF_MIN:
            test_cutoff_count += 1
        print("Episode reward: %.3f\tCount: %d" % (episode_rew, count))
    def __init__(self, env=None, load_dir=None, load_type=None, **kwargs):
        self.algorithm = load_type

        if self.algorithm == 'ars':
            params = np.load(load_dir + 'params1.npy')
            policy_params = {
                'ob_dim': 23,
                'ac_dim': 12,
                'ob_filter': 'NoFilter',
                'hsize': 2,
                'numlayers': 32
            }

            if kwargs["mode"] == 'mlp':
                self.agent = MLPPolicy(policy_params)
                self.agent.load(params)
            elif kwargs["mode"] == 'linearbias':
                self.agent = LinearBiasPolicy(policy_params)
                self.agent.load(params)
            else:
                raise NotImplementedError

        elif self.algorithm == 'openloop':
            if kwargs["mode"] == '2finger':
                config_data = util.read_config_file(
                    'gym_roam_hand_2fin_grasping_baseline.cfg', '')
            elif kwargs["mode"] == '3finger':
                config_data = util.read_config_file('roam_grasping_3fin.cfg',
                                                    '')
            else:
                raise NotImplementedError
            self.agent = OpenLoopPolicy(config_data, env)

        elif self.algorithm == 'ppo1':
            self.agent = PPO1.load("ppo1_roam")

        elif self.algorithm == 'ppo2':
            self.agent = PPO2.load("{}/trained_model".format(load_dir))

        else:
            raise NotImplementedError
Esempio n. 16
0
def mainUp(arg):
    test = arg == TEST

    env = fep.FurutaEnvPosPpoUp(cm.RUN, render=not test)
    #env.setRender(True)
    model = PPO1.load(POLICY_PATH + "ppo1_pos_policy_up.zip")

    buf_rew = []
    test_cutoff_count = 0
    test_count = 0
    overspeed = 0
    total_count = 0
    while True:
        test_count += 1
        if test and test_count >= TEST_COUNT_UP:
            print(
                "\n***Average reward: %.3f\tAverage count: %.3f\tShort runs: %d***"
                % (sum(buf_rew) / float(len(buf_rew)), total_count /
                   float(test_count), test_cutoff_count - overspeed))
            break

        obs, done = env.reset(), False
        episode_rew = 0
        count = 0
        while not done:
            action, _ = model.predict(obs)
            obs, rew, done, _ = env.step(action)
            if speedCheck(obs):
                overspeed += 1
            episode_rew += rew
            count += 1
            total_count += 1
        buf_rew.append(episode_rew)
        if test and count <= TEST_CUTOFF_MAX:
            test_cutoff_count += 1
        print("Episode average reward: %.3f\tCount: %d" %
              (episode_rew / count, count))
Esempio n. 17
0
import gym
import pybulletgym
from stable_baselines.common.policies import MlpPolicy
from stable_baselines import PPO1
import time

if __name__ == '__main__':
    env = gym.make('Walker2DPyBulletEnv-v0')
    model = PPO1.load("sac_sliding")
    env.render()
    ob = env.reset()
    reward = 0

    while True:
        action, _states = model.predict(ob)
        ob, r, done, info = env.step(action)
        reward += r
        time.sleep(0.01)
        if done:
            ob = env.reset()
            print('r is {}'.format(r))
            print('Episode reward is {}'.format(reward))
            reward = 0
Esempio n. 18
0
import gym
import numpy as np
from matplotlib import pyplot as plt
from stable_baselines.common.policies import MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy,CnnPolicy
from stable_baselines.common import make_vec_env
from stable_baselines import TRPO, ACKTR, A2C, SAC, PPO1

# multiprocess environment
env = gym.make('gym_squeeze:squeeze-v0')

model = PPO1.load("ppo1_squeeze")
#model.save("quello_quasi_buono_squeeze")

obs = env.reset()
x=[]
rc=[]
sc=[]
rew=[]
done=[]
dones=False
i=0

rc2=[]
sc2=[]
rew2=[]
done2=[]

dones=False
i=0
obs2=obs
while dones==False:
Esempio n. 19
0
print("original action space: ", env.action_space)
print("original observation space: ", env.observation_space)

env_player1 = RoboSumoWrapper(env, player_id=1)
policy1 = PPO1(MlpPolicy, env_player1, verbose=1)

env_player0 = RoboSumoWrapper(env)
policy0 = PPO1(MlpPolicy, env_player0, verbose=1)

env_player0.opponent_policy = policy1

print("action space of policy0 is: ", policy0.action_space)
print("observation  space of policy0 is: ", policy0.observation_space)

policy0.learn(total_timesteps=5)
policy0.save("policy0")

del policy0  # remove to demonstrate saving and loading

model = PPO1.load("policy0")

obs = env_player0.reset()
while True:
    print("shape of obs is: ", obs.shape)
    action, _states = model.predict(obs)

    print("action is: ", action)
    obs, rewards, dones, info = env_player0.step(action)
    # obs = env.state
    env.render(mode="human")
Esempio n. 20
0
def main():

  parser = custom_arg_parser()
  args = parser.parse_args()
  load_defaults(args)
  print("Arguments:{}".format(args))
  # Create the model name with all the parameters
  
  model_dir_name = serialize_args(args)
  print("Model name: {}".format(model_dir_name))
  if args.model is not None:
    model_save_path = os.path.dirname(args.model) + "/"
    tb_save_path = model_save_path.replace("learned_models","tb_logs")
  else:
    model_save_path = "../../learned_models/" + model_dir_name + "/"
    tb_save_path = "../../tb_logs/" +  model_dir_name + "/"
  print("Model save path:{}".format(model_save_path))
  print("TB logs save path:{}".format(tb_save_path))
  final_model_path = model_save_path + "final_" + model_dir_name
  model_load_path = args.model
  show_render = args.visualize

  # Save args to json for training from checkpoints
  if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)
    with open(model_save_path + "args.json", 'w+') as f:
      json.dump(vars(args),f,indent=2,sort_keys=True)

  env = GymWrapper(
      suite.make(
      "JR2Door",
      has_renderer        = show_render,
      use_camera_obs      = False,
      ignore_done         = False,
      control_freq        = args.control_freq,
      horizon             = args.horizon,
      door_type           = args.door_type,
      bot_motion          = args.bot_motion,
      robot_pos           = args.robot_pos,
      robot_theta         = args.robot_theta,
      dist_to_handle_coef = args.rcoef_dist_to_handle,
      door_angle_coef     = args.rcoef_door_angle,
      handle_con_coef     = args.rcoef_handle_con,
      body_door_con_coef  = args.rcoef_body_door_con,
      self_con_coef       = args.rcoef_self_con,
      arm_handle_con_coef = args.rcoef_arm_handle_con,
      arm_door_con_coef   = args.rcoef_arm_door_con,
      force_coef          = args.rcoef_force,
      gripper_touch_coef  = args.rcoef_gripper_touch,
      dist_to_door_coef   = args.rcoef_dist_to_door,
      wall_con_coef       = args.rcoef_wall_con,
      reset_on_large_force= args.reset_on_large_force,
      debug_print         = args.print_info,
      eef_type            = args.eef_type,
      door_init_qpos      = args.door_init_qpos,
      goal_offset         = args.goal_offset,
    )
  )
  
  if args.slurm:
    env = SubprocVecEnv([lambda: env for i in range(args.n_cpu)])
  else:
    env = DummyVecEnv([lambda: env])

  # Load the specified model, if there is one
  if args.model is not None:
    # Training from checkpoint, so need to reset timesteps for tb
    reset_num_timesteps = False
    if args.rl_alg == "ppo2":
      model = PPO2.load(model_load_path,env=env)
      print("Succesfully loaded PPO2 model")
    if args.rl_alg == "ppo1":
      model = PPO1.load(model_load_path,env=env)
      print("Succesfully loaded PPO1 model")
  else: 
    # New model, so need to reset timesteps for tb
    reset_num_timesteps = True
    if args.rl_alg == "ppo2":
      model = PPO2(
                  args.policy,
                  env,
                  verbose=args.verbose,
                  n_steps=args.n_steps,
                  nminibatches=args.minibatches,
                  noptepochs=args.opt_epochs,
                  cliprange=args.clip_range,
                  ent_coef=args.ent_coef,
                  tensorboard_log=tb_save_path,
                  #full_tensorboard_log=True
                  )

    elif args.rl_alg == "ppo1":
      model = PPO1(
                  args.policy,
                  env,
                  verbose=args.verbose,
                  timesteps_per_actorbatch=args.n_steps,
                  optim_epochs=args.opt_epochs,
                  tensorboard_log=tb_save_path,
                  )
  if args.replay:
    # Replay a policy
    obs = env.reset()
    count = 0
    with open('episode-reward.csv', mode='w') as fid:
      writer = csv.writer(fid, delimiter=',')
      writer.writerow("reward")
    while(count < 1000):
      env.render()
      count += 1
      print(count)
    while True:
      if args.model is None:
        print("Error: No model has been specified")
      action, _states = model.predict(obs,deterministic=True)
      #print("action {}".format(action))
      obs, reward, done, info = env.step(action)
      env.render()
      #print(obs)
      #print(env.sim.data.qpos[env._ref_joint_vel_indexes])
      #time.sleep(0.1)

      with open('episode-reward.csv', mode='a') as fid:
        writer = csv.writer(fid, delimiter=',')
        writer.writerow(reward)

      #if done:
      #  quit()
  else:
    # Train
    model.learn(
                total_timesteps = args.total_timesteps,
                save_dir = model_save_path,
                render=show_render,
                reset_num_timesteps=reset_num_timesteps,
                )

    model.save(final_model_path)
  
    print("Done training")
    obs = env.reset()
Esempio n. 21
0
save_name = "model/gail_exp1"
epochs = 10
timestep_per_epoch = int(1e5)
expert_n_episodes = 100
############################################

if __name__ == "__main__":

    if not os.path.exists(save_name):
        os.makedirs(save_name)

    # Generate expert trajectories (train expert)
    print("\n...Generate expert trajectories\n")
    env = PrticleEnv(alpha=1, beta=10, win_thre=1, max_timestep=256)
    model = PPO1.load("model/part_circle_exp2_epoch05_sib.zip")
    model.set_env(env)
    generate_expert_traj(model,
                         'expert_part_circle_exp2_epoch05_sib',
                         n_episodes=expert_n_episodes)
    print("...finish\n")

    # Load the expert dataset
    print("\n...Load the expert dataset\n")

    dataset = ExpertDataset(
        expert_path='expert_part_circle_exp2_epoch05_sib.npz',
        traj_limitation=-1,
        verbose=1)
    print("...finish\n")
Esempio n. 22
0
parser = argparse.ArgumentParser()
parser.add_argument("--algorithm") 
args = parser.parse_args()

algorithm = args.algorithm 

env = gym.make('CartPole-v0')

if algorithm == "ppo1":
    from stable_baselines import PPO1
    from stable_baselines.common.policies import MlpPolicy
    
    model = PPO1(MlpPolicy, env, verbose=1)
else:
    from stable_baselines import DQN
    from stable_baselines.deepq.policies import MlpPolicy

    model = DQN(MlpPolicy, env, verbose=1)

model.learn(total_timesteps=int(2e4), log_interval=10)
model.save(f"{algorithm}_cartpole")

del model # remove to demonstrate saving and loading

if algorithm == "ppo1":
    model = PPO1.load(f"{algorithm}_cartpole")
else:
    model = DQN.load(f"{algorithm}_cartpole")

mean_reward = evaluate(model, num_steps=10000)
Esempio n. 23
0
        description='Evaluate pre-trained PPO agent.')
    parser.add_argument('--model-path',
                        help='path to stable-baselines model.',
                        type=str,
                        default="zoo/ppo/best_model.zip")
    parser.add_argument('--render',
                        action='store_true',
                        help='render to screen?',
                        default=False)

    args = parser.parse_args()
    render_mode = args.render

    env = gym.make("SlimeVolley-v0")

    # the yellow agent:
    print("Loading", args.model_path)
    policy = PPO1.load(args.model_path, env=env)  # 96-core PPO1 policy

    history = []
    for i in range(1000):
        env.seed(seed=i)
        cumulative_score = rollout(env, policy, render_mode)
        print("cumulative score #", i, ":", cumulative_score)
        history.append(cumulative_score)

    print("history dump:", history)
    # this is what I got: [1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 2, 1, 1, 1, 1, 4, 0, 0, 0, 2, 2, 0, 1, 2, 2, 2, 1, 2, 1, 0, 2, 0, 1, 1, 1, 0, 0, 1, 0, 1, 2, 0, 0, 1, 1, 4, 1, 0, 2, 2, 3, 2, 4, 4, 1, 1, 2, 0, 0, 0, 4, 1, 1, 2, 0, 1, 1, 1, 2, 1, 1, 3, 2, 0, 1, 1, 1, 2, 2, 1, 1, 0, 0, 0, 1, 1, 1, 2, 5, 3, 3, 0, 0, 1, 0, 0, 2, 2, 1, 2, 1, 1, 0, 1, 0, 1, 1, 2, 2, 1, 3, 4, 0, 0, 0, 3, 0, 1, 5, 2, 4, 0, 1, 1, 1, 3, 0, 1, 2, 1, 1, 2, 1, 1, 2, 0, 1, 1, 0, 1, 0, 1, 2, 0, 2, 0, 2, 1, 1, 1, 0, 0, 0, 2, 2, 1, 0, 0, 0, 3, 0, 1, 3, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 2, 0, 1, 2, 1, 0, 0, 1, 2, 0, 2, 1, 0, 1, 2, 2, 0, 2, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 2, 0, 1, 1, 0, 2, 1, 0, 1, 0, 1, 0, 1, 3, 2, 2, 1, 2, 0, 2, 2, 0, 1, 0, 1, 0, 0, 2, 1, 2, 1, 0, 2, 1, 0, 1, 0, 2, 1, 1, 1, 2, 2, 2, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 2, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 2, 0, 2, 0, 0, 1, 1, 0, 1, 2, 1, 0, 2, 3, 3, 4, 0, 0, 1, 0, 1, 1, 2, 0, 1, 0, 1, 0, 2, 1, 0, 3, 0, 0, 1, 1, 1, 2, 2, 0, 0, 2, 0, 0, 1, 2, 4, 0, 2, 0, 1, 1, 1, 0, 1, 2, 1, 0, 0, 4, 1, 0, 0, 0, 0, 2, 1, 1, 1, 3, 1, 1, 1, 2, 1, 1, 1, 2, 1, 0, 1, 1, 2, 0, 0, 0, 1, 4, 2, 3, 0, 3, 1, 0, 0, 1, 2, 2, 1, 0, 0, 1, 2, 0, 2, 1, 0, 1, 0, 0, 0, 1, 0, 2, 1, 2, 0, 1, 1, 2, 1, 0, 1, 0, 1, 1, 2, 0, 2, 0, 0, 1, 1, 0, 0, 2, 0, 2, 0, 1, 2, 2, 3, 1, 1, 0, 0, 1, 1, 4, 2, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 2, 3, 0, 0, 2, 2, 0, 3, 1, 0, 2, 0, 1, 0, 0, 2, 1, 2, 3, 1, 0, 1, 0, 1, 2, 1, 0, 2, 0, 0, 1, 0, 0, 1, 1, 1, 0, 2, 1, 0, 2, 2, 0, 1, 0, 1, 0, 5, 2, 2, 0, 1, 2, 0, 2, 0, 0, 0, 1, 0, 0, 1, 0, 2, 2, 1, 0, 1, 1, 2, 0, 0, 2, 0, 0, 3, 2, 2, -1, 3, 1, 1, 2, 0, 0, 2, 1, 1, 0, 1, 1, 3, 0, 2, 1, 1, 0, 3, 2, 1, 0, 2, 1, 2, 0, 1, 0, 2, 0, 2, 0, 3, 0, 0, 1, 0, 0, 1, 0, 0, 0, 2, 1, 2, 0, 3, 0, 2, 0, 1, 2, 1, 0, 0, 1, 2, 1, 0, 0, 4, 3, 0, 2, 1, 0, 0, 0, 2, 2, 1, 1, 0, 0, 2, 1, 0, 2, 2, 1, 0, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 2, 0, 0, 0, 2, 2, 2, 0, 0, 4, 3, 0, 0, 1, 0, 1, 1, 3, 3, 1, 0, 1, 1, 0, 0, 3, 3, 0, 2, 3, 1, 2, 1, 3, 2, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 2, 0, 1, 1, 2, 1, 3, 1, 2, 0, -1, 0, 1, 0, 1, 4, 4, 0, 0, 0, 1, 0, 1, 0, 1, 3, 1, 0, 1, 1, 1, 0, 1, 1, 0, 2, 0, 2, 0, 0, 2, 1, 1, 1, 0, 1, 3, 1, 0, 0, 0, 1, 1, 0, 1, 2, 0, 2, 2, 0, 1, 0, 2, 3, 1, 1, 1, 1, 0, 2, 2, 1, 2, 0, 0, 2, 0, 1, 3, 0, 1, 0, 1, 0, 1, 0, 0, 2, 1, 2, 0, 2, 1, 1, 3, 1, 2, 2, 0, 1, 0, 2, 0, 1, 2, 0, 1, 2, 1, 0, 0, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0, 0, 1, 2, 1, 2, 0, 0, 0, 2, 1, 1, 3, 1, 2, 2, 2, 2, 0, 1, 1, 1, 2, 0, 1, 4, 0, 0, 0, 1, 4, 0, 1, 4, 1, 2, 1, 1, 3, 3, 3, 4, 1, 0, 1, 0, 0, 3, 1, 4, 1, 3, 1, 1, 1, 0, 2, 4, 1, 0, 3, 2, 1, 0, 0, 3, 1, 2, 0, 0, 0, 4, 0, 1, 0, 1, 1, 0, 0, 0, 0, 2, 1, 1, 0, 2, 3, 0, 1, 0, 1, 1, 2, 0, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 0, 0, 1, 0, 0, 1, 2, 1, 3, 2, 0, 2, 0, 0, 0, 0, 1, 0, 2, 0, 1, 0, 1, 0, 1, 0, 0, 2, 1, 1, 2, 0, 1, 1, 1, 1, 2, 1, 0, 1, 0, 2, 3, 3, 0, -1, 2, 0, 1, 1, 3, 0, 1, 0, 0, 3, 0, 2, 0, 0, 1, 0, 2, 2, -1, 1, 0, 0, 1, 0, 1, 1, 0, 2, 1, 3, 1, 0, 2, 2, 1, 1, 1, 1, 1, 3, 1, 1, 2, 0, 2, 2, 1, 0, 0, 2, 0, 1, 2, 3, 2, 3, 0, 3, 2, 3, 2]
    print("average score", np.mean(history), "standard_deviation",
          np.std(history))
Esempio n. 24
0
RL_method = "PPO1"
experiment_ID = "experiment_4"
stiffness_value = "stiffness_test16"
save_name_extension = RL_method

log_dir = "./logs/{}/{}/{}/".format(experiment_ID, RL_method, stiffness_value)



# defining the environments
env = gym.make('NmiLeg-v1')
#env = DummyVecEnv([lambda: env])

# loading the trained model
if RL_method == "PPO1":
	model = PPO1.load(log_dir+"/model.pkl")
elif RL_method == "PPO2":
	model = PPO2.load(log_dir+"/model.pkl")
	env = DummyVecEnv([lambda: env])
elif RL_method == "DDPG":
	model = DDPG.load(log_dir+"/model.pkl")
	env = DummyVecEnv([lambda: env])
else:
	raise ValueError("Invalid RL mode")
# setting the environment

model.set_env(env)

env_run = gym.make('NmiLeg-v1')
#env_run = Monitor(env_run,'./video/'+log_dir,force=True)
#model = DDPG.load("PPO2-HalfCheetah_nssu-v3_test2")
 def __init__(self, path):
     print(path)
     self.model = PPO1.load(path)
Esempio n. 26
0
    else:
        raise ValueError('No such environment!')
    env.render()

    if args.alg == 'td3' or args.alg == 'TD3':
        from stable_baselines import TD3
        model = TD3.load(args.file)
    elif args.alg == 'ddpg' or args.alg == 'DDPG':
        from stable_baselines import DDPG
        model = DDPG.load(args.file)
    elif args.alg == 'SAC' or args.alg == 'sac':
        from stable_baselines import SAC
        model = SAC.load(args.file)
    elif args.alg == 'ppo1' or args.alg == 'PPO1':
        from stable_baselines import PPO1
        model = PPO1.load(args.file)
    elif args.alg == 'ppo2' or args.alg == 'PPO2':
        from stable_baselines import PPO2
        model = PPO2.load(args.file)
    else:
        raise ValueError('No such algorithm')

    ob = env.reset()
    reward = 0

    while True:
        action, _states = model.predict(ob)
        ob, r, done, info = env.step(action)
        reward += r
        time.sleep(0.01)
        if done:
Esempio n. 27
0
def eval_mpi(args, evaldir, modelfile, model_index):

    from mpi4py import MPI as mpi
    rank = mpi.COMM_WORLD.Get_rank()
    all = mpi.COMM_WORLD.Get_size()

    model = PPO1.load(modelfile)
    stage_scheduler = StageScheduler(args)
    container = Container(args, stage_scheduler=stage_scheduler)
    env = ProofEnv(args, container, stage_scheduler)

    dirparts = evaldir.split("/")
    if dirparts[-1] == "":
        dirname = dirparts[-2]
    else:
        dirname = dirparts[-1]

    evalprefix = "eval_{}_{}_{}_{}".format(model_index, dirname, args.evaltype,
                                           args.evalcount)

    proofs_found = 0
    proofs_tried = 0
    len_sum = 0.0
    attempts_sum = 0.0
    prove.guidance_time = 0

    filenames_original = sorted([
        filename for filename in os.listdir(evaldir) if filename.endswith(".p")
    ])

    def data_gen(filenames, i):
        return filenames[i % len(filenames)]

    chunks = int(len(filenames_original) / all) + 1
    filenames_extended = [
        data_gen(filenames_original, i) for i in range(chunks * all)
    ]  # [rank:][::all]
    assert (len(filenames_extended) > 0)
    for index in range(chunks):
        chunk = filenames_extended[index * all:(index + 1) * all]
        assert (len(chunk) == all)
        name = os.path.join(evaldir, chunk[rank])
        print("\n\nTrying to find proof for {}".format(name))
        success, prooflen, attempts = find_one_proof_nobacktrack(
            args, model, env, name)
        results = mpi.COMM_WORLD.gather((1, success, prooflen, attempts),
                                        root=0)
        if rank == 0:
            # print(results)
            for i in range(len(results)):
                proofs_tried += results[i][0]
                succ = results[i][1]
                if succ == 1:
                    proofs_found += 1
                    len_sum += results[i][2]
                    attempts_sum += results[i][3]
            logger.record_tabular("update_no", proofs_tried)
            logger.record_tabular("{}_proofs_found".format(evalprefix),
                                  proofs_found)
            logger.record_tabular("{}_found".format(evalprefix),
                                  safediv(proofs_found, proofs_tried))
            logger.record_tabular("{}_avg_prooflen".format(evalprefix),
                                  safediv(len_sum, proofs_found))
            logger.record_tabular("{}_avg_attempts".format(evalprefix),
                                  safediv(attempts_sum, proofs_found))
            logger.dumpkvs()
            print("Found: {}/{} proofs".format(proofs_found, proofs_tried))

    print("\n\nEVALUATION {}".format(rank))
    print("   evaltime: {}".format(args.evaltime))
    print("   evaldir: {}".format(dirname))
    print("   model_index: {}".format(model_index))
    print("   evaltype: {}".format(args.evaltype))
    print("   evalcount: {}".format(args.evalcount))
    print("   FOUND: {}/{}".format(proofs_found, proofs_tried))
    print("   Avg proof length: {}".format(safediv(len_sum, proofs_found)))
    print("   Avg attempts: {}".format(safediv(attempts_sum, proofs_found)))
Esempio n. 28
0
def ppo1_test():

    v_env = PortfolioEnv(settings['data_file'], settings['output_file'],
                         settings['strategy_name'], settings['total_steps'],
                         settings['window_length'], settings['capital_base'],
                         settings['lot_size'], settings['leverage'],
                         settings['commission_percent'],
                         settings['commission_fixed'],
                         settings['max_slippage_percent'],
                         settings['start_idx'], settings['compute_indicators'],
                         settings['compute_reward'],
                         settings['compute_position'], settings['debug'])
    #   Create the vectorized environment
    #   v_env = DummyVecEnv([lambda: v_env])
    #   Normalize environment
    #   v_env = VecNormalize(v_env, norm_obs=settings['norm_obs'], norm_reward=settings['norm_reward'], clip_obs=settings['clip_obs'], clip_reward=settings['clip_reward'], gamma=p_gamma, epsilon=EPS)

    model = PPO1.load(MODELS_DIR + settings['model_name'])

    # Strategy

    obs = v_env.reset()
    dones = False

    while not dones:
        action, _states = model.predict(obs, deterministic=True)
        obs, rewards, dones, info = v_env.step(action)
        #   v_env.render(mode='ansi')

    v_env.strategy_name = 'fxcm_XXXUSD_H4 returns PPO1'
    v_env.render(mode='human')

    #   pv,pp,pw=v_env.get_summary()
    #   pr=pv.sum(axis=1).pct_change().fillna(0)

    pr = v_env.returns

    # Random agent

    obs = v_env.reset()
    dones = False

    while not dones:
        # action, _states = model.predict(obs, deterministic=True)
        action = v_env.action_sample
        obs, rewards, dones, info = v_env.step(action)
        #   v_env.render(mode='ansi')

    v_env.strategy_name = 'Random agent'
    v_env.render(mode='human')

    # Buy and hold

    obs = v_env.reset()
    dones = False

    weights = np.concatenate(
        (np.ones(len(v_env.instruments)) / len(v_env.instruments), [0]))

    #   print(weights)

    while not dones:
        obs, rewards, dones, info = v_env.step(action=weights)
        weights = v_env.current_weights
        v_env.render(mode='ansi')

    v_env.strategy_name = 'Buy and hold'
    v_env.render(mode='human')

    bpv, bpp, bpw = v_env.get_summary()
    bpr = bpv.sum(axis=1).pct_change().fillna(0)

    bpr = v_env.returns
    '''
    #   Extended
    pv,pp,pw=v_env.get_summary()
    pv.sum(axis=1).plot()
    plt.title('strategy')
    plt.show()

    
    bpv,bpp,bpw=v_env.get_summary()
    bpv.sum(axis=1).plot()
    plt.title('buy and hold')
    plt.show()

    pr=pv.sum(axis=1).pct_change().fillna(0)
    bpr=bpv.sum(axis=1).pct_change().fillna(0)
    '''
    #   pf.create_simple_tear_sheet(returns=pr,benchmark_rets=bpr)
    pf.create_full_tear_sheet(returns=pr, benchmark_rets=bpr)
Esempio n. 29
0
RL_method = "PPO1"
experiment_ID = "handtest_rot_pool_with_MC_C_task4/"
mc_cntr = 10
sensory_value = 0
sesnory_value_str = "sensory_{}".format(sensory_value)
save_name_extension = RL_method
log_dir_read = "./logs/{}/MC_{}/{}/{}/".format(experiment_ID, mc_cntr, RL_method, sesnory_value_str)
log_dir_write = "./logs/{}/videos/MC_{}/{}/{}/".format(experiment_ID, mc_cntr, RL_method, sesnory_value_str)

# defining the environments
env = gym.make('HandManipulate-v1{}'.format(sensory_value))
#env = DummyVecEnv([lambda: env])

# loading the trained model
if RL_method == "PPO1":
    model = PPO1.load(log_dir_read +"/model")
elif RL_method == "PPO2":
    model = PPO2.load(log_dir_read+"/model.pkl")
    env = DummyVecEnv([lambda: env])
elif RL_method == "DDPG":
    model = DDPG.load(log_dir_read+"/model")
    env = DummyVecEnv([lambda: env])
else:
    raise ValueError("Invalid RL mode")
# setting the environment

model.set_env(env)

env_run = gym.make('HandManipulate-v1{}'.format(sensory_value))
#env_run = Monitor(env_run,log_dir_write,force=True)
#model = DDPG.load("PPO2-HalfCheetah_nssu-v3_test2")
Esempio n. 30
0
import gym

from stable_baselines.common.policies import MlpPolicy
from stable_baselines import PPO1
from env.RSEnv import RSEnv
from env.TestRSEnv import TestRSEnv

env = RSEnv()

#model = PPO1(MlpPolicy, env, verbose=1)
model = PPO1.load("sbppov3")
model.set_env(env)
model.learn(total_timesteps=3000000,
            log_interval=10,
            reset_num_timesteps=False)
model.save("sbppov4")

env = TestRSEnv()
obs = env.reset()
done = False
while not done:
    action, _ = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()
env.close()