Example #1
0
def train(env_name,
          num_time_steps,
          policy_kwargs,
          eval_ep,
          eval_freq,
          ckpt_freq,
          load_model=None):
    env = gym.make(env_name)
    env_ = gym.make(env_name)
    rank = MPI.COMM_WORLD.Get_rank()
    today = date.today()
    today = str(today).replace('-', '_')
    now = datetime.now()
    current_time = now.strftime("%H_%M_%S")
    model_name = env_name + '_PPO1_' + today + current_time
    Path('./run/' + model_name).mkdir(parents=True, exist_ok=True)
    path = os.path.join(os.path.dirname(__file__), './run/' + model_name)

    ############################
    #         callback         #
    ############################
    callbacklist = []
    eval_callback = EvalCallback_wandb(env_,
                                       n_eval_episodes=eval_ep,
                                       eval_freq=eval_freq,
                                       log_path=path)
    ckpt_callback = CheckpointCallback(save_freq=ckpt_freq,
                                       save_path='./run/' + model_name +
                                       '/ckpt',
                                       name_prefix='')
    callbacklist.append(eval_callback)
    callbacklist.append(ckpt_callback)
    callback = CallbackList(callbacklist)

    if load_model:
        model = PPO1.load(env=env, load_path=load_model)
    else:
        model = PPO1(MlpPolicy, env, verbose=1, policy_kwargs=policy_kwargs)

    ############################
    #          Logging         #
    ############################
    if rank == 0:
        logger.configure(path)
        config = {}
        config['load'] = [{'load_model': load_model}]
        config['eval'] = [{'eval_freq': eval_freq, 'eval_ep': eval_ep}]
        config['ckpt'] = [{'ckpt_freq': ckpt_freq}]
        config['policy'] = [{'policy_network': policy_kwargs}]
        with open('./run/' + model_name + '/' + model_name + '.txt',
                  'w+') as outfile:
            json.dump(config, outfile, indent=4)
    else:
        logger.configure(path, format_strs=[])
    ############################
    #            run           #
    ############################

    model.learn(total_timesteps=int(num_time_steps), callback=callback)
    model.save(path + '/finish')
Example #2
0
def train():
  """
  Train PPO1 model for slime volleyball, in MPI multiprocessing. Tested for 96 CPUs.
  """
  rank = MPI.COMM_WORLD.Get_rank()

  if rank == 0:
    logger.configure(folder=LOGDIR)

  else:
    logger.configure(format_strs=[])
  workerseed = SEED + 10000 * MPI.COMM_WORLD.Get_rank()
  set_global_seeds(workerseed)
  env = make_env(workerseed)

  env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
  env.seed(workerseed)

  model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=4096, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
               optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear',
               verbose=1)

  eval_callback = EvalCallback(env, best_model_save_path=LOGDIR, log_path=LOGDIR, eval_freq=EVAL_FREQ, n_eval_episodes=EVAL_EPISODES)

  model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

  env.close()
  del env
  if rank == 0:
    model.save(os.path.join(LOGDIR, "final_model")) # probably never get to this point.
Example #3
0
def getPpo1(env, arch):
    return PPO1(
        env=env,
        policy=MlpPolicy,
        policy_kwargs=dict(net_arch=arch),
        n_cpu_tf_sess=None
    )
Example #4
0
def train(env_id, num_timesteps, seed):
    """
    Train PPO1 model for Atari environments, for testing purposes

    :param env_id: (str) Environment ID
    :param num_timesteps: (int) The total number of samples
    :param seed: (int) The initial seed for training
    """
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure()
    else:
        logger.configure(format_strs=[])
    workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_atari(env_id)

    env = bench.Monitor(env, logger.get_dir() and
                        os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    env = wrap_deepmind(env)
    env.seed(workerseed)

    model = PPO1(CnnPolicy, env, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4,
                 optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2)
    model.learn(total_timesteps=num_timesteps)
    env.close()
    del env
def main(args):
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = args.device
    env = gym.make(args.env)
    train_log_dir = os.path.join(
        args.train_log_dir,
        args.env + '_' + args.expert + '_' + args.policy_type)
    if args.expert == 'PPO':
        expert_model = PPO1(args.policy_type,
                            env,
                            verbose=1,
                            tensorboard_log=train_log_dir)
    else:
        raise NotImplementedError
    expert_model.learn(total_timesteps=args.expert_training_step)
    generate_expert_traj(expert_model,
                         os.path.join(train_log_dir, 'expert_traj'),
                         n_timesteps=1000,
                         n_episodes=args.expert_episodes)

    dataset = ExpertDataset(expert_path=os.path.join(train_log_dir,
                                                     'expert_traj.npz'),
                            traj_limitation=-1)
    gail_model = GAIL(args.policy_type,
                      env,
                      dataset,
                      verbose=1,
                      tensorboard_log=train_log_dir)
    gail_model.learn(args.student_training_step)
    evaluate(gail_model, env, num_steps=10000)
    gail_model.save(train_log_dir)
    env.close()
Example #6
0
    def run_model(self, K, model=None, configs=None):
        if(model is None):
            model = self.model
        if(configs is None):
            configs = self.configs

        with open(configs, 'rb') as f:
            env_dict = pickle.load(f)
        
        env = gym.make(env_dict['env_name'])
        env.settings.set("matchframes", self.turns*10)

        env.init(**env_dict)
        
        with open(model, 'rb') as f:
            h = PPO1.load(f, env=env)
        obs = env.reset()

        generated_trajectories = []
        for episode in range(K):
            tau = []
            for turn in range(self.turns):
                action, _ = h.predict(obs)
                obs, _, done, _ = env.step(action)
                tau.append(obs)
                if(done):
                    env.reset()
            generated_trajectories.append(tau)

        env.close()
        return np.array(generated_trajectories).astype(np.float)
Example #7
0
    def init(self, **kwargs):
        self.action_idx = [10, 11, 18, 19, 20, 21]
        super().init(**kwargs)

        models = kwargs['inner_models']
        # set up major model and environment parameters
        self.major = PPO1.load(models['major_model'])
        with open(models['major_model_configs'], 'rb') as f:
            self.major_configs = pickle.load(f)
        self.major_action_idx = [1, 3, 4, 5, 7, 8, 14, 15]

        # set up minor model and environment parameters
        self.minor = PPO1.load(models['minor_model'])
        with open(models['minor_model_configs'], 'rb') as f:
            self.minor_configs = pickle.load(f)
        self.minor_action_idx = [0, 2, 6, 9, 12, 13, 16, 17]
Example #8
0
def train():
    """
  Train PPO1 model for slime volleyball, in MPI multiprocessing. Tested for 96 CPUs.
  """
    rank = MPI.COMM_WORLD.Get_rank()

    if rank == 0:
        logger.configure(folder=LOGDIR)

    else:
        logger.configure(format_strs=[])
    workerseed = SEED + 10000 * MPI.COMM_WORLD.Get_rank()
    set_global_seeds(workerseed)
    env = make_env(workerseed)

    env = bench.Monitor(
        env,
        logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
    env.seed(workerseed)

    model = PPO1.load(BEST_MODEL_PATH, env=env)

    eval_callback = EvalCallback(env,
                                 best_model_save_path=LOGDIR,
                                 log_path=LOGDIR,
                                 eval_freq=EVAL_FREQ,
                                 n_eval_episodes=EVAL_EPISODES)

    model.learn(total_timesteps=NUM_TIMESTEPS, callback=eval_callback)

    env.close()
    del env
    if rank == 0:
        model.save(os.path.join(
            LOGDIR, "final_model"))  # probably never get to this point.
Example #9
0
def eval_file(args, evalfile, modelfile, model_index):

    model = PPO1.load(modelfile)
    stage_scheduler = StageScheduler(args)
    container = Container(args, stage_scheduler=stage_scheduler)
    env = ProofEnv(args, container, stage_scheduler)

    prove.guidance_time = 0

    fileparts = evalfile.split("/")
    filename = fileparts[-1]
    evalprefix = "eval_{}_{}_{}_{}".format(model_index, filename,
                                           args.evaltype, args.evalcount)

    print("\n\nTrying to find proof for {}".format(evalfile))
    success, prooflen, attempts = find_one_proof_nobacktrack(
        args, model, env, evalfile)

    print("\n\nEVALUATION")
    print("   evaltime: {}".format(args.evaltime))
    print("   evalfile: {}".format(filename))
    print("   model_index: {}".format(model_index))
    print("   evaltype: {}".format(args.evaltype))
    print("   evalcount: {}".format(args.evalcount))
    print("   Success: {}".format(success))
    print("   Proof length: {}".format(prooflen))
    print("   Attempts: {}".format(attempts))
Example #10
0
 def load_model(self,agent_to_load_directory,is_test=False):
     if self.game_type != "atari":
         if  agent_to_load_directory=="":
             self.model=PPO1.load("./models/agentPPO.pkl",env=self.env)
         else:
             self.model=PPO1.load(agent_to_load_directory,env=self.env)
     else:
         if is_test:
             if  agent_to_load_directory=="":
                 self.model=PPO2.load("./models/agentPPO.pkl")
             else:
                 self.model=PPO2.load(agent_to_load_directory)
         else:
             if  agent_to_load_directory=="":
                 self.model=PPO2.load("./models/agentPPO.pkl",env=self.env)
             else:
                 self.model=PPO2.load(agent_to_load_directory,env=self.env)
Example #11
0
    def build_model(self):
        if self.is_stack:
            if self.game_type == "box":
                self.env = DummyVecEnv([lambda: self.env])
                self.model = PPO1(MlpPolicy, self.env, verbose=0, gamma=self.gamma, lam=self.c1, entcoeff=self.c2,
                                  clip_param=self.clip_epslion, adam_epsilon=self.lr)
            if self.game_type == "atari":
                self.model = PPO2(CnnPolicy, self.env, verbose=1, gamma=self.gamma, vf_coef=self.c1,
                                  ent_coef=self.c2, cliprange=self.clip_epslion, learning_rate=self.lr)

        else:
            if self.game_type=="box":
                self.env = DummyVecEnv([lambda: self.env])
                self.model = PPO1(MlpPolicy, self.env, verbose=0,gamma=self.gamma,lam=self.c1,entcoeff=self.c2,clip_param=self.clip_epslion,adam_epsilon=self.lr)
            if self.game_type=="atari":

                self.model = PPO2(CnnLstmPolicy, self.env, verbose=1,gamma=self.gamma,vf_coef=self.c1,ent_coef=self.c2,cliprange=self.clip_epslion,learning_rate=self.lr)
Example #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--algorithm") 
    parser.add_argument("--env")
    parser.add_argument("--steps")
    parser.add_argument("--alpha")
    parser.add_argument("--grid_search")
    args = parser.parse_args()

    algorithm = args.algorithm 
    env = gym.make(args.env)
    grid_search = args.grid_search
    alpha = args.alpha

    if algorithm == "ppo1":
        from stable_baselines import PPO1
        from stable_baselines.common.policies import MlpPolicy
        
        model = PPO1(MlpPolicy, env, verbose=1)
    else:
        from stable_baselines import DQN
        from stable_baselines.deepq.policies import MlpPolicy

        model = DQN(MlpPolicy, env, learning_rate=alpha, verbose=1)

    model.learn(total_timesteps=int(args.steps), log_interval=10)
    model.save(f"{algorithm}_cartpole")

    del model # remove to demonstrate saving and loading

    if algorithm == "ppo1":
        model = PPO1.load(f"{algorithm}_cartpole")
    else:
        model = DQN.load(f"{algorithm}_cartpole")

    mean_reward = evaluate(model, env, num_steps=10000)
    
    hparams_str = f" algorithm={algorithm} env={args.env} steps={args.steps} alpha={alpha}"

    if grid_search:
        with open("grid_search_results.txt", "a") as myfile:
            myfile.write(str(mean_reward) + hparams_str)

        myfile.close()
    else:
        print(str(mean_reward) + hparams_str)
Example #13
0
    def init(self, **kwargs):
        self.action_idx = [0, 2, 6, 9, 12, 13, 16, 17]
        super().init(**kwargs)

        # the major actions
        self.major = PPO1.load(kwargs['inner_models']['major_model'])
        with open(kwargs['inner_models']['major_model_configs'], 'rb') as f:
            self.major_configs = pickle.load(f)

        self.major_action_idx = [1, 3, 4, 5, 7, 8, 14, 15]
Example #14
0
def mainHybrid(arg):
    test = arg == TEST

    env = fep.FurutaEnvPosPpo(cm.RUN, render=not test)
    #env.setRender(True)
    modelBal = PPO1.load(POLICY_PATH + "ppo1_pos_policy_bal.zip")
    modelUp = PPO1.load(POLICY_PATH + "ppo1_pos_policy_up.zip")

    buf_rew = []
    test_cutoff_count = 0
    test_count = 0
    overspeed = 0
    complete_count = 0
    while True:
        test_count += 1
        if test and test_count >= TEST_COUNT_HYBRID:
            print("\n***Average reward: %.3f\tLong runs: %d\tComplete: %d" %
                  (sum(buf_rew) / float(len(buf_rew)),
                   test_cutoff_count - overspeed, complete_count))
            break

        obs, done = env.reset(), False
        episode_rew = 0
        count = 0
        while not done:
            if abs(obs[2]) > cm.deg2Rad(cm.ANGLE_TERMINAL_MIN_D):
                action, _ = modelUp.predict(obs)
            else:
                action, _ = modelBal.predict(obs)

            obs, rew, done, _ = env.step(action)

            if speedCheck(obs):
                overspeed += 1

            episode_rew += rew
            count += 1
        if count > 999:
            complete_count += 1
        buf_rew.append(episode_rew)
        if test and count >= TEST_CUTOFF_MAX:
            test_cutoff_count += 1
        print("Episode reward: %.3f" % (episode_rew))
def train(params):

    # create model
    env = FlattenObservation(gym.make(params.get("environment")))
    exp_name = params.get("model_name") + "_train_" + params.get("environment")
    log_dir = './logs/' + exp_name
    expert_name = 'expert_{0}'.format(exp_name)

    if params.get("model_name") == 'TRPO':
        print("Loading TRPO Model")
        model = TRPO(MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
        model.learn(total_timesteps=params.get("train_steps"))
        model.save(exp_name)

    if params.get("model_name") == 'PPO':
        print("Loading PPO Model")
        model = PPO1(MlpPolicy,
                     env,
                     verbose=1,
                     tensorboard_log=log_dir,
                     entcoeff=params.get("ent_coef"),
                     gamma=params.get("gamma"),
                     optim_batchsize=params.get("batch_size"),
                     clip_param=params.get("clip_range"),
                     lam=params.get("gae_lambda"))
        model.learn(total_timesteps=params.get("train_steps"))
        model.save(exp_name)

    if params.get("expert_exists") is False:
        print("Training expert trajectories")
        # Train expert controller (if needed) and record expert trajectories.
        generate_expert_traj(model,
                             expert_name,
                             n_timesteps=params.get("expert_timesteps"),
                             n_episodes=params.get("n_episodes"))

    dataset = ExpertDataset(
        expert_path='{0}.npz'.format(expert_name),
        traj_limitation=-1,
        randomize=True,  # if the dataset should be shuffled
        verbose=1)

    model = GAIL('MlpPolicy', env, dataset, verbose=1,
                 tensorboard_log=log_dir)  # Check out for defaults

    if params.get("pre_train") is True:
        print("Pretraining Dataset with Behavioural Cloning")
        model.pretrain(dataset, n_epochs=10000)

    print("Executing GAIL Learning")
    model.learn(total_timesteps=params.get("train_steps"))
    model.save("BC" + exp_name)

    env.close()
    del env
Example #16
0
    def __init__(self, model, action_idx):
        """
         A simple class wrapper for lower level models. 
         Has some useful functions for learning with.

         Args:
             model (string): path to the model
             action_idx (list): the list of action indexes controlled by the model
        """
        self.model = PPO1.load(model)
        self.action_indexes = action_idx
        self.action_list = list(
            product([1, 2, 3, 4], repeat=len(self.action_indexes)))
def ppo1(env_id, log_dir, timesteps):
    # Create log dir
    os.makedirs(log_dir, exist_ok=True)

    # Create and wrap the environment
    env = gym.make(env_id)
    env = Monitor(env, log_dir, allow_early_resets=True)
    env = DummyVecEnv([lambda: env])

    model = PPO1(MlpPolicy, env, verbose=0)
    # Train the agent
    print("Beginning training episodes with PPO1.")
    model.learn(total_timesteps=timesteps)

    env.close()
Example #18
0
def load_env(model_name='flexible_load_first',seed=9):
#flexible_load_first, overnight, larger_margin_cost, discount_06, flex50
    model_path = os.path.join(MODEL_PATH,model_name)
    params_name = model_name +'_params.p'
    param_path = os.path.join(MODEL_PATH,params_name)
    try:
        model = DDPG.load(model_path)
    except:
        model = PPO1.load(model_path)
    env = ActiveEnv(seed=seed)
    with open(param_path,'rb') as f:
        params = pickle.load(f)

    env.set_parameters(params)
    model.set_env(env)
    return model, env
Example #19
0
def test_action_mask_run_ppo1(vec_env, policy, env_class):
    env = vec_env([env_class])

    model = PPO1(policy, env, verbose=0)

    obs, done, action_masks = env.reset(), [False], []
    while not done[0]:
        action, _states = model.predict(obs, action_mask=action_masks)
        obs, _, done, infos = env.step(action)

        action_masks.clear()
        for info in infos:
            env_action_mask = info.get('action_mask')
            action_masks.append(env_action_mask)

    env.close()
Example #20
0
def ppo1_train():

    # best parames fxcm_11_H4_full_2015_2018_train_6300

    v_policy = MlpPolicy  #   policies = [MlpPolicy, MlpLstmPolicy, MlpLnLstmPolicy]
    v_gamma = 0.99  #  default 0.99
    v_learning_rate = 0.0003  #  default 0.0003
    v_ent_coef = 'auto'  #  default 'auto'

    v_env = PortfolioEnv(settings['data_file'], settings['output_file'],
                         settings['strategy_name'], settings['total_steps'],
                         settings['window_length'], settings['capital_base'],
                         settings['lot_size'], settings['leverage'],
                         settings['commission_percent'],
                         settings['commission_fixed'],
                         settings['max_slippage_percent'],
                         settings['start_idx'], settings['compute_indicators'],
                         settings['compute_reward'],
                         settings['compute_position'], settings['debug'])
    #   Create the vectorized environment
    #   v_env = DummyVecEnv([lambda: v_env])
    #   Normalize environment
    #   v_env = VecNormalize(v_env, norm_obs=settings['norm_obs'], norm_reward=settings['norm_reward'], clip_obs=settings['clip_obs'], clip_reward=settings['clip_reward'], gamma=p_gamma, epsilon=EPS)

    #   n_actions = v_env.action_space.shape[-1]
    #   v_action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5) * np.ones(n_actions))
    v_action_noise = None

    #   for v_policy, v_gamma, v_lam in it.product(p_policy, p_gamma, p_lam):
    #   print(str(v_policy) + '_' + str(v_gamma) + '_' + str(v_lam))

    model_name = settings['model_name'] + '_' + str(
        settings['total_timestamp']) + '_' + str(
            settings['window_length']) + '_' + str(
                settings['compute_indicators']) + '_' + str(v_gamma) + '_' + (
                    uuid.uuid4().hex)[:16]

    model = PPO1(env=v_env,
                 policy=v_policy,
                 gamma=v_gamma,
                 verbose=0,
                 tensorboard_log='log_' + model_name)
    model.learn(total_timesteps=(settings['total_timestamp']))
    model.save(MODELS_DIR + model_name)
    #   v_env.save_running_average(MODELS_DIR)

    del model
Example #21
0
def train(env_dict, save_folder, log_dir):
    """
     Run training on a Toribash Environment. Saves a model and the environment 
     configurations used. Because the actions may need to be remembered, this 
     method builds the action space here and saves it to the environment dictionary

     Args:
        env_dict (dictionary): The dictionary from the yaml file. 
        save_folder (filepath): path to save models
        log_dir (filepath): path to save logs. If file is run, then found inside of save_folder
    """


    # setting up reward and action space

    if(env_dict['agent'] == 'single'):
        env_dict = load_single_model(env_dict)
    elif(env_dict['agent'] == 'multi'):
        env_dict = load_multi_model(env_dict)
    elif(env_dict['agent'] == 'limb'):
        env_dict['env_name'] = 'Toribash-{}-v0'.format(env_dict['limb'])
    elif(env_dict['agent'] == 'hierarchy'):
        env_dict = load_hierarchy_model(env_dict)
    else:
        raise ValueError("Incorrect agent type given. Make sure agent: [single, multi, limb, hierarchy]" +
    "\n And, make sure other necessary components are loaded correctly."
    )

    with open(os.path.join(save_folder, 'configs_dict.pkl'), 'wb') as f:
        pickle.dump(env_dict, f)



    # setting up the model and environment
    env = make_env(env_dict, env_dict['env_name'])

    model = PPO1(MlpPolicy, env, verbose=1, tensorboard_log="./tensorboard/{}/".format(env_dict['savename']), optim_stepsize=0.01)

    try:
        model.learn(total_timesteps=env_dict['timesteps'], callback=callback)
    except KeyboardInterrupt as identifier:
        print("Incomplete Model Save")
        model.save(os.path.join(save_folder, 'incomplete'))
    finally:
        model.save(os.path.join(save_folder, 'final_model.pkl'))
Example #22
0
def eval_on_dir(args, env, modelFile, evalcount, evaldir, evaltype, evaltime):
    model = PPO1.load(modelFile)

    successes = []
    failures = []
    timeouts = []
    lenghts = []
    for filename in os.listdir(evaldir):  ### PARALLELIZE THIS!!!
        if filename.endswith(".p"):
            evalfile = os.path.join(dir, filename)
            success_ratio, failure_ratio, timeout_ratio, avglen = eval_on_file(
                args, model, env, evalcount, evalfile, evaltype, evaltime)
            successes.append(success_ratio)
            failures.append(failure_ratio)
            timeouts.append(timeout_ratio)
            lenghts.append(avglen)

    return successes, failures, timeouts, lenghts
Example #23
0
def eval(args, evaldir, modelfile, model_index):

    model = PPO1.load(modelfile)
    stage_scheduler = StageScheduler(args)
    container = Container(args, stage_scheduler=stage_scheduler)
    env = ProofEnv(args, container, stage_scheduler)

    proofs_found = 0
    proofs_tried = 0
    len_sum = 0.0
    attempts_sum = 0.0
    prove.guidance_time = 0

    dirparts = evaldir.split("/")
    if dirparts[-1] == "":
        dirname = dirparts[-2]
    else:
        dirname = dirparts[-1]
    evalprefix = "eval_{}_{}_{}_{}".format(model_index, dirname, args.evaltype,
                                           args.evalcount)

    for filename in os.listdir(evaldir):
        if filename.endswith(".p"):
            name = os.path.join(evaldir, filename)
            print("\n\nTrying to find proof for {}".format(name))
            proofs_tried += 1
            success, prooflen, attempts = find_one_proof_nobacktrack(
                args, model, env, name)
            if success == 1:
                proofs_found += 1
                len_sum += prooflen
                attempts_sum += attempts

        print("Found: {}/{} proofs".format(proofs_found, proofs_tried))

    print("\n\nEVALUATION")
    print("   evaltime: {}".format(args.evaltime))
    print("   evaldir: {}".format(dirname))
    print("   model_index: {}".format(model_index))
    print("   evaltype: {}".format(args.evaltype))
    print("   evalcount: {}".format(args.evalcount))
    print("   FOUND: {}/{}".format(proofs_found, proofs_tried))
    print("   Avg proof length: {}".format(safediv(len_sum, proofs_found)))
    print("   Avg attempts: {}".format(safediv(attempts_sum, proofs_found)))
Example #24
0
def ppo1_nmileg_pool(sensory_value):
	RL_method = "PPO1" 
	# total_MC_runs = 50
	experiment_ID = "handtest_rot_pool_with_MC_C_task0/"
	save_name_extension = RL_method
	total_timesteps =  500000
	sensory_info = "sensory_{}".format(sensory_value) 
	current_mc_run_num =22 #starts from 0
	for mc_cntr in range(current_mc_run_num, current_mc_run_num+1):
		log_dir = "./logs/{}/MC_{}/{}/{}/".format(experiment_ID, mc_cntr, RL_method, sensory_info)
		# defining the environments
		env = gym.make('HandManipulate-v1{}'.format(sensory_value))
		#env = gym.wrappers.Monitor(env, "./tmp/gym-results", video_callable=False, force=True)
		## setting the Monitor
		env = gym.wrappers.Monitor(env, log_dir+"Monitor/", video_callable=False, force=True, uid="Monitor_info")
		# defining the initial model
		if RL_method == "PPO1":
			model = PPO1(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
		elif RL_method == "PPO2":
			env = DummyVecEnv([lambda: env])
			model = PPO2(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
		elif RL_method == "DDPG":
			env = DummyVecEnv([lambda: env])
			n_actions = env.action_space.shape[-1]
			param_noise = None
			action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions), sigma=float(0.5)* 5 * np.ones(n_actions))
			model = DDPG(DDPG_MlpPolicy, env, verbose=1, param_noise=param_noise, action_noise=action_noise, tensorboard_log=log_dir)
		else:
			raise ValueError("Invalid RL mode")
		# setting the environment on the model
		#model.set_env(env)
		# setting the random seed for some of the random instances
		random_seed = mc_cntr
		random.seed(random_seed)
		env.seed(random_seed)
		env.action_space.seed(random_seed)
		np.random.seed(random_seed)
		tf.random.set_random_seed(random_seed)
		# training the model
		# training the model
		model.learn(total_timesteps=total_timesteps)
		# saving the trained model
		model.save(log_dir+"/model")
	return None
Example #25
0
def mainBal(arg):
    test = arg == TEST

    env = fep.FurutaEnvPosPpoBal(cm.RUN, render=not test)
    #env.setRender(not test)
    #model = PPO1.load(POLICY_PATH + "ppo1_pos_policy_bal.zip")
    model = PPO1.load(POLICY_PATH + "ppo1_pos_policy_bal.pkl")

    buf_rew = []
    test_cutoff_count = 0
    complete_count = 0
    test_count = 0
    overspeed = 0
    total_count = 0
    while True:
        test_count += 1
        if test and test_count >= TEST_COUNT_BAL:
            print(
                "\n***Average reward: %.3f\tLong runs: %d\tAverage count: %.3f\tCompleted: %d\tOverspeed: %d***\n"
                % (sum(buf_rew) / float(len(buf_rew)), test_cutoff_count,
                   total_count / float(test_count), complete_count, overspeed))
            break

        obs, done = env.reset(), False
        #obs[4] = ARM_TARGET_RAD
        episode_rew = 0
        count = 0
        while not done:
            action, _ = model.predict(obs)
            obs, rew, done, _ = env.step(action)
            #obs[4] = ARM_TARGET_RAD
            if speedCheck(obs):
                overspeed += 1
            episode_rew += rew
            count += 1
            total_count += 1
        if count > 999:
            complete_count += 1
        buf_rew.append(episode_rew)
        if test and count >= TEST_CUTOFF_MIN:
            test_cutoff_count += 1
        print("Episode reward: %.3f\tCount: %d" % (episode_rew, count))
Example #26
0
 def create_ppo1(self):
     return PPO1(MlpPolicy,
                 self.env,
                 gamma=0.99,
                 timesteps_per_actorbatch=1500,
                 clip_param=0.2,
                 entcoeff=0.01,
                 optim_epochs=4,
                 optim_stepsize=0.001,
                 optim_batchsize=256,
                 lam=0.95,
                 adam_epsilon=1e-05,
                 schedule='linear',
                 verbose=0,
                 tensorboard_log=None,
                 _init_setup_model=True,
                 policy_kwargs=None,
                 full_tensorboard_log=False,
                 seed=None,
                 n_cpu_tf_sess=1)
    def __init__(self, env=None, load_dir=None, load_type=None, **kwargs):
        self.algorithm = load_type

        if self.algorithm == 'ars':
            params = np.load(load_dir + 'params1.npy')
            policy_params = {
                'ob_dim': 23,
                'ac_dim': 12,
                'ob_filter': 'NoFilter',
                'hsize': 2,
                'numlayers': 32
            }

            if kwargs["mode"] == 'mlp':
                self.agent = MLPPolicy(policy_params)
                self.agent.load(params)
            elif kwargs["mode"] == 'linearbias':
                self.agent = LinearBiasPolicy(policy_params)
                self.agent.load(params)
            else:
                raise NotImplementedError

        elif self.algorithm == 'openloop':
            if kwargs["mode"] == '2finger':
                config_data = util.read_config_file(
                    'gym_roam_hand_2fin_grasping_baseline.cfg', '')
            elif kwargs["mode"] == '3finger':
                config_data = util.read_config_file('roam_grasping_3fin.cfg',
                                                    '')
            else:
                raise NotImplementedError
            self.agent = OpenLoopPolicy(config_data, env)

        elif self.algorithm == 'ppo1':
            self.agent = PPO1.load("ppo1_roam")

        elif self.algorithm == 'ppo2':
            self.agent = PPO2.load("{}/trained_model".format(load_dir))

        else:
            raise NotImplementedError
def ppo1_nmileg_pool(stiffness_value):
    RL_method = "PPO1"
    experiment_ID = "experiment_4_pool_A/mc_1/"
    save_name_extension = RL_method
    total_timesteps = 500000
    stiffness_value_str = "stiffness_{}".format(stiffness_value)
    log_dir = "./logs/{}/{}/{}/".format(experiment_ID, RL_method,
                                        stiffness_value_str)
    # defining the environments
    env = gym.make('TSNMILeg{}-v1'.format(stiffness_value))
    #env = gym.wrappers.Monitor(env, "./tmp/gym-results", video_callable=False, force=True)
    # defining the initial model
    if RL_method == "PPO1":
        model = PPO1(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
    elif RL_method == "PPO2":
        env = DummyVecEnv([lambda: env])
        model = PPO2(common_MlpPolicy, env, verbose=1, tensorboard_log=log_dir)
    elif RL_method == "DDPG":
        env = DummyVecEnv([lambda: env])
        n_actions = env.action_space.shape[-1]
        param_noise = None
        action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(n_actions),
                                                    sigma=float(0.5) * 5 *
                                                    np.ones(n_actions))
        model = DDPG(DDPG_MlpPolicy,
                     env,
                     verbose=1,
                     param_noise=param_noise,
                     action_noise=action_noise,
                     tensorboard_log=log_dir)
    else:
        raise ValueError("Invalid RL mode")
    # setting the environment on the model
    #model.set_env(env)
    # training the model
    # training the model
    model.learn(total_timesteps=total_timesteps)
    # saving the trained model
    model.save(log_dir + "/model")
    return None
Example #29
0
def mainUp(arg):
    test = arg == TEST

    env = fep.FurutaEnvPosPpoUp(cm.RUN, render=not test)
    #env.setRender(True)
    model = PPO1.load(POLICY_PATH + "ppo1_pos_policy_up.zip")

    buf_rew = []
    test_cutoff_count = 0
    test_count = 0
    overspeed = 0
    total_count = 0
    while True:
        test_count += 1
        if test and test_count >= TEST_COUNT_UP:
            print(
                "\n***Average reward: %.3f\tAverage count: %.3f\tShort runs: %d***"
                % (sum(buf_rew) / float(len(buf_rew)), total_count /
                   float(test_count), test_cutoff_count - overspeed))
            break

        obs, done = env.reset(), False
        episode_rew = 0
        count = 0
        while not done:
            action, _ = model.predict(obs)
            obs, rew, done, _ = env.step(action)
            if speedCheck(obs):
                overspeed += 1
            episode_rew += rew
            count += 1
            total_count += 1
        buf_rew.append(episode_rew)
        if test and count <= TEST_CUTOFF_MAX:
            test_cutoff_count += 1
        print("Episode average reward: %.3f\tCount: %d" %
              (episode_rew / count, count))
Example #30
0
def advlearn(env, model_name=None, dir_dict=None):

    _, _ = setup_logger(SAVE_DIR, EXP_NAME)

    if model_name == 'ppo1_oppomodel':
        ## inline hyperparameters
        ## param timesteps_per_actorbatch: timesteps per actor per update
        ## other inline hyperparameters is by default choice in file 'PPO1_model_value'
        model = PPO1_model_value(
            MlpPolicy_hua,
            env,
            timesteps_per_actorbatch=1000,
            verbose=1,
            tensorboard_log=dir_dict['tb'],
            hyper_weights=dir_dict['_hyper_weights'],
            benigned_model_file=None,
            full_tensorboard_log=False,
            black_box_att=dir_dict['_black_box'],
            attention_weights=dir_dict['_attention'],
            model_saved_loc=dir_dict['model'],
            clipped_attention=dir_dict['_clipped_attention'],
            exp_method=dir_dict['_x_method'],
            mimic_model_path=dir_dict['_mimic_model_path'],
            save_victim_traj=dir_dict['_save_victim_traj'])
    else:
        model = PPO1(MlpPolicy,
                     env,
                     timesteps_per_actorbatch=1000,
                     verbose=1,
                     tensorboard_log=dir_dict['tb'])
    try:
        model.learn(TRAINING_ITER, callback=callback, seed=SEED)
    except ValueError as e:
        traceback.print_exc()
        print("Learn exit!")
    model_file_name = "{0}agent.pkl".format(dir_dict['model'])
    model.save(model_file_name)