Beispiel #1
0
def train(env_id, num_timesteps, seed):
    from baselines.ppo_pnp import mlp_policy, pposgd_simple, interactive_ppo, ppo_gail
    U.make_session(num_cpu=1).__enter__()

    def policy_fn(name, ob_space, ac_space):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    hid_size=64,
                                    num_hid_layers=3)

    env = JacoEnv(64, 64, 1, 1.0)  #make_mujoco_env(env_id, seed)
    dataset = Mujoco_Dset(expert_path='data/pnp_demo.npz', traj_limitation=-1)
    reward_giver = TransitionClassifier(env, 100, entcoeff=1e-3)
    ppo_gail.learn(
        env,
        policy_fn,
        reward_giver,
        dataset,
        max_timesteps=num_timesteps,
        timesteps_per_actorbatch=2048,
        clip_param=0.2,
        entcoeff=0.0,
        optim_epochs=10,
        optim_stepsize=3e-4,
        optim_batchsize=64,
        gamma=0.99,
        lam=0.95,
        schedule='linear',
    )
    env.close()
Beispiel #2
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and
                        osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)
    dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
    savedir_fname = learn(env,
                          policy_fn,
                          dataset,
                          max_iters=args.BC_max_iter,
                          ckpt_dir=args.checkpoint_dir,
                          log_dir=args.log_dir,
                          task_name=task_name,
                          verbose=True)
    avg_len, avg_ret = runner(env,
                              policy_fn,
                              savedir_fname,
                              timesteps_per_batch=1024,
                              number_trajs=10,
                              stochastic_policy=args.stochastic_policy,
                              save=args.save_sample,
                              reuse=True)
Beispiel #3
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    reuse=reuse,
                                    hid_size=args.policy_hidden_size,
                                    num_hid_layers=2,
                                    gaussian_fixed_var=False,
                                    obs_normalize=True)

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)

    logger.configure(
        os.path.join("log", "BC", args.env_id,
                     "subsample_{}".format(args.subsample_freq),
                     "traj_{}".format(args.traj_limitation)))
    args.expert_path = 'dataset/{}.npz'.format(args.env_id).lower().replace(
        "-v1", "")  # set expert path
    dataset = Mujoco_Dset(expert_path=args.expert_path,
                          traj_limitation=args.traj_limitation,
                          data_subsample_freq=args.subsample_freq)
    learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, verbose=True)
Beispiel #4
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    reuse=reuse,
                                    hid_size=args.policy_hidden_size,
                                    num_hid_layers=2)

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    if args.task == 'train':
        dataset = Mujoco_Dset(expert_path=args.expert_path,
                              traj_limitation=args.traj_limitation)
        reward_giver = TransitionClassifier(env,
                                            args.adversary_hidden_size,
                                            entcoeff=args.adversary_entcoeff)
        train(
            env,
            args.seed,
            policy_fn,
            reward_giver,
            dataset,
            args.algo,
            args.g_step,
            args.d_step,
            args.policy_entcoeff,
            args.num_timesteps,
            args.save_per_iter,
            args.checkpoint_dir,
            args.log_dir,
            args.pretrained,  #false
            args.BC_max_iter,
            task_name)
    elif args.task == 'evaluate':
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=1024,
               number_trajs=10,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample)
    else:
        raise NotImplementedError
    env.close()
Beispiel #5
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = robosuite.make(args.env_id,
            ignore_done=True,
            use_camera_obs=False,
            has_renderer=True,
            control_freq=100,
            gripper_visualization=True,
            reward_shaping=True,
            #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
            #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
            #box_pos = [0.23522776, 0.2287869, 0.82162434], #shift3
            #box_quat=[0.3775825618903728, 0, 0, 0.679425538604203], #shift3
            #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4
            #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4
            #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5 
            #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5
            #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6
            #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6
            box_pos = [0.43522776, -0.367869, 0.82162434], #shift7
            box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7
            ) # Switch from gym to robosuite, also add reward shaping to see reach goal

    env = GymWrapper(env) # wrap in the gym environment

    # Environment joints should be clipped at 1 and -1 for sawyer

    
    # Task
    #task = 'train'
    task = 'evaluate'
    # parser.add_argument('--task', type=str, choices=['train', 'evaluate', 'sample'], default='train')

    # Expert Path
    #expert_path = '/home/mastercljohnson/Robotics/GAIL_Part/mod_surreal/robosuite/models/assets/demonstrations/ac100/combined/combined_0.npz' # path for 100 trajectories
    expert_path = '/home/mastercljohnson/Robotics/GAIL_Part/mod_surreal/robosuite/models/assets/demonstrations/120_shift7/combined/combined_0.npz' # path for 100 trajectories

    #parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz')
    
    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy_sawyer.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
    env = bench.Monitor(env, logger.get_dir() and
                        osp.join(logger.get_dir(), "monitor.json"), allow_early_resets=True)
    
    #env.seed(args.seed) # Sawyer does not have seed 

    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    #if not os.path.isdir(args.log_dir):
    #    os.makedirs(args.log_dir)

    logger.log("log_directories: ",args.log_dir)
    
    logger.log("environment action space range: ", env.action_space) #logging the action space

    if task == 'train':
        dataset = Mujoco_Dset(expert_path=expert_path, traj_limitation=args.traj_limitation)

        # Check dimensions of the dataset
        #print("dimension of inputs", dataset.dset.inputs.shape) # dims seem correct
        #print("dimension of inputs", dataset.dset.labels.shape) # dims seem correct

        reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
        train(env,
              args.seed,
              policy_fn,
              reward_giver,
              dataset,
              args.algo,
              args.g_step,
              args.d_step,
              args.policy_entcoeff,
              args.num_timesteps,
              args.save_per_iter,
              args.checkpoint_dir,
              args.log_dir,
              args.pretrained,
              args.BC_max_iter,
              task_name
              )
    elif task == 'evaluate':
        # Create the playback environment
        play_env = robosuite.make(args.env_id,
                ignore_done=True,
                use_camera_obs=False,
                has_renderer=True,
                control_freq=100,
                gripper_visualization=True,
                #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
                #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
                #box_pos = [0.23522776, 0.2287869, 0.82162434], #shift3
                #box_quat=[0.3775825618903728, 0, 0, 0.679425538604203], #shift3
                #box_pos = [0.53522776, 0.3287869, 0.82162434], #shift4
                #box_quat=[0.5775825618903728, 0, 0, 0.679425538604203], #shift4
                #box_pos = [0.53522776, 0.1287869, 0.82162434], #shift5 
                #box_quat=[0.4775825618903728, 0, 0, 0.679425538604203], #shift5
                #box_pos = [0.48522776, -0.187869, 0.82162434], #shift6
                #box_quat=[0.8775825618903728, 0, 0, 0.679425538604203], #shift6
                box_pos = [0.43522776, -0.367869, 0.82162434], #shift7
                box_quat=[0.2775825618903728, 0, 0, 0.679425538604203], #shift7
                )

        #play_env.viewer.set_camera(camera_id=2) # Switch views for eval

        runner(env,
                play_env,
                policy_fn,
                args.load_model_path,
                timesteps_per_batch=4000, # Change time step per batch to be more reasonable
                number_trajs=20, # change from 10 to 1 for evaluation
                stochastic_policy=args.stochastic_policy,
                save=args.save_sample
                )
    else:
        raise NotImplementedError
    env.close()
Beispiel #6
0
def learn(
        *,
        network,
        env,
        eval_env,
        timesteps_per_batch=1000,  # what to train on
        max_kl=0.001,
        cg_iters=10,
        gamma=0.99,
        lam=1.0,  # advantage estimation
        seed=None,
        ent_coef=0.0,
        cg_damping=1e-2,
        vf_stepsize=3e-4,
        vf_iters=3,
        num_epochs=1000,
        callback=None,
        load_path=None,
        log_dir=None,
        env_id=None,
        evaluation_freq=10,
        pretrain=False,
        expert_path=None,
        BC_max_iter=1e4,
        **network_kwargs):
    '''
    learn a policy function with TRPO algorithm

    Parameters:
    ----------

    network                 neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types)
                            or function that takes input placeholder and returns tuple (output, None) for feedforward nets
                            or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets

    env                     environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class

    timesteps_per_batch     timesteps per gradient estimation batch

    max_kl                  max KL divergence between old policy and new policy ( KL(pi_old || pi) )

    ent_coef                coefficient of policy entropy term in the optimization objective

    cg_iters                number of iterations of conjugate gradient algorithm

    cg_damping              conjugate gradient damping

    vf_stepsize             learning rate for adam optimizer used to optimie value function loss

    vf_iters                number of iterations of value function optimization iterations per each policy optimization step

    total_timesteps           max number of timesteps

    max_episodes            max number of episodes

    max_iters               maximum number of policy optimization iterations

    callback                function to be called with (locals(), globals()) each policy optimization step

    load_path               str, path to load the model from (default: None, i.e. no model is loaded)

    **network_kwargs        keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network

    Returns:
    -------

    learnt model

    '''

    # Configure log.
    log_dir = os.path.join("log", "trpo", env_id,
                           "pretrained_" + str(pretrain), str(seed))
    logger.configure(dir=log_dir)

    if MPI is not None:
        nworkers = MPI.COMM_WORLD.Get_size()
        rank = MPI.COMM_WORLD.Get_rank()
    else:
        nworkers = 1
        rank = 0

    cpus_per_worker = 1
    U.get_session(config=tf.ConfigProto(
        allow_soft_placement=True,
        inter_op_parallelism_threads=cpus_per_worker,
        intra_op_parallelism_threads=cpus_per_worker,
    ))

    policy = build_policy(env, network, value_network='copy', **network_kwargs)
    set_global_seeds(seed)

    # Pretrain.
    mujo_dataset = Mujoco_Dset(expert_path=expert_path)
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space

    ob = observation_placeholder(ob_space)

    pretrained_weight = None
    if pretrain and (BC_max_iter > 0):
        # Pretrain with behavior cloning
        from baselines.trpo_mpi import behavior_clone
        pretrained_weight, pi = behavior_clone.learn(ob,
                                                     policy,
                                                     mujo_dataset,
                                                     max_iters=BC_max_iter)
        evaluate_policy(pi, eval_env, -2, timesteps_per_batch, 0)
    else:
        with tf.variable_scope("pi"):
            pi = policy(observ_placeholder=ob)

    with tf.variable_scope("oldpi"):
        oldpi = policy(observ_placeholder=ob)

    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = ent_coef * meanent

    vferr = tf.reduce_mean(tf.square(pi.vf - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = get_trainable_variables("pi")
    # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")]
    # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")]
    var_list = get_pi_trainable_variables("pi")
    vf_var_list = get_vf_trainable_variables("pi")

    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  #pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(get_variables("oldpi"), get_variables("pi"))
        ])

    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        if MPI is not None:
            out = np.empty_like(x)
            MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
            out /= nworkers
        else:
            out = np.copy(x)

        return out

    U.initialize()
    if load_path is not None:
        pi.load(load_path)

    th_init = get_flat()
    if MPI is not None:
        MPI.COMM_WORLD.Bcast(th_init, root=0)

    set_from_flat(th_init)
    vfadam.sync()
    print("Init param sum", th_init.sum(), flush=True)

    # if provide pretrained weight
    if pretrained_weight is not None:
        U.load_variables(pretrained_weight,
                         variables=tf.get_collection(
                             tf.GraphKeys.TRAINABLE_VARIABLES, "pi"))

    evaluate_policy(pi, eval_env, -1, timesteps_per_batch, 0)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards

    for epoch in range(num_epochs):
        if callback: callback(locals(), globals())
        logger.log("********** Epoch  %i ************" % epoch)

        with timed("sampling"):
            seg = seg_gen.__next__()
        add_vtarg_and_adv(seg, gamma, lam)

        # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
        ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
            "tdlamret"]
        vpredbefore = seg["vpred"]  # predicted value function before udpate
        atarg = (atarg - atarg.mean()
                 ) / atarg.std()  # standardized advantage function estimate

        # if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret)
        if hasattr(pi, "ob_rms"):
            pi.ob_rms.update(ob)  # update running mean/std for policy

        args = seg["ob"], seg["ac"], atarg
        fvpargs = [arr[::5] for arr in args]

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        assign_old_eq_new()  # set old parameter values to new parameter values
        with timed("computegrad"):
            *lossbefore, g = compute_lossandgrad(*args)
        lossbefore = allmean(np.array(lossbefore))
        g = allmean(g)
        if np.allclose(g, 0):
            logger.log("Got zero gradient. not updating")
        else:
            with timed("cg"):
                stepdir = cg(fisher_vector_product,
                             g,
                             cg_iters=cg_iters,
                             verbose=rank == 0)
            assert np.isfinite(stepdir).all()
            shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
            lm = np.sqrt(shs / max_kl)
            # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
            fullstep = stepdir / lm
            expectedimprove = g.dot(fullstep)
            surrbefore = lossbefore[0]
            stepsize = 1.0
            thbefore = get_flat()
            for _ in range(10):
                thnew = thbefore + fullstep * stepsize
                set_from_flat(thnew)
                meanlosses = surr, kl, *_ = allmean(
                    np.array(compute_losses(*args)))
                improve = surr - surrbefore
                logger.log("Expected: %.3f Actual: %.3f" %
                           (expectedimprove, improve))
                if not np.isfinite(meanlosses).all():
                    logger.log("Got non-finite value of losses -- bad!")
                elif kl > max_kl * 1.5:
                    logger.log("violated KL constraint. shrinking step.")
                elif improve < 0:
                    logger.log("surrogate didn't improve. shrinking step.")
                else:
                    logger.log("Stepsize OK!")
                    break
                stepsize *= .5
            else:
                logger.log("couldn't compute a good step")
                set_from_flat(thbefore)
            if nworkers > 1 and iters_so_far % 20 == 0:
                paramsums = MPI.COMM_WORLD.allgather(
                    (thnew.sum(), vfadam.getflat().sum()))  # list of tuples
                assert all(
                    np.allclose(ps, paramsums[0]) for ps in paramsums[1:])

        with timed("vf"):
            for _ in range(vf_iters):
                for (mbob, mbret) in dataset.iterbatches(
                    (seg["ob"], seg["tdlamret"]),
                        include_final_partial_batch=False,
                        batch_size=64):
                    g = allmean(compute_vflossandgrad(mbob, mbret))
                    vfadam.update(g, vf_stepsize)

        if epoch % evaluation_freq == 0:
            evaluate_policy(pi, eval_env, epoch, timesteps_per_batch, tstart)

    return pi
def load_dataset(expert_path):
    dataset = Mujoco_Dset(expert_path=expert_path)
    return dataset
Beispiel #8
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)

    import MujocoManip as MM
    if args.task == 'train':
        env_name, user_name = osp.basename(
            args.expert_path).split('.')[0].split('_')
    else:
        env_name, user_name = osp.basename(args.load_model_path).split('.')[:2]
    wrapper = '%sWrapper' % env_name
    render = True if args.task == 'evaluate' else False

    if env_name == 'SawyerLiftEnv':
        env = MM.make(wrapper,
                      ignore_done=False,
                      use_eef_ctrl=False,
                      gripper_visualization=True,
                      use_camera_obs=False,
                      has_renderer=render,
                      reward_shaping=True,
                      has_offscreen_renderer=render)
    elif env_name == 'SawyerBinsEnv':
        env = MM.make(
            wrapper,
            ignore_done=False,
            use_eef_ctrl=False,
            gripper_visualization=True,
            use_camera_obs=False,
            has_renderer=render,
            reward_shaping=True,
            single_object_mode=False if 'hard' in user_name.lower() else True,
            has_offscreen_renderer=render)
    elif env_name == 'SawyerPegsEnv':
        env = MM.make(
            wrapper,
            ignore_done=False,
            use_eef_ctrl=False,
            gripper_visualization=True,
            use_camera_obs=False,
            has_renderer=render,
            reward_shaping=True,
            single_object_mode=False if 'hard' in user_name.lower() else True,
            has_offscreen_renderer=render)
    else:
        raise NotImplementedError

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    reuse=reuse,
                                    hid_size=args.policy_hidden_size,
                                    num_hid_layers=2)

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(env_name, user_name) + '_%s_%s' % (
        args.algo, 1 if not args.mix_reward else args.rew_lambda)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)
    os.makedirs(args.log_dir, exist_ok=True)

    if args.task == 'train':
        dataset = Mujoco_Dset(expert_path=args.expert_path,
                              traj_limitation=args.traj_limitation)
        reward_giver = TransitionClassifier(env,
                                            args.adversary_hidden_size,
                                            entcoeff=args.adversary_entcoeff)
        train(env, args.seed, policy_fn, reward_giver, dataset, args.algo,
              args.g_step, args.d_step, args.policy_entcoeff,
              args.num_timesteps, args.save_per_iter, args.checkpoint_dir,
              args.log_dir, args.pretrained, args.BC_max_iter, args.rew_lambda,
              args.mix_reward, task_name, args.frame_stack)
    elif args.task == 'evaluate':
        visualizer(env,
                   policy_fn,
                   args.load_model_path,
                   timesteps_per_batch=env.env.horizon,
                   number_trajs=10,
                   stochastic_policy=args.stochastic_policy,
                   save=args.save_sample)
    else:
        raise NotImplementedError
    env.close()
Beispiel #9
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = robosuite.make(
        args.env_id,
        ignore_done=True,
        use_camera_obs=False,
        has_renderer=True,
        control_freq=100,
        gripper_visualization=True,
        reward_shaping=True,
        #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
        #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
    )  # Switch from gym to robosuite, also add reward shaping to see reach goal

    env = GymWrapper(env)  # wrap in the gym environment

    #task = 'train'
    task = 'evaluate'

    # Expert Path
    expert_path = '/home/mastercljohnson/Robotics/GAIL_Part/mod_surreal/robosuite/models/assets/demonstrations/150_grasp_shift2/combined/combined_0.npz'  # path for 100 trajectories

    #parser.add_argument('--expert_path', type=str, default='data/deterministic.trpo.Hopper.0.00.npz')

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy_sawyer.MlpPolicy(name=name,
                                           ob_space=ob_space,
                                           ac_space=ac_space,
                                           reuse=reuse,
                                           hid_size=args.policy_hidden_size,
                                           num_hid_layers=2)

    env = bench.Monitor(env,
                        logger.get_dir()
                        and osp.join(logger.get_dir(), "monitor.json"),
                        allow_early_resets=True)

    # Note: taking away the bench monitor wrapping allows rendering

    #env.seed(args.seed) # Sawyer does not have seed

    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    logger.log("log_directories: ", args.log_dir)
    logger.log("environment action space range: ",
               env.action_space)  #logging the action space

    #------- Run policy for reaching ---------#
    play_env = robosuite.make(
        args.env_id,
        ignore_done=True,
        use_camera_obs=False,
        has_renderer=True,
        control_freq=100,
        gripper_visualization=True,
        #box_pos = [0.63522776, -0.3287869, 0.82162434], # shift2
        #box_quat=[0.6775825618903728, 0, 0, 0.679425538604203], # shift2
    )

    play_env = GymWrapper(play_env)

    #Weights are loaded from reach model grasp_strange

    #play_env.viewer.set_camera(camera_id=2) # Switch views for eval

    # Setup network
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi_reach = policy_fn("pi", ob_space, ac_space, reuse=False)

    # Hack for loading policies using tensorflow
    init_op = tf.compat.v1.global_variables_initializer()
    saver = tf.compat.v1.train.Saver(max_to_keep=5)
    with tf.compat.v1.Session() as sess:
        sess.run(init_op)
        # Load Checkpoint
        ckpt_path = './reach_and_grasp_weights/reach_one/trpo_gail.transition_limitation_2100.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
        ckpt = tf.compat.v1.train.get_checkpoint_state(ckpt_path)
        saver.restore(sess, ckpt.model_checkpoint_path)

        # Create the playback environment

        _, _, last_ob, last_jpos = runner_1_traj(
            play_env,
            pi_reach,
            None,
            timesteps_per_batch=3500,
            number_trajs=1,
            stochastic_policy=args.stochastic_policy,
            save=False)

    if task == 'train':
        play_env.close()

        dataset = Mujoco_Dset(expert_path=expert_path,
                              traj_limitation=args.traj_limitation)

        reward_giver = TransitionClassifier(env,
                                            args.adversary_hidden_size,
                                            entcoeff=args.adversary_entcoeff)
        train_grasp(env, last_ob, last_jpos, args.seed, policy_fn,
                    reward_giver, dataset, args.algo, args.g_step, args.d_step,
                    args.policy_entcoeff, args.num_timesteps,
                    args.save_per_iter, args.checkpoint_dir, args.log_dir,
                    args.pretrained, args.BC_max_iter, task_name)

    elif task == 'evaluate':
        pi_grasp = policy_fn("pi_grasp", ob_space, ac_space, reuse=False)
        saver_2 = tf.compat.v1.train.Saver(max_to_keep=5)
        with tf.compat.v1.Session() as sess:
            sess.run(init_op)
            ckpt_path_2 = './reach_and_grasp_weights/grasp_shift1_after_reach/grasptrpo_gail.transition_limitation_2000.SawyerLift.g_step_1.d_step_1.policy_entcoeff_0.adversary_entcoeff_0.001.seed_0/'
            ckpt_2 = tf.compat.v1.train.get_checkpoint_state(ckpt_path_2)
            saver_2.restore(sess, ckpt_2.model_checkpoint_path)

            tt = 0
            ob = last_ob

            while True:
                ac, vpred = pi_grasp.act(False, ob)
                ob, rew, new, _ = play_env.step(ac)

                play_env.render()  # check the running in for the first part
                #logger.log("rendering for reach policy")

                if new or tt >= args.traj_limitation:
                    break
                tt += 1

        play_env.close()

    env.close()
Beispiel #10
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)

    import MujocoManip as MM
    if args.task == 'train':
        env_name, user_name = osp.basename(
            args.expert_path).split('.')[0].split('_')
    else:
        uenv, user_name = osp.basename(args.load_model_path).split('.')[:2]
        env_name = uenv.split('_')[1]

    wrapper = '%sWrapper' % env_name
    render = True if args.task == 'evaluate' else False
    print('%s initialized.' % wrapper)

    bin_dict = dict(milk=0, bread=1, cereal=2, can=3)
    peg_dict = dict(square=0, round=1)

    if env_name == 'SawyerLiftEnv':
        env = MM.make(wrapper,
                      ignore_done=False,
                      use_eef_ctrl=False,
                      gripper_visualization=True,
                      use_camera_obs=False,
                      has_renderer=render,
                      reward_shaping=True,
                      has_offscreen_renderer=False)
    elif env_name == 'SawyerBinsEnv':
        env = MM.make(
            wrapper,
            ignore_done=False,
            use_eef_ctrl=False,
            gripper_visualization=True,
            use_camera_obs=False,
            has_renderer=render,
            reward_shaping=True,
            single_object_mode=False if 'hard' in user_name.lower() else True,
            has_offscreen_renderer=False,
            selected_bin=None
            if 'hard' in user_name.lower() else bin_dict[user_name.lower()])
    elif env_name == 'SawyerPegsEnv':
        env = MM.make(wrapper,
                      ignore_done=False,
                      use_eef_ctrl=False,
                      gripper_visualization=True,
                      use_camera_obs=False,
                      has_renderer=render,
                      reward_shaping=True,
                      single_object_mode=False if user_name.lower() else True,
                      has_offscreen_renderer=False,
                      selected_bin=None if 'hard' in user_name.lower() else
                      peg_dict[user_name.lower()])
    else:
        raise NotImplementedError

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    reuse=reuse,
                                    hid_size=args.policy_hidden_size,
                                    num_hid_layers=3)

    env = bench.Monitor(
        env,
        logger.get_dir() and osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(env_name, user_name)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    if args.task == 'train':
        dataset = Mujoco_Dset(expert_path=args.expert_path,
                              traj_limitation=args.traj_limitation)
        savedir_fname = learn(env,
                              policy_fn,
                              dataset,
                              max_iters=args.BC_max_iter,
                              ckpt_dir=args.checkpoint_dir,
                              log_dir=args.log_dir,
                              task_name=task_name,
                              verbose=True)

    elif args.task == 'evaluate':
        visualizer(env,
                   policy_fn,
                   args.load_model_path,
                   env.env.horizon,
                   10,
                   args.stochastic_policy,
                   save=args.save_sample)
Beispiel #11
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)
    logger.configure()

    env = bench.Monitor(env, logger.get_dir() and
                        osp.join(logger.get_dir(), "monitor.json"))
    env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, task_name)

    if args.task == 'train':
        from baselines.gail import mlp_policy
        def policy_fn(name, ob_space, ac_space, reuse=False):
          return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                      reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
        dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation)
        if args.states_only:
            reward_giver = WeakTransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
        else:
            reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff)
        train(env,
              args.seed,
              policy_fn,
              reward_giver,
              dataset,
              args.algo,
              args.g_step,
              args.d_step,
              args.policy_entcoeff,
              args.num_timesteps,
              args.save_per_iter,
              args.checkpoint_dir,
              args.log_dir,
              args.pretrained,
              args.BC_max_iter,
              task_name,
              args.states_only
              )
    elif args.task == 'evaluate':
        from baselines.gail import mlp_policy
        def policy_fn(name, ob_space, ac_space, reuse=False):
            return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                        reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2)
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=1024,
               number_trajs=args.traj_limitation,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample
               )
    elif args.task == 'expert_train':
        from baselines.trpo_mpi import trpo_mpi as original_trpo
        from baselines.ppo1.mlp_policy import MlpPolicy as OriginalMlpPolicy
        def policy_fn(name, ob_space, ac_space, reuse=False):
            return OriginalMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    hid_size=args.policy_hidden_size, num_hid_layers=2)
        original_trpo.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
            max_timesteps=args.num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
        saver = tf.train.Saver()
        saver.save(tf.get_default_session(), args.save_model_path)
    elif args.task == 'expert_gen':
        from baselines.trpo_mpi import trpo_mpi as original_trpo
        from baselines.ppo1.mlp_policy import MlpPolicy as OriginalMlpPolicy
        def policy_fn(name, ob_space, ac_space, reuse=False):
            return OriginalMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
                                    hid_size=args.policy_hidden_size, num_hid_layers=2)
        runner(env,
               policy_fn,
               args.save_model_path,
               timesteps_per_batch=1024,
               number_trajs=args.traj_limitation,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample
                              )
    else:
        raise NotImplementedError
    env.close()
Beispiel #12
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)
    # env = DelayRewardWrapper(env, args.delay_freq, args.max_path_length)
    eval_env = gym.make(args.env_id)

    logger.configure(
        os.path.join("log", "GAIL", args.env_id,
                     "subsample_{}".format(args.subsample_freq),
                     "traj_{}".format(args.traj_limitation),
                     "seed_{}".format(args.seed)))

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    reuse=reuse,
                                    hid_size=args.policy_hidden_size,
                                    num_hid_layers=2,
                                    gaussian_fixed_var=args.gaussian_fixed_var,
                                    obs_normalize=args.obs_normalize)

    env.seed(args.seed)
    eval_env.seed(args.seed)

    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir, "GAIL", task_name)

    if args.task == 'train':
        dataset = Mujoco_Dset(expert_path=args.expert_path,
                              traj_limitation=args.traj_limitation,
                              data_subsample_freq=args.subsample_freq)
        reward_giver = TransitionClassifier(env,
                                            args.adversary_hidden_size,
                                            entcoeff=args.adversary_entcoeff,
                                            obs_normalize=args.obs_normalize)
        train(
            env,
            eval_env,
            args.seed,
            policy_fn,
            reward_giver,
            dataset,
            args.algo,
            args.g_step,
            args.d_step,
            args.policy_entcoeff,
            args.save_per_iter,
            args.checkpoint_dir,
            args.log_dir,
            args.pretrained,
            args.BC_max_iter,
            args.num_epochs,
            args.evaluation_freq,
            args.timesteps_per_batch,
            task_name,
        )
    elif args.task == 'evaluate':
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=args.timesteps_per_batch,
               number_trajs=10,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample)
    else:
        raise NotImplementedError
    env.close()
Beispiel #13
0
def setup_and_learn(env,
                    nb_epochs,
                    nb_epoch_cycles,
                    render_eval,
                    reward_scale,
                    render,
                    actor,
                    critic,
                    classifier,
                    normalize_returns,
                    normalize_observations,
                    critic_l2_reg,
                    classifier_l2_reg,
                    actor_lr,
                    critic_lr,
                    classifier_lr,
                    action_noise,
                    popart,
                    gamma,
                    clip_norm,
                    nb_train_steps,
                    nb_rollout_steps,
                    nb_eval_steps,
                    batch_size,
                    memory,
                    fifomemory,
                    tau=0.01,
                    eval_env=None,
                    callback=None,
                    entropy_coeff=1.,
                    reward_giver=None,
                    expert_dataset=None,
                    g_step=4,
                    d_step=1,
                    d_stepsize=3e-4,
                    max_timesteps=0,
                    max_iters=0,
                    timesteps_per_batch=1024,
                    adversary_hidden_size=100,
                    adversary_entcoeff=1e-3,
                    task='train',
                    expert_path=None):  # TODO: max_episodes
    """
    set up learning agent and execute training
    """
    logger.info('Initialize policy')
    logger.info('noisynet implementation of DDPG')

    assert task == 'train'

    assert (np.abs(env.action_space.low) == env.action_space.high
            ).all()  # we assume symmetric actions.
    max_action = env.action_space.high
    logger.info(
        'scaling actions by {} before executing in env'.format(max_action))
    agent = DDPG_paramnoise(actor,
                            critic,
                            classifier,
                            memory,
                            fifomemory,
                            env.observation_space.shape,
                            env.action_space.shape,
                            gamma=gamma,
                            tau=tau,
                            normalize_returns=normalize_returns,
                            normalize_observations=normalize_observations,
                            batch_size=batch_size,
                            action_noise=action_noise,
                            critic_l2_reg=critic_l2_reg,
                            classifier_l2_reg=classifier_l2_reg,
                            actor_lr=actor_lr,
                            critic_lr=critic_lr,
                            classifier_lr=classifier_lr,
                            enable_popart=popart,
                            clip_norm=clip_norm,
                            reward_scale=reward_scale,
                            entropy_coeff=entropy_coeff)
    logger.info('Using agent with the following configuration:')
    logger.info(str(agent.__dict__.items()))

    logger.info('Initialize Discriminator')
    reward_giver = TransitionClassifier(env,
                                        adversary_hidden_size,
                                        entcoeff=adversary_entcoeff)
    d_adam = MpiAdam(reward_giver.get_trainable_variables())

    logger.info('Load Expert Data')
    dataset = Mujoco_Dset(expert_path=expert_path,
                          traj_limitation=-1)  # TODO: customize

    logger.info('Start training')
    with U.single_threaded_session() as sess:
        # init agent
        agent.initialize(sess)
        # tf saver
        saver = tf.train.Saver()
        # finalize graph
        sess.graph.finalize()

        learn(
            env,
            agent,
            reward_giver,
            dataset,
            g_step,
            d_step,
            d_stepsize=d_stepsize,
            timesteps_per_batch=timesteps_per_batch,
            nb_train_steps=nb_train_steps,
            max_timesteps=max_timesteps,
            max_iters=max_iters,  # TODO: max_episodes
            callback=callback,
            d_adam=d_adam,
            sess=sess,
            saver=saver)
Beispiel #14
0
def main(args):
    U.make_session(num_cpu=1).__enter__()
    set_global_seeds(args.seed)
    env = gym.make(args.env_id)

    # delay training env
    # 返回next_obs,delay_reward(0/累积奖赏),done,info
    env = DelayRewardWrapper(env, args.reward_freq, 1000)
    # 评估Env,真实Env
    eval_env = gym.make(args.env_id)

    def policy_fn(name, ob_space, ac_space, reuse=False):
        return mlp_policy.MlpPolicy(name=name,
                                    ob_space=ob_space,
                                    ac_space=ac_space,
                                    reuse=reuse,
                                    hid_size=args.policy_hidden_size,
                                    num_hid_layers=2)

    # 设置随机种子
    env.seed(args.seed)
    eval_env.seed(args.seed)
    gym.logger.setLevel(logging.WARN)
    task_name = get_task_name(args)
    args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name)
    args.log_dir = osp.join(args.log_dir,
                            "reward_coeff_" + str(args.reward_coeff),
                            args.env_id, "seed_" + str(args.seed))

    if args.task == 'train':
        dataset = Mujoco_Dset(expert_path=args.expert_path,
                              traj_limitation=args.traj_limitation)
        reward_giver = TransitionClassifier(env,
                                            args.adversary_hidden_size,
                                            entcoeff=args.adversary_entcoeff)
        train(
            env,
            eval_env,
            args.seed,
            policy_fn,
            reward_giver,
            dataset,
            args.algo,
            args.g_step,
            args.d_step,
            args.policy_entcoeff,
            args.reward_coeff,
            args.num_timesteps,
            args.save_per_iter,
            args.checkpoint_dir,
            args.log_dir,
            args.pretrained,
            args.BC_max_iter,
            args.num_epochs,
            args.eval_interval,
            args.timesteps_per_batch,
            task_name,
        )
    elif args.task == 'evaluate':
        runner(env,
               policy_fn,
               args.load_model_path,
               timesteps_per_batch=args.timesteps_per_batch,
               number_trajs=10,
               stochastic_policy=args.stochastic_policy,
               save=args.save_sample)
    else:
        raise NotImplementedError
    env.close()