Ejemplo n.º 1
0
def main():
    # Command line arguments
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--outfile")
    parser.add_argument("--metadata")
    parser.add_argument("--plot", type=int, default=0)

    parser.add_argument("--just_sim", action="store_true")

    # Parameters
    parser.add_argument("--n_iter", type=int, default=150)
    parser.add_argument("--gamma", type=float, default=.99)
    parser.add_argument("--lam", type=float, default=1.0)
    parser.add_argument("--timesteps_per_batch", type=int, default=50000)
    parser.add_argument("--penalty_coeff", type=float, default=0.5)
    parser.add_argument("--max_pathlength", type=int, default=1000)
    args = parser.parse_args()

    # mdp = mjcmdp.CartpoleMDP()
    np.random.seed(args.seed)

    mdp = mjcmdp.HopperMDP()
    (_, (ctrl_dim, )) = mdp.action_spec()
    (_, (obs_dim, )) = mdp.observation_spec()

    policy = MujocoPolicy(obs_dim, ctrl_dim)

    # Saving to HDF5
    hdf, diagnostics = prepare_h5_file(args, {"policy": policy, "mdp": mdp})
    vf = MujocoNeuralValueFunction(num_features=38, num_hidden=40)

    for (iteration, stats) in enumerate(
            ppo.run_ppo(mdp,
                        policy,
                        vf=vf,
                        gamma=args.gamma,
                        lam=args.lam,
                        max_pathlength=args.max_pathlength,
                        timesteps_per_batch=args.timesteps_per_batch,
                        n_iter=args.n_iter,
                        parallel=False,
                        penalty_coeff=args.penalty_coeff)):
        std_a = policy.get_stdev()
        for (i, s) in enumerate(std_a):
            stats["std_%i" % i] = s
        print tabulate(stats.items())
        for (statname, statval) in stats.items():
            diagnostics[statname].append(statval)

        if args.plot:
            animate_rollout(mdp,
                            policy,
                            delay=.001,
                            horizon=args.max_pathlength)

        grp = hdf.create_group("snapshots/%.4i" % (iteration))
        policy.pc.to_h5(grp)
Ejemplo n.º 2
0
def main():
    # Command line arguments
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--outfile")
    parser.add_argument("--metadata")
    parser.add_argument("--plot", type=int, default=0)

    parser.add_argument("--just_sim", action="store_true")

    # Parameters
    parser.add_argument("--n_iter", type=int, default=150)
    parser.add_argument("--gamma", type=float, default=0.99)
    parser.add_argument("--lam", type=float, default=1.0)
    parser.add_argument("--timesteps_per_batch", type=int, default=50000)
    parser.add_argument("--penalty_coeff", type=float, default=0.5)
    parser.add_argument("--max_pathlength", type=int, default=1000)
    args = parser.parse_args()

    # mdp = mjcmdp.CartpoleMDP()
    np.random.seed(args.seed)

    mdp = mjcmdp.HopperMDP()
    (_, (ctrl_dim,)) = mdp.action_spec()
    (_, (obs_dim,)) = mdp.observation_spec()

    policy = MujocoPolicy(obs_dim, ctrl_dim)

    # Saving to HDF5
    hdf, diagnostics = prepare_h5_file(args, {"policy": policy, "mdp": mdp})
    vf = MujocoNeuralValueFunction(num_features=38, num_hidden=40)

    for (iteration, stats) in enumerate(
        ppo.run_ppo(
            mdp,
            policy,
            vf=vf,
            gamma=args.gamma,
            lam=args.lam,
            max_pathlength=args.max_pathlength,
            timesteps_per_batch=args.timesteps_per_batch,
            n_iter=args.n_iter,
            parallel=False,
            penalty_coeff=args.penalty_coeff,
        )
    ):
        std_a = policy.get_stdev()
        for (i, s) in enumerate(std_a):
            stats["std_%i" % i] = s
        print tabulate(stats.items())
        for (statname, statval) in stats.items():
            diagnostics[statname].append(statval)

        if args.plot:
            animate_rollout(mdp, policy, delay=0.001, horizon=args.max_pathlength)

        grp = hdf.create_group("snapshots/%.4i" % (iteration))
        policy.pc.to_h5(grp)
Ejemplo n.º 3
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--outfile")
    parser.add_argument("--metadata")
    parser.add_argument("--plot", type=int, default=0)
    parser.add_argument("--game",
                        type=str,
                        choices=[
                            "pong", "breakout", "enduro", "beam_rider",
                            "space_invaders", "seaquest", "qbert"
                        ],
                        default='pong')

    # Parameters
    parser.add_argument("--n_iter", type=int, default=1000)
    parser.add_argument("--gamma", type=float, default=.98)
    parser.add_argument("--lam", type=float, default=1.00)
    parser.add_argument("--timesteps_per_batch", type=int, default=30000)
    parser.add_argument("--penalty_coeff", type=float, default=0.5)
    parser.add_argument("--max_pathlength", type=int, default=10000)
    parser.add_argument("--max_kl", type=float, default=.04)

    args = parser.parse_args()

    np.random.seed(args.seed)

    mdp = AtariMDP('atari_roms/%s.bin' % args.game)
    policy = AtariRAMPolicy(mdp.n_actions)
    vf = AtariRamLinearValueFunction()

    hdf, diagnostics = prepare_h5_file(args, {"policy": policy, "mdp": mdp})

    for (iteration, stats) in enumerate(
            ppo.run_ppo(mdp,
                        policy,
                        vf=vf,
                        gamma=args.gamma,
                        lam=args.lam,
                        max_pathlength=args.max_pathlength,
                        timesteps_per_batch=args.timesteps_per_batch,
                        n_iter=args.n_iter,
                        parallel=True,
                        max_kl=0.04,
                        penalty_coeff=args.penalty_coeff)):

        print tabulate(stats.items())

        for (statname, statval) in stats.items():
            diagnostics[statname].append(statval)

        if args.plot:
            animate_rollout(mdp, policy, delay=.001, horizon=100)

        grp = hdf.create_group("snapshots/%.4i" % (iteration))
        policy.pc.to_h5(grp)
Ejemplo n.º 4
0
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("--seed",type=int,default=0)
    parser.add_argument("--outfile")
    parser.add_argument("--metadata")
    parser.add_argument("--plot",type=int,default=0)
    parser.add_argument("--game",type=str,choices=["pong","breakout","enduro","beam_rider","space_invaders","seaquest","qbert"],default='pong')

    # Parameters
    parser.add_argument("--n_iter",type=int,default=1000)
    parser.add_argument("--gamma",type=float,default=.98)
    parser.add_argument("--lam",type=float,default=1.00)
    parser.add_argument("--timesteps_per_batch",type=int,default=30000)
    parser.add_argument("--penalty_coeff",type=float,default=0.5)
    parser.add_argument("--max_pathlength",type=int,default=10000)
    parser.add_argument("--max_kl",type=float,default=.04)

    args = parser.parse_args()

    np.random.seed(args.seed)

    mdp = AtariMDP('atari_roms/%s.bin'%args.game)
    policy = AtariRAMPolicy(mdp.n_actions)
    vf = AtariRamLinearValueFunction()


    hdf, diagnostics = prepare_h5_file(args, {"policy" : policy, "mdp" : mdp})


    for (iteration,stats) in enumerate(ppo.run_ppo(
            mdp, policy, 
            vf=vf,
            gamma=args.gamma,
            lam=args.lam,
            max_pathlength = args.max_pathlength,
            timesteps_per_batch = args.timesteps_per_batch,
            n_iter = args.n_iter,
            parallel=True,
            max_kl = 0.04,
            penalty_coeff=args.penalty_coeff)):

        print tabulate(stats.items())

        for (statname, statval) in stats.items():
            diagnostics[statname].append(statval)

        if args.plot:
            animate_rollout(mdp,policy,delay=.001,horizon=100)

        grp = hdf.create_group("snapshots/%.4i"%(iteration))
        policy.pc.to_h5(grp)