def main(): # Command line arguments parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--seed", type=int, default=0) parser.add_argument("--outfile") parser.add_argument("--metadata") parser.add_argument("--plot", type=int, default=0) parser.add_argument("--just_sim", action="store_true") # Parameters parser.add_argument("--n_iter", type=int, default=150) parser.add_argument("--gamma", type=float, default=.99) parser.add_argument("--lam", type=float, default=1.0) parser.add_argument("--timesteps_per_batch", type=int, default=50000) parser.add_argument("--penalty_coeff", type=float, default=0.5) parser.add_argument("--max_pathlength", type=int, default=1000) args = parser.parse_args() # mdp = mjcmdp.CartpoleMDP() np.random.seed(args.seed) mdp = mjcmdp.HopperMDP() (_, (ctrl_dim, )) = mdp.action_spec() (_, (obs_dim, )) = mdp.observation_spec() policy = MujocoPolicy(obs_dim, ctrl_dim) # Saving to HDF5 hdf, diagnostics = prepare_h5_file(args, {"policy": policy, "mdp": mdp}) vf = MujocoNeuralValueFunction(num_features=38, num_hidden=40) for (iteration, stats) in enumerate( ppo.run_ppo(mdp, policy, vf=vf, gamma=args.gamma, lam=args.lam, max_pathlength=args.max_pathlength, timesteps_per_batch=args.timesteps_per_batch, n_iter=args.n_iter, parallel=False, penalty_coeff=args.penalty_coeff)): std_a = policy.get_stdev() for (i, s) in enumerate(std_a): stats["std_%i" % i] = s print tabulate(stats.items()) for (statname, statval) in stats.items(): diagnostics[statname].append(statval) if args.plot: animate_rollout(mdp, policy, delay=.001, horizon=args.max_pathlength) grp = hdf.create_group("snapshots/%.4i" % (iteration)) policy.pc.to_h5(grp)
def main(): # Command line arguments parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--seed", type=int, default=0) parser.add_argument("--outfile") parser.add_argument("--metadata") parser.add_argument("--plot", type=int, default=0) parser.add_argument("--just_sim", action="store_true") # Parameters parser.add_argument("--n_iter", type=int, default=150) parser.add_argument("--gamma", type=float, default=0.99) parser.add_argument("--lam", type=float, default=1.0) parser.add_argument("--timesteps_per_batch", type=int, default=50000) parser.add_argument("--penalty_coeff", type=float, default=0.5) parser.add_argument("--max_pathlength", type=int, default=1000) args = parser.parse_args() # mdp = mjcmdp.CartpoleMDP() np.random.seed(args.seed) mdp = mjcmdp.HopperMDP() (_, (ctrl_dim,)) = mdp.action_spec() (_, (obs_dim,)) = mdp.observation_spec() policy = MujocoPolicy(obs_dim, ctrl_dim) # Saving to HDF5 hdf, diagnostics = prepare_h5_file(args, {"policy": policy, "mdp": mdp}) vf = MujocoNeuralValueFunction(num_features=38, num_hidden=40) for (iteration, stats) in enumerate( ppo.run_ppo( mdp, policy, vf=vf, gamma=args.gamma, lam=args.lam, max_pathlength=args.max_pathlength, timesteps_per_batch=args.timesteps_per_batch, n_iter=args.n_iter, parallel=False, penalty_coeff=args.penalty_coeff, ) ): std_a = policy.get_stdev() for (i, s) in enumerate(std_a): stats["std_%i" % i] = s print tabulate(stats.items()) for (statname, statval) in stats.items(): diagnostics[statname].append(statval) if args.plot: animate_rollout(mdp, policy, delay=0.001, horizon=args.max_pathlength) grp = hdf.create_group("snapshots/%.4i" % (iteration)) policy.pc.to_h5(grp)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=0) parser.add_argument("--outfile") parser.add_argument("--metadata") parser.add_argument("--plot", type=int, default=0) parser.add_argument("--game", type=str, choices=[ "pong", "breakout", "enduro", "beam_rider", "space_invaders", "seaquest", "qbert" ], default='pong') # Parameters parser.add_argument("--n_iter", type=int, default=1000) parser.add_argument("--gamma", type=float, default=.98) parser.add_argument("--lam", type=float, default=1.00) parser.add_argument("--timesteps_per_batch", type=int, default=30000) parser.add_argument("--penalty_coeff", type=float, default=0.5) parser.add_argument("--max_pathlength", type=int, default=10000) parser.add_argument("--max_kl", type=float, default=.04) args = parser.parse_args() np.random.seed(args.seed) mdp = AtariMDP('atari_roms/%s.bin' % args.game) policy = AtariRAMPolicy(mdp.n_actions) vf = AtariRamLinearValueFunction() hdf, diagnostics = prepare_h5_file(args, {"policy": policy, "mdp": mdp}) for (iteration, stats) in enumerate( ppo.run_ppo(mdp, policy, vf=vf, gamma=args.gamma, lam=args.lam, max_pathlength=args.max_pathlength, timesteps_per_batch=args.timesteps_per_batch, n_iter=args.n_iter, parallel=True, max_kl=0.04, penalty_coeff=args.penalty_coeff)): print tabulate(stats.items()) for (statname, statval) in stats.items(): diagnostics[statname].append(statval) if args.plot: animate_rollout(mdp, policy, delay=.001, horizon=100) grp = hdf.create_group("snapshots/%.4i" % (iteration)) policy.pc.to_h5(grp)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--seed",type=int,default=0) parser.add_argument("--outfile") parser.add_argument("--metadata") parser.add_argument("--plot",type=int,default=0) parser.add_argument("--game",type=str,choices=["pong","breakout","enduro","beam_rider","space_invaders","seaquest","qbert"],default='pong') # Parameters parser.add_argument("--n_iter",type=int,default=1000) parser.add_argument("--gamma",type=float,default=.98) parser.add_argument("--lam",type=float,default=1.00) parser.add_argument("--timesteps_per_batch",type=int,default=30000) parser.add_argument("--penalty_coeff",type=float,default=0.5) parser.add_argument("--max_pathlength",type=int,default=10000) parser.add_argument("--max_kl",type=float,default=.04) args = parser.parse_args() np.random.seed(args.seed) mdp = AtariMDP('atari_roms/%s.bin'%args.game) policy = AtariRAMPolicy(mdp.n_actions) vf = AtariRamLinearValueFunction() hdf, diagnostics = prepare_h5_file(args, {"policy" : policy, "mdp" : mdp}) for (iteration,stats) in enumerate(ppo.run_ppo( mdp, policy, vf=vf, gamma=args.gamma, lam=args.lam, max_pathlength = args.max_pathlength, timesteps_per_batch = args.timesteps_per_batch, n_iter = args.n_iter, parallel=True, max_kl = 0.04, penalty_coeff=args.penalty_coeff)): print tabulate(stats.items()) for (statname, statval) in stats.items(): diagnostics[statname].append(statval) if args.plot: animate_rollout(mdp,policy,delay=.001,horizon=100) grp = hdf.create_group("snapshots/%.4i"%(iteration)) policy.pc.to_h5(grp)