예제 #1
0
def main():
  env = bullet_cartpole.BulletCartpole(gui=opts.gui, action_force=opts.action_force,
                                       max_episode_len=opts.max_episode_len,
                                       initial_force=opts.initial_force, delay=opts.delay,
                                       discrete_actions=False, event_log_file=opts.event_log)

  with tf.Session() as sess:  #config=tf.ConfigProto(log_device_placement=True)) as sess:
    agent = DeepDeterministicPolicyGradientAgent(env=env, agent_opts=opts)

    # setup saver util and either load latest ckpt, or init if none...
    saver_util = None
    ckpt_dir = None
    if opts.run_id is not None:
      ckpt_dir = "ckpts/%s" % opts.run_id
    elif opts.ckpt_dir is not None:
      ckpt_dir = opts.ckpt_dir
    if ckpt_dir is not None:
      saver_util = util.SaverUtil(sess, ckpt_dir, opts.ckpt_freq)
    else:
      sess.run(tf.initialize_all_variables())

    # now that we've either init'd from scratch, or loaded up a checkpoint,
    # we can hook together target networks
    agent.hook_up_target_networks(opts.target_update_rate)

    # run either eval or training
    if opts.num_eval > 0:
      agent.run_eval(opts.num_eval)
    else:
      agent.run_training(opts.max_num_actions, opts.batch_size, saver_util, opts.run_id)
      if saver_util is not None:
        saver_util.force_save()
예제 #2
0
def main():
    env = bullet_cartpole.BulletCartpole(gui=opts.gui,
                                         action_force=opts.action_force,
                                         max_episode_len=opts.max_episode_len,
                                         initial_force=opts.initial_force,
                                         delay=opts.delay,
                                         discrete_actions=True)

    with tf.Session() as sess:
        agent = LikelihoodRatioPolicyGradientAgent(
            env=env,
            gui=opts.gui,
            hidden_dim=opts.num_hidden,
            optimiser=tf.train.AdamOptimizer())

        # setup saver util; will load latest ckpt, or init if none...
        saver_util = None
        ckpt_dir = None
        if opts.run_id is not None:
            ckpt_dir = "ckpts/%s" % opts.run_id
        elif opts.ckpt_dir is not None:
            ckpt_dir = opts.ckpt_dir
        if ckpt_dir is not None:
            saver_util = util.SaverUtil(sess, ckpt_dir, opts.ckpt_freq)
        else:
            sess.run(tf.initialize_all_variables())

        # run either eval or training
        if opts.num_eval > 0:
            agent.run_eval(opts.num_eval)
        else:
            agent.run_training(opts.num_train_batches, opts.rollouts_per_batch,
                               saver_util)
            if saver_util is not None:
                saver_util.force_save()
예제 #3
0
                    type=str,
                    default='discrete',
                    help="either 'discrete' or 'continuous'")
bullet_cartpole.add_opts(parser)
opts = parser.parse_args()

actions = map(int, opts.actions.split(","))

if opts.action_type == 'discrete':
    discrete_actions = True
elif opts.action_type == 'continuous':
    discrete_actions = False
else:
    raise Exception("Unknown action type [%s]" % opts.action_type)

env = bullet_cartpole.BulletCartpole(opts=opts,
                                     discrete_actions=discrete_actions)

for _ in xrange(opts.num_eval):
    env.reset()
    done = False
    total_reward = 0
    steps = 0
    while not done:
        if discrete_actions:
            action = random.choice(actions)
        else:
            action = env.action_space.sample()
        _state, reward, done, info = env.step(action)
        steps += 1
        total_reward += reward
        if opts.max_episode_len is not None and steps > opts.max_episode_len:
                    type=str,
                    default='discrete',
                    help="either 'discrete' or 'continuous'")
opts = parser.parse_args()

actions = map(int, opts.actions.split(","))

if opts.action_type == 'discrete':
    discrete_actions = True
elif opts.action_type == 'continuous':
    discrete_actions = False
else:
    raise Exception("Unknown action type [%s]" % opts.action_type)

env = bullet_cartpole.BulletCartpole(gui=opts.gui,
                                     initial_force=opts.initial_force,
                                     discrete_actions=discrete_actions,
                                     event_log_file=opts.event_log)

for _ in xrange(opts.num_eval):
    env.reset()
    done = False
    steps = 0
    while not done:
        if discrete_actions:
            action = random.choice(actions)
        else:
            action = env.action_space.sample()
        _state, _reward, done, info = env.step(action)
        steps += 1
        if opts.delay > 0:
            time.sleep(opts.delay)
                    type=float,
                    default=50.0,
                    help="magnitude of action push")
parser.add_argument('--num-train', type=int, default=100)
parser.add_argument('--num-eval', type=int, default=0)
parser.add_argument('--load-file', type=str, default=None)
parser.add_argument('--save-file', type=str, default=None)
parser.add_argument('--delay', type=float, default=0.0)
opts = parser.parse_args()
print "OPTS", opts

ENV_NAME = 'BulletCartpole'

# Get the environment and extract the number of actions.
env = bullet_cartpole.BulletCartpole(gui=opts.gui,
                                     action_force=opts.action_force,
                                     initial_force=opts.initial_force,
                                     delay=opts.delay)
nb_actions = env.action_space.n

# Next, we build a very simple model.
model = Sequential()
model.add(Flatten(input_shape=(1, ) + env.observation_space.shape))
model.add(Dense(32))
model.add(Activation('tanh'))
#model.add(Dense(16))
#model.add(Activation('relu'))
#model.add(Dense(16))
#model.add(Activation('relu'))
model.add(Dense(nb_actions))
model.add(Activation('linear'))
print(model.summary())