Ejemplo n.º 1
0
def main(argv):
    del argv  # Unused.
    config = utility.load_config(LOG_DIR)
    policy_layers = config.policy_layers
    value_layers = config.value_layers
    env = config.env(render=True)
    network = config.network

    with tf.Session() as sess:
        agent = simple_ppo_agent.SimplePPOPolicy(sess,
                                                 env,
                                                 network,
                                                 policy_layers=policy_layers,
                                                 value_layers=value_layers,
                                                 checkpoint=os.path.join(
                                                     LOG_DIR, CHECKPOINT))

        sum_reward = 0
        observation = env.reset()
        while True:
            action = agent.get_action([observation])
            observation, reward, done, _ = env.step(action[0])
            time.sleep(0.002)
            sum_reward += reward
            if done:
                break
        tf.logging.info("reward: %s", sum_reward)
Ejemplo n.º 2
0
def visualize(logdir, outdir, num_agents, num_episodes, checkpoint=None, env_processes=True):
  """Recover checkpoint and render videos from it.

  Args:
    logdir: Logging directory of the trained algorithm.
    outdir: Directory to store rendered videos in.
    num_agents: Number of environments to simulate in parallel.
    num_episodes: Total number of episodes to simulate.
    checkpoint: Checkpoint name to load; defaults to most recent.
    env_processes: Whether to step environments in separate processes.
  """
  config = utility.load_config(logdir)
  with config.unlocked:
    config.network = functools.partial(utility.define_network, config.network, config)
    config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
    config.value_optimizer = getattr(tf.train, config.value_optimizer)
  with tf.device('/cpu:0'):
    batch_env = utility.define_batch_env(lambda: _create_environment(config, outdir), num_agents,
                                         env_processes)
    graph = utility.define_simulation_graph(batch_env, config.algorithm, config)
    total_steps = num_episodes * config.max_length
    loop = _define_loop(graph, total_steps)
  saver = utility.define_saver(exclude=(r'.*_temporary/.*', r'global_step'))
  sess_config = tf.ConfigProto(allow_soft_placement=True)
  sess_config.gpu_options.allow_growth = True
  with tf.Session(config=sess_config) as sess:
    utility.initialize_variables(sess, saver, config.logdir, checkpoint, resume=True)
    for unused_score in loop.run(sess, saver, total_steps):
      pass
  batch_env.close()
def main(argv):
  del argv  # Unused.
  config = utility.load_config(LOG_DIR)
  policy_layers = config.policy_layers
  value_layers = config.value_layers
  env = config.env(render=True)
  network = config.network

  with tf.Session() as sess:
    agent = simple_ppo_agent.SimplePPOPolicy(sess,
                                             env,
                                             network,
                                             policy_layers=policy_layers,
                                             value_layers=value_layers,
                                             checkpoint=os.path.join(LOG_DIR, CHECKPOINT))

    sum_reward = 0
    observation = env.reset()
    while True:
      action = agent.get_action([observation])
      observation, reward, done, _ = env.step(action[0])
      time.sleep(0.002)
      sum_reward += reward
      if done:
        break
    tf.logging.info("reward: %s", sum_reward)
Ejemplo n.º 4
0
def create_env(envid, seed, render=False):
    """Create minitaur or other standard gym environment."""
    if 'minitaur' in envid:
        from pybullet_envs.minitaur.agents.scripts import utility
        from pybullet_envs.minitaur.agents import tools
        config = utility.load_config(os.path.expanduser('minitaur_config'))
        if 'bad' in envid:
            with config.unlocked:
                config.env.keywords['accurate_motor_model_enabled'] = False
                config.env.keywords['control_latency'] = .0
                config.env.keywords['pd_latency'] = .0
                config.env.keywords['urdf_version'] = None
        env = config.env(render=render)
        if config.max_length:
            env = tools.wrappers.LimitDuration(env, config.max_length)
        env = tools.wrappers.RangeNormalize(env)
        env = tools.wrappers.ClipAction(env)
        env = tools.wrappers.ConvertTo32Bit(env)

        class MySpec(object):
            def __init__(self, max_episode_steps):
                self.max_episode_steps = max_episode_steps

        env.spec = MySpec(1000)
    else:
        env = gym.make(envid)

    # Set up seed.
    env.seed(seed)
    return env
Ejemplo n.º 5
0
def main(_):
  """Create or load configuration and launch the trainer."""
  utility.set_up_logging()
  if not FLAGS.config:
    raise KeyError('You must specify a configuration.')
  logdir = FLAGS.logdir and os.path.expanduser(
      os.path.join(FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp, FLAGS.config)))
  try:
    config = utility.load_config(logdir)
  except IOError:
    config = tools.AttrDict(getattr(configs, FLAGS.config)())
    config = utility.save_config(config, logdir)
  for score in train(config, FLAGS.env_processes):
    tf.logging.info('Score {}.'.format(score))
Ejemplo n.º 6
0
Archivo: train.py Proyecto: Gs-001/quad
def main(_):
    """Create or load configuration and launch the trainer."""
    utility.set_up_logging()
    if not FLAGS.config:
        raise KeyError('You must specify a configuration.')
    logdir = FLAGS.logdir and os.path.expanduser(
        os.path.join(FLAGS.logdir, '{}-{}'.format(FLAGS.timestamp,
                                                  FLAGS.config)))
    try:
        config = utility.load_config(logdir)
    except IOError:
        config = tools.AttrDict(getattr(configs, FLAGS.config)())
        config = utility.save_config(config, logdir)
    for score in train(config, FLAGS.env_processes):
        tf.logging.info('Score {}.'.format(score))
Ejemplo n.º 7
0
def visualize(logdir,
              outdir,
              num_agents,
              num_episodes,
              checkpoint=None,
              env_processes=True):
    """Recover checkpoint and render videos from it.

  Args:
    logdir: Logging directory of the trained algorithm.
    outdir: Directory to store rendered videos in.
    num_agents: Number of environments to simulate in parallel.
    num_episodes: Total number of episodes to simulate.
    checkpoint: Checkpoint name to load; defaults to most recent.
    env_processes: Whether to step environments in separate processes.
  """
    config = utility.load_config(logdir)
    with config.unlocked:
        config.network = functools.partial(utility.define_network,
                                           config.network, config)
        config.policy_optimizer = getattr(tf.train, config.policy_optimizer)
        config.value_optimizer = getattr(tf.train, config.value_optimizer)
    with tf.device('/cpu:0'):
        batch_env = utility.define_batch_env(
            lambda: _create_environment(config, outdir), num_agents,
            env_processes)
        graph = utility.define_simulation_graph(batch_env, config.algorithm,
                                                config)
        total_steps = num_episodes * config.max_length
        loop = _define_loop(graph, total_steps)
    saver = utility.define_saver(exclude=(r'.*_temporary/.*', r'global_step'))
    sess_config = tf.ConfigProto(allow_soft_placement=True)
    sess_config.gpu_options.allow_growth = True
    with tf.Session(config=sess_config) as sess:
        utility.initialize_variables(sess,
                                     saver,
                                     config.logdir,
                                     checkpoint,
                                     resume=True)
        for unused_score in loop.run(sess, saver, total_steps):
            pass
    batch_env.close()
Ejemplo n.º 8
0
def main(argv):
    del argv  # Unused.
    config = utility.load_config(LOG_DIR)
    print(LOG_DIR)
    policy_layers = config.policy_layers
    value_layers = config.value_layers
    env = config.env(render=True)
    network = config.network

    with tf.Session() as sess:
        agent = simple_ppo_agent.SimplePPOPolicy(sess,
                                                 env,
                                                 network,
                                                 policy_layers=policy_layers,
                                                 value_layers=value_layers,
                                                 checkpoint=os.path.join(
                                                     LOG_DIR, CHECKPOINT))
        sum_reward = 0
        observation = env.reset()
        while True:
            # 驱动
            command = 'd'
            command = bytes(command, encoding='utf8')
            arduino.write(command)
            # ppo get action
            action = agent.get_action([observation])

            # transfer ppo action to all motor action
            o_action = copy.deepcopy(action)
            o_action = env.transform_action_to_motor_command(o_action[0])
            print("----- each motor radio -----")
            print(o_action)
            print("----- each motor angle -----")
            pi = 3.14159265359
            deg = []
            i = 0
            for each_rad in o_action[:2]:
                now = 180 * each_rad / pi
                # output action
                command = str(i)
                command = bytes(command, encoding='utf8')
                arduino.write(command)
                '''
        command = input('请输入转动方向(+-)')
        command = bytes(command, encoding='utf8')
        arduino.write(command)
        '''
                command = str(now)
                command = bytes(command, encoding='utf8')
                print(command)
                arduino.write(command)

                # 如需更高速度,要更改Arduino代码
                command = '10000'
                command = bytes(command, encoding='utf8')
                print(command)
                arduino.write(command)
                i = i + 1
                '''
        msg = arduino.read(14)
        msg = binascii.b2a_hex(msg).decode('utf-8')
        print(msg)
        '''
            observation, reward, done, _ = env.step(action[0])
            '''
      replace observation with a real observation
      '''
            time.sleep(0.002)
            sum_reward += reward
            if done:
                break
        tf.logging.info("reward: %s", sum_reward)