Beispiel #1
0
def main(_):
  model_dir = util.get_model_dir(conf, 
      ['data_dir', 'sample_dir', 'max_epoch', 'test_step', 'save_step',
       'is_train', 'random_seed', 'log_level', 'display', 'runtime_base_dir', 
       'occlude_start_row', 'num_generated_images'])
  util.preprocess_conf(conf)
  validate_parameters(conf)

  data = 'mnist' if conf.data == 'color-mnist' else conf.data 
  DATA_DIR = os.path.join(conf.runtime_base_dir, conf.data_dir, data)
  SAMPLE_DIR = os.path.join(conf.runtime_base_dir, conf.sample_dir, conf.data, model_dir)

  util.check_and_create_dir(DATA_DIR)
  util.check_and_create_dir(SAMPLE_DIR)
  
  dataset = get_dataset(DATA_DIR, conf.q_levels)

  with tf.Session() as sess:
    network = Network(sess, conf, dataset.height, dataset.width, dataset.channels)

    stat = Statistic(sess, conf.data, conf.runtime_base_dir, model_dir, tf.trainable_variables())
    stat.load_model()

    if conf.is_train:
      train(dataset, network, stat, SAMPLE_DIR)
    else:
      generate(network, dataset.height, dataset.width, SAMPLE_DIR)
Beispiel #2
0
def main(_):
  model_dir = get_model_dir(conf,
      ['is_train', 'random_seed', 'monitor', 'display', 'log_level'])

  preprocess_conf(conf)

  with tf.Session() as sess:
    # environment
    env = gym.make(conf.env_name)
    env.seed(conf.random_seed)

    assert isinstance(env.observation_space, gym.spaces.Box), \
      "observation space must be continuous"
    assert isinstance(env.action_space, gym.spaces.Box), \
      "action space must be continuous"

    # exploration strategy
    if conf.noise == 'ou':
      strategy = OUExploration(env, sigma=conf.noise_scale)
    elif conf.noise == 'brownian':
      strategy = BrownianExploration(env, conf.noise_scale)
    elif conf.noise == 'linear_decay':
      strategy = LinearDecayExploration(env)
    else:
      raise ValueError('Unkown exploration strategy: %s' % conf.noise)

    # networks
    shared_args = {
      'sess': sess,
      'input_shape': env.observation_space.shape,
      'action_size': env.action_space.shape[0],
      'hidden_dims': conf.hidden_dims,
      'use_batch_norm': conf.use_batch_norm,
      'use_seperate_networks': conf.use_seperate_networks,
      'hidden_w': conf.hidden_w, 'action_w': conf.action_w,
      'hidden_fn': conf.hidden_fn, 'action_fn': conf.action_fn,
      'w_reg': conf.w_reg,
    }

    logger.info("Creating prediction network...")
    pred_network = Network(
      scope='pred_network', **shared_args
    )

    logger.info("Creating target network...")
    target_network = Network(
      scope='target_network', **shared_args
    )
    target_network.make_soft_update_from(pred_network, conf.tau)

    # statistic
    stat = Statistic(sess, conf.env_name, model_dir, pred_network.variables, conf.update_repeat)

    agent = NAF(sess, env, strategy, pred_network, target_network, stat,
                conf.discount, conf.batch_size, conf.learning_rate,
                conf.max_steps, conf.update_repeat, conf.max_episodes)

    agent.run(conf.monitor, conf.display, conf.is_train)
Beispiel #3
0
def main(_):
  model_dir = get_model_dir(conf,
      ['is_train', 'random_seed', 'monitor', 'display', 'log_level'])
  preprocess_conf(conf)

  with tf.Session() as sess:
    # environment
    env = gym.make(conf.env_name)
    env._seed(conf.random_seed)

    assert isinstance(env.observation_space, gym.spaces.Box), \
      "observation space must be continuous"
    assert isinstance(env.action_space, gym.spaces.Box), \
      "action space must be continuous"

    # exploration strategy
    if conf.noise == 'ou':
      strategy = OUExploration(env, sigma=conf.noise_scale)
    elif conf.noise == 'brownian':
      strategy = BrownianExploration(env, conf.noise_scale)
    elif conf.noise == 'linear_decay':
      strategy = LinearDecayExploration(env)
    else:
      raise ValueError('Unkown exploration strategy: %s' % conf.noise)

    # networks
    shared_args = {
      'sess': sess,
      'input_shape': env.observation_space.shape,
      'action_size': env.action_space.shape[0],
      'hidden_dims': conf.hidden_dims,
      'use_batch_norm': conf.use_batch_norm,
      'use_seperate_networks': conf.use_seperate_networks,
      'hidden_w': conf.hidden_w, 'action_w': conf.action_w,
      'hidden_fn': conf.hidden_fn, 'action_fn': conf.action_fn,
      'w_reg': conf.w_reg,
    }

    logger.info("Creating prediction network...")
    pred_network = Network(
      scope='pred_network', **shared_args
    )

    logger.info("Creating target network...")
    target_network = Network(
      scope='target_network', **shared_args
    )
    target_network.make_soft_update_from(pred_network, conf.tau)

    # statistic
    stat = Statistic(sess, conf.env_name, model_dir, pred_network.variables, conf.update_repeat)

    agent = NAF(sess, env, strategy, pred_network, target_network, stat,
                conf.discount, conf.batch_size, conf.learning_rate,
                conf.max_steps, conf.update_repeat, conf.max_episodes)

    #agent.run(conf.monitor, conf.display, conf.is_train)
    agent.run(conf.monitor, conf.display, True)
Beispiel #4
0
def main(_):
    model_dir, data_dir = get_dirs(conf, ['exp_name'])
    # exp_start_time = datetime.datetime.now().strftime("%A_%b%d-%H%M%S")
    # data_dir = "logs/" + conf.exp_name + "_" + exp_start_time
    preprocess_conf(conf, model_dir)

    env = gym.make(conf.env_name)
    env.seed(conf.random_seed)
    state_shape = env.observation_space.shape
    if type(env.action_space) is gym.spaces.Discrete:
        action_shape = env.action_space.n
    else:
        action_shape = env.action_space.shape[0]

    # replay buffer
    buffer = ReplayBuffer2(conf.buffer_size)

    # building agent
    # config = tf.ConfigProto(allow_soft_placement=True)
    # config.gpu_options.allow_growth = True

    config = tf.ConfigProto(intra_op_parallelism_threads=8,
                            inter_op_parallelism_threads=8)
    with tf.Session(config=config) as sess:
        # agent
        agent = SoftPolicyGradient(sess, conf, state_shape, action_shape)
        # statistic
        stat = Statistic(sess, conf, model_dir, data_dir)
        if conf.load_model:
            stat.load_model()

        def var_print():
            for var in tf.global_variables():
                print(var)

        print("printing vars:------------------------------------------------")
        var_print()
        print(
            "printing vars::------------------------------------------------")

        start_steps = 1000
        episode, global_step, local_step = 0, 0, 0
        epi_rewards = 0
        total_Q, Q_loss, pi_loss = [], [], []
        state = env.reset()
        # pbar = tqdm(total=conf.max_steps, dynamic_ncols=True)
        while global_step < conf.max_steps:
            # interaction with environment
            action = agent.sampling_actions(
                [state], is_deterministic=False)[0]  # [-inf, inf]
            next_state, reward, done, info = env.step(
                action_converter(env, action))
            global_step += 1
            local_step += 1
            epi_rewards += reward
            reward *= conf.reward_scale
            buffer.add_transition(state, action, reward, next_state, done)
            state = next_state

            # train step
            if buffer.size() >= conf.batch_size and global_step >= start_steps:
                for i in range(conf.num_train_steps):
                    transitions = buffer.get_transitions(conf.batch_size)
                    Q, single_Q_loss, single_pi_loss = agent.trainer(
                        transitions)
                    total_Q.append(np.mean(Q))
                    Q_loss.append(single_Q_loss)
                    pi_loss.append(single_pi_loss)

            # evaluate step
            if global_step % conf.eval_interval == 0:
                ave_epi_rewards = np.mean(eval_step(env, agent))
                stat.save_step(global_step, ave_epi_rewards)
                print('\n[Evaluation] averaged_epi_rewards: %.3f' %
                      ave_epi_rewards)

            if done:
                # save step
                all_epi_rewards.append(epi_rewards)
                stat.save_step(global_step, epi_rewards, np.mean(total_Q),
                               np.mean(Q_loss), np.mean(pi_loss))
                # pbar.update(local_step)

                lenn = len(all_epi_rewards)
                fromm = max(lenn - 20, 0)
                to = lenn
                min_5_ep_ret = min(all_epi_rewards[fromm:to])

                # pbar.set_description('Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f avg_5_epi_rew %.1f' %
                #    (episode+1, epi_rewards, np.mean(pi_loss), np.mean(Q_loss), sum(all_epi_rewards[fromm:to])/(to-fromm) ) )
                print(
                    'Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f \tmin_5_epi_rew %.1f'
                    % (episode + 1, epi_rewards, np.mean(pi_loss),
                       np.mean(Q_loss), min_5_ep_ret))
                threshold = -500.0
                if ((to - fromm) > 3 and min_5_ep_ret > threshold):
                    time_end = time.time()
                    print("SHI hyperParams have made algo converge (",
                          threshold, ") in ", (time_end - time_begin) / 1.0,
                          " s")
                    stat.save_step(global_step, epi_rewards, np.mean(total_Q),
                                   np.mean(Q_loss), np.mean(pi_loss))
                    stat.save_model(global_step)
                    sys.exit()
                episode += 1
                local_step = 0
                epi_rewards = 0
                total_Q, Q_loss, pi_loss = [], [], []
                state = env.reset()
Beispiel #5
0
def main(_):
    model_dir, data_dir = get_dirs(conf, ['env_name'])
    preprocess_conf(conf, model_dir)

    env = gym.make(conf.env_name)
    # env.seed(conf.random_seed)
    state_shape = env.observation_space.shape
    if type(env.action_space) is gym.spaces.Discrete:
        action_shape = env.action_space.n
    else:
        action_shape = env.action_space.shape[0]

    # replay buffer
    buffer = ReplayBuffer2(conf.buffer_size)

    # building agent
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # agent
        agent = SoftPolicyGradient(sess, conf, state_shape, action_shape)
        # statistic
        stat = Statistic(sess, conf, model_dir, data_dir)
        if conf.load_model:
            stat.load_model()

        episode, global_step, local_step = 0, 0, 0
        epi_rewards = 0
        total_Q, Q_loss, pi_loss = [], [], []
        state = env.reset()
        pbar = tqdm(total=conf.max_steps, dynamic_ncols=True)
        while global_step < conf.max_steps:
            # interaction with environment
            action = agent.sampling_actions([state], is_deterministic=False)[0] # [-inf, inf]
            next_state, reward, done, info = env.step(action_converter(env, action))
            global_step += 1
            local_step += 1
            epi_rewards += reward
            reward *= conf.reward_scale
            buffer.add_transition(state, action, reward, next_state, done)
            state = next_state

            # train step
            if buffer.size() >= conf.batch_size:
                for i in range(conf.num_train_steps):
                    transitions = buffer.get_transitions(conf.batch_size)
                    Q, single_Q_loss, single_pi_loss = agent.trainer(transitions)
                    total_Q.append(np.mean(Q))
                    Q_loss.append(single_Q_loss)
                    pi_loss.append(single_pi_loss)

            # evaluate step
            if global_step % conf.eval_interval == 0:
                ave_epi_rewards = np.mean(eval_step(env, agent))
                stat.save_step(global_step, ave_epi_rewards)
                print('\n[Evaluation] averaged_epi_rewards: %.3f' % ave_epi_rewards)

            if done:
                # save step
                stat.save_step(global_step, epi_rewards, np.mean(total_Q), np.mean(Q_loss), np.mean(pi_loss))
                pbar.update(local_step)
                pbar.set_description('Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f' %
                       (episode+1, epi_rewards, np.mean(pi_loss), np.mean(Q_loss)))
                print()
                episode += 1
                local_step = 0
                epi_rewards = 0
                total_Q, Q_loss, pi_loss = [], [], []
                state = env.reset()
        pbar.close()
flags.DEFINE_integer('max_steps', 200, 'maximum # of steps for each episode')
flags.DEFINE_integer('update_repeat', 5,
                     'maximum # of q-learning updates for each step')
flags.DEFINE_integer('max_episodes', 1000, 'maximum # of episodes to train')

# Debug
flags.DEFINE_boolean('is_train', True, 'training or testing')
flags.DEFINE_integer('random_seed', 123, 'random seed')
flags.DEFINE_boolean('monitor', False, 'monitor the training or not')
flags.DEFINE_boolean('display', False, 'display the game screen or not')
flags.DEFINE_string('log_level', 'INFO',
                    'log level [DEBUG, INFO, WARNING, ERROR, CRITICAL]')

conf = flags.FLAGS
#    ['is_train', 'random_seed', 'monitor', 'display', 'log_level'])
preprocess_conf(conf)
env = 'GazeboModularScara4DOF-v3'

# set random seed
tf.set_random_seed(123)
np.random.seed(123)

with tf.Session() as sess:
    # environment
    env = gym.make(env)
    env._seed(123)
    # learn (env,
    #         sess,
    #         conf.noise,
    #         conf.noise_scale,
    #         conf.hidden_dims,