Esempio n. 1
0
def main(_):
    model_dir, data_dir = get_dirs(conf, ['exp_name'])
    # exp_start_time = datetime.datetime.now().strftime("%A_%b%d-%H%M%S")
    # data_dir = "logs/" + conf.exp_name + "_" + exp_start_time
    preprocess_conf(conf, model_dir)

    env = gym.make(conf.env_name)
    env.seed(conf.random_seed)
    state_shape = env.observation_space.shape
    if type(env.action_space) is gym.spaces.Discrete:
        action_shape = env.action_space.n
    else:
        action_shape = env.action_space.shape[0]

    # replay buffer
    buffer = ReplayBuffer2(conf.buffer_size)

    # building agent
    # config = tf.ConfigProto(allow_soft_placement=True)
    # config.gpu_options.allow_growth = True

    config = tf.ConfigProto(intra_op_parallelism_threads=8,
                            inter_op_parallelism_threads=8)
    with tf.Session(config=config) as sess:
        # agent
        agent = SoftPolicyGradient(sess, conf, state_shape, action_shape)
        # statistic
        stat = Statistic(sess, conf, model_dir, data_dir)
        if conf.load_model:
            stat.load_model()

        def var_print():
            for var in tf.global_variables():
                print(var)

        print("printing vars:------------------------------------------------")
        var_print()
        print(
            "printing vars::------------------------------------------------")

        start_steps = 1000
        episode, global_step, local_step = 0, 0, 0
        epi_rewards = 0
        total_Q, Q_loss, pi_loss = [], [], []
        state = env.reset()
        # pbar = tqdm(total=conf.max_steps, dynamic_ncols=True)
        while global_step < conf.max_steps:
            # interaction with environment
            action = agent.sampling_actions(
                [state], is_deterministic=False)[0]  # [-inf, inf]
            next_state, reward, done, info = env.step(
                action_converter(env, action))
            global_step += 1
            local_step += 1
            epi_rewards += reward
            reward *= conf.reward_scale
            buffer.add_transition(state, action, reward, next_state, done)
            state = next_state

            # train step
            if buffer.size() >= conf.batch_size and global_step >= start_steps:
                for i in range(conf.num_train_steps):
                    transitions = buffer.get_transitions(conf.batch_size)
                    Q, single_Q_loss, single_pi_loss = agent.trainer(
                        transitions)
                    total_Q.append(np.mean(Q))
                    Q_loss.append(single_Q_loss)
                    pi_loss.append(single_pi_loss)

            # evaluate step
            if global_step % conf.eval_interval == 0:
                ave_epi_rewards = np.mean(eval_step(env, agent))
                stat.save_step(global_step, ave_epi_rewards)
                print('\n[Evaluation] averaged_epi_rewards: %.3f' %
                      ave_epi_rewards)

            if done:
                # save step
                all_epi_rewards.append(epi_rewards)
                stat.save_step(global_step, epi_rewards, np.mean(total_Q),
                               np.mean(Q_loss), np.mean(pi_loss))
                # pbar.update(local_step)

                lenn = len(all_epi_rewards)
                fromm = max(lenn - 20, 0)
                to = lenn
                min_5_ep_ret = min(all_epi_rewards[fromm:to])

                # pbar.set_description('Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f avg_5_epi_rew %.1f' %
                #    (episode+1, epi_rewards, np.mean(pi_loss), np.mean(Q_loss), sum(all_epi_rewards[fromm:to])/(to-fromm) ) )
                print(
                    'Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f \tmin_5_epi_rew %.1f'
                    % (episode + 1, epi_rewards, np.mean(pi_loss),
                       np.mean(Q_loss), min_5_ep_ret))
                threshold = -500.0
                if ((to - fromm) > 3 and min_5_ep_ret > threshold):
                    time_end = time.time()
                    print("SHI hyperParams have made algo converge (",
                          threshold, ") in ", (time_end - time_begin) / 1.0,
                          " s")
                    stat.save_step(global_step, epi_rewards, np.mean(total_Q),
                                   np.mean(Q_loss), np.mean(pi_loss))
                    stat.save_model(global_step)
                    sys.exit()
                episode += 1
                local_step = 0
                epi_rewards = 0
                total_Q, Q_loss, pi_loss = [], [], []
                state = env.reset()
Esempio n. 2
0
def main(_):
    model_dir, data_dir = get_dirs(conf, ['env_name'])
    preprocess_conf(conf, model_dir)

    env = gym.make(conf.env_name)
    # env.seed(conf.random_seed)
    state_shape = env.observation_space.shape
    if type(env.action_space) is gym.spaces.Discrete:
        action_shape = env.action_space.n
    else:
        action_shape = env.action_space.shape[0]

    # replay buffer
    buffer = ReplayBuffer2(conf.buffer_size)

    # building agent
    config = tf.ConfigProto(allow_soft_placement=True)
    config.gpu_options.allow_growth = True
    with tf.Session(config=config) as sess:
        # agent
        agent = SoftPolicyGradient(sess, conf, state_shape, action_shape)
        # statistic
        stat = Statistic(sess, conf, model_dir, data_dir)
        if conf.load_model:
            stat.load_model()

        episode, global_step, local_step = 0, 0, 0
        epi_rewards = 0
        total_Q, Q_loss, pi_loss = [], [], []
        state = env.reset()
        pbar = tqdm(total=conf.max_steps, dynamic_ncols=True)
        while global_step < conf.max_steps:
            # interaction with environment
            action = agent.sampling_actions([state], is_deterministic=False)[0] # [-inf, inf]
            next_state, reward, done, info = env.step(action_converter(env, action))
            global_step += 1
            local_step += 1
            epi_rewards += reward
            reward *= conf.reward_scale
            buffer.add_transition(state, action, reward, next_state, done)
            state = next_state

            # train step
            if buffer.size() >= conf.batch_size:
                for i in range(conf.num_train_steps):
                    transitions = buffer.get_transitions(conf.batch_size)
                    Q, single_Q_loss, single_pi_loss = agent.trainer(transitions)
                    total_Q.append(np.mean(Q))
                    Q_loss.append(single_Q_loss)
                    pi_loss.append(single_pi_loss)

            # evaluate step
            if global_step % conf.eval_interval == 0:
                ave_epi_rewards = np.mean(eval_step(env, agent))
                stat.save_step(global_step, ave_epi_rewards)
                print('\n[Evaluation] averaged_epi_rewards: %.3f' % ave_epi_rewards)

            if done:
                # save step
                stat.save_step(global_step, epi_rewards, np.mean(total_Q), np.mean(Q_loss), np.mean(pi_loss))
                pbar.update(local_step)
                pbar.set_description('Episode: %s, epi_rewards: %.3f, pi_loss: %.3f, Q_loss: %.3f' %
                       (episode+1, epi_rewards, np.mean(pi_loss), np.mean(Q_loss)))
                print()
                episode += 1
                local_step = 0
                epi_rewards = 0
                total_Q, Q_loss, pi_loss = [], [], []
                state = env.reset()
        pbar.close()