예제 #1
0
def get_cache_loaded_a2c(sess, nenvs, nsteps, ob_space, ac_space):
    global g_actor_critic
    if g_actor_critic is None:
        with tf.variable_scope('actor'):
            g_actor_critic = get_actor_critic(sess, nenvs, nsteps, ob_space,
                    ac_space, CnnPolicy, should_summary=False)
        g_actor_critic.load(A2C_MODEL_PATH)

        print('Actor restored!')
    return g_actor_critic
예제 #2
0
 def __init__(self):
     fn_policy = 'weights/a2c_400000.ckpt'
     tf.reset_default_graph()
     self.sess = tf.Session()
     nenvs = 1
     nsteps = 1
     ob_space = self.observation_space
     ac_space = self.action_space
     with tf.variable_scope('actor'):
         self.actor_critic = get_actor_critic(self.sess, nenvs, nsteps, ob_space,
                 ac_space, CnnPolicy, should_summary=False)
     self.actor_critic.load(fn_policy)
     super().__init__()
예제 #3
0
def train(policy, save_name, load_count = 0, summarize=True, load_path=None, log_path = './logs'):
    
    #Minigrid maze env
    env_name = "MiniGrid-BlockMaze-v0"
    def make_env(env_name):
        return lambda: gym_minigrid.wrappers.PadImgObsWrapper(gym.make(env_name))

    envs = [make_env(env_name) for i in range(N_ENVS)]
    envs = SubprocVecEnv(envs)

    ob_space = envs.observation_space.shape
    nw, nh, nc = ob_space
    ac_space = envs.action_space

    obs = envs.reset()

    with tf.Session() as sess:
        actor_critic = get_actor_critic(sess, N_ENVS, N_STEPS, ob_space,
                ac_space, policy, summarize)
        if load_path is not None:
            actor_critic.load(load_path)
            print('Loaded a2c')

        summary_op = tf.summary.merge_all()
        writer = tf.summary.FileWriter(log_path, graph=sess.graph)

        sess.run(tf.global_variables_initializer())

        batch_ob_shape = (N_ENVS*N_STEPS, nw, nh, nc)

        dones = [False for _ in range(N_ENVS)]
        nbatch = N_ENVS * N_STEPS

        episode_rewards = np.zeros((N_ENVS, ))
        final_rewards   = np.zeros((N_ENVS, ))

        for update in tqdm(range(load_count + 1, TOTAL_TIMESTEPS + 1)):
            # mb stands for mini batch
            mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
            for n in range(N_STEPS):
                actions, values, _ = actor_critic.act(obs)

                mb_obs.append(np.copy(obs))
                mb_actions.append(actions)
                mb_values.append(values)
                mb_dones.append(dones)

                obs, rewards, dones, _ = envs.step(actions)

                #print(obs[0:3, :,:,0])

                episode_rewards += rewards
                masks = 1 - np.array(dones)
                final_rewards *= masks
                final_rewards += (1 - masks) * episode_rewards
                episode_rewards *= masks

                mb_rewards.append(rewards)

            mb_dones.append(dones)

            #batch of steps to batch of rollouts
            mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).reshape(batch_ob_shape)
            mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
            mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
            mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
            mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
            mb_masks = mb_dones[:, :-1]
            mb_dones = mb_dones[:, 1:]

            last_values = actor_critic.critique(obs).tolist()

            #discount/bootstrap off value fn
            for n, (rewards, d, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
                rewards = rewards.tolist()
                d = d.tolist()
                if d[-1] == 0:
                    rewards = discount_with_dones(rewards+[value], d+[0], GAMMA)[:-1]
                else:
                    rewards = discount_with_dones(rewards, d, GAMMA)
                mb_rewards[n] = rewards

            mb_rewards = mb_rewards.flatten()
            mb_actions = mb_actions.flatten()
            mb_values = mb_values.flatten()
            mb_masks = mb_masks.flatten()

            if summarize:
                loss, policy_loss, value_loss, policy_entropy, _, summary = actor_critic.train(mb_obs,
                        mb_rewards, mb_masks, mb_actions, mb_values, update,
                        summary_op)
                writer.add_summary(summary, update)
            else:
                loss, policy_loss, value_loss, policy_entropy, _ = actor_critic.train(mb_obs,
                        mb_rewards, mb_masks, mb_actions, mb_values, update)

            if update % LOG_INTERVAL == 0 or update == 1:
                print('%i): %.4f, %.4f, %.4f' % (update, policy_loss, value_loss, policy_entropy))
                print(final_rewards.mean())

            if update % SAVE_INTERVAL == 0:
                print('Saving model')
                actor_critic.save(SAVE_PATH, save_name + '_' + str(update) + '.ckpt')

        actor_critic.save(SAVE_PATH, save_name + '_done.ckpt')
def train(policy, save_name, load_count = 0, summarize=True, load_path=None, log_path = './logs'):
    envs = [make_env() for i in range(N_ENVS)]
    envs = SubprocVecEnv(envs)

    ob_space = envs.observation_space.shape
    nc, nw, nh = ob_space
    ac_space = envs.action_space

    obs = envs.reset()
    ob_np = np.copy(obs)
    ob_np = np.squeeze(ob_np, axis=1)
    ob_np = np.expand_dims(ob_np, axis=3)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    actor_critic = get_actor_critic(sess, N_ENVS, N_STEPS, ob_space,
            ac_space, policy, summarize)
    if load_path is not None:
        actor_critic.load(load_path)
        print('Loaded a2c')

    summary_op = tf.summary.merge_all()
    writer = tf.summary.FileWriter(log_path, graph=sess.graph)

    sess.run(tf.global_variables_initializer())

    batch_ob_shape = (N_ENVS * N_STEPS, nw, nh, nc)

    dones = [False for _ in range(N_ENVS)]
    nbatch = N_ENVS * N_STEPS

    episode_rewards = np.zeros((N_ENVS, ))
    final_rewards   = np.zeros((N_ENVS, ))

    for update in tqdm(range(load_count + 1, TOTAL_TIMESTEPS + 1)):
        # mb stands for mini batch
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
        for n in range(N_STEPS):
            
            ob_np = np.copy(obs)
            ob_np = np.squeeze(ob_np, axis=1)
            ob_np = np.expand_dims(ob_np, axis=3)

            actions, values, _ = actor_critic.act(ob_np)

            mb_obs.append(ob_np)
            mb_actions.append(actions)
            mb_values.append(values)
            mb_dones.append(dones)

            obs, rewards, dones, _ = envs.step(actions)

            episode_rewards += rewards
            masks = 1 - np.array(dones)
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            mb_rewards.append(rewards)

        mb_dones.append(dones)

        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=np.float32).reshape(batch_ob_shape) #.swapaxes(1, 0).reshape(batch_ob_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
        mb_masks = mb_dones[:, :-1]
        mb_dones = mb_dones[:, 1:]

        last_values = actor_critic.critique(ob_np).tolist()

        #discount/bootstrap off value fn
        for n, (rewards, d, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
            rewards = rewards.tolist()
            d = d.tolist()
            if d[-1] == 0:
                rewards = discount_with_dones(rewards+[value], d+[0], GAMMA)[:-1]
            else:
                rewards = discount_with_dones(rewards, d, GAMMA)
            mb_rewards[n] = rewards

        mb_rewards = mb_rewards.flatten()
        mb_actions = mb_actions.flatten()
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()

        if summarize:
            loss, policy_loss, value_loss, policy_entropy, _, summary = actor_critic.train(mb_obs,
                    mb_rewards, mb_masks, mb_actions, mb_values, update,
                    summary_op)
            writer.add_summary(summary, update)
        else:
            loss, policy_loss, value_loss, policy_entropy, _ = actor_critic.train(mb_obs,
                    mb_rewards, mb_masks, mb_actions, mb_values, update)

        if update % LOG_INTERVAL == 0 or update == 1:
            print('%i => Policy Loss : %.4f, Value Loss : %.4f, Policy Entropy : %.4f, Final Reward : %.4f' % (update, policy_loss, value_loss, policy_entropy, final_rewards.mean()))

        if update % SAVE_INTERVAL == 0:
            print('Saving model')
            actor_critic.save(SAVE_PATH, save_name + '_' + str(update) + '.ckpt')

        actor_critic.save(SAVE_PATH, save_name + '_done.ckpt')
예제 #5
0
파일: env_model.py 프로젝트: zhl001/i2a-tf
        self.image_loss       = image_loss
        self.target_states    = target_states
        self.target_rewards   = target_rewards
        self.opt              = opt


if __name__ == '__main__':
    envs = [make_env() for i in range(N_ENVS)]
    envs = SubprocVecEnv(envs)

    ob_space = envs.observation_space.shape
    ac_space = envs.action_space
    num_actions = envs.action_space.n

    with tf.Session() as sess:
        actor_critic = get_actor_critic(sess, N_ENVS, N_STEPS, ob_space, ac_space, CnnPolicy, should_summary=False)
        actor_critic.load(A2C_WEIGHTS)

        with tf.variable_scope('env_model'):
            env_model = create_env_model(ob_space, num_actions, num_pixels,
                    len(mode_rewards[REWARD_MODE]))

        summary_op = tf.summary.merge_all()
        sess.run(tf.global_variables_initializer())

        losses = []
        all_rewards = []

        width = ob_space[0]
        height = ob_space[1]
        depth = ob_space[2]
def train(policy,
          save_name,
          s_alpha,
          load_count=0,
          summarize=True,
          load_path=None,
          log_path='./logs',
          safety=True):
    envs = make_env()()  #for i in range(N_ENVS)]
    #envs = SubprocVecEnv(envs)
    with open("./unsafe_state_count_{}.txt".format(safety), "w+") as f:
        pass

    ob_space = envs.observation_space.shape
    nc, nw, nh = ob_space
    ac_space = envs.action_space

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    actor_critic = get_actor_critic(sess, N_ENVS, N_STEPS, ob_space, ac_space,
                                    policy, summarize)
    if load_path is not None:
        actor_critic.load(load_path)
        print('Loaded a2c')

    summary_op = tf.summary.merge_all()
    writer = tf.summary.FileWriter(log_path, graph=sess.graph)
    sess.run(tf.global_variables_initializer())

    batch_ob_shape = (N_ENVS * N_STEPS, nw, nh, nc)
    dones = False  #for _ in range(N_ENVS)]
    nbatch = N_STEPS  # * N_Envs
    episode_rewards = np.zeros((1, ))
    final_rewards = np.zeros((1, ))
    last_rews = [0] * REW_HIST

    # Safety part
    obs = envs.reset()
    ob_np = obs.reshape(nc, nw, nh)

    base_state = copy.deepcopy(ob_np).reshape(nc, nw, nh)
    base_state[np.where(base_state == 2.0)] = 1.0
    print(base_state)
    base_tree = generate_tree(sess, ob_np)

    for update in tqdm(range(load_count + 1, TOTAL_TIMESTEPS + 1)):
        # mb stands for mini batch
        unsafe_state_count = 0
        tree = copy.deepcopy(base_tree)
        mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]

        for n in range(N_STEPS):
            ob_np = obs.reshape(nc, nw, nh)

            unsafe_state = ob_np.reshape(nw, nh)
            x, y = np.where(unsafe_state == 4.0)
            if (x == 3 and y == 2):
                unsafe_state_count += 1

            if (update % LOG_INTERVAL == 0 and DEBUG == True):
                print("-- State ---")
                print(ob_np)
                print("-- Imagined State --")
                print(tree.imagined_state.reshape(nc, nw, nh))

            ac_ob = ob_np.reshape(1, nw, nh, nc)
            actions, values, _ = actor_critic.act(ac_ob)
            if (safety):
                actions = a2c_safe_action(tree, actions, base_state,
                                          actor_critic)

            mb_obs.append(ob_np)
            mb_actions.append(actions[0])
            mb_values.append(values)
            mb_dones.append(dones)

            if (update % LOG_INTERVAL == 0 and DEBUG == True):
                print("Action : ", CONTROLS[actions[0]], " - Safe :",
                      bool(safe[0]), " - Done : ", dones[0])
                _ = input("")

            obs, rewards, dones, _ = envs.step(actions[0])
            ob_np = ob_np.reshape(nc, nw, nh)

            tree = get_node(base_tree, ob_np)

            #rewards = [rewards[i] - s_alpha * (1 - safe[i]) for i in range(len(rewards))]
            episode_rewards += rewards
            masks = 1 - int(dones)
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            mb_rewards.append(rewards)

        with open("./unsafe_state_count_{}.txt".format(safety), "a+") as f:
            f.write("{}\n".format(unsafe_state_count))
            unsafe_state_count = 0

        mb_dones.append(dones)
        obs = envs.reset()
        tree = copy.deepcopy(base_tree)

        #batch of steps to batch of rollouts
        mb_obs = np.asarray(mb_obs, dtype=np.float32).reshape(
            batch_ob_shape)  #.swapaxes(1, 0).reshape(batch_ob_shape)
        mb_rewards = np.asarray(mb_rewards, dtype=np.float32)  #.swapaxes(1, 0)
        mb_actions = np.asarray(mb_actions, dtype=np.int32)  #.swapaxes(1, 0)
        mb_values = np.asarray(mb_values, dtype=np.float32)  #.swapaxes(1, 0)
        mb_dones = np.asarray(mb_dones, dtype=np.bool)  #.swapaxes(1, 0)
        mb_masks = mb_dones[:-1]
        mb_dones = mb_dones[1:]

        ac_ob = ob_np.reshape(1, nw, nh, nc)
        last_values = actor_critic.critique(ac_ob).tolist()

        #discount/bootstrap off value fn
        #for n, (rewards, value) in enumerate(zip(mb_rewards, last_values)):
        rewards = mb_rewards.tolist()
        d = mb_dones.tolist()
        value = last_values
        if d[-1] == 0:
            rewards = discount_with_dones(rewards + value, d + [0], GAMMA)[:-1]
        else:
            rewards = discount_with_dones(rewards, d, GAMMA)
        mb_rewards = np.array(rewards)

        mb_rewards = mb_rewards.flatten()
        mb_actions = mb_actions.flatten()
        mb_values = mb_values.flatten()
        mb_masks = mb_masks.flatten()

        if summarize:
            loss, policy_loss, value_loss, policy_entropy, _, summary = actor_critic.train(
                mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, update,
                summary_op)
            writer.add_summary(summary, update)
        else:
            loss, policy_loss, value_loss, policy_entropy, _ = actor_critic.train(
                mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, update)

        if update % LOG_INTERVAL == 0 or update == 1:
            print(
                '%i - %.1f => Policy Loss : %.4f, Value Loss : %.4f, Policy Entropy : %.4f, Final Reward : %.4f'
                % (update, s_alpha, policy_loss, value_loss, policy_entropy,
                   final_rewards.mean()))
            if (EARLY_STOPPING and update != 1
                    and abs(final_rewards.mean() - statistics.mean(last_rews))
                    < EARLY_STOP_THRESH):
                print('Training done - Saving model')
                actor_critic.save(SAVE_PATH,
                                  save_name + '_' + str(update) + '.ckpt')
                with open("./logs_alpha.txt", "a+") as f:
                    f.write("{:.1f} - {:.4f}\n".format(s_alpha,
                                                       max(last_rews)))
                break
            _ = last_rews.pop(0)
            last_rews.append(final_rewards.mean())

        if update % SAVE_INTERVAL == 0:
            print('Saving model')
            actor_critic.save(SAVE_PATH,
                              save_name + '_' + str(update) + '.ckpt')

        actor_critic.save(SAVE_PATH, save_name + '_done.ckpt')
env = GridworldEnv("side_effects_sokoban")

done = False
states = env.reset()
num_actions = ac_space.n
nc, nw, nh = ob_space
print('Observation space ', ob_space)
print('Number of actions ', num_actions)
steps = 0

with tf.Session() as sess:
    with tf.variable_scope('actor'):
        actor_critic = get_actor_critic(sess,
                                        nenvs,
                                        nsteps,
                                        ob_space,
                                        ac_space,
                                        CnnPolicy,
                                        should_summary=False)
    actor_critic.load('weights/a2c_3600.ckpt')

    with tf.variable_scope('env_model'):
        env_model = create_env_model(ob_space,
                                     num_actions,
                                     _NUM_PIXELS,
                                     len(sokoban_rewards),
                                     should_summary=False)

    save_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                  scope='env_model')
    loader = tf.train.Saver(var_list=save_vars)
예제 #8
0
#os.environ["CUDA_VISIBLE_DEVICES"]="1"

nenvs = 1
nsteps = 5

done = False
env = MiniPacman('regular', 1000)
ob_space = env.observation_space.shape
nw, nh, nc = ob_space
ac_space = env.action_space

states = env.reset()

with tf.Session() as sess:
    actor_critic = get_actor_critic(sess, nenvs, nsteps, ob_space, ac_space,
                                    CnnPolicy, False)
    actor_critic.load('./weights/a2c_100000.ckpt')

    total_reward = 0

    while not done:
        states = np.expand_dims(states, 0)
        actions, values, _ = actor_critic.act(states)

        states, reward, done, _ = env.step(actions[0])

        total_reward += reward

    print('total reward', total_reward)