def get_cache_loaded_a2c(sess, nenvs, nsteps, ob_space, ac_space): global g_actor_critic if g_actor_critic is None: with tf.variable_scope('actor'): g_actor_critic = get_actor_critic(sess, nenvs, nsteps, ob_space, ac_space, CnnPolicy, should_summary=False) g_actor_critic.load(A2C_MODEL_PATH) print('Actor restored!') return g_actor_critic
def __init__(self): fn_policy = 'weights/a2c_400000.ckpt' tf.reset_default_graph() self.sess = tf.Session() nenvs = 1 nsteps = 1 ob_space = self.observation_space ac_space = self.action_space with tf.variable_scope('actor'): self.actor_critic = get_actor_critic(self.sess, nenvs, nsteps, ob_space, ac_space, CnnPolicy, should_summary=False) self.actor_critic.load(fn_policy) super().__init__()
def train(policy, save_name, load_count = 0, summarize=True, load_path=None, log_path = './logs'): #Minigrid maze env env_name = "MiniGrid-BlockMaze-v0" def make_env(env_name): return lambda: gym_minigrid.wrappers.PadImgObsWrapper(gym.make(env_name)) envs = [make_env(env_name) for i in range(N_ENVS)] envs = SubprocVecEnv(envs) ob_space = envs.observation_space.shape nw, nh, nc = ob_space ac_space = envs.action_space obs = envs.reset() with tf.Session() as sess: actor_critic = get_actor_critic(sess, N_ENVS, N_STEPS, ob_space, ac_space, policy, summarize) if load_path is not None: actor_critic.load(load_path) print('Loaded a2c') summary_op = tf.summary.merge_all() writer = tf.summary.FileWriter(log_path, graph=sess.graph) sess.run(tf.global_variables_initializer()) batch_ob_shape = (N_ENVS*N_STEPS, nw, nh, nc) dones = [False for _ in range(N_ENVS)] nbatch = N_ENVS * N_STEPS episode_rewards = np.zeros((N_ENVS, )) final_rewards = np.zeros((N_ENVS, )) for update in tqdm(range(load_count + 1, TOTAL_TIMESTEPS + 1)): # mb stands for mini batch mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] for n in range(N_STEPS): actions, values, _ = actor_critic.act(obs) mb_obs.append(np.copy(obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(dones) obs, rewards, dones, _ = envs.step(actions) #print(obs[0:3, :,:,0]) episode_rewards += rewards masks = 1 - np.array(dones) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks mb_rewards.append(rewards) mb_dones.append(dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.float32).swapaxes(1, 0).reshape(batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = actor_critic.critique(obs).tolist() #discount/bootstrap off value fn for n, (rewards, d, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() d = d.tolist() if d[-1] == 0: rewards = discount_with_dones(rewards+[value], d+[0], GAMMA)[:-1] else: rewards = discount_with_dones(rewards, d, GAMMA) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() if summarize: loss, policy_loss, value_loss, policy_entropy, _, summary = actor_critic.train(mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, update, summary_op) writer.add_summary(summary, update) else: loss, policy_loss, value_loss, policy_entropy, _ = actor_critic.train(mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, update) if update % LOG_INTERVAL == 0 or update == 1: print('%i): %.4f, %.4f, %.4f' % (update, policy_loss, value_loss, policy_entropy)) print(final_rewards.mean()) if update % SAVE_INTERVAL == 0: print('Saving model') actor_critic.save(SAVE_PATH, save_name + '_' + str(update) + '.ckpt') actor_critic.save(SAVE_PATH, save_name + '_done.ckpt')
def train(policy, save_name, load_count = 0, summarize=True, load_path=None, log_path = './logs'): envs = [make_env() for i in range(N_ENVS)] envs = SubprocVecEnv(envs) ob_space = envs.observation_space.shape nc, nw, nh = ob_space ac_space = envs.action_space obs = envs.reset() ob_np = np.copy(obs) ob_np = np.squeeze(ob_np, axis=1) ob_np = np.expand_dims(ob_np, axis=3) config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) actor_critic = get_actor_critic(sess, N_ENVS, N_STEPS, ob_space, ac_space, policy, summarize) if load_path is not None: actor_critic.load(load_path) print('Loaded a2c') summary_op = tf.summary.merge_all() writer = tf.summary.FileWriter(log_path, graph=sess.graph) sess.run(tf.global_variables_initializer()) batch_ob_shape = (N_ENVS * N_STEPS, nw, nh, nc) dones = [False for _ in range(N_ENVS)] nbatch = N_ENVS * N_STEPS episode_rewards = np.zeros((N_ENVS, )) final_rewards = np.zeros((N_ENVS, )) for update in tqdm(range(load_count + 1, TOTAL_TIMESTEPS + 1)): # mb stands for mini batch mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] for n in range(N_STEPS): ob_np = np.copy(obs) ob_np = np.squeeze(ob_np, axis=1) ob_np = np.expand_dims(ob_np, axis=3) actions, values, _ = actor_critic.act(ob_np) mb_obs.append(ob_np) mb_actions.append(actions) mb_values.append(values) mb_dones.append(dones) obs, rewards, dones, _ = envs.step(actions) episode_rewards += rewards masks = 1 - np.array(dones) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks mb_rewards.append(rewards) mb_dones.append(dones) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.float32).reshape(batch_ob_shape) #.swapaxes(1, 0).reshape(batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] mb_dones = mb_dones[:, 1:] last_values = actor_critic.critique(ob_np).tolist() #discount/bootstrap off value fn for n, (rewards, d, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() d = d.tolist() if d[-1] == 0: rewards = discount_with_dones(rewards+[value], d+[0], GAMMA)[:-1] else: rewards = discount_with_dones(rewards, d, GAMMA) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() if summarize: loss, policy_loss, value_loss, policy_entropy, _, summary = actor_critic.train(mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, update, summary_op) writer.add_summary(summary, update) else: loss, policy_loss, value_loss, policy_entropy, _ = actor_critic.train(mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, update) if update % LOG_INTERVAL == 0 or update == 1: print('%i => Policy Loss : %.4f, Value Loss : %.4f, Policy Entropy : %.4f, Final Reward : %.4f' % (update, policy_loss, value_loss, policy_entropy, final_rewards.mean())) if update % SAVE_INTERVAL == 0: print('Saving model') actor_critic.save(SAVE_PATH, save_name + '_' + str(update) + '.ckpt') actor_critic.save(SAVE_PATH, save_name + '_done.ckpt')
self.image_loss = image_loss self.target_states = target_states self.target_rewards = target_rewards self.opt = opt if __name__ == '__main__': envs = [make_env() for i in range(N_ENVS)] envs = SubprocVecEnv(envs) ob_space = envs.observation_space.shape ac_space = envs.action_space num_actions = envs.action_space.n with tf.Session() as sess: actor_critic = get_actor_critic(sess, N_ENVS, N_STEPS, ob_space, ac_space, CnnPolicy, should_summary=False) actor_critic.load(A2C_WEIGHTS) with tf.variable_scope('env_model'): env_model = create_env_model(ob_space, num_actions, num_pixels, len(mode_rewards[REWARD_MODE])) summary_op = tf.summary.merge_all() sess.run(tf.global_variables_initializer()) losses = [] all_rewards = [] width = ob_space[0] height = ob_space[1] depth = ob_space[2]
def train(policy, save_name, s_alpha, load_count=0, summarize=True, load_path=None, log_path='./logs', safety=True): envs = make_env()() #for i in range(N_ENVS)] #envs = SubprocVecEnv(envs) with open("./unsafe_state_count_{}.txt".format(safety), "w+") as f: pass ob_space = envs.observation_space.shape nc, nw, nh = ob_space ac_space = envs.action_space config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) actor_critic = get_actor_critic(sess, N_ENVS, N_STEPS, ob_space, ac_space, policy, summarize) if load_path is not None: actor_critic.load(load_path) print('Loaded a2c') summary_op = tf.summary.merge_all() writer = tf.summary.FileWriter(log_path, graph=sess.graph) sess.run(tf.global_variables_initializer()) batch_ob_shape = (N_ENVS * N_STEPS, nw, nh, nc) dones = False #for _ in range(N_ENVS)] nbatch = N_STEPS # * N_Envs episode_rewards = np.zeros((1, )) final_rewards = np.zeros((1, )) last_rews = [0] * REW_HIST # Safety part obs = envs.reset() ob_np = obs.reshape(nc, nw, nh) base_state = copy.deepcopy(ob_np).reshape(nc, nw, nh) base_state[np.where(base_state == 2.0)] = 1.0 print(base_state) base_tree = generate_tree(sess, ob_np) for update in tqdm(range(load_count + 1, TOTAL_TIMESTEPS + 1)): # mb stands for mini batch unsafe_state_count = 0 tree = copy.deepcopy(base_tree) mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] for n in range(N_STEPS): ob_np = obs.reshape(nc, nw, nh) unsafe_state = ob_np.reshape(nw, nh) x, y = np.where(unsafe_state == 4.0) if (x == 3 and y == 2): unsafe_state_count += 1 if (update % LOG_INTERVAL == 0 and DEBUG == True): print("-- State ---") print(ob_np) print("-- Imagined State --") print(tree.imagined_state.reshape(nc, nw, nh)) ac_ob = ob_np.reshape(1, nw, nh, nc) actions, values, _ = actor_critic.act(ac_ob) if (safety): actions = a2c_safe_action(tree, actions, base_state, actor_critic) mb_obs.append(ob_np) mb_actions.append(actions[0]) mb_values.append(values) mb_dones.append(dones) if (update % LOG_INTERVAL == 0 and DEBUG == True): print("Action : ", CONTROLS[actions[0]], " - Safe :", bool(safe[0]), " - Done : ", dones[0]) _ = input("") obs, rewards, dones, _ = envs.step(actions[0]) ob_np = ob_np.reshape(nc, nw, nh) tree = get_node(base_tree, ob_np) #rewards = [rewards[i] - s_alpha * (1 - safe[i]) for i in range(len(rewards))] episode_rewards += rewards masks = 1 - int(dones) final_rewards *= masks final_rewards += (1 - masks) * episode_rewards episode_rewards *= masks mb_rewards.append(rewards) with open("./unsafe_state_count_{}.txt".format(safety), "a+") as f: f.write("{}\n".format(unsafe_state_count)) unsafe_state_count = 0 mb_dones.append(dones) obs = envs.reset() tree = copy.deepcopy(base_tree) #batch of steps to batch of rollouts mb_obs = np.asarray(mb_obs, dtype=np.float32).reshape( batch_ob_shape) #.swapaxes(1, 0).reshape(batch_ob_shape) mb_rewards = np.asarray(mb_rewards, dtype=np.float32) #.swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32) #.swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32) #.swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool) #.swapaxes(1, 0) mb_masks = mb_dones[:-1] mb_dones = mb_dones[1:] ac_ob = ob_np.reshape(1, nw, nh, nc) last_values = actor_critic.critique(ac_ob).tolist() #discount/bootstrap off value fn #for n, (rewards, value) in enumerate(zip(mb_rewards, last_values)): rewards = mb_rewards.tolist() d = mb_dones.tolist() value = last_values if d[-1] == 0: rewards = discount_with_dones(rewards + value, d + [0], GAMMA)[:-1] else: rewards = discount_with_dones(rewards, d, GAMMA) mb_rewards = np.array(rewards) mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() if summarize: loss, policy_loss, value_loss, policy_entropy, _, summary = actor_critic.train( mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, update, summary_op) writer.add_summary(summary, update) else: loss, policy_loss, value_loss, policy_entropy, _ = actor_critic.train( mb_obs, mb_rewards, mb_masks, mb_actions, mb_values, update) if update % LOG_INTERVAL == 0 or update == 1: print( '%i - %.1f => Policy Loss : %.4f, Value Loss : %.4f, Policy Entropy : %.4f, Final Reward : %.4f' % (update, s_alpha, policy_loss, value_loss, policy_entropy, final_rewards.mean())) if (EARLY_STOPPING and update != 1 and abs(final_rewards.mean() - statistics.mean(last_rews)) < EARLY_STOP_THRESH): print('Training done - Saving model') actor_critic.save(SAVE_PATH, save_name + '_' + str(update) + '.ckpt') with open("./logs_alpha.txt", "a+") as f: f.write("{:.1f} - {:.4f}\n".format(s_alpha, max(last_rews))) break _ = last_rews.pop(0) last_rews.append(final_rewards.mean()) if update % SAVE_INTERVAL == 0: print('Saving model') actor_critic.save(SAVE_PATH, save_name + '_' + str(update) + '.ckpt') actor_critic.save(SAVE_PATH, save_name + '_done.ckpt')
env = GridworldEnv("side_effects_sokoban") done = False states = env.reset() num_actions = ac_space.n nc, nw, nh = ob_space print('Observation space ', ob_space) print('Number of actions ', num_actions) steps = 0 with tf.Session() as sess: with tf.variable_scope('actor'): actor_critic = get_actor_critic(sess, nenvs, nsteps, ob_space, ac_space, CnnPolicy, should_summary=False) actor_critic.load('weights/a2c_3600.ckpt') with tf.variable_scope('env_model'): env_model = create_env_model(ob_space, num_actions, _NUM_PIXELS, len(sokoban_rewards), should_summary=False) save_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='env_model') loader = tf.train.Saver(var_list=save_vars)
#os.environ["CUDA_VISIBLE_DEVICES"]="1" nenvs = 1 nsteps = 5 done = False env = MiniPacman('regular', 1000) ob_space = env.observation_space.shape nw, nh, nc = ob_space ac_space = env.action_space states = env.reset() with tf.Session() as sess: actor_critic = get_actor_critic(sess, nenvs, nsteps, ob_space, ac_space, CnnPolicy, False) actor_critic.load('./weights/a2c_100000.ckpt') total_reward = 0 while not done: states = np.expand_dims(states, 0) actions, values, _ = actor_critic.act(states) states, reward, done, _ = env.step(actions[0]) total_reward += reward print('total reward', total_reward)