def test_set_writer(self): """ Check that when using an EventFileWriter from a FileWriter, the resulting events file contains events from both the FileWriter and easy_tf_log. """ with tempfile.TemporaryDirectory() as temp_dir: os.chdir(temp_dir) writer = tf.summary.FileWriter('logs') var = tf.Variable(0.0) summary_op = tf.summary.scalar('tf_var', var) sess = tf.Session() sess.run(var.initializer) summary = sess.run(summary_op) writer.add_summary(summary) easy_tf_log.set_writer(writer.event_writer) easy_tf_log.tflog('easy-tf-log_var', 0) self.assertEqual(os.listdir(), ['logs']) event_filename = osp.join('logs', os.listdir('logs')[0]) self.assertIn('events.out.tfevents', event_filename) tags = set() for event in tf.train.summary_iterator(event_filename): for value in event.summary.value: tags.add(value.tag) self.assertIn('tf_var', tags) self.assertIn('easy-tf-log_var', tags)
def update(self, *args, **kwargs): episode_length = len(self.state_set) discounted_rewards = self.discount_rewards(self.reward_set) discounted_rewards -= np.mean(discounted_rewards) discounted_rewards /= np.std(discounted_rewards) + 0.0000001 update_inputs = np.zeros((episode_length, self.config.config_dict['STATE_SPACE'][0])) advantages = np.zeros((episode_length, self.action_size)) for i in range(episode_length): update_inputs[i] = self.state_set[i] advantages[i][self.action_set[i]] = discounted_rewards[i] average_loss = 0.0 for i in range(self.config.config_dict['ITERATION_EVER_EPOCH']): re, _ = self.sess.run(fetches=[self.loss, self.optimize_op], feed_dict={ self.state_input: update_inputs, self.advantages: advantages }) average_loss += np.sum(re) average_loss /= self.config.config_dict['ITERATION_EVER_EPOCH'] self.log_queue.put({self.name + '_LOSS': average_loss}) easy_tf_log.tflog(key=self.name + 'TRAIN_LOSS', value=average_loss) self.state_set, self.action_set, self.reward_set = [], [], [] self.print_log_queue(status=self.status_key['TRAIN'])
def update(self): average_loss = 0.0 for i in range(self.config.config_dict['ITERATION_EVER_EPOCH']): # print("memory length=", self.memory.) if self.memory.observations0.length < self.config.config_dict[ 'BATCH_SIZE']: return batch_data = self.memory.sample( batch_size=self.config.config_dict['BATCH_SIZE']) target_q_value_list = [] for state in batch_data['obs1']: _, target_q_value = self.predict_target(sess=self.sess, new_obs=state) target_q_value_list.append(target_q_value) re = self.sess.run(fetches=[self.loss, self.optimize], feed_dict={ self.reward_input: batch_data['rewards'], self.action_input: batch_data['actions'], self.state_input: batch_data['obs0'], self.done_input: batch_data['terminals1'], self.target_q_input: target_q_value_list }) average_loss += re[0] average_loss /= self.config.config_dict['ITERATION_EVER_EPOCH'] self.log_queue.put({self.name + '_LOSS': average_loss}) easy_tf_log.tflog(key=self.name + 'TRAIN_LOSS', value=average_loss) # TODO POLICY FOR UPDATE DQN TARGET self.sess.run(self.update_target_q_op)
def test_full(self): """ Log a few values and check that the event file contain the expected values. """ with tempfile.TemporaryDirectory() as temp_dir: os.chdir(temp_dir) for i in range(10): easy_tf_log.tflog('foo', i) for i in range(10): easy_tf_log.tflog('bar', i) event_filename = osp.join('logs', os.listdir('logs')[0]) event_n = 0 for event in tf.train.summary_iterator(event_filename): if event_n == 0: # metadata event_n += 1 continue if event_n <= 10: self.assertEqual(event.step, event_n - 1) self.assertEqual(event.summary.value[0].tag, "foo") self.assertEqual(event.summary.value[0].simple_value, float(event_n - 1)) if event_n > 10 and event_n <= 20: self.assertEqual(event.step, event_n - 10 - 1) self.assertEqual(event.summary.value[0].tag, "bar") self.assertEqual(event.summary.value[0].simple_value, float(event_n - 10 - 1)) event_n += 1
def compute_grad(self): batch = self.ddpg_model.memory.sample( batch_size=self.ddpg_model.batch_size) if self.ddpg_model.normalize_returns and self.ddpg_model.enable_popart: old_mean, old_std, target_Q = self.ddpg_model.sess.run( [ self.ddpg_model.ret_rms.mean, self.ddpg_model.ret_rms.std, self.ddpg_model.target_Q ], feed_dict={ self.ddpg_model.obs1: batch['obs1'], self.ddpg_model.rewards: batch['rewards'], self.ddpg_model.terminals1: batch['terminals1'].astype('float32'), }) self.ddpg_model.ret_rms.update(target_Q.flatten()) self.ddpg_model.sess.run(self.ddpg_model.renormalize_Q_outputs_op, feed_dict={ self.ddpg_model.old_std: np.array([old_std]), self.ddpg_model.old_mean: np.array([old_mean]), }) else: target_Q = self.ddpg_model.sess.run( self.ddpg_model.target_Q, feed_dict={ self.ddpg_model.obs1: batch['obs1'], self.ddpg_model.rewards: batch['rewards'], self.ddpg_model.terminals1: batch['terminals1'].astype('float32'), }) # Get all gradients and perform a synced update. ops = [self.ddpg_model.actor_grads, self.ddpg_model.critic_grads] actor_grads, critic_grads = self.ddpg_model.sess.run( ops, feed_dict={ self.ddpg_model.obs0: batch['obs0'], self.ddpg_model.actions: batch['actions'], self.ddpg_model.critic_target: target_Q, }) actor_grads_norm = np.sqrt(np.sum(actor_grads**2)) critic_grads_norm = np.sqrt(np.sum(actor_grads_norm**2)) easy_tf_log.tflog(key=self.name + '_' + self.current_env_status + '_ACTOR_GRADS_2_NORM', value=actor_grads_norm) easy_tf_log.tflog(key=self.name + '_' + self.current_env_status + '_CRITIC_GRADS_2_NORM', value=critic_grads_norm)
def test_no_setup(self): """ Test that if tflog() is used without any extra setup, a directory 'logs' is created in the current directory containing the event file. """ with tempfile.TemporaryDirectory() as temp_dir: os.chdir(temp_dir) easy_tf_log.tflog('var', 0) self.assertEqual(os.listdir(), ['logs']) self.assertIn('events.out.tfevents', os.listdir('logs')[0])
def test_set_dir(self): """ Confirm that set_dir works. """ with tempfile.TemporaryDirectory() as temp_dir: os.chdir(temp_dir) easy_tf_log.set_dir('logs2') easy_tf_log.tflog('var', 0) self.assertEqual(os.listdir(), ['logs2']) self.assertIn('events.out.tfevents', os.listdir('logs2')[0])
def run_manager(worker_threads, sess, lr, step_counter, update_counter, log_dir, saver, wake_interval_seconds, ckpt_interval_seconds): checkpoint_file = osp.join(log_dir, 'checkpoints', 'network.ckpt') # Junte um ou mais componentes de caminho de maneira inteligente. O valor de retorno é a concatenação de caminho e qualquer membro de * caminhos com exatamente um separador de diretório ( os.sep) seguindo cada parte não vazia, exceto a última, significando que o resultado só terminará em um separador se a última parte estiver vazia. Se um componente for um caminho absoluto, todos os componentes anteriores serão descartados e a junção continuará a partir do componente de caminho absoluto. ckpt_timer = utils.Timer(duration_seconds=ckpt_interval_seconds) ckpt_timer.reset() step_rate = utils.RateMeasure() step_rate.reset(int(step_counter)) while True: time.sleep(wake_interval_seconds) steps_per_second = step_rate.measure(int(step_counter)) easy_tf_log.tflog('misc/steps_per_second', steps_per_second) easy_tf_log.tflog('misc/steps', int(step_counter)) easy_tf_log.tflog('misc/updates', int(update_counter)) easy_tf_log.tflog('misc/lr', sess.run(lr)) alive = [t.is_alive() for t in worker_threads] if ckpt_timer.done() or not any(alive): saver.save(sess, checkpoint_file, int(step_counter)) print("Checkpoint saved to '{}'".format(checkpoint_file)) ckpt_timer.reset() if not any(alive): break
def run_manager(worker_threads, sess, lr, step_counter, update_counter, log_dir, saver, wake_interval_seconds, ckpt_interval_seconds): checkpoint_file = osp.join(log_dir, 'checkpoints', 'network.ckpt') ckpt_timer = utils.Timer(duration_seconds=ckpt_interval_seconds) ckpt_timer.reset() step_rate = utils.RateMeasure() step_rate.reset(int(step_counter)) while True: time.sleep(wake_interval_seconds) steps_per_second = step_rate.measure(int(step_counter)) easy_tf_log.tflog('misc/steps_per_second', steps_per_second) easy_tf_log.tflog('misc/steps', int(step_counter)) easy_tf_log.tflog('misc/updates', int(update_counter)) easy_tf_log.tflog('misc/lr', sess.run(lr)) alive = [t.is_alive() for t in worker_threads] if ckpt_timer.done() or not any(alive): saver.save(sess, checkpoint_file, int(step_counter)) print("Checkpoint saved to '{}'".format(checkpoint_file)) ckpt_timer.reset() if not any(alive): break
def step(self, action): if self.episode_done: raise Exception("Attempted to call step() after episode done") obs, reward, done, info = self.env.step(action) self.episode_rewards.append(reward) self.episode_length_steps += 1 if done: reward_sum = sum(self.episode_rewards) print("{}Episode {} finished; reward sum {}".format( self.log_prefix, self.episode_n, reward_sum)) if self.log_dir is not None: tflog('rl/episode_reward_sum', reward_sum) tflog('rl/episode_length_steps', self.episode_length_steps) self.episode_done = True return obs, reward, done, info
def train(self, prefs_train, prefs_val, val_interval): """ Train all ensemble members for one epoch. """ start_steps = self.n_steps start_time = time.time() for ind, batch in enumerate( batch_iter(prefs_train.prefs, batch_size=32, shuffle=True)): self.train_step(batch, prefs_train) self.n_steps += 1 if self.n_steps and self.n_steps % val_interval == 0: self.val_step(prefs_val) end_time = time.time() end_steps = self.n_steps rate = (end_steps - start_steps) / (end_time - start_time) easy_tf_log.tflog('reward_predictor_training_steps_per_second', rate)
def recv_prefs(self, pref_pipe): n_recvd = 0 while not self.stop_recv: try: s1, s2, pref = pref_pipe.get(block=True, timeout=1) logging.debug("Pref DB got segment pair plus preferences from pref pipe") except queue.Empty: logging.debug("Pref DB got no segments") continue n_recvd += 1 val_fraction = self.val_db.maxlen / (self.val_db.maxlen + self.train_db.maxlen) self.lock.acquire(blocking=True) if np.random.rand() < val_fraction: self.val_db.append(s1, s2, pref) easy_tf_log.tflog('val_db_len', len(self.val_db)) else: self.train_db.append(s1, s2, pref) easy_tf_log.tflog('train_db_len', len(self.train_db)) self.lock.release() easy_tf_log.tflog('n_prefs_recvd', n_recvd)
def update(self): self.update_count += 1 if self.update_count % 50 == 0: self.ddpg_model.adapt_param_noise() # TODO CHECK THIS API critic_loss, actor_loss = self.ddpg_model.train() self.ddpg_model.update_target_net() self.log_queue.put({ self.name + '_ACTOR': actor_loss, self.name + '_CRITIC': critic_loss }) easy_tf_log.tflog(key=self.name + '_' + self.current_env_status + '_ACTOR_TRAIN_LOSS', value=actor_loss) easy_tf_log.tflog(key=self.name + '_' + self.current_env_status + '_CRITIC_TRAIN_LOSS', value=critic_loss) self.compute_grad() return { 'VALUE_FUNCTION_LOSS': critic_loss, 'CONTROLLER_LOSS': actor_loss }
def predict(self, state, *args, **kwargs): state = np.reshape(state, [-1]) count = self._real_env_sample_count eps = 1.0 - (self.config.config_dict['EPS'] - self.config.config_dict['EPS_GREEDY_FINAL_VALUE']) * \ (count / self.config.config_dict['EPS_ZERO_FLAG']) if eps < 0: eps = 0.0 rand_eps = np.random.rand(1) if self.config.config_dict[ 'EPS_GREEDY_FLAG'] == 1 and rand_eps < eps and self.status == self.status_key[ 'TRAIN']: res = self.env.action_space.sample() else: res = np.array(self.model.predict(state)) if self.config.config_dict[ 'NOISE_FLAG'] > 0 and self.status == self.status_key['TRAIN']: res, noise = noise_adder(action=res, agent=self) for i in range(len(noise)): easy_tf_log.tflog(key=self.name + '_ACTION_NOISE_DIM_' + str(i), value=noise[i]) return np.reshape(res, [-1])
def test_explicit_step(self): """ Log a few values explicitly setting the step number. """ with tempfile.TemporaryDirectory() as temp_dir: os.chdir(temp_dir) for i in range(5): easy_tf_log.tflog('foo', i, step=(10 * i)) # These ones should continue from where the previous ones left off for i in range(5): easy_tf_log.tflog('foo', i) event_filename = osp.join('logs', os.listdir('logs')[0]) event_n = 0 for event in tf.train.summary_iterator(event_filename): if event_n == 0: # metadata event_n += 1 continue if event_n <= 5: self.assertEqual(event.step, 10 * (event_n - 1)) if event_n > 5 and event_n <= 10: self.assertEqual(event.step, 40 + (event_n - 5)) event_n += 1
def recv_segments(self, seg_pipe): """ Receive segments from `seg_pipe` into circular buffer `segments`. """ max_wait_seconds = 0.5 start_time = time.time() n_recvd = 0 while time.time() - start_time < max_wait_seconds: try: segment = seg_pipe.get(block=True, timeout=max_wait_seconds) except queue.Empty: return if len(self.segments) < self.max_segs: self.segments.append(segment) else: self.segments[self.seg_idx] = segment self.seg_idx = (self.seg_idx + 1) % self.max_segs n_recvd += 1 easy_tf_log.tflog('segment_idx', self.seg_idx) easy_tf_log.tflog('n_segments_rcvd', n_recvd) easy_tf_log.tflog('n_segments', len(self.segments))
def recv_prefs(self, pref_pipe): n_recvd = 0 while not self.stop_recv: try: s1, s2, pref = pref_pipe.get(block=True, timeout=1) except queue.Empty: continue n_recvd += 1 val_fraction = self.val_db.maxlen / (self.val_db.maxlen + self.train_db.maxlen) self.lock.acquire(blocking=True) if np.random.rand() < val_fraction: self.val_db.append(s1, s2, pref) easy_tf_log.tflog('val_db_len', len(self.val_db)) else: self.train_db.append(s1, s2, pref) easy_tf_log.tflog('train_db_len', len(self.train_db)) self.lock.release() easy_tf_log.tflog('n_prefs_recvd', n_recvd)
#%% Tensorflow / Keras #For specifying device to use with tf.device('/gpu:0'): pass # Adding new axis to array x_train = train[..., tf.newaxis] # Tensorboard setup logdir="logs/" + datetime.now().strftime("%Y%m%d-%H%M%S") tensorboard_callback = keras.callbacks.TensorBoard(log_dir=logdir) callbacks=[tensorboard_callback] # in model.fit() # Easy tf log to tensorboard for scalars etl.set_dir('logs2') for k in range(20, 30): etl.tflog('baz', k) # to start tensorboard put this into the terminal: tensorboard --logdir path/to/log/dir # Plot Graphs def plot_graphs(history, string): plt.plot(history.history[string]) plt.plot(history.history['val_'+string]) plt.xlabel("Epochs") plt.ylabel(string) plt.legend([string, 'val_'+string]) plt.show() pass # Class for displaying progress on the end of an epoch class DisplayCallback(tf.keras.callbacks.Callback): def on_epoch_end(self, epoch, logs=None):
# end of an episode if done: print('At the end of episode', episode_nb, 'the total reward was :', reward_sum) # increment episode number episode_nb += 1 # training model.fit(x=np.vstack(x_train), y=np.vstack(y_train), verbose=1, callbacks=[tbCallBack], sample_weight=discount_rewards(rewards, gamma)) # Saving the weights used by our model if episode_nb % epochs_before_saving == 0: model.save_weights('my_model_weights' + datetime.now().strftime("%Y%m%d-%H%M%S") + '.h5') # Log the reward running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 tflog('running_reward', running_reward, custom_dir=log_dir) # Reinitialization x_train, y_train, rewards = [], [], [] observation = env.reset() reward_sum = 0 prev_input = None
def sample(self): self.step += 1 if self._current_observation_n is None: self._current_observation_n = self.env.reset() action_n = [] for agent, current_observation in zip(self.agents, self._current_observation_n): action, _ = agent.policy.get_action(self._current_observation_n) # print(action) if agent.joint_policy: action_n.append(np.array(action)[0:agent._action_dim]) else: action_n.append(np.array(action)) try: action_n = np.asarray(action_n) # action_n = .5 * np.ones_like(action_n) next_observation_n, reward_n, done_n, info = self.env.step(action_n) self.step_act_dict[self.step] = action_n self.step_rew_dict[self.step] = reward_n print(reward_n) except: import pdb; pdb.set_trace() if self.global_reward: reward_n = np.array([np.sum(reward_n)] * self.agent_num) self._path_length += 1 self._path_return += np.array(reward_n[0], dtype=np.float32) self._total_samples += 1 for i, agent in enumerate(self.agents): action = deepcopy(action_n[i]) if agent.pool.joint: # opponent_action = deepcopy(action_n) # opponent_action = np.delete(opponent_action, i, 0) # opponent_action = np.array(opponent_action).flatten() agent.pool.add_sample(observation=self._current_observation_n[i], action=action, reward=reward_n[i], terminal=done_n[i], next_observation=next_observation_n[i]) else: agent.pool.add_sample(observation=self._current_observation_n[i], action=action, reward=reward_n[i], terminal=done_n[i], next_observation=next_observation_n[i]) self._current_observation_n = next_observation_n for i, rew in enumerate(reward_n): self.episode_rewards[-1] += rew self.agent_rewards[-1] += rew if self.step % (25 * 1000) == 0: print("steps: {}, episodes: {}, mean episode reward: {}".format( self.step, len(self.episode_rewards), np.mean(self.episode_rewards[-1000:]))) if np.all(done_n) or self._path_length >= self._max_path_length: self._current_observation_n = self.env.reset() self._max_path_return = np.maximum(self._max_path_return, self._path_return) self._mean_path_return = self._path_return / self._path_length self._last_path_return = self._path_return self.episode_rewards.append(0) # import pdb; pdb.set_trace() self.agent_rewards.append(0) # a.append(0) self._path_length = 0 self._path_return = np.array([0.] * self.agent_num, dtype=np.float32) self._n_episodes += 1 # self.log_diagnostics() # logger.dump_tabular(with_prefix=False) tflog('mean-return', self._mean_path_return[0]) else: self._current_observation_n = next_observation_n
# Sample a minibatch from memory if t_steps % train_every == 0: samples = random.sample(replay_memory, batch_size * train_every) states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(np.array, zip(*samples)) # Compute target #q_values_next = target_estimator.predict(next_states_batch) q_values_next = target_estimator.predict(next_states_batch) targets = reward_batch + (1-done_batch) * discount_factor * np.amax(q_values_next, axis=1) # Update estimator weights target_f = q_estimator.predict(states_batch) for i, action in enumerate(action_batch): target_f[i,action] = targets[i] loss = q_estimator.train_on_batch(states_batch, target_f) eps_loss += loss if done: break obs = new_obs t_steps += 1 tflog('running_reward', eps_reward, custom_dir=log_dir) tflog('eps_length', t, custom_dir=log_dir) tflog('epsilon', epsilon, custom_dir=log_dir) tflog('loss', eps_loss, custom_dir=log_dir)
def f(queue): easy_tf_log.tflog('foo', 0) queue.put(True)
def sample(self): self.step += 1 if self._current_observation_n is None: self._current_observation_n = self.env.reset() action_n = self.agents[0].policy.get_actions(self._current_observation_n) if self._do_nego > 0: action_n = self.agents[0].nego_policy.get_actions(self._current_observation_n, action_n) try: action_n = np.asarray(action_n).reshape(-1) # action_n = .5 * np.ones_like(action_n) next_observation_n, reward_n, done_n, info = self.env.step(action_n) print(reward_n) except: import pdb; pdb.set_trace() if self.global_reward: reward_n = np.array([np.sum(reward_n)] * self.agent_num) self._path_length += 1 self._path_return += np.array(reward_n, dtype=np.float32) self._total_samples += 1 for i, agent in enumerate(self.agents): action = deepcopy(action_n[i]) if agent.pool.joint: agent.pool.add_sample(observation=self._current_observation_n.reshape(-1), action=action.reshape(-1), reward=reward_n[i], terminal=done_n[i], next_observation=next_observation_n.reshape(-1)) else: agent.pool.add_sample(observation=self._current_observation_n.reshape(-1), action=action.reshape(-1), reward=reward_n[i], terminal=done_n[i], next_observation=next_observation_n.reshape(-1)) self._current_observation_n = next_observation_n for i, rew in enumerate(reward_n): self.episode_rewards[-1] += rew self.agent_rewards[i][-1] += rew if self.step % (25 * 1000) == 0: print("steps: {}, episodes: {}, mean episode reward: {}".format( self.step, len(self.episode_rewards), np.mean(self.episode_rewards[-1000:]))) if np.all(done_n) or self._path_length >= self._max_path_length: self._current_observation_n = self.env.reset() self._max_path_return = np.maximum(self._max_path_return, self._path_return) self._mean_path_return = self._path_return / self._path_length self._last_path_return = self._path_return self.episode_rewards.append(0) for a in self.agent_rewards: a.append(0) self._path_length = 0 self._path_return = np.array([0.] * self.agent_num, dtype=np.float32) self._n_episodes += 1 tflog('mean-return', self._mean_path_return[0]) # self.log_diagnostics() # logger.dump_tabular(with_prefix=False) else: self._current_observation_n = next_observation_n
def objective(arglist): config = tf.ConfigProto() config.gpu_options.allow_growth = True # dynamically grow the memory used on the GPU sess = tf.Session(config=config) set_session(sess) game_name = arglist.game_name # 'abs', 'one' reward_type = arglist.reward_type p = arglist.p agent_num = arglist.n u_range = 1. k = 0 print(arglist.aux, 'arglist.aux') model_names_setting = arglist.model_names_setting.split('_') model_names = model_names_setting model_name = '_'.join(model_names) path_prefix = game_name if game_name == 'pbeauty': env = PBeautyGame(agent_num=agent_num, reward_type=reward_type, p=p) path_prefix = game_name + '-' + reward_type + '-' + str(p) elif 'matrix' in game_name: matrix_game_name = game_name.split('-')[-1] repeated = arglist.repeat max_step = arglist.max_path_length memory = arglist.memory env = MatrixGame(game=matrix_game_name, agent_num=agent_num, action_num=2, repeated=repeated, max_step=max_step, memory=memory, discrete_action=False, tuple_obs=False) path_prefix = '{}-{}-{}-{}'.format(game_name, repeated, max_step, memory) elif 'diff' in game_name: diff_game_name = game_name.split('-')[-1] agent_num = 3 s2 = arglist.s2 x2 = arglist.x2 y2 = arglist.y2 con = arglist.con env = DifferentialGame(diff_game_name, agent_num, x2, y2, s2, con) elif 'particle' in game_name: particle_game_name = game_name.split('-')[-1] env, agent_num, model_name, model_names = get_particle_game(particle_game_name, arglist) now = datetime.datetime.now() timestamp = now.strftime('%Y-%m-%d %H:%M:%S.%f %Z') if 'CG' in model_name: model_name = model_name + '-{}'.format(arglist.mu) if not arglist.aux: model_name = model_name + '-{}'.format(arglist.aux) suffix = '{}/{}/{}/{}'.format(path_prefix, agent_num, model_name, timestamp) print(suffix) # logger.add_tabular_output('./log/{}.csv'.format(suffix)) # snapshot_dir = './snapshot/{}'.format(suffix) # policy_dir = './policy/{}'.format(suffix) # os.makedirs(snapshot_dir, exist_ok=True) # os.makedirs(policy_dir, exist_ok=True) # logger.set_snapshot_dir(snapshot_dir) agents = [] M = arglist.hidden_size batch_size = arglist.batch_size sampler = MASampler(agent_num=agent_num, joint=True, global_reward=arglist.global_reward, max_path_length=25, min_pool_size=100, batch_size=batch_size) base_kwargs = { 'sampler': sampler, 'epoch_length': 1, 'n_epochs': arglist.max_steps, 'n_train_repeat': 1, 'eval_render': True, 'eval_n_episodes': 10 } _alpha = arglist.alpha lr = arglist.lr n_pars = arglist.n_pars result = 0. with U.single_threaded_session(): for i, model_name in enumerate(model_names): if 'PR2AC' in model_name: k = int(model_name[-1]) g = False mu = arglist.mu if 'G' in model_name: g = True agent = pr2ac_agent(model_name, i, env, M, u_range, base_kwargs, lr=lr, n_pars=n_pars, k=k, g=g, mu=mu, game_name=game_name, aux=arglist.aux) elif model_name == 'MASQL': agent = masql_agent(model_name, i, env, M, u_range, base_kwargs, lr=lr, n_pars=n_pars, game_name=game_name) elif model_name == 'ROMMEO': agent = rom_agent(model_name, i, env, M, u_range, base_kwargs, game_name=game_name) else: if model_name == 'DDPG': joint = False opponent_modelling = False elif model_name == 'MADDPG': joint = True opponent_modelling = False elif model_name == 'DDPG-OM': joint = True opponent_modelling = True agent = ddpg_agent(joint, opponent_modelling, model_names, i, env, M, u_range, base_kwargs,lr=lr, game_name=game_name) agents.append(agent) sampler.initialize(env, agents) for agent in agents: agent._init_training() gt.rename_root('MARLAlgorithm') gt.reset() gt.set_def_unique(False) initial_exploration_done = False # noise = .1 noise = .5 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass # alpha = .5 for steps in gt.timed_for(range(base_kwargs['n_epochs'] + 1)): # import pdb; pdb.set_trace() # alpha = _alpha + np.exp(-0.1 * max(steps-10, 0)) * 500. if steps < base_kwargs['n_epochs']//3: # alpha = _alpha alpha = _alpha + np.exp(-0.1 * max(steps-10, 0)) * 500. elif steps < base_kwargs['n_epochs']//2: alpha = _alpha/10 else: alpha = .3 tflog('alpha', alpha) print('alpha', alpha) # if steps > 100 and steps<150: # alpha = .1 - 0.099 * steps/(150) # elif steps >= 150: # alpha = 1e-3 print('alpha', alpha) # logger.push_prefix('Epoch #%d | ' % steps) if steps % (25*1000) == 0: print(suffix) for t in range(base_kwargs['epoch_length']): # TODO.code consolidation: Add control interval to sampler if not initial_exploration_done: # if steps >= 1000: if steps >= 10: initial_exploration_done = True sampler.sample() if not initial_exploration_done: continue gt.stamp('sample') print('Sample Done') if steps == 10000: noise = 0.1 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass # alpha = 10. # if steps == 2000: if steps > base_kwargs['n_epochs'] / 10: noise = 0.1 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass # alpha = .1 if steps > base_kwargs['n_epochs'] / 5: noise = 0.05 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass if steps > base_kwargs['n_epochs'] / 6: noise = 0.01 for agent in agents: try: agent.policy.set_noise_level(noise) except: pass if steps % arglist.training_interval != 0: continue for j in range(base_kwargs['n_train_repeat']): batch_n = [] recent_batch_n = [] indices = None receent_indices = None for i, agent in enumerate(agents): if i == 0: batch = agent.pool.random_batch(batch_size) indices = agent.pool.indices receent_indices = list(range(agent.pool._top-batch_size, agent.pool._top)) batch_n.append(agent.pool.random_batch_by_indices(indices)) recent_batch_n.append(agent.pool.random_batch_by_indices(receent_indices)) # print(len(batch_n)) target_next_actions_n = [] # try: all_obs = np.array(np.concatenate([batch['observations'] for batch in batch_n], axis=-1)) all_next_obs = np.array(np.concatenate([batch['next_observations'] for batch in batch_n], axis=-1)) # print(all_obs[0]) for batch in batch_n: # print('making all obs') batch['all_observations'] = deepcopy(all_obs) batch['all_next_observations'] = deepcopy(all_next_obs) opponent_current_actions_n = [] for agent, batch in zip(agents, batch_n): target_next_actions_n.append(agent.target_policy.get_actions(batch['next_observations'])) opponent_current_actions_n.append(agent.policy.get_actions(batch['observations'])) for i, agent in enumerate(agents): batch_n[i]['opponent_current_actions'] = np.reshape( np.delete(deepcopy(opponent_current_actions_n), i, 0), (-1, agent._opponent_action_dim)) # except: # pass opponent_actions_n = np.array([batch['actions'] for batch in batch_n]) recent_opponent_actions_n = np.array([batch['actions'] for batch in recent_batch_n]) ####### figure out recent_opponent_observations_n = [] for batch in recent_batch_n: recent_opponent_observations_n.append(batch['observations']) current_actions = [agents[i].policy.get_actions(batch_n[i]['next_observations'])[0][0] for i in range(agent_num)] all_actions_k = [] for i, agent in enumerate(agents): if isinstance(agent, MAVBAC): if agent._k > 0: batch_actions_k = agent.policy.get_all_actions(batch_n[i]['next_observations']) actions_k = [a[0][0] for a in batch_actions_k] all_actions_k.append(';'.join(list(map(str, actions_k)))) # if len(all_actions_k) > 0: # with open('{}/all_actions.csv'.format(policy_dir), 'a') as f: # f.write(','.join(list(map(str, all_actions_k))) + '\n') # with open('{}/policy.csv'.format(policy_dir), 'a') as f: # f.write(','.join(list(map(str, current_actions)))+'\n') # print('============') for i, agent in enumerate(agents): try: batch_n[i]['next_actions'] = deepcopy(target_next_actions_n[i]) except: pass batch_n[i]['opponent_actions'] = np.reshape(np.delete(deepcopy(opponent_actions_n), i, 0), (-1, agent._opponent_action_dim)) if agent.joint: if agent.opponent_modelling: batch_n[i]['recent_opponent_observations'] = recent_opponent_observations_n[i] batch_n[i]['recent_opponent_actions'] = np.reshape(np.delete(deepcopy(recent_opponent_actions_n), i, 0), (-1, agent._opponent_action_dim)) batch_n[i]['opponent_next_actions'] = agent.opponent_policy.get_actions(batch_n[i]['next_observations']) else: batch_n[i]['opponent_next_actions'] = np.reshape(np.delete(deepcopy(target_next_actions_n), i, 0), (-1, agent._opponent_action_dim)) if isinstance(agent, MAVBAC) or isinstance(agent, MASQL) or isinstance(agent, ROMMEO): agent._do_training(iteration=t + steps * agent._epoch_length, batch=batch_n[i], annealing=alpha) else: agent._do_training(iteration=t + steps * agent._epoch_length, batch=batch_n[i]) gt.stamp('train') result = sampler.terminate() clear_session() return result
def main(): args, lr_args, log_dir, preprocess_wrapper, ckpt_timer = parse_args() easy_tf_log.set_dir(log_dir) utils.set_random_seeds(args.seed) sess = tf.Session() envs = make_envs(args.env_id, preprocess_wrapper, args.max_n_noops, args.n_workers, args.seed, args.debug, log_dir) step_counter = utils.GraphCounter(sess) update_counter = utils.GraphCounter(sess) lr = make_lr(lr_args, step_counter.value) optimizer = make_optimizer(lr) networks = make_networks(n_workers=args.n_workers, n_actions=envs[0].action_space.n, weight_inits=args.weight_inits, value_loss_coef=args.value_loss_coef, entropy_bonus=args.entropy_bonus, max_grad_norm=args.max_grad_norm, optimizer=optimizer, debug=args.debug) # Why save_relative_paths=True? # So that the plain-text 'checkpoint' file written uses relative paths, # which seems to be needed in order to avoid confusing saver.restore() # when restoring from FloydHub runs. global_vars = tf.trainable_variables('global') saver = tf.train.Saver(global_vars, max_to_keep=1, save_relative_paths=True) checkpoint_dir = osp.join(log_dir, 'checkpoints') os.makedirs(checkpoint_dir) checkpoint_file = osp.join(checkpoint_dir, 'network.ckpt') if args.load_ckpt: print("Restoring from checkpoint '%s'..." % args.load_ckpt, end='', flush=True) saver.restore(sess, args.load_ckpt) print("done!") else: sess.run(tf.global_variables_initializer()) workers = make_workers(sess=sess, envs=envs, networks=networks, n_workers=args.n_workers, log_dir=log_dir) worker_threads = start_workers(n_steps=args.n_steps, steps_per_update=args.steps_per_update, step_counter=step_counter, update_counter=update_counter, workers=workers) ckpt_timer.reset() step_rate = utils.RateMeasure() step_rate.reset(int(step_counter)) while True: time.sleep(args.wake_interval_seconds) steps_per_second = step_rate.measure(int(step_counter)) easy_tf_log.tflog('misc/steps_per_second', steps_per_second) easy_tf_log.tflog('misc/steps', int(step_counter)) easy_tf_log.tflog('misc/updates', int(update_counter)) easy_tf_log.tflog('misc/lr', sess.run(lr)) alive = [t.is_alive() for t in worker_threads] if ckpt_timer.done() or not any(alive): saver.save(sess, checkpoint_file, int(step_counter)) print("Checkpoint saved to '{}'".format(checkpoint_file)) ckpt_timer.reset() if not any(alive): break for env in envs: env.close()
#!/usr/bin/env python import time import easy_tf_log # Logging using the global logger # Will log to automatically-created 'logs' directory for i in range(10): easy_tf_log.tflog('foo', i) for j in range(10, 20): easy_tf_log.tflog('bar', j) easy_tf_log.set_dir('logs2') for k in range(20, 30): easy_tf_log.tflog('baz', k) for l in range(5): easy_tf_log.tflog('qux', l, step=(10 * l)) # Logging using a Logger object logger = easy_tf_log.Logger(log_dir='logs3') for i in range(10): logger.log_key_value('quux', i) logger.log_list_stats('quuz', [1, 2, 3, 4, 5]) logger.measure_rate('corge', 10) time.sleep(1)
def run(self): nenvs = len(self.env.remotes) mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = \ [], [], [], [], [] mb_states = self.states # Run for nsteps steps in the environment for _ in range(self.nsteps): actions, values, states = self.model.step(self.obs, self.states, self.dones) mb_obs.append(np.copy(self.obs)) mb_actions.append(actions) mb_values.append(values) mb_dones.append(self.dones) # len({obs, rewards, dones}) == nenvs obs, rewards, dones, _ = self.env.step(actions) self.states = states self.dones = dones for n, done in enumerate(dones): if done: self.obs[n] = self.obs[n] * 0 # SubprocVecEnv automatically resets when done self.update_obs(obs) mb_rewards.append(rewards) mb_dones.append(self.dones) # batch of steps to batch of rollouts # i.e. from nsteps, nenvs to nenvs, nsteps mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0) mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0) mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) mb_masks = mb_dones[:, :-1] # The first entry was just the init state of 'dones' (all False), # before we'd actually run any steps, so drop it. mb_dones = mb_dones[:, 1:] # Log original rewards for env_n, (rs, dones) in enumerate(zip(mb_rewards, mb_dones)): assert_equal(rs.shape, (self.nsteps, )) assert_equal(dones.shape, (self.nsteps, )) for step_n in range(self.nsteps): self.orig_reward[env_n] += rs[step_n] if dones[step_n]: easy_tf_log.tflog("orig_reward_{}".format(env_n), self.orig_reward[env_n]) self.orig_reward[env_n] = 0 if self.env.env_id == 'MovingDotNoFrameskip-v0': # For MovingDot, reward depends on both current observation and # current action, so encode action in the observations. # (We only need to set this in the most recent frame, # because that's all that the reward predictor for MovingDot # uses.) mb_obs[:, :, 0, 0, -1] = mb_actions[:, :] # Generate segments # (For MovingDot, this has to happen _after_ we've encoded the action # in the observations.) if self.gen_segments: self.update_segment_buffer(mb_obs, mb_rewards, mb_dones) # Replace rewards with those from reward predictor # (Note that this also needs to be done _after_ we've encoded the # action.) logging.debug("Original rewards:\n%s", mb_rewards) if self.reward_predictor: assert_equal(mb_obs.shape, (nenvs, self.nsteps, 84, 84, 4)) mb_obs_allenvs = mb_obs.reshape(nenvs * self.nsteps, 84, 84, 4) rewards_allenvs = self.reward_predictor.reward(mb_obs_allenvs) assert_equal(rewards_allenvs.shape, (nenvs * self.nsteps, )) mb_rewards = rewards_allenvs.reshape(nenvs, self.nsteps) assert_equal(mb_rewards.shape, (nenvs, self.nsteps)) logging.debug("Predicted rewards:\n%s", mb_rewards) # Save frames for episode rendering if self.episode_vid_queue is not None: self.update_episode_frame_buffer(mb_obs, mb_dones) # Discount rewards mb_obs = mb_obs.reshape(self.batch_ob_shape) last_values = self.model.value(self.obs, self.states, self.dones).tolist() # discount/bootstrap off value fn for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): rewards = rewards.tolist() dones = dones.tolist() if dones[-1] == 0: # Make sure that the first iteration of the loop inside # discount_with_dones picks up 'value' as the initial # value of r rewards = discount_with_dones(rewards + [value], dones + [0], self.gamma)[:-1] else: rewards = discount_with_dones(rewards, dones, self.gamma) mb_rewards[n] = rewards mb_rewards = mb_rewards.flatten() mb_actions = mb_actions.flatten() mb_values = mb_values.flatten() mb_masks = mb_masks.flatten() return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
def run(self): """Performs training. Trains a model using episodic training. Every so often, runs some evaluations on validation data. """ train_data, valid_data = self.train_data, self.valid_data input_dim, output_dim = self.input_dim, self.output_dim rep_dim, episode_length = self.rep_dim, self.episode_length episode_width, memory_size = self.episode_width, self.memory_size batch_size = self.batch_size # create data generator birds_data = FewshotBirdsDataGenerator(self.batch_size, self.episode_length, self.episode_width, image_dim=input_dim) train_size = len(train_data) valid_size = len(valid_data) logging.info('train_size (number of labels) %d', train_size) logging.info('valid_size (number of labels) %d', valid_size) logging.info('input_dim %d', input_dim) logging.info('output_dim %d', output_dim) logging.info('rep_dim %d', rep_dim) logging.info('episode_length %d', episode_length) logging.info('episode_width %d', episode_width) logging.info('memory_size %d', memory_size) logging.info('batch_size %d', batch_size) assert all( len(v) >= float(episode_length) / episode_width for v in train_data.values()) assert all( len(v) >= float(episode_length) / episode_width for v in valid_data.values()) output_dim = episode_width self.model = self.get_model() self.model.setup() sess = tf.Session() sess.run(tf.global_variables_initializer()) saver = tf.train.Saver(max_to_keep=10) # for inception #ckpt = tf.train.get_checkpoint_state(INCEPTION_CKPT) print('use resnet:', FLAGS.use_resnet) ckpt = RESNET_CKPT if FLAGS.use_resnet else INCEPTION_CKPT scope = 'core/resnet_v2_50' if FLAGS.use_resnet else 'core/InceptionV3' incpt_vars = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) incpt_vars = [v for v in incpt_vars if 'Adam' not in v.name] incpt_vars = [v for v in incpt_vars if 'BatchNorm' not in v.name] incpt_vars = {v.name.split('core/')[1][0:-2]: v for v in incpt_vars} assign_fn = tf.contrib.framework.assign_from_checkpoint_fn( ckpt, incpt_vars, ignore_missing_vars=True, reshape_variables=False) assign_fn(sess) ckpt = None if FLAGS.save_dir: ckpt = tf.train.get_checkpoint_state(FLAGS.save_dir) if ckpt and ckpt.model_checkpoint_path: logging.info('restoring from %s', ckpt.model_checkpoint_path) saver.restore(sess, ckpt.model_checkpoint_path) logging.info('starting now') losses = [] random.seed(FLAGS.seed) np.random.seed(FLAGS.seed) # used for sampling data use_parts = FLAGS.num_parts > 1 for i in xrange(FLAGS.num_episodes): # TODO: add parts x, p1, p2, y = birds_data.sample_episode_batch( birds_data.train_data, use_parts=use_parts) #outputs = self.model.episode_step_with_parts(sess, x, p1, p2, y, True, clear_memory=True) # TODO: this doesn't make sense if FLAGS.num_parts > 1: parts = [ np.concatenate([pp1, pp2], axis=0) for pp1, pp2 in zip(p1, p2) ] else: parts = x #outputs = self.model.episode_step_with_parts(sess, parts, y, True, clear_memory=True) outputs = self.model.episode_step_n_parts(sess, parts, y, True, clear_memory=True) #x, y = self.sample_episode_batch( # train_data, episode_length, episode_width, batch_size) #outputs = self.model.episode_step(sess, x, y, clear_memory=True) # plot a histogram of the different labels loss = outputs losses.append(loss) if i % FLAGS.validation_frequency == 0: logging.info('episode batch %d, avg train loss %f', i, np.mean(losses)) tflog('loss', np.mean(losses)) losses = [] # validation correct = [] num_shots = episode_length // episode_width correct_by_shot = dict((k, []) for k in xrange(num_shots)) for _ in xrange(FLAGS.validation_length): # TODO: add parts #x, y = self.sample_episode_batch( # valid_data, episode_length, episode_width, 1) x, p1, p2, y = birds_data.sample_episode_batch( birds_data.val_data, use_parts=use_parts) if FLAGS.num_parts > 1: parts = [ np.concatenate([pp1, pp2], axis=0) for pp1, pp2 in zip(p1, p2) ] else: parts = x #outputs = self.model.episode_predict_with_parts( # sess, x, p1, p2, y, False, clear_memory=True) outputs = self.model.episode_predict_n_parts( sess, parts, y, False, clear_memory=True) y_preds = outputs correct.append(self.compute_correct(np.array(y), y_preds)) # compute per-shot accuracies seen_counts = [0] * episode_width # loop over episode steps for yy, yy_preds in zip(y, y_preds): # loop over batch examples yyy, yyy_preds = int(yy[0]), int(yy_preds[0]) count = seen_counts[yyy % episode_width] if count in correct_by_shot: correct_by_shot[count].append( self.individual_compute_correct( yyy, yyy_preds)) seen_counts[yyy % episode_width] = count + 1 tflog('val_accuracy', np.mean(correct)) for k_shot, correct in correct_by_shot.items(): tflog(str(k_shot) + '_shot_accuracy', np.mean(correct)) logging.info('validation overall accuracy %f', np.mean(correct)) logging.info( '%d-shot: %.3f, ' * num_shots, *sum([[k, np.mean(correct_by_shot[k])] for k in xrange(num_shots)], [])) if saver and FLAGS.save_dir: saved_file = saver.save(sess, os.path.join( FLAGS.save_dir, 'model.ckpt'), global_step=self.model.global_step) logging.info('saved model to %s', saved_file)
def train(self): self.build_model() self.__model.summary() self.__model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) UP_ACTION = 2 DOWN_ACTION = 3 # hyperparameters gamma = .99 # initializing variables x_train, y_train, rewards = [], [], [] reward_sum = 0 episode_nb = 0 # initialize variables resume = True running_reward = None epochs_before_saving = 10 log_dir = './log' + datetime.now().strftime("%Y%m%d-%H%M%S") + "/" # load pre-trained model if exist if (resume and os.path.isfile(LOG_DIR + 'my_model_weights.h5')): print("loading previous weights") self.__model.load_weights(LOG_DIR + 'my_model_weights.h5') # add a callback tensorboard object to visualize learning tbCallBack = callbacks.TensorBoard(log_dir=log_dir, histogram_freq=0, \ write_graph=True, write_images=True) # initializing environment env = gym.make('Pong-v0') observation = env.reset() prev_input = None # main loop while (True): # preprocess the observation, set input as difference between images cur_input = prepro(observation) x = cur_input - prev_input if prev_input is not None else np.zeros( 80 * 80) prev_input = cur_input # forward the policy network and sample action according to the proba distribution proba = self.__model.predict(np.expand_dims(x, axis=1).T) action = UP_ACTION if np.random.uniform() < proba else DOWN_ACTION y = 1 if action == 2 else 0 # 0 and 1 are our labels # log the input and label to train later x_train.append(x) y_train.append(y) # do one step in our environment observation, reward, done, info = env.step(action) rewards.append(reward) reward_sum += reward # end of an episode if done: print('At the end of episode', episode_nb, 'the total reward was :', reward_sum) # increment episode number episode_nb += 1 # training self.__model.fit(x=np.vstack(x_train), y=np.vstack(y_train), verbose=1, callbacks=[tbCallBack], \ sample_weight=discount_rewards(rewards, gamma)) # Saving the weights used by our model if episode_nb % epochs_before_saving == 0: self.__model.save_weights( 'my_model_weights' + datetime.now().strftime("%Y%m%d-%H%M%S") + '.h5') # Log the reward running_reward = reward_sum if running_reward is None else running_reward * 0.99 + reward_sum * 0.01 tflog('running_reward', running_reward, custom_dir=log_dir) # Reinitialization x_train, y_train, rewards = [], [], [] observation = env.reset() reward_sum = 0 prev_input = None
import easy_tf_log for i in range(10): easy_tf_log.tflog('foo', i) for j in range(10, 20): easy_tf_log.tflog('bar', j) easy_tf_log.set_dir('logs2') for k in range(20, 30): easy_tf_log.tflog('baz', k)