def callback(self, lcl, glb): self.iter += 1 if self.iter == 1: self.sess = lcl['sess'] self.callback_setup_saver() self.logging.writer_val.add_graph(lcl['sess'].graph) self.callback_val_vis(lcl, glb, num_rollouts=self.other_kwargs['num_valid'], plot=True) return t = lcl['t'] print_freq = lcl['print_freq'] if t > self.trainer_kwargs['learning_starts']: if t % print_freq == 0: logger.error('Num Steps: %d' % (t)) self.callback_logging(lcl, glb) self.callback_val_vis( lcl, glb, num_rollouts=self.other_kwargs['num_valid'], plot=True) logger.error('') if t % (print_freq * 15) == 0: self.callback_snapshot(lcl, glb)
def crowdai_submit(seed, noise_type, layer_norm, evaluation, **kwargs): if 'restore_model_name' not in kwargs: logger.error( 'You must specify the --restore-model-name in order to submit') sys.exit() remote_base = "http://grader.crowdai.org:1729" crowdai_token = kwargs['crowdai_token'] crowdai_client = Client(remote_base) kwargs['crowdai_client'] = crowdai_client evaluate(seed, noise_type, layer_norm, evaluation=True, **kwargs)
def get_tf_reward(env): while hasattr(env, "wrapped_env") or hasattr(env, "env") or hasattr( env, "get_tf_reward"): if hasattr(env, "get_tf_reward"): return env.get_tf_reward() elif hasattr(env, "wrapped_env"): env = env.wrapped_env else: env = env.env logger.error("env should have the attribution get_tf_reward()")
def build(): lrank, _lsize = mpi_util.get_local_rank_size(MPI.COMM_WORLD) if lrank == 0: dirname = os.path.dirname(__file__) if len(dirname): make_cmd = "QT_SELECT=5 make -C %s" % dirname else: make_cmd = "QT_SELECT=5 make" r = os.system(make_cmd) if r != 0: logger.error('coinrun: make failed') sys.exit(1) MPI.COMM_WORLD.barrier()
def callback_val_vis(self, lcl, glb, num_rollouts, plot=False): act = lcl['act'] global_step = lcl['t'] with plt.style.context("fivethirtyeight"): plt.rcParams["axes.grid"] = True env = self.env_val env.reset_rng() obsss, actionss, rewardss = [], [], [] ms = [] for i in range(num_rollouts): obs, done = env.reset(), False obss, actions, rewards = [obs], [], [] while not done: action = act(obs[None], False)[0] obs, rew, done, _ = env.step(action) obss.append(obs) actions.append(action) rewards.append(rew) obss.pop() #last obs is unnecessary obsss.append(obss) actionss.append(actions) rewardss.append(rewards) m = env.get_metrics() ms.append(m) metric_names, metric_vals = env.collect_metrics(ms) metric_summary_init = tf.summary.Summary() metric_summary_end = tf.summary.Summary() for k, v in zip(metric_names, metric_vals): add_value_to_summary(metric_summary_init, 'metrics/{:s}'.format(k), v, log=True, tag_str='metrics/{:s}: '.format(k)) add_value_to_summary(metric_summary_end, 'metrics/{:s}'.format(k), v, log=False, tag_str='metrics/{:s}: '.format(k)) self.logging.writer_val.add_summary(metric_summary_init, global_step) logger.error('')
def set_seed(env, seed): random.seed(seed) np.random.seed(seed) try: import tensorflow as tf tf.set_random_seed(seed) except Exception as e: print(e) while hasattr(env, "wrapped_env") or hasattr(env, "env") or hasattr( env, "seed"): if hasattr(env, "seed"): temp_seed = env.seed(seed) if temp_seed != None and temp_seed != []: logger.info("Seed: %d. Set seed successfully" % env.seed(seed)[0]) return if hasattr(env, "wrapped_env"): env = env.wrapped_env else: env = env.env logger.error("env should have the attribution seed()")
def evaluate_one_episode(env, agent, nb_eval_steps, render): if nb_eval_steps <= 0: logger.error('evaluate_one_episode nb_eval_steps must be > 0') reward = 0. qs = [] obs = env.reset() for step in range(nb_eval_steps): action, q = agent.pi(obs, apply_noise=False, compute_Q=True) obs, r, done, info = env.step(action) if render: env.render() reward += r qs.append(q) if done: #obs = env.reset() break # the original baseline code didn't have this break statement, so would average multiple evaluation episodes elif step >= nb_eval_steps: logger.warn('evaluate_one_episode step', step, 'exceeded nb_eval_steps', nb_eval_steps, 'but done is False') #obs = env.reset() break return reward, np.mean(qs), step + 1
def find_character_in_frame(self, frame): mask = cv2.inRange(frame, self.lower_color, self.upper_color) output = cv2.bitwise_and(frame, frame, mask=mask) pix_x, pix_y, _ = np.where(output > 0) if pix_x.size != 0: prev_pix_x = pix_x pix_x = pix_x[np.where(pix_x > 19)] pix_y = pix_y[-pix_x.size:] # If array is even then median doesn't exist in the array, because it's the average # between the middle twos try: # Very rarely a nan will be received here median_x = int(np.median(pix_x)) while median_x not in pix_x: median_x += 1 median_y = int(pix_y[np.where(pix_x == median_x)[0][0]]) except Exception as e: logger.error("Exception: {}".format(e)) logger.error("Pixel x: {}".format(pix_x)) logger.error("Pixel y: {}".format(pix_y)) logger.error("Previous pixel x: {}".format(prev_pix_x)) roi = np.zeros([self.ego_h, self.ego_w, 3], dtype=np.uint8) return roi else: median_x = output.shape[0] // 2 median_y = output.shape[1] // 2 low_x = median_x - self.ego_h high_x = median_x + self.ego_h low_y = median_y - self.ego_w high_y = median_y + self.ego_w low_x = low_x if low_x > 0 else 0 high_x = high_x if high_x < frame.shape[0] else frame.shape[0] low_y = low_y if low_y > 0 else 0 high_y = high_y if high_y < frame.shape[1] else frame.shape[1] roi = frame[low_x:high_x, low_y:high_y] return roi
def callback_snapshot(self, lcl, glb): model_file_name = os.path.join(self.logdir, 'snapshots', 'model') self.logging.saver.save(lcl['sess'], model_file_name, global_step=lcl['num_episodes']) logger.error('Saving model to: ', model_file_name)
def update(self): #Some logic gathering best ret, rooms etc using MPI. temp = sum(MPI.COMM_WORLD.allgather(self.local_rooms), []) temp = sorted(list(set(temp))) self.rooms = temp temp = sum(MPI.COMM_WORLD.allgather(self.scores), []) temp = sorted(list(set(temp))) self.scores = temp temp = sum(MPI.COMM_WORLD.allgather([self.local_best_ret]), []) self.best_ret = max(temp) eprews = MPI.COMM_WORLD.allgather( np.mean(list(self.I.statlists["eprew"]))) local_best_rets = MPI.COMM_WORLD.allgather(self.local_best_ret) n_rooms = sum(MPI.COMM_WORLD.allgather([len(self.local_rooms)]), []) if MPI.COMM_WORLD.Get_rank() == 0: logger.info(f"Rooms visited {self.rooms}") logger.info(f"Best return {self.best_ret}") logger.info(f"Best local return {sorted(local_best_rets)}") logger.info(f"eprews {sorted(eprews)}") logger.info(f"n_rooms {sorted(n_rooms)}") logger.info(f"Extrinsic coefficient {self.ext_coeff}") logger.info(f"Gamma {self.gamma}") logger.info(f"Gamma ext {self.gamma_ext}") logger.info(f"All scores {sorted(self.scores)}") #Normalize intrinsic rewards. rffs_int = np.array( [self.I.rff_int.update(rew) for rew in self.I.buf_rews_int.T]) self.I.rff_rms_int.update(rffs_int.ravel()) rews_int = self.I.buf_rews_int / np.sqrt(self.I.rff_rms_int.var) self.mean_int_rew = np.mean(rews_int) self.max_int_rew = np.max(rews_int) #Don't normalize extrinsic rewards. rews_ext = self.I.buf_rews_ext rewmean, rewstd, rewmax = self.I.buf_rews_int.mean( ), self.I.buf_rews_int.std(), np.max(self.I.buf_rews_int) #Calculate intrinsic returns and advantages. lastgaelam = 0 for t in range(self.nsteps - 1, -1, -1): # nsteps-2 ... 0 if self.use_news: nextnew = self.I.buf_news[:, t + 1] if t + 1 < self.nsteps else self.I.buf_new_last else: nextnew = 0.0 #No dones for intrinsic reward. nextvals = self.I.buf_vpreds_int[:, t + 1] if t + 1 < self.nsteps else self.I.buf_vpred_int_last nextnotnew = 1 - nextnew delta = rews_int[:, t] + self.gamma * nextvals * nextnotnew - self.I.buf_vpreds_int[:, t] self.I.buf_advs_int[:, t] = lastgaelam = delta + self.gamma * self.lam * nextnotnew * lastgaelam rets_int = self.I.buf_advs_int + self.I.buf_vpreds_int #Calculate extrinsic returns and advantages. lastgaelam = 0 for t in range(self.nsteps - 1, -1, -1): # nsteps-2 ... 0 nextnew = self.I.buf_news[:, t + 1] if t + 1 < self.nsteps else self.I.buf_new_last #Use dones for extrinsic reward. nextvals = self.I.buf_vpreds_ext[:, t + 1] if t + 1 < self.nsteps else self.I.buf_vpred_ext_last nextnotnew = 1 - nextnew delta = rews_ext[:, t] + self.gamma_ext * nextvals * nextnotnew - self.I.buf_vpreds_ext[:, t] self.I.buf_advs_ext[:, t] = lastgaelam = delta + self.gamma_ext * self.lam * nextnotnew * lastgaelam rets_ext = self.I.buf_advs_ext + self.I.buf_vpreds_ext #Combine the extrinsic and intrinsic advantages. self.I.buf_advs = self.int_coeff * self.I.buf_advs_int + self.ext_coeff * self.I.buf_advs_ext #Collects info for reporting. info = dict( advmean=self.I.buf_advs.mean(), advstd=self.I.buf_advs.std(), retintmean=rets_int.mean(), # previously retmean retintstd=rets_int.std(), # previously retstd retextmean=rets_ext.mean(), # previously not there retextstd=rets_ext.std(), # previously not there rewintmean_unnorm=rewmean, # previously rewmean rewintmax_unnorm=rewmax, # previously not there rewintmean_norm=self.mean_int_rew, # previously rewintmean rewintmax_norm=self.max_int_rew, # previously rewintmax rewintstd_unnorm=rewstd, # previously rewstd vpredintmean=self.I.buf_vpreds_int.mean(), # previously vpredmean vpredintstd=self.I.buf_vpreds_int.std(), # previously vrpedstd vpredextmean=self.I.buf_vpreds_ext.mean(), # previously not there vpredextstd=self.I.buf_vpreds_ext.std(), # previously not there ev_int=np.clip( explained_variance(self.I.buf_vpreds_int.ravel(), rets_int.ravel()), -1, None), ev_ext=np.clip( explained_variance(self.I.buf_vpreds_ext.ravel(), rets_ext.ravel()), -1, None), rooms=SemicolonList(self.rooms), n_rooms=len(self.rooms), best_ret=self.best_ret, reset_counter=self.I.reset_counter) info[f'mem_available'] = psutil.virtual_memory().available to_record = { 'acs': self.I.buf_acs, 'rews_int': self.I.buf_rews_int, 'rews_int_norm': rews_int, 'rews_ext': self.I.buf_rews_ext, 'vpred_int': self.I.buf_vpreds_int, 'vpred_ext': self.I.buf_vpreds_ext, 'adv_int': self.I.buf_advs_int, 'adv_ext': self.I.buf_advs_ext, 'ent': self.I.buf_ent, 'ret_int': rets_int, 'ret_ext': rets_ext, } if self.I.venvs[0].record_obs: if None in self.I.buf_obs: to_record['obs'] = self.I.buf_obs[None] else: to_record['obs'] = self.I.buf_obs['normal'] self.recorder.record(bufs=to_record, infos=self.I.buf_epinfos) #Create feeddict for optimization. envsperbatch = self.I.nenvs // self.nminibatches ph_buf = [ (self.stochpol.ph_ac, self.I.buf_acs), (self.ph_ret_int, rets_int), (self.ph_ret_ext, rets_ext), (self.ph_oldnlp, self.I.buf_nlps), (self.ph_adv, self.I.buf_advs), ] if self.I.mem_state is not NO_STATES: ph_buf.extend([ (self.stochpol.ph_istate, self.I.seg_init_mem_state), (self.stochpol.ph_new, self.I.buf_news), ]) #verbose = True verbose = False if verbose and self.is_log_leader: samples = np.prod(self.I.buf_advs.shape) logger.info( "buffer shape %s, samples_per_mpi=%i, mini_per_mpi=%i, samples=%i, mini=%i " % (str(self.I.buf_advs.shape), samples, samples // self.nminibatches, samples * self.comm_train_size, samples * self.comm_train_size // self.nminibatches)) logger.info(" " * 6 + fmt_row(13, self.loss_names)) to_record_attention = None attention_output = None if os.environ['EXPERIMENT_LVL'] == 'attention' or os.environ[ 'EXPERIMENT_LVL'] == 'ego': try: #attention_output = tf.get_default_graph().get_tensor_by_name("ppo/pol/augmented2/attention_output_combined:0") #attention_output = tf.get_default_graph().get_tensor_by_name("ppo/pol/augmented2/attention_output_combined/kernel:0") attention_output = tf.get_default_graph().get_tensor_by_name( "ppo/pol/augmented2/attention_output_combined/Conv2D:0") except Exception as e: logger.error("Exception in attention_output: {}".format(e)) attention_output = None epoch = 0 start = 0 #Optimizes on current data for several epochs. while epoch < self.nepochs: end = start + envsperbatch mbenvinds = slice(start, end, None) fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf} fd.update({self.ph_lr: self.lr, self.ph_cliprange: self.cliprange}) if None in self.stochpol.ph_ob: fd[self.stochpol.ph_ob[None]] = np.concatenate([ self.I.buf_obs[None][mbenvinds], self.I.buf_ob_last[None][mbenvinds, None] ], 1) assert list(fd[self.stochpol.ph_ob[None]].shape) == [self.I.nenvs//self.nminibatches, self.nsteps + 1] + list(self.ob_space.shape), \ [fd[self.stochpol.ph_ob[None]].shape, [self.I.nenvs//self.nminibatches, self.nsteps + 1] + list(self.ob_space.shape)] else: fd[self.stochpol.ph_ob['normal']] = np.concatenate([ self.I.buf_obs['normal'][mbenvinds], self.I.buf_ob_last['normal'][mbenvinds, None] ], 1) fd[self.stochpol.ph_ob['ego']] = np.concatenate([ self.I.buf_obs['ego'][mbenvinds], self.I.buf_ob_last['ego'][mbenvinds, None] ], 1) assert list(fd[self.stochpol.ph_ob['normal']].shape) == [self.I.nenvs//self.nminibatches, self.nsteps + 1] + list(self.ob_space.spaces['normal'].shape), \ [fd[self.stochpol.ph_ob['normal']].shape, [self.I.nenvs//self.nminibatches, self.nsteps + 1] + list(self.ob_space.spaces['normal'].shape)] assert list(fd[self.stochpol.ph_ob['ego']].shape) == [self.I.nenvs//self.nminibatches, self.nsteps + 1] + list(self.ob_space.spaces['ego'].shape), \ [fd[self.stochpol.ph_ob['ego']].shape, [self.I.nenvs//self.nminibatches, self.nsteps + 1] + list(self.ob_space.spaces['ego'].shape)] fd.update({ self.stochpol.ph_mean: self.stochpol.ob_rms.mean, self.stochpol.ph_std: self.stochpol.ob_rms.var**0.5 }) if attention_output is not None: _train_losses = [attention_output, self._train] else: _train_losses = [self._train] ret = tf.get_default_session().run(self._losses + _train_losses, feed_dict=fd)[:-1] if attention_output is not None: attn_output = ret[-1] ret = ret[:-1] if None in self.I.buf_obs: outshape = list( self.I.buf_obs[None][mbenvinds].shape[:2]) + list( attn_output.shape[1:]) else: # does not matter if it's normal or ego, the first 2 axes are the same outshape = list( self.I.buf_obs['normal'][mbenvinds].shape[:2]) + list( attn_output.shape[1:]) attn_output = np.reshape(attn_output, outshape) attn_output = attn_output[:, :, :, :, :64] if not self.testing: lossdict = dict(zip([n for n in self.loss_names], ret), axis=0) else: lossdict = {} #Synchronize the lossdict across mpi processes, otherwise weights may be rolled back on one process but not another. _maxkl = lossdict.pop('maxkl') lossdict = dict_gather(self.comm_train, lossdict, op='mean') maxmaxkl = dict_gather(self.comm_train, {"maxkl": _maxkl}, op='max') lossdict["maxkl"] = maxmaxkl["maxkl"] if verbose and self.is_log_leader: logger.info( "%i:%03i %s" % (epoch, start, fmt_row(13, [lossdict[n] for n in self.loss_names]))) start += envsperbatch if start == self.I.nenvs: epoch += 1 start = 0 if attention_output is not None: if to_record_attention is None: to_record_attention = attn_output else: to_record_attention = np.concatenate( [to_record_attention, attn_output]) # if to_record_attention is not None: # if None in self.I.buf_obs: # to_record['obs'] = self.I.buf_obs[None] # else: # to_record['obs'] = self.I.buf_obs['normal'] # to_record['attention'] = to_record_attention to_record_attention = None if self.is_train_leader: self.I.stats["n_updates"] += 1 info.update([('opt_' + n, lossdict[n]) for n in self.loss_names]) tnow = time.time() info['tps'] = self.nsteps * self.I.nenvs / (tnow - self.I.t_last_update) info['time_elapsed'] = time.time() - self.t0 self.I.t_last_update = tnow self.stochpol.update_normalization( # Necessary for continuous control tasks with odd obs ranges, only implemented in mlp policy, ob=self.I.buf_obs # NOTE: not shared via MPI ) return info
def learn(env_list, policy_fn, *, timesteps_per_actorbatch, # timesteps per actor per update clip_param, entcoeff, # clipping parameter epsilon, entropy coeff optim_epochs, optim_stepsize, optim_batchsize, # optimization hypers gamma, lam, # advantage estimation max_timesteps=0, max_episodes=0, max_iters=0, max_seconds=0, # time constraint callback=None, # you can do anything in the callback, since it takes locals(), globals() adam_epsilon=1e-5, schedule='constant', # annealing for stepsize parameters (epsilon and adam) end_timesteps, newround ): env = env_list.popleft() # Open a file to record the accumulated rewards rewFile = open("reward/%d.txt" % (env.seed), "ab") resptimeFile = open("respTime/%d.txt" % (env.seed), "ab") pktnumFile = open("pktNum/%d.txt" % (env.seed), "ab") # Setup losses and stuff # ---------------------------------------- vf_ob_space = env.vf_observation_space # ac_ob_space = env.ac_observation_space ac_space = env.action_space pi = policy_fn("pi1", vf_ob_space, ac_space) # Construct network for new policy oldpi = policy_fn("oldpi", vf_ob_space, ac_space) # Network for old policy atarg = tf.placeholder(name="atarg", dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(name="ret", dtype=tf.float32, shape=[None]) # Empirical return lrmult = tf.placeholder(name='lrmult', dtype=tf.float32, shape=[]) # learning rate multiplier, updated with schedule clip_param = clip_param * lrmult # Annealed clipping parameter epislon vf_ob = U.get_placeholder_cached(name="vf_ob") nn_in = U.get_placeholder_cached(name="nn_in") # placeholder for nn input ac = pi.pdtype.sample_placeholder([None]) # kloldnew = oldpi.pd.kl(pi.pd) # ent = pi.pd.entropy() pb_old_holder = tf.placeholder(name="pd_old", dtype=tf.float32, shape=[None, ac_space.n]) pb_new_holder = tf.placeholder(name="pd_new", dtype=tf.float32, shape=[None, ac_space.n]) oldpd = CategoricalPd(pb_old_holder) pd = CategoricalPd(pb_new_holder) kloldnew = oldpd.kl(pd) ent = pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) pol_entpen = (-entcoeff) * meanent # ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # pnew / pold ratio = tf.placeholder(dtype=tf.float32, shape=[None]) surr1 = ratio * atarg # surrogate from conservative policy iteration surr2 = tf.clip_by_value(ratio, 1.0 - clip_param, 1.0 + clip_param) * atarg # pol_surr = - tf.reduce_mean(tf.minimum(surr1, surr2)) # PPO's pessimistic surrogate (L^CLIP) vf_loss = tf.reduce_mean(tf.square(pi.vpred - ret)) total_loss = pol_surr + pol_entpen + vf_loss losses = [pol_surr, pol_entpen, vf_loss, meankl, meanent] loss_names = ["pol_surr", "pol_entpen", "vf_loss", "kl", "ent"] var_list = pi.get_trainable_variables() vf_var_list = [v for v in var_list if v.name.split("/")[1].startswith("vf")] pol_var_list = [v for v in var_list if v.name.split("/")[1].startswith("pol")] vf_grad = U.function([vf_ob, ret], U.flatgrad(vf_loss, vf_var_list)) # gradient of value function pol_nn_grad = U.function([nn_in], U.flatgrad(pi.nn_out, pol_var_list)) vf_adam = MpiAdam(vf_var_list, epsilon=adam_epsilon) pol_adam = MpiAdam(pol_var_list, epsilon=adam_epsilon) clip_para = U.function([lrmult], [clip_param]) assign_old_eq_new = U.function([], [], updates=[tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables())]) compute_losses = U.function([vf_ob, atarg, ret, lrmult, ratio, pb_new_holder, pb_old_holder], losses) U.initialize() vf_adam.sync() pol_adam.sync() # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) end_timestep = end_timesteps.popleft() new = newround.popleft() episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=10) # rolling buffer for episode lengths rewbuffer = deque(maxlen=10) # rolling buffer for episode rewards env_so_far = 1 assert sum([max_iters > 0, max_timesteps > 0, max_episodes > 0, max_seconds > 0]) == 1, "Only one time constraint permitted" while True: if callback: callback(locals(), globals()) if max_timesteps and timesteps_so_far >= max_timesteps: rewFile.close() resptimeFile.close() pktnumFile.close() para = {} for vf in range(len(vf_var_list)): # para[vf_var_list[vf].name] = vf_var_list[vf].eval() para[vf] = vf_var_list[vf].eval() for pol in range(len(pol_var_list)): # para[pol_var_list[pol].name] = pol_var_list[pol].eval() para[pol + len(vf_var_list)] = pol_var_list[pol].eval() f = open("network/%d-%d.txt" % (env.seed, timesteps_so_far), "wb") pickle.dump(para, f) f.close() print("============================= policy is stored =================================") break elif end_timestep and timesteps_so_far >= end_timestep: env = env_list.popleft() seg_gen = traj_segment_generator(pi, env, timesteps_per_actorbatch, stochastic=True) end_timestep = end_timesteps.popleft() new = newround.popleft() env_so_far += 1 if True: para = {} for vf in range(len(vf_var_list)): # para[vf_var_list[vf].name] = vf_var_list[vf].eval() para[vf] = vf_var_list[vf].eval() for pol in range(len(pol_var_list)): # para[pol_var_list[pol].name] = pol_var_list[pol].eval() para[pol + len(vf_var_list)] = pol_var_list[pol].eval() f = open("network/%d-%d.txt" % (env.seed, timesteps_so_far), "wb") pickle.dump(para, f) f.close() print("======================== new environment (%s network settings left) ===========================" % len(env_list)) elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break elif max_seconds and time.time() - tstart >= max_seconds: break elif timesteps_so_far == 0: para = {} for vf in range(len(vf_var_list)): # para[vf_var_list[vf].name] = vf_var_list[vf].eval() para[vf] = vf_var_list[vf].eval() for pol in range(len(pol_var_list)): # para[pol_var_list[pol].name] = pol_var_list[pol].eval() para[pol + len(vf_var_list)] = pol_var_list[pol].eval() f = open("network/%d-%d.txt" % (env.seed, timesteps_so_far), "wb") pickle.dump(para, f) f.close() if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i, Environment %i ************" % (iters_so_far, env_so_far)) seg = seg_gen.__next__() add_vtarg_and_adv(seg, gamma, lam) # for vf in range(len(vf_var_list)): # print(vf_var_list[vf].name, vf_var_list[vf].eval()) # for pol in range(len(pol_var_list)): # print(pol_var_list[pol].name, pol_var_list[pol].eval()) record_reward(rewFile, sum(seg["rew"])) record_reward(resptimeFile, sum(seg["resptime"])) record_reward(pktnumFile, sum(seg["pktnum"])) print("total rewards for Iteration %s: %s" % (iters_so_far, sum(seg["rew"]))) print("average response time: %s, num of pkts: %s" % (sum(seg["resptime"])/sum(seg["pktnum"]), sum(seg["pktnum"]))) prob = collections.Counter(seg["ac"]) # a dict where elements are stored as dictionary keys and their counts are stored as dictionary values. for key in prob: prob[key] = prob[key]/len(seg["ac"]) print("percentage of choosing each controller: %s" % (prob)) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) vf_ob, ac_ob, ac, atarg, tdlamret = seg["vf_ob"], seg['ac_ob'], seg["ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std() # standardized advantage function estimate d = Dataset(dict(vf_ob=vf_ob, ac_ob=ac_ob, ac=ac, atarg=atarg, vtarg=tdlamret), shuffle=not pi.recurrent) optim_batchsize = optim_batchsize or vf_ob.shape[0] # if hasattr(pi, "vf_ob_rms"): pi.vf_ob_rms.update(vf_ob) # update running mean/std for policy # if hasattr(pi, "nn_in_rms"): # temp = ac_ob.reshape(-1,ac_ob.shape[2]) # pi.nn_in_rms.update(temp) assign_old_eq_new() # set old parameter values to new parameter values logger.log("Optimizing...") logger.log(fmt_row(13, loss_names)) # Here we do a bunch of optimization epochs over the data for _ in range(optim_epochs): losses = [] # list of tuples, each of which gives the loss for a minibatch for batch in d.iterate_once(optim_batchsize): # calculate the value function gradient vf_g = vf_grad(batch["vf_ob"], batch["vtarg"]) vf_adam.update(vf_g, optim_stepsize * cur_lrmult) # calculate the policy gradient pol_g = [] ratios = [] pbs_new_batch = [] pbs_old_batch = [] e = clip_para(cur_lrmult)[0] for sample_id in range(optim_batchsize): sample_ac_ob = batch["ac_ob"][sample_id] sample_ac = batch["ac"][sample_id] probs_new = pi.calculate_ac_prob(sample_ac_ob) prob_new = probs_new[sample_ac] probs_old = oldpi.calculate_ac_prob(sample_ac_ob) prob_old = probs_old[sample_ac] if prob_old == 0: logger.error("pi_old = 0 in %s th iteration %s th epoch %s th sample..." % (iters_so_far, _, sample_id)) r = prob_new / prob_old ratios.append(r) pbs_new_batch.append(probs_new) pbs_old_batch.append(probs_old) if (r > 1.0 + e and batch["atarg"][sample_id] > 0) or (r < 1.0 - e and batch["atarg"][sample_id] < 0) or r == 0: dnn_dtheta = pol_nn_grad(sample_ac_ob[0].reshape(1, -1)) pol_g.append(0.*dnn_dtheta) else: nn = pi.calculate_ac_value(sample_ac_ob) denominator = np.power(sum(nn), 2) sorted_ind = np.argsort(nn) # sort the array in ascending order if len(probs_new) == 2: if sample_ac == 0: numerator1 = nn[1]*pol_nn_grad(sample_ac_ob[0].reshape(1,-1)) numerator2 = nn[0] * pol_nn_grad(sample_ac_ob[1].reshape(1, -1)) dpi_dtheta = -(numerator1-numerator2)/denominator else: numerator1 = nn[1]*pol_nn_grad(sample_ac_ob[0].reshape(1,-1)) numerator2 = nn[0]*pol_nn_grad(sample_ac_ob[1].reshape(1,-1)) dpi_dtheta = -(numerator2 - numerator1)/denominator # numerator1 = nn[sorted_ind[0]]*pol_nn_grad(sample_ac_ob[sorted_ind[1]].reshape(1,-1)) # numerator2 = nn[sorted_ind[1]]*pol_nn_grad(sample_ac_ob[sorted_ind[0]].reshape(1,-1)) # dpi_dtheta = (numerator1-numerator2)/denominator elif len(probs_new) == 3: if sample_ac == sorted_ind[0]: # the controller with lowest probability will still possible to be chosen because the probability is not zero dnn_dtheta = pol_nn_grad(sample_ac_ob[0].reshape(1, -1)) pol_g.append(0. * dnn_dtheta) else: numerator1 = sum(nn) * (pol_nn_grad(sample_ac_ob[sample_ac].reshape(1,-1)) + 0.5 * pol_nn_grad( sample_ac_ob[sorted_ind[0]].reshape(1, -1))) numerator2 = (nn[sample_ac] + 0.5 * nn[sorted_ind[0]]) * pol_nn_grad(sample_ac_ob) dpi_dtheta = -(numerator1 - numerator2) / denominator else: if sample_ac == sorted_ind[-1] or sample_ac == sorted_ind[-2]: numerator1 = sum(nn) * (pol_nn_grad(sample_ac_ob[sample_ac] .reshape(1,-1))+0.5*pol_nn_grad(sample_ac_ob[sorted_ind[0:-2]])) numerator2 = (nn[sample_ac]+0.5*sum(nn[sorted_ind[0:-2]])) * pol_nn_grad(sample_ac_ob) dpi_dtheta = -(numerator1 - numerator2) / denominator else: dnn_dtheta = pol_nn_grad(sample_ac_ob[0].reshape(1, -1)) pol_g.append(0. * dnn_dtheta) pol_g.append(batch["atarg"][sample_id] * dpi_dtheta / prob_old) pol_g_mean = np.mean(np.array(pol_g), axis=0) pol_adam.update(pol_g_mean, optim_stepsize * cur_lrmult) newlosses = compute_losses(batch["vf_ob"], batch["atarg"], batch["vtarg"], cur_lrmult, np.array(ratios), np.array(pbs_new_batch), np.array(pbs_old_batch)) # adam.update(g, optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") # losses = [] # for batch in d.iterate_once(optim_batchsize): # newlosses = compute_losses(batch["vf_ob"], batch["ac_ob"], batch["ac"], batch["atarg"], batch["vtarg"], # cur_lrmult) # losses.append(newlosses) meanlosses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, meanlosses)) for (lossval, name) in zipsame(meanlosses, loss_names): logger.record_tabular("loss_" + name, lossval) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) if len(lenbuffer) == 0: logger.record_tabular("EpLenMean", 0) logger.record_tabular("EpRewMean", 0) else: logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular()
def callback(locals, globals): if that.method != "ddpg": if load_policy is not None and locals[iter_name] == 0: # noinspection PyBroadException try: utils.load_state(load_policy) if MPI.COMM_WORLD.Get_rank() == 0: logger.info("Loaded policy network weights from %s." % load_policy) # save TensorFlow summary (contains at least the graph definition) except: logger.error("Failed to load policy network weights from %s." % load_policy) if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] == 0: _ = tf.summary.FileWriter(folder, tf.get_default_graph()) if MPI.COMM_WORLD.Get_rank() == 0 and locals[iter_name] % save_every == 0: print('Saving video and checkpoint for policy at iteration %i...' % locals[iter_name]) ob = env.reset() images = [] rewards = [] max_reward = 1. # if any reward > 1, we have to rescale lower_part = video_height // 5 for i in range(episode_length): if that.method == "ddpg": ac, _ = locals['agent'].pi(ob, apply_noise=False, compute_Q=False) elif that.method == "sql": ac, _ = locals['policy'].get_action(ob) elif isinstance(locals['pi'], GaussianMlpPolicy): ac, _, _ = locals['pi'].act(np.concatenate((ob, ob))) else: ac, _ = locals['pi'].act(False, ob) ob, rew, new, _ = env.step(ac) images.append(render_frames(env)) if plot_rewards: rewards.append(rew) max_reward = max(rew, max_reward) if new: break orange = np.array([255, 163, 0]) red = np.array([255, 0, 0]) video = [] width_factor = 1. / episode_length * video_width for i, imgs in enumerate(images): for img in imgs: img[-lower_part, :10] = orange img[-lower_part, -10:] = orange if episode_length < video_width: p_rew_x = 0 for j, r in enumerate(rewards[:i]): rew_x = int(j * width_factor) if r < 0: img[-1:, p_rew_x:rew_x] = red img[-1:, p_rew_x:rew_x] = red else: rew_y = int(r / max_reward * lower_part) img[-rew_y - 1:, p_rew_x:rew_x] = orange img[-rew_y - 1:, p_rew_x:rew_x] = orange p_rew_x = rew_x else: for j, r in enumerate(rewards[:i]): rew_x = int(j * width_factor) if r < 0: img[-1:, rew_x] = red img[-1:, rew_x] = red else: rew_y = int(r / max_reward * lower_part) img[-rew_y - 1:, rew_x] = orange img[-rew_y - 1:, rew_x] = orange video.append(np.hstack(imgs)) imageio.mimsave( os.path.join(folder, "videos", "%s_%s_iteration_%i.mp4" % (that.environment, that.method, locals[iter_name])), video, fps=60) env.reset() if that.method != "ddpg": utils.save_state(os.path.join(that.folder, "checkpoints", "%s_%i" % (that.environment, locals[iter_name])))
def main(_): config = tf.ConfigProto() config.device_count['GPU'] = 1 config.gpu_options.allow_growth = True config.intra_op_parallelism_threads = 1 config.inter_op_parallelism_threads = 1 config_name = FLAGS.config_name env_str, trainer_str, other_str = config_name.split('.') other_kwargs = get_other_args(other_str) env_kwargs = SocialNetworkGraphEnv.get_env_args(env_str) if env_kwargs['method_name'] == 'dqnV0': trainer_kwargs, trainer_name = get_dqn_v0_args(trainer_str), 'dqnV0' elif env_kwargs['method_name'] == 'randomV0': trainer_kwargs, trainer_name = get_random_v0_args( trainer_str), 'randomV0' elif env_kwargs['method_name'] == 'exhaustiveV0': trainer_kwargs, trainer_name = get_exhaustive_v0_args( trainer_str), 'exhaustiveV0' elif env_kwargs['method_name'] == 'greedyV0': trainer_kwargs, trainer_name = get_greedy_v0_args( trainer_str), 'greedyV0' else: assert (False) logdir = FLAGS.logdir_prefix + FLAGS.config_name + FLAGS.logdir_suffix logger.configure(logdir) logger.error('env_kwargs: ', env_kwargs) logger.error('other_kwargs: ', other_kwargs) logger.error('%s_kwargs: ' % (trainer_name), trainer_kwargs) tf.set_random_seed(other_kwargs['seed']) random.seed(other_kwargs['seed']) np.random.seed(other_kwargs['seed']) if env_kwargs['method_name'] == 'dqnV0': dqnTrainer = DQNTrainer( env_name=other_kwargs['social_network_graph_env'], env_kwargs=env_kwargs, trainer_kwargs=trainer_kwargs, other_kwargs=other_kwargs, logdir=logdir) dqnTrainer.train(config=config) elif env_kwargs['method_name'] == 'randomV0': randomTrainer = RandomTrainer( env_name=other_kwargs['social_network_graph_env'], env_kwargs=env_kwargs, trainer_kwargs=trainer_kwargs, other_kwargs=other_kwargs, logdir=logdir) randomTrainer.train() elif env_kwargs['method_name'] == 'exhaustiveV0': exhaustiveTrainer = ExhaustiveTrainer( env_name=other_kwargs['social_network_graph_env'], env_kwargs=env_kwargs, trainer_kwargs=trainer_kwargs, other_kwargs=other_kwargs, logdir=logdir) exhaustiveTrainer.train() elif env_kwargs['method_name'] == 'greedyV0': greedyTrainer = GreedyTrainer( env_name=other_kwargs['social_network_graph_env'], env_kwargs=env_kwargs, trainer_kwargs=trainer_kwargs, other_kwargs=other_kwargs, logdir=logdir) greedyTrainer.train() else: assert (False)