def update(self): if self.normrew: rffs = np.array( [self.rff.update(rew) for rew in self.rollout.buf_rews.T]) rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel()) self.rff_rms.update_from_moments(rffs_mean, rffs_std**2, rffs_count) rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var) else: rews = np.copy(self.rollout.buf_rews) self.calculate_advantages(rews=rews, use_news=self.use_news, gamma=self.gamma, lam=self.lam) info = dict( advmean=self.buf_advs.mean(), advstd=self.buf_advs.std(), retmean=self.buf_rets.mean(), retstd=self.buf_rets.std(), vpredmean=self.rollout.buf_vpreds.mean(), vpredstd=self.rollout.buf_vpreds.std(), ev=explained_variance(self.rollout.buf_vpreds.ravel(), self.buf_rets.ravel()), rew_mean=np.mean(self.rollout.buf_rews), recent_best_ext_ret=self.rollout.current_max if self.rollout.current_max is not None else 0, ) if self.rollout.best_ext_ret is not None: info['best_ext_ret'] = self.rollout.best_ext_ret # store images for debugging # from PIL import Image # if not os.path.exists('logs/images/'): # os.makedirs('logs/images/') # for i in range(self.rollout.buf_obs_last.shape[0]): # obs = self.rollout.buf_obs_last[i][0] # Image.fromarray((obs*255.).astype(np.uint8)).save('logs/images/%04d.png'%i) # normalize advantages if self.normadv: m, s = get_mean_and_std(self.buf_advs) self.buf_advs = (self.buf_advs - m) / (s + 1e-7) envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches envsperbatch = max(1, envsperbatch) envinds = np.arange(self.nenvs * self.nsegs_per_env) def resh(x): if self.nsegs_per_env == 1: return x sh = x.shape return x.reshape((sh[0] * self.nsegs_per_env, self.nsteps_per_seg) + sh[2:]) ph_buf = [ (self.stochpol.ph_ac, resh(self.rollout.buf_acs)), (self.ph_rews, resh(self.rollout.buf_rews)), (self.ph_oldvpred, resh(self.rollout.buf_vpreds)), (self.ph_oldnlp, resh(self.rollout.buf_nlps)), (self.stochpol.ph_ob, resh(self.rollout.buf_obs)), (self.ph_ret, resh(self.buf_rets)), (self.ph_adv, resh(self.buf_advs)), ] ph_buf.extend([(self.dynamics_list[0].last_ob, self.rollout.buf_obs_last.reshape([ self.nenvs * self.nsegs_per_env, 1, *self.ob_space.shape ]))]) mblossvals = [] for _ in range(self.nepochs): np.random.shuffle(envinds) for start in range(0, self.nenvs * self.nsegs_per_env, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf} fd.update({ self.ph_lr: self.lr, self.ph_cliprange: self.cliprange }) mblossvals.append(getsess().run(self._losses + (self._train, ), fd)[:-1]) mblossvals = [mblossvals[0]] info.update( zip(['opt_' + ln for ln in self.loss_names], np.mean([mblossvals[0]], axis=0))) info["rank"] = MPI.COMM_WORLD.Get_rank() self.n_updates += 1 info["n_updates"] = self.n_updates info.update({ dn: (np.mean(dvs) if len(dvs) > 0 else 0) for (dn, dvs) in self.rollout.statlists.items() }) info.update(self.rollout.stats) if "states_visited" in info: info.pop("states_visited") tnow = time.time() info["ups"] = 1. / (tnow - self.t_last_update) info["total_secs"] = tnow - self.t_start info['tps'] = MPI.COMM_WORLD.Get_size( ) * self.rollout.nsteps * self.nenvs / (tnow - self.t_last_update) self.t_last_update = tnow return info
branches=branches, selection=selection_test) features_data = root_numpy.tree2array(tree, branches=branches, selection=selection_data) features = { "train": features_train, "test": features_test, "data": features_data, } preprocess_dict = {} if args.z_score: for feat in training_features: mean, std = utils.get_mean_and_std(features_train[feat]) preprocess_dict[feat] = {"mean": float(mean), "std_dev": float(std)} with open(z_score_json, "w") as f_out: json.dump(preprocess_dict, f_out, indent=4, sort_keys=True) f_out = h5py.File(output_file, "w") f_out.create_dataset("feature_names", data=training_features) for set in features.keys(): global_features, label = prep_utils.create_features_and_label( features[set], training_features, signal, bkg, preprocess_dict, args.z_score) f_out.create_dataset("global_%s" % set, data=global_features) f_out.create_dataset("label_%s" % set, data=label)
def update(self): # Rewards normalization # if self.normrew: # rffs = np.array([self.rff.update(rew) for rew in self.rollout.buf_rews.T]) # rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel()) # self.rff_rms.update_from_moments(rffs_mean, rffs_std ** 2, rffs_count) # rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var) # Intrinsic Rewards Normalization if self.normrew: rffs_int = np.array([self.rff.update(rew) for rew in self.rollout.buf_int_rews.T]) self.rff_rms.update(rffs_int.ravel()) int_rews = self.rollout.buf_int_rews / np.sqrt(self.rff_rms.var) else: int_rews = np.copy(self.rollout.buf_int_rews) mean_int_rew = np.mean(int_rews) max_int_rew = np.max(int_rews) # Do not normalize extrinsic rewards ext_rews = self.rollout.buf_ext_rews nsteps = self.rollout.nsteps # If separate value fcn are used if self.hps['num_vf']==2: #Calculate intrinsic returns and advantages. lastgaelam = 0 for t in range(nsteps - 1, -1, -1): # nsteps-2 ... 0 if self.use_news: nextnew = self.rollout.buf_news[:, t + 1] if t + 1 < nsteps else self.rollout.buf_new_last else: nextnew = 0 # No dones for intrinsic rewards with self.use_news=False nextvals = self.rollout.buf_vpreds_int[:, t + 1] if t + 1 < nsteps else self.rollout.buf_vpred_int_last nextnotnew = 1 - nextnew delta = int_rews[:, t] + self.gamma * nextvals * nextnotnew - self.rollout.buf_vpreds_int[:, t] self.buf_advs_int[:, t] = lastgaelam = delta + self.gamma * self.lam * nextnotnew * lastgaelam self.buf_rets_int[:] = self.buf_advs_int + self.rollout.buf_vpreds_int #Calculate extrinsic returns and advantages. lastgaelam = 0 for t in range(nsteps - 1, -1, -1): # nsteps-2 ... 0 nextnew = self.rollout.buf_news[:, t + 1] if t + 1 < nsteps else self.rollout.buf_new_last nextvals = self.rollout.buf_vpreds_ext[:, t + 1] if t + 1 < nsteps else self.rollout.buf_vpred_ext_last nextnotnew = 1 - nextnew delta = ext_rews[:, t] + self.gamma_ext * nextvals * nextnotnew - self.rollout.buf_vpreds_ext[:, t] self.buf_advs_ext[:, t] = lastgaelam = delta + self.gamma_ext * self.lam * nextnotnew * lastgaelam self.buf_rets_ext[:] = self.buf_advs_ext + self.rollout.buf_vpreds_ext #Combine the extrinsic and intrinsic advantages. self.buf_advs = self.int_coeff*self.buf_advs_int + self.ext_coeff*self.buf_advs_ext else: #Calculate mixed intrinsic and extrinsic returns and advantages. rews = self.rollout.buf_rews = self.rollout.reward_fun(int_rew=int_rews, ext_rew=ext_rews) lastgaelam = 0 for t in range(nsteps - 1, -1, -1): # nsteps-2 ... 0 nextnew = self.rollout.buf_news[:, t + 1] if t + 1 < nsteps else self.rollout.buf_new_last nextvals = self.rollout.buf_vpreds[:, t + 1] if t + 1 < nsteps else self.rollout.buf_vpred_last nextnotnew = 1 - nextnew delta = rews[:, t] + self.gamma * nextvals * nextnotnew - self.rollout.buf_vpreds[:, t] self.buf_advs[:, t] = lastgaelam = delta + self.gamma * self.lam * nextnotnew * lastgaelam self.buf_rets[:] = self.buf_advs + self.rollout.buf_vpreds info = dict( # advmean=self.buf_advs.mean(), # advstd=self.buf_advs.std(), recent_best_ext_ret=self.rollout.current_max, recent_best_eplen = self.rollout.current_minlen, recent_worst_eplen = self.rollout.current_maxlen ) if self.hps['num_vf'] ==2: info['retmean_int']=self.buf_rets_int.mean() info['retmean_ext']=self.buf_rets_ext.mean() info['retstd_int']=self.buf_rets_int.std() info['retstd_ext']=self.buf_rets_ext.std() info['vpredmean_int']=self.rollout.buf_vpreds_int.mean() info['vpredmean_ext']=self.rollout.buf_vpreds_ext.mean() info['vpredstd_int']=self.rollout.buf_vpreds_int.std() info['vpredstd_ext']=self.rollout.buf_vpreds_ext.std() info['ev_int']=explained_variance(self.rollout.buf_vpreds_int.ravel(), self.buf_rets_int.ravel()) info['ev_ext']=explained_variance(self.rollout.buf_vpreds_ext.ravel(), self.buf_rets_ext.ravel()) info['rew_int_mean']=mean_int_rew info['recent_best_int_rew']=max_int_rew else: # info['retmean']=self.buf_rets.mean() # info['retstd']=self.buf_rets.std() # info['vpredmean']=self.rollout.buf_vpreds.mean() # info['vpredstd']=self.rollout.buf_vpreds.std() info['rew_mean']=np.mean(self.rollout.buf_rews) info['eplen_std']=np.std(self.rollout.statlists['eplen']) info['eprew_std']=np.std(self.rollout.statlists['eprew']) # info['ev']=explained_variance(self.rollout.buf_vpreds.ravel(), self.buf_rets.ravel()) if self.rollout.best_ext_ret is not None: info['best_ext_ret'] = self.rollout.best_ext_ret info['best_eplen'] = self.rollout.best_eplen # normalize advantages if self.normadv: m, s = get_mean_and_std(self.buf_advs) self.buf_advs = (self.buf_advs - m) / (s + 1e-7) envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches envsperbatch = max(1, envsperbatch) envinds = np.arange(self.nenvs * self.nsegs_per_env) def resh(x): if self.nsegs_per_env == 1: return x sh = x.shape return x.reshape((sh[0] * self.nsegs_per_env, self.nsteps_per_seg) + sh[2:]) #Create feed_dict for optimization. ph_buf = [ (self.stochpol.ph_ac, resh(self.rollout.buf_acs)), (self.ph_oldnlp, resh(self.rollout.buf_nlps)), (self.stochpol.ph_ob, resh(self.rollout.buf_obs)), (self.ph_adv, resh(self.buf_advs)), ] if self.hps['num_vf']==2: ph_buf.extend([ (self.ph_ret_int, resh(self.buf_rets_int)), (self.ph_ret_ext, resh(self.buf_rets_ext)), ]) else: ph_buf.extend([ (self.ph_rews, resh(self.rollout.buf_rews)), (self.ph_oldvpred, resh(self.rollout.buf_vpreds)), (self.ph_ret, resh(self.buf_rets)), ]) ph_buf.extend([ (self.dynamics.last_ob, self.rollout.buf_obs_last.reshape([self.nenvs * self.nsegs_per_env, 1, *self.ob_space.shape])) ]) #Optimizes on current data for several epochs. mblossvals = [] for _ in range(self.nepochs): np.random.shuffle(envinds) for start in range(0, self.nenvs * self.nsegs_per_env, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf} fd.update({self.ph_lr: self.lr, self.ph_cliprange: self.cliprange}) mblossvals.append(getsess().run(self._losses + (self._train,), fd)[:-1]) mblossvals = [mblossvals[0]] # info.update(zip(['opt_' + ln for ln in self.loss_names], np.mean([mblossvals[0]], axis=0))) # info["rank"] = MPI.COMM_WORLD.Get_rank() self.n_updates += 1 info["n_updates"] = self.n_updates info.update({dn: (np.mean(dvs) if len(dvs) > 0 else 0) for (dn, dvs) in self.rollout.statlists.items()}) info.update(self.rollout.stats) if "states_visited" in info: info.pop("states_visited") tnow = time.time() # info["ups"] = 1. / (tnow - self.t_last_update) info["total_secs"] = tnow - self.t_start # info['tps'] = MPI.COMM_WORLD.Get_size() * self.rollout.nsteps * self.nenvs / (tnow - self.t_last_update) self.t_last_update = tnow return info
def update(self): if self.normrew: rffs = np.array( [self.rff.update(rew) for rew in self.rollout.buf_rews.T]) rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel()) self.rff_rms.update_from_moments(rffs_mean, rffs_std**2, rffs_count) rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var) if self.dynamics.dropout: rffs2 = np.array([ self.rff2.update(rew) for rew in self.rollout.buf_rews_mean.T ]) rffs2_mean, rffs2_std, rffs2_count = mpi_moments(rffs2.ravel()) self.rff_rms2.update_from_moments(rffs2_mean, rffs2_std**2, rffs2_count) rews_m = self.rollout.buf_rews_mean / np.sqrt( self.rff_rms2.var) rews = rews_m + rews else: rews = np.copy(self.rollout.buf_rews) self.calculate_advantages(rews=rews, use_news=self.use_news, gamma=self.gamma, lam=self.lam) info = dict(advmean=self.buf_advs.mean(), advstd=self.buf_advs.std(), retmean=self.buf_rets.mean(), retstd=self.buf_rets.std(), vpredmean=self.rollout.buf_vpreds.mean(), vpredstd=self.rollout.buf_vpreds.std(), ev=explained_variance(self.rollout.buf_vpreds.ravel(), self.buf_rets.ravel()), rew_mean=np.mean(self.rollout.buf_rews), recent_best_ext_ret=self.rollout.current_max) if self.rollout.best_ext_ret is not None: info['best_ext_ret'] = self.rollout.best_ext_ret # if self.flipout: # info['dyn_mean'] = np.mean(self.rollout.buf_dyn_rew) # normalize advantages if self.normadv: m, s = get_mean_and_std(self.buf_advs) self.buf_advs = (self.buf_advs - m) / (s + 1e-7) envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches envsperbatch = max(1, envsperbatch) envinds = np.arange(self.nenvs * self.nsegs_per_env) def resh(x): if self.nsegs_per_env == 1: return x sh = x.shape return x.reshape((sh[0] * self.nsegs_per_env, self.nsteps_per_seg) + sh[2:]) ph_buf = [ (self.policy.placeholder_action, resh(self.rollout.buf_acs)), (self.placeholder_rews, resh(self.rollout.buf_rews)), (self.placeholder_oldvpred, resh(self.rollout.buf_vpreds)), (self.placeholder_oldnlp, resh(self.rollout.buf_nlps)), (self.policy.placeholder_observation, resh(self.rollout.buf_obs)), (self.placeholder_ret, resh(self.buf_rets)), (self.placeholder_advantage, resh(self.buf_advs)), ] ph_buf.extend([(self.dynamics.last_ob, self.rollout.buf_obs_last.reshape([ self.nenvs * self.nsegs_per_env, 1, *self.ob_space.shape ]))]) # if self.flipout: # ph_buf.extend([(self.placeholder_dyn_mean, resh(self.buf_n_dyn_rew))]) if self.bootstrapped: ph_buf.extend([ (self.dynamics.mask_placeholder, self.rollout.buf_mask.reshape(-1, self.dynamics.n_heads, 1)) ]) mblossvals = [] for _ in range(self.nepochs): np.random.shuffle(envinds) for start in range(0, self.nenvs * self.nsegs_per_env, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf} fd.update({ self.placeholder_lr: self.lr, self.placeholder_cliprange: self.cliprange }) if self.dynamics.dropout: fd.update({self.dynamics.is_training: True}) mblossvals.append(tf.get_default_session().run( self._losses + (self._train, ), fd)[:-1]) mblossvals = [mblossvals[0]] info.update( zip(['opt_' + ln for ln in self.loss_names], np.mean([mblossvals[0]], axis=0))) info["rank"] = MPI.COMM_WORLD.Get_rank() self.n_updates += 1 info["n_updates"] = self.n_updates info.update({ dn: (np.mean(dvs) if len(dvs) > 0 else 0) for (dn, dvs) in self.rollout.statlists.items() }) info.update(self.rollout.stats) if "states_visited" in info: info.pop("states_visited") tnow = time.time() info["ups"] = 1. / (tnow - self.t_last_update) info["total_secs"] = tnow - self.t_start info['tps'] = MPI.COMM_WORLD.Get_size( ) * self.rollout.nsteps * self.nenvs / (tnow - self.t_last_update) self.t_last_update = tnow return info
def update(self): if self.normrew: # 规约奖励, 根据 MPI 从其余线程获取的信息 rffs = np.array([self.rff.update(rew) for rew in self.rollout.buf_rews.T]) rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel()) self.rff_rms.update_from_moments(rffs_mean, rffs_std ** 2, rffs_count) rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var) else: rews = np.copy(self.rollout.buf_rews) # 调用本类的函数, 根据奖励序列 rews 计算 advantage function self.calculate_advantages(rews=rews, use_news=self.use_news, gamma=self.gamma, lam=self.lam) # 记录一些统计量进行输出 info = dict( advmean=self.buf_advs.mean(), advstd=self.buf_advs.std(), retmean=self.buf_rets.mean(), retstd=self.buf_rets.std(), vpredmean=self.rollout.buf_vpreds.mean(), vpredstd=self.rollout.buf_vpreds.std(), ev=explained_variance(self.rollout.buf_vpreds.ravel(), self.buf_rets.ravel()), rew_mean=np.mean(self.rollout.buf_rews), rew_mean_norm=np.mean(rews), recent_best_ext_ret=self.rollout.current_max ) if self.rollout.best_ext_ret is not None: info['best_ext_ret'] = self.rollout.best_ext_ret # normalize advantages. 对计算得到的 advantage 由 mean 和 std 进行规约. if self.normadv: m, s = get_mean_and_std(self.buf_advs) self.buf_advs = (self.buf_advs - m) / (s + 1e-7) envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches envsperbatch = max(1, envsperbatch) envinds = np.arange(self.nenvs * self.nsegs_per_env) def resh(x): if self.nsegs_per_env == 1: return x sh = x.shape return x.reshape((sh[0] * self.nsegs_per_env, self.nsteps_per_seg) + sh[2:]) # 将本类中定义的 placeholder 与 rollout 类中收集的样本numpy 对应起来, 准备作为 feed-dict ph_buf = [ (self.stochpol.ph_ac, resh(self.rollout.buf_acs)), (self.ph_rews, resh(self.rollout.buf_rews)), (self.ph_oldvpred, resh(self.rollout.buf_vpreds)), (self.ph_oldnlp, resh(self.rollout.buf_nlps)), (self.stochpol.ph_ob, resh(self.rollout.buf_obs)), # 以上是rollout在于环境交互中记录的numpy (self.ph_ret, resh(self.buf_rets)), # 根据 rollout 记录计算得到的 return (self.ph_adv, resh(self.buf_advs)), # 根据 rollout 记录计算得到的 advantage. ] ph_buf.extend([ (self.dynamics.last_ob, self.rollout.buf_obs_last.reshape([self.nenvs * self.nsegs_per_env, 1, *self.ob_space.shape])) ]) mblossvals = [] # 记录训练中的损失 # 训练 Agent 损失 for _ in range(self.nepochs): np.random.shuffle(envinds) for start in range(0, self.nenvs * self.nsegs_per_env, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf} # 构造 feed_dict fd.update({self.ph_lr: self.lr, self.ph_cliprange: self.cliprange}) mblossvals.append(getsess().run(self._losses + (self._train,), fd)[:-1]) # 计算损失, 同时进行更新 # add bai. 单独再次训练 DVAE for tmp in range(self.nepochs_dvae): print("额外训练dvae. ", tmp) np.random.shuffle(envinds) for start in range(0, self.nenvs * self.nsegs_per_env, envsperbatch): # 循环8次 end = start + envsperbatch mbenvinds = envinds[start:end] fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf} # 构造 feed_dict fd.update({self.ph_lr: self.lr, self.ph_cliprange: self.cliprange}) d_loss, _ = getsess().run([self.dynamics_loss, self._train_dvae], fd) # 计算dvae损失, 同时进行更新 print(d_loss, end=", ") print("\n") mblossvals = [mblossvals[0]] info.update(zip(['opt_' + ln for ln in self.loss_names], np.mean([mblossvals[0]], axis=0))) info["rank"] = MPI.COMM_WORLD.Get_rank() self.n_updates += 1 info["n_updates"] = self.n_updates info.update({dn: (np.mean(dvs) if len(dvs) > 0 else 0) for (dn, dvs) in self.rollout.statlists.items()}) info.update(self.rollout.stats) if "states_visited" in info: info.pop("states_visited") tnow = time.time() info["ups"] = 1. / (tnow - self.t_last_update) info["total_secs"] = tnow - self.t_start info['tps'] = MPI.COMM_WORLD.Get_size() * self.rollout.nsteps * self.nenvs / (tnow - self.t_last_update) self.t_last_update = tnow return info
def update(self): if self.normrew: rffs = np.array( [self.rff.update(rew) for rew in self.rollout.buf_rews.T]) rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel()) self.rff_rms.update_from_moments(rffs_mean, rffs_std**2, rffs_count) rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var) else: rews = np.copy(self.rollout.buf_rews) self.calculate_advantages(rews=rews, use_news=self.use_news, gamma=self.gamma, lam=self.lam) info = dict(advmean=self.buf_advs.mean(), advstd=self.buf_advs.std(), retmean=self.buf_rets.mean(), retstd=self.buf_rets.std(), vpredmean=self.rollout.buf_vpreds.mean(), vpredstd=self.rollout.buf_vpreds.std(), ev=explained_variance(self.rollout.buf_vpreds.ravel(), self.buf_rets.ravel()), rew_mean=np.mean(self.rollout.buf_rews), recent_best_ext_ret=self.rollout.current_max) if self.rollout.best_ext_ret is not None: info['best_ext_ret'] = self.rollout.best_ext_ret # normalize advantages if self.normadv: m, s = get_mean_and_std(self.buf_advs) self.buf_advs = (self.buf_advs - m) / (s + 1e-7) envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches envsperbatch = max(1, envsperbatch) envinds = np.arange(self.nenvs * self.nsegs_per_env) def resh(x): if self.nsegs_per_env == 1: return x sh = x.shape return x.reshape((sh[0] * self.nsegs_per_env, self.nsteps_per_seg) + sh[2:]) ph_buf = [ (self.trainpol.ph_ac, resh(self.rollout.buf_acs)), (self.ph_rews, resh(self.rollout.buf_rews)), (self.ph_oldvpred, resh(self.rollout.buf_vpreds)), (self.ph_oldnlp, resh(self.rollout.buf_nlps)), (self.trainpol.ph_ob, resh(self.rollout.buf_obs)), (self.ph_ret, resh(self.buf_rets)), (self.ph_adv, resh(self.buf_advs)), ] ph_buf.extend([(self.train_dynamics.last_ob, self.rollout.buf_obs_last.reshape([ self.nenvs * self.nsegs_per_env, 1, *self.ob_space.shape ]))]) ph_buf.extend([ (self.trainpol.states_ph, resh(self.rollout.buf_states_first)), # rnn inputs (self.trainpol.masks_ph, resh(self.rollout.buf_news)) ]) if 'err' in self.policy_mode: ph_buf.extend([(self.trainpol.pred_error, resh(self.rollout.buf_errs))]) # New if 'ac' in self.policy_mode: ph_buf.extend([(self.trainpol.ph_ac, resh(self.rollout.buf_acs)), (self.trainpol.ph_ac_first, resh(self.rollout.buf_acs_first))]) if 'pred' in self.policy_mode: ph_buf.extend([(self.trainpol.obs_pred, resh(self.rollout.buf_obpreds))]) # with open(os.getcwd() + "/record_instruction.txt", 'r') as rec_inst: # rec_n = [] # rec_all_n = [] # while True: # line = rec_inst.readline() # if not line: break # args = line.split() # rec_n.append(int(args[0])) # if len(args) > 1: # rec_all_n.append(int(args[0])) # if self.n_updates in rec_n and MPI.COMM_WORLD.Get_rank() == 0: # print("Enter!") # with open(self.logdir + '/full_log' + str(self.n_updates) + '.pk', 'wb') as full_log: # import pickle # debug_data = {"buf_obs" : self.rollout.buf_obs, # "buf_obs_last" : self.rollout.buf_obs_last, # "buf_acs" : self.rollout.buf_acs, # "buf_acs_first" : self.rollout.buf_acs_first, # "buf_news" : self.rollout.buf_news, # "buf_news_last" : self.rollout.buf_new_last, # "buf_rews" : self.rollout.buf_rews, # "buf_ext_rews" : self.rollout.buf_ext_rews} # if self.n_updates in rec_all_n: # debug_data.update({"buf_err": self.rollout.buf_errs, # "buf_err_last": self.rollout.buf_errs_last, # "buf_obpreds": self.rollout.buf_obpreds, # "buf_obpreds_last": self.rollout.buf_obpreds_last, # "buf_vpreds": self.rollout.buf_vpreds, # "buf_vpred_last": self.rollout.buf_vpred_last, # "buf_states": self.rollout.buf_states, # "buf_states_first": self.rollout.buf_states_first, # "buf_nlps": self.rollout.buf_nlps,}) # pickle.dump(debug_data, full_log) mblossvals = [] for _ in range(self.nepochs): np.random.shuffle(envinds) for start in range(0, self.nenvs * self.nsegs_per_env, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf} fd.update({ self.ph_lr: self.lr, self.ph_cliprange: self.cliprange }) mblossvals.append(getsess().run(self._losses + (self._train, ), fd)[:-1]) mblossvals = [mblossvals[0]] info.update( zip(['opt_' + ln for ln in self.loss_names], np.mean([mblossvals[0]], axis=0))) info["rank"] = MPI.COMM_WORLD.Get_rank() self.n_updates += 1 info["n_updates"] = self.n_updates info.update({ dn: (np.mean(dvs) if len(dvs) > 0 else 0) for (dn, dvs) in self.rollout.statlists.items() }) info.update(self.rollout.stats) if "states_visited" in info: info.pop("states_visited") tnow = time.time() info["ups"] = 1. / (tnow - self.t_last_update) info["total_secs"] = tnow - self.t_start info['tps'] = MPI.COMM_WORLD.Get_size( ) * self.rollout.nsteps * self.nenvs / (tnow - self.t_last_update) self.t_last_update = tnow # New if 'err' in self.policy_mode: info["error"] = np.sqrt(np.power(self.rollout.buf_errs, 2).mean()) if self.n_updates % self.tboard_period == 0 and MPI.COMM_WORLD.Get_rank( ) == 0: if self.full_tensorboard_log: summary = getsess().run(self.merged_summary_op, fd) # New self.summary_writer.add_summary( summary, self.rollout.stats["tcount"]) # New for k, v in info.items(): summary = tf.Summary(value=[ tf.Summary.Value(tag=k, simple_value=v), ]) self.summary_writer.add_summary(summary, self.rollout.stats["tcount"]) return info
def main(): global best_score start_epoch = args.start_epoch #Data Loader traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') data_train = datasets.ImageFolder( traindir, transforms.Compose([transforms.ToTensor()])) mean_tr, std_tr = get_mean_and_std(data_train) data_test = datasets.ImageFolder( valdir, transforms.Compose([transforms.ToTensor()])) mean_te, std_te = get_mean_and_std(data_test) #Note that for imgaug, we should convert the PIL images to NumPy arrays before applying the transforms. train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.ToTensor(), transforms.Normalize(mean=mean_tr, std=std_tr) ])) test_dataset = datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize(mean=mean_te, std=std_te) ])) train_loader = torch.utils.data.DataLoader( train_dataset, sampler=ImbalancedDatasetSampler(train_dataset), batch_size=args.train_batch, shuffle=False, num_workers=args.workers, pin_memory=True) val_loader = torch.utils.data.DataLoader( test_dataset #, sampler=ImbalancedDatasetSampler(test_dataset) , batch_size=args.test_batch, shuffle=False, num_workers=args.workers, pin_memory=True) # test_loader = torch.utils.data.DataLoader(test_dataset #, sampler=ImbalancedDatasetSampler(test_dataset) # ,batch_size=320, shuffle=False, num_workers=args.workers, pin_memory=True) # for inputs, targets in train_loader: #Create Model device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model = resnet34() model = model.to(device) summary(model, (3, 224, 224)) # for child in model.named_children(): # print(child) # model.fc.weight # (list(model.layer4.children()))[0].conv1.weights #Get the number of model parameters print('Number of model parameters: {}'.format( sum([p.data.nelement() for p in model.parameters()]))) model = torch.nn.DataParallel(model).cuda() for name, param in model.named_parameters(): if param.requires_grad: print(name) #cudnn.benchmark = True # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, nesterov=args.nesterov, weight_decay=args.weight_decay) title = 'AF' if args.resume: # Load checkpoint. print('==> Resuming from checkpoint..') assert os.path.isfile( args.resume), 'Error: no checkpoint directory found!' args.checkpoint = os.path.dirname(args.resume) checkpoint = torch.load(args.resume) best_score = checkpoint['best_score'] print(best_score) start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc 1.', 'Valid Acc 1.' ]) if args.evaluate: print('\nEvaluation only') test_loss, test_acc = test(val_loader, model, criterion, start_epoch, use_cuda) print(' Test Loss: %.8f, Test Acc: %.2f' % (test_loss, test_acc)) return # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) # print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, state['lr'])) #Adjust Orhto decay rate odecay = adjust_ortho_decay_rate(epoch + 1) sendecay = adjust_sen_decay(epoch + 1) train_loss, train_acc = train(train_loader, model, criterion, optimizer, epoch, use_cuda, odecay, sendecay) test_loss, test_acc = test(val_loader, model, criterion, epoch, use_cuda) # append logger file logger.append( [state['lr'], train_loss, test_loss, train_acc, test_acc]) # save model is_best = test_acc > best_score best_score = max(test_acc, best_score) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'acc': test_acc, 'best_score': best_score, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) logger.close() logger.plot() savefig(os.path.join(args.checkpoint, 'log.eps')) print('Best Fscore:') print(best_score)
def update(self): if self.normrew: rffs = np.array( [self.rff.update(rew) for rew in self.rollout.buf_rews.T]) rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel()) self.rff_rms.update_from_moments(rffs_mean, rffs_std**2, rffs_count) rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var) else: rews = np.copy(self.rollout.buf_rews) self.calculate_advantages(rews=rews, use_news=self.use_news, gamma=self.gamma, lam=self.lam) info = dict(advmean=self.buf_advs.mean(), advstd=self.buf_advs.std(), retmean=self.buf_rets.mean(), retstd=self.buf_rets.std(), vpredmean=self.rollout.buf_vpreds.mean(), vpredstd=self.rollout.buf_vpreds.std(), ev=explained_variance(self.rollout.buf_vpreds.ravel(), self.buf_rets.ravel()), rew_mean=np.mean(self.rollout.buf_rews), recent_best_ext_ret=self.rollout.current_max) if self.rollout.best_ext_ret is not None: info['best_ext_ret'] = self.rollout.best_ext_ret # normalize advantages if self.normadv: m, s = get_mean_and_std(self.buf_advs) self.buf_advs = (self.buf_advs - m) / (s + 1e-7) envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches envsperbatch = max(1, envsperbatch) envinds = np.arange(self.nenvs * self.nsegs_per_env) def mask(x, grad_mask): if self.early_stop: #print("x shape: {}".format(np.shape(x))) #grad_mask = self.rollout.grad_mask #print("mask shape: {}".format(np.shape(pseudo_dones))) #no_grad_mask = 1 - grad_mask sh = np.shape(x) if sh[1] < np.shape(grad_mask)[1]: return x broadcast_shape = (sh[0], sh[1]) + sh[2:] #print("mask shape: {}".format(broadcast_shape)) for i in range(len(broadcast_shape) - 2): # no_grad_mask = tf.expand_dims(no_grad_mask, -1) grad_mask = np.expand_dims(grad_mask, -1) #no_grad_mask =tf.cast(no_grad_mask, x.dtype) #grad_mask = tf.cast(grad_mask, x.dtype) #result = tf.placeholder(x.dtype, shape=broadcast_shape) #result = tf.stop_gradient(tf.multiply(no_grad_mask, x)) + tf.multiply(grad_mask, x) #print("Result size: {}".format(result.shape)) result = np.multiply(grad_mask, x) return result else: return x def resh(x): if self.nsegs_per_env == 1: return x sh = x.shape return x.reshape((sh[0] * self.nsegs_per_env, self.nsteps_per_seg) + sh[2:]) new_count = np.count_nonzero(self.rollout.buf_news) print(self.rollout.buf_news) if self.early_stop: print(self.rollout.grad_mask) print(new_count) ph_buf = [ (self.stochpol.ph_ac, resh(self.rollout.buf_acs)), (self.ph_rews, resh(self.rollout.buf_rews)), (self.ph_oldvpred, resh(self.rollout.buf_vpreds)), (self.ph_oldnlp, resh(self.rollout.buf_nlps)), (self.stochpol.ph_ob, resh(self.rollout.buf_obs)), (self.ph_ret, resh(self.buf_rets)), (self.ph_adv, resh(self.buf_advs)), ] if self.depth_pred: ph_buf.extend([ (self.stochpol.ph_depths, resh(self.rollout.buf_depths)), ]) if self.aux_input: ph_buf.extend([ (self.stochpol.ph_vel, resh(self.rollout.buf_vels)), (self.stochpol.ph_prev_rew, resh(self.rollout.buf_prev_ext_rews)), (self.stochpol.ph_prev_ac, resh(self.rollout.buf_prev_acs)), ]) if self.dynamics.auxiliary_task.features_shared_with_policy: ph_buf.extend([ (self.dynamics.auxiliary_task.ph_features, resh(self.rollout.buf_feats)), (self.dynamics.auxiliary_task.ph_last_features, resh(np.expand_dims(self.rollout.buf_feat_last, axis=1))), ]) #print("Buff obs shape: {}".format(self.rollout.buf_obs.shape)) #print("Buff rew shape: {}".format(self.rollout.buf_rews.shape)) #print("Buff nlps shape: {}".format(self.rollout.buf_nlps.shape)) #print("Buff vpreds shape: {}".format(self.rollout.buf_vpreds.shape)) ph_buf.extend([(self.dynamics.last_ob, self.rollout.buf_obs_last.reshape([ self.nenvs * self.nsegs_per_env, 1, *self.ob_space.shape ]))]) mblossvals = [] #if self.lstm: #print("Train lstm 1 state: {}, {}".format(self.rollout.train_lstm1_c, self.rollout.train_lstm1_h)) #if self.lstm2_size: #print("Train lstm2 state: {}, {}".format(self.rollout.train_lstm2_c, self.rollout.train_lstm2_h)) for _ in range(self.nepochs): np.random.shuffle(envinds) for start in range(0, self.nenvs * self.nsegs_per_env, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] #mbenvinds = tf.convert_to_tensor(mbenvinds) #fd = {ph: buf[mbenvinds] if type(buf) is np.ndarray else buf.eval()[mbenvinds] for (ph, buf) in ph_buf} if self.early_stop: grad_mask = self.rollout.grad_mask[mbenvinds] fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf} fd.update({self.ph_gradmask: grad_mask}) else: fd = {ph: buf[mbenvinds] for (ph, buf) in ph_buf} fd.update({ self.ph_lr: self.lr, self.ph_cliprange: self.cliprange }) if self.lstm: fd.update({ self.stochpol.c_in_1: self.rollout.train_lstm1_c[mbenvinds, :], self.stochpol.h_in_1: self.rollout.train_lstm1_h[mbenvinds, :] }) if self.lstm and self.lstm2_size: fd.update({ self.stochpol.c_in_2: self.rollout.train_lstm2_c[mbenvinds, :], self.stochpol.h_in_2: self.rollout.train_lstm2_h[mbenvinds, :] }) if self.log_grads: outs = getsess().run( self._losses + (self._train, self._summary), fd) losses = outs[:-2] summary = outs[-1] mblossvals.append(losses) wandb.tensorflow.log(tf.summary.merge_all()) self.grad_writer.add_summary( summary, getsess().run(self.global_step)) else: mblossvals.append(getsess().run( self._losses + (self._train, ), fd)[:-1]) mblossvals = [mblossvals[0]] info.update( zip(['opt_' + ln for ln in self.loss_names], np.mean([mblossvals[0]], axis=0))) info["rank"] = MPI.COMM_WORLD.Get_rank() self.n_updates += 1 info["n_updates"] = self.n_updates info.update({ dn: (np.mean(dvs) if len(dvs) > 0 else 0) for (dn, dvs) in self.rollout.statlists.items() }) info.update(self.rollout.stats) if "states_visited" in info: info.pop("states_visited") tnow = time.time() info["ups"] = 1. / (tnow - self.t_last_update) info["total_secs"] = tnow - self.t_start info['tps'] = MPI.COMM_WORLD.Get_size( ) * self.rollout.nsteps * self.nenvs / (tnow - self.t_last_update) self.t_last_update = tnow return info
transforms = utils.get_trans(size=cfg.img_size) train_dst = MyDataset(x_train, y_train, transform=transforms['train']) valid_dst = MyDataset(x_val, y_val, transform=transforms['val']) train_loader = torch.utils.data.DataLoader(train_dst, batch_size=cfg.bs, shuffle=True, pin_memory=True) valid_loader = torch.utils.data.DataLoader(valid_dst, batch_size=cfg.bs, shuffle=False, pin_memory=True) # 得到均值方差 print(utils.get_mean_and_std(train_dst)) # 使用多模型融合 models_list = get_model(cfg.model_names) for i, cur_cnn in enumerate(models_list): cnn = cur_cnn # 因为要保存model name = cfg.model_names[i] + '.pkl' cnn.to(device) # 训练数据 loss_fn = nn.CrossEntropyLoss() # loss_fn = utils.LabelSmoothingCrossEntropy() optimizer = optim.Adam(cnn.parameters(), lr=cfg.lr, weight_decay=1e-4) # optimizer = optim.SGD(cnn.parameters(), lr=cfg.lr, momentum=0.9, nesterov=True) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', patience=3,
alpha = (array - mean) * (1. / std) return (1 - numpy.exp(-alpha)) / (1 + numpy.exp(-alpha)) for name in feature_names: if "leptons_" == name or "jets_" == name or "objects_" in name: feature_names.remove(name) print("Here are the ordered global features:", feature_names) if args.z_score: preprocess_dict = {} for feature in feature_names: if ("objects_" not in feature and "leptons_" != feature and "jets_" != feature): mean, stddev = utils.get_mean_and_std(features[feature]) preprocess_dict[feature] = { "mean": float(mean), "std_dev": float(stddev) } global_features = utils.create_array(features, feature_names, preprocess_dict, args.z_score) global_features_validation = utils.create_array(features_validation, feature_names, preprocess_dict, args.z_score) global_features_data = utils.create_array(features_data, feature_names, preprocess_dict, args.z_score) global_features_final_fit = utils.create_array(features_final_fit, feature_names, preprocess_dict, args.z_score)
def update(self): if self.normrew: rffs = np.array( [self.rff.update(rew) for rew in self.rollout.buf_rews.T]) rffs_mean, rffs_std, rffs_count = mpi_moments(rffs.ravel()) self.rff_rms.update_from_moments(rffs_mean, rffs_std**2, rffs_count) rews = self.rollout.buf_rews / np.sqrt(self.rff_rms.var) else: rews = np.copy(self.rollout.buf_rews) self.calculate_advantages(rews=rews, use_news=self.use_news, gamma=self.gamma, lam=self.lam) info = dict(advmean=self.buf_advs.mean(), advstd=self.buf_advs.std(), retmean=self.buf_rets.mean(), retstd=self.buf_rets.std(), vpredmean=self.rollout.buf_vpreds.mean(), vpredstd=self.rollout.buf_vpreds.std(), ev=explained_variance(self.rollout.buf_vpreds.ravel(), self.buf_rets.ravel()), rew_mean=np.mean(self.rollout.buf_rews), recent_best_ext_ret=self.rollout.current_max) if self.rollout.best_ext_ret is not None: info['best_ext_ret'] = self.rollout.best_ext_ret to_report = { 'total': 0.0, 'pg': 0.0, 'vf': 0.0, 'ent': 0.0, 'approxkl': 0.0, 'clipfrac': 0.0, 'aux': 0.0, 'dyn_loss': 0.0, 'feat_var': 0.0 } # normalize advantages if self.normadv: m, s = get_mean_and_std(self.buf_advs) self.buf_advs = (self.buf_advs - m) / (s + 1e-7) envsperbatch = (self.nenvs * self.nsegs_per_env) // self.nminibatches envsperbatch = max(1, envsperbatch) envinds = np.arange(self.nenvs * self.nsegs_per_env) mblossvals = [] for _ in range(self.nepochs): np.random.shuffle(envinds) for start in range(0, self.nenvs * self.nsegs_per_env, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] acs = self.rollout.buf_acs[mbenvinds] rews = self.rollout.buf_rews[mbenvinds] vpreds = self.rollout.buf_vpreds[mbenvinds] nlps = self.rollout.buf_nlps[mbenvinds] obs = self.rollout.buf_obs[mbenvinds] rets = self.buf_rets[mbenvinds] advs = self.buf_advs[mbenvinds] last_obs = self.rollout.buf_obs_last[mbenvinds] lr = self.lr cliprange = self.cliprange self.stochpol.update_features(obs, acs) self.dynamics.auxiliary_task.update_features(obs, last_obs) self.dynamics.update_features(obs, last_obs) feat_loss = torch.mean(self.dynamics.auxiliary_task.get_loss()) dyn_loss = torch.mean(self.dynamics.get_loss()) acs = torch.tensor(flatten_dims(acs, len(self.ac_space.shape))) neglogpac = self.stochpol.pd.neglogp(acs) entropy = torch.mean(self.stochpol.pd.entropy()) vpred = self.stochpol.vpred vf_loss = 0.5 * torch.mean( (vpred.squeeze() - torch.tensor(rets))**2) nlps = torch.tensor(flatten_dims(nlps, 0)) ratio = torch.exp(nlps - neglogpac.squeeze()) advs = flatten_dims(advs, 0) negadv = torch.tensor(-advs) pg_losses1 = negadv * ratio pg_losses2 = negadv * torch.clamp( ratio, min=1.0 - cliprange, max=1.0 + cliprange) pg_loss_surr = torch.max(pg_losses1, pg_losses2) pg_loss = torch.mean(pg_loss_surr) ent_loss = (-self.ent_coef) * entropy approxkl = 0.5 * torch.mean((neglogpac - nlps)**2) clipfrac = torch.mean( (torch.abs(pg_losses2 - pg_loss_surr) > 1e-6).float()) feat_var = torch.std(self.dynamics.auxiliary_task.features) total_loss = pg_loss + ent_loss + vf_loss + feat_loss + dyn_loss total_loss.backward() self.optimizer.step() self.optimizer.zero_grad() to_report['total'] += total_loss.data.numpy() / ( self.nminibatches * self.nepochs) to_report['pg'] += pg_loss.data.numpy() / (self.nminibatches * self.nepochs) to_report['vf'] += vf_loss.data.numpy() / (self.nminibatches * self.nepochs) to_report['ent'] += ent_loss.data.numpy() / ( self.nminibatches * self.nepochs) to_report['approxkl'] += approxkl.data.numpy() / ( self.nminibatches * self.nepochs) to_report['clipfrac'] += clipfrac.data.numpy() / ( self.nminibatches * self.nepochs) to_report['feat_var'] += feat_var.data.numpy() / ( self.nminibatches * self.nepochs) to_report['aux'] += feat_loss.data.numpy() / ( self.nminibatches * self.nepochs) to_report['dyn_loss'] += dyn_loss.data.numpy() / ( self.nminibatches * self.nepochs) info.update(to_report) self.n_updates += 1 info["n_updates"] = self.n_updates info.update({ dn: (np.mean(dvs) if len(dvs) > 0 else 0) for (dn, dvs) in self.rollout.statlists.items() }) info.update(self.rollout.stats) if "states_visited" in info: info.pop("states_visited") tnow = time.time() info["ups"] = 1. / (tnow - self.t_last_update) info["total_secs"] = tnow - self.t_start info['tps'] = self.rollout.nsteps * self.nenvs / ( tnow - self.t_last_update) # MPI.COMM_WORLD.Get_size() * self.t_last_update = tnow return info
'''AlexNet for CIFAR10. FC layers are removed. Paddings are adjusted. Without BN, the start learning rate should be 0.01 (c) YANG, Wei ''' import torch import torch.nn as nn import torch.nn.parallel import torch.backends.cudnn as cudnn import torch.optim as optim import torch.utils.data as data import torchvision.transforms as transforms import torchvision.datasets as datasets import models.cifar as models from utils import get_mean_and_std transform_stats = transforms.Compose([ transforms.ToTensor() ]) dataset = datasets.ImageFolder(root='../Data/coco/images/cbas34_train',transform=transform_stats) cbas_mean, cbas_std = get_mean_and_std(dataset) print('CBAS-34 mean: {}'.format(cbas_mean)) print('CBAS-34 std: {}'.format(cbas_std))