def load_data(path_data, action_space, force_reload=False): path_data_processed = path_data + ', processed' file_data_processed = path_data_processed + '/data' if not force_reload and os.path.exists(file_data_processed): print(f'load data from {file_data_processed}') vs = load_vars(file_data_processed) return vs print(f'load data from {path_data}') tools.mkdir(path_data_processed) files = tools.get_files(path_rel=path_data, sort=True) # inputs_final, outputs_final = np.zeros((0, 2)), np.zeros((0, 4)) inputs_final, outputs_final = np.zeros((0, 2 * action_space)), np.zeros((0, 4 * action_space)) counts = np.zeros((len(files)), dtype=np.int) for ind, f in enumerate(files): mu0s_ats_batch, logsigma0s_batch, ress = load_vars(f) inputs = np.concatenate((mu0s_ats_batch, logsigma0s_batch), axis=-1) max_values = np.array([res['max'].x for res in ress]) min_values = np.array([res['min'].x for res in ress]) outputs = np.concatenate((max_values, min_values), axis=-1) inputs_final = np.concatenate((inputs_final, inputs)) # shape:(None, 2) outputs_final = np.concatenate((outputs_final, outputs)) # shape:(None, 4) counts[ind] = mu0s_ats_batch.shape[0] weights = [] cnt_normalize = counts.mean() for cnt in counts: weight = cnt_normalize * 1. / cnt * np.ones(cnt) weights.append(weight) weights = np.concatenate(weights, axis=0) # final = np.concatenate((inputs_final, outputs_final), axis=-1) # --- delete nan and inf # final = final[~np.isnan(final).any(axis=1)] # final = final[~np.isinf(final).any(axis=1)] inds_reserve = np.logical_and(~np.isnan(outputs_final).any(axis=1), ~np.isinf(outputs_final).any(axis=1)) inputs_final = inputs_final[inds_reserve] outputs_final = outputs_final[inds_reserve] weights = weights[inds_reserve] # --- shuffle # np.random.shuffle(final) N = inputs_final.shape[0] inds_shuffle = np.random.permutation(N) inputs_final = inputs_final[inds_shuffle] outputs_final = outputs_final[inds_shuffle] weights = weights[inds_shuffle] # inputs_final, outputs_final = np.split(final, indices_or_sections=[2], axis=-1) ind_split = -500 train_x, train_y, train_weight = \ inputs_final[:ind_split], outputs_final[:ind_split], weights[:ind_split] eval_x, eval_y, eval_weight = \ inputs_final[ind_split:], outputs_final[ind_split:], weights[ind_split:] save_vars(file_data_processed, train_x, train_y, train_weight, eval_x, eval_y, eval_weight) return train_x, train_y, train_weight, eval_x, eval_y, eval_weight
def atari_arg_parser(): """ Create an argparse.ArgumentParser for run_atari.py. """ parser = arg_parser() parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('--clipped_type', default='kl2clip', type=str) parser.add_argument('--use_tabular', default=False, type=ast.literal_eval) parser.add_argument('--cliprange', default=0.1, type=ast.literal_eval) parser.add_argument('--delta_kl', default=0.001, type=float) root_dir_default = '/tmp/baselines' if not os.path.exists(root_dir_default): tools.mkdir(root_dir_default) parser.add_argument('--root_dir', default=root_dir_default, type=str) parser.add_argument('--sub_dir', default=None, type=str) parser.add_argument('--force_write', default=1, type=int) return parser
def load_data_normal(path_data, USE_MULTIPROCESSING=True): path_save = f'{path_data}/train_preprocessed_reduce_v3' if os.path.exists(f'{path_save}/data'): print(f'load data from {path_save}/data') vs = load_vars(f'{path_save}/data') return vs tools.mkdir(f'{path_data}/train_preprocessed') files = tools.get_files(path_rel=path_data, only_sub=False, sort=False, suffix='.pkl') actions, deltas, max_mu_logsigma, min_mu_logsigma = [], [], [], [] for ind, f in enumerate(files[:1]): a_s_batch, _, _, ress_tf = load_vars(f) actions.append(a_s_batch) deltas.append(np.ones_like(a_s_batch) * ress_tf.delta) min_mu_logsigma.append(ress_tf.x.min) max_mu_logsigma.append(ress_tf.x.max) actions = np.concatenate(actions, axis=0) deltas = np.concatenate(deltas, axis=0) min_mu_logsigma = np.concatenate(min_mu_logsigma, axis=0) max_mu_logsigma = np.concatenate(max_mu_logsigma, axis=0) min_mu_tfopt, _ = np.split(min_mu_logsigma, indices_or_sections=2, axis=-1) max_mu_tfopt, _ = np.split(max_mu_logsigma, indices_or_sections=2, axis=-1) time0 = time.time() calculate_mu = get_calculate_mu_func(True) # TODO: 以下为mu_logsigma_fsolve if USE_MULTIPROCESSING: p = multiprocessing.Pool(4) min_mu_fsolve = p.map(calculate_mu, zip(min_mu_tfopt, actions, deltas)) max_mu_fsolve = p.map(calculate_mu, zip(max_mu_tfopt, actions, deltas)) else: min_mu_fsolve = list(map(calculate_mu, zip(min_mu_tfopt, actions, deltas))) max_mu_fsolve = list(map(calculate_mu, zip(max_mu_tfopt, actions, deltas))) min_mu_fsolve = [_[0] for _ in min_mu_fsolve] max_mu_fsolve = [_[0] for _ in max_mu_fsolve] # f_mu_to_logsigma = lambda m, a: (m - a) * (m ** 2 - a * u - 1) / a time1 = time.time() print(time1 - time0) mu_tf_opt = np.concatenate((min_mu_tfopt, max_mu_tfopt), axis=1) mu_fsolve = np.stack( (np.concatenate(min_mu_fsolve, axis=0).squeeze(), np.concatenate(max_mu_fsolve, axis=0).squeeze()) , axis=1) print(mu_tf_opt - mu_fsolve) # exit() inds_shuffle = np.random.permutation(actions.shape[0]) all_ = np.concatenate((actions, deltas, mu_fsolve), axis=1)[inds_shuffle] all_ = all_[~np.isnan(all_).any(axis=1)] inputs_all, outputs_all = np.split(all_, indices_or_sections=2, axis=1) # (actions, deltas) (lambda_min_true, lambda_max_true) weights = np.ones(shape=(inputs_all.shape[0],)) print(outputs_all.shape) ind_split = -3000 train_x, train_y, train_weight = \ inputs_all[:ind_split], outputs_all[:ind_split], weights[:ind_split] eval_x, eval_y, eval_weight = \ inputs_all[ind_split:], outputs_all[ind_split:], weights[ind_split:] save_vars(f'{path_save}/data', train_x, train_y, train_weight, eval_x, eval_y, eval_weight) return train_x, train_y, train_weight, eval_x, eval_y, eval_weight,
def main(): args = mujoco_arg_parser().parse_args() if args.clipped_type == 'kl2clip': name_tmp = '' assert (args.cliprange is None) is not ( args.delta_kl is None ), "TRPPO can receive only one of cliprange and delta_kl arguments" if args.cliprange: args.kl2clip_clipcontroltype = 'base-clip' else: args.kl2clip_clipcontroltype = 'none-clip' else: name_tmp = '' assert args.cliprange, "PPO has to receive a cliprange parameter, the default one is 0.2" # --- Generate sub_dir of log dir and model dir split = ',' if args.sub_dir is None: keys_except = [ 'env', 'play', 'root_dir', 'sub_dir', 'force_write', 'lr', 'kl2clip_clipcontroltype' ] # TODO: tmp for kl2clip_sharelogsigma keys_fmt = {'num_timesteps': '.0e'} args_dict = vars(args) sub_dir = args.env if not args.clipped_type in ['kl2clip']: keys_except += ['delta_kl'] if not args.clipped_type in ['origin', 'kl2clip', 'a2c']: keys_except += ['cliprange'] # --- add keys common for key in args_dict.keys(): if key not in keys_except and key not in keys_fmt.keys(): sub_dir += f'{split} {key}={args_dict[key]}' # --- add keys which has specific format for key in keys_fmt.keys(): sub_dir += f'{split} {key}={args_dict[key]:{keys_fmt[key]}}' sub_dir += ('' if name_tmp == '' else f'{split} {name_tmp}') args.sub_dir = sub_dir tools.mkdir(f'{args.root_dir}/log') tools.mkdir(f'{args.root_dir}/model') args.log_dir = f'{args.root_dir}/log/{args.sub_dir}' args.model_dir = f'{args.root_dir}/model/{args.sub_dir}' force_write = args.force_write # Move Dirs if osp.exists(args.log_dir) or osp.exists( args.model_dir): # modify name if exist print( f"Exsits directory! \n log_dir:'{args.log_dir}' \n model_dir:'{args.model_dir}'\nMove to discard(y or n)?", end='') if force_write > 0: cmd = 'y' elif force_write < 0: exit() else: cmd = input() if cmd == 'y': log_dir_new = args.log_dir.replace('/log/', '/log_discard/') model_dir_new = args.model_dir.replace('/model/', '/model_discard/') import itertools if osp.exists(log_dir_new) or osp.exists(model_dir_new): for i in itertools.count(): suffix = f' {split} {i}' log_dir_new = f'{args.root_dir}/log_discard/{args.sub_dir}{suffix}' model_dir_new = f'{args.root_dir}/model_discard/{args.sub_dir}{suffix}' if not osp.exists(log_dir_new) and not osp.exists( model_dir_new): break print( f"Move log_dir '{args.log_dir}' \n to '{log_dir_new}'. \n" f"Move model_dir '{args.model_dir}' \n to '{model_dir_new}'" f"\nConfirm move(y or n)?", end='') if force_write > 0: cmd = 'y' elif force_write < 0: exit() else: cmd = input() if cmd == 'y': import shutil if osp.exists(args.log_dir): shutil.move(args.log_dir, log_dir_new) if osp.exists(args.model_dir): shutil.move(args.model_dir, model_dir_new) else: print("Please Rename 'name_tmp'") exit() else: print("Please Rename 'name_tmp'") exit() os.mkdir(args.log_dir) os.mkdir(args.model_dir) # exit() os.mkdir(osp.join(args.model_dir, 'cliprange_max')) os.mkdir(osp.join(args.model_dir, 'cliprange_min')) os.mkdir(osp.join(args.model_dir, 'actions')) os.mkdir(osp.join(args.model_dir, 'mu0_logsigma0')) os.mkdir(osp.join(args.model_dir, 'kls, ratios')) os.mkdir(osp.join(args.model_dir, 'advs')) args_str = vars(args) with open(f'{args.log_dir}/args.json', 'w') as f: json.dump(args_str, f, indent=4, separators=(',', ':')) logger.configure(args.log_dir) model, env = train(env_id=args.env, clipped_type=args.clipped_type, num_timesteps=args.num_timesteps, seed=args.seed, args=args) # model, env = train(args.env, num_timesteps=10, seed=args.seed) if args.play: logger.log("Running trained model") obs = np.zeros((env.num_envs, ) + env.observation_space.shape) obs[:] = env.reset() while True: actions = model.step(obs)[0] obs[:] = env.step(actions)[0] env.render()
ratio_mins.append(ratio) ratio_pre = ratio else: ratio_pre = ratio = self.opt_entity2( pa, delta, 'min', ratio_pre if initialwithpresol else None) ratio_mins.append(ratio) ratio_maxs = np.array(ratio_maxs) ratio_mins = np.array(ratio_mins) return DotMap(max=ratio_maxs, min=ratio_mins) import tools_process path_root_tabular = f'{path_root}/tabular' tools.mkdir(path_root_tabular) path_root_tabluar_locker = f'{path_root_tabular}/locker' tools.mkdir(path_root_tabluar_locker) class KL2Clip_tabular(object): def __init__(self): self.deltas_dict = {} self._upperbound = 0.99 self._lowerbound = 0.01 def get_tabular(self, delta): save_path = f'{path_root_tabular}/{delta:.16f}_atari' if delta in self.deltas_dict: pass # TODO: file lock
def main(): parser = atari_arg_parser() parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm', 'mlp'], default='cnn') args = parser.parse_args() if args.clipped_type == 'kl2clip': name_tmp = '' if args.cliprange and 'NoFrameskip-v4' not in args.env: args.kl2clip_clipcontroltype = 'base-clip' else: args.kl2clip_clipcontroltype = 'none-clip' else: name_tmp = '' assert args.cliprange, "PPO has to receive a cliprange parameter, the default one is 0.2" # --- Generate sub_dir of log dir and model dir split = ',' if args.sub_dir is None: keys_except = [ 'env', 'play', 'root_dir', 'sub_dir', 'force_write', 'lr', 'kl2clip_clipcontroltype' ] # TODO: tmp for kl2clip_sharelogsigma keys_fmt = {'num_timesteps': '.0e'} args_dict = vars(args) sub_dir = args.env if not args.clipped_type in ['kl2clip']: keys_except += ['delta_kl'] if not args.clipped_type in ['origin', 'kl2clip', 'a2c']: keys_except += ['cliprange'] # --- add keys common for key in args_dict.keys(): if key not in keys_except and key not in keys_fmt.keys(): sub_dir += f'{split} {key}={args_dict[key]}' # --- add keys which has specific format for key in keys_fmt.keys(): sub_dir += f'{split} {key}={args_dict[key]:{keys_fmt[key]}}' sub_dir += ('' if name_tmp == '' else f'{split} {name_tmp}') args.sub_dir = sub_dir tools.mkdir(f'{args.root_dir}/log') tools.mkdir(f'{args.root_dir}/model') args.log_dir = f'{args.root_dir}/log/{args.sub_dir}' args.model_dir = f'{args.root_dir}/model/{args.sub_dir}' force_write = args.force_write # Move Dirs if osp.exists(args.log_dir) or osp.exists( args.model_dir): # modify name if exist print( f"Exsits directory! \n log_dir:'{args.log_dir}' \n model_dir:'{args.model_dir}'\nMove to discard(y or n)?", end='') if force_write > 0: cmd = 'y' elif force_write < 0: exit() else: cmd = input() if cmd == 'y': log_dir_new = args.log_dir.replace('/log/', '/log_discard/') model_dir_new = args.model_dir.replace('/model/', '/model_discard/') import itertools if osp.exists(log_dir_new) or osp.exists(model_dir_new): for i in itertools.count(): suffix = f' {split} {i}' log_dir_new = f'{args.root_dir}/log_discard/{args.sub_dir}{suffix}' model_dir_new = f'{args.root_dir}/model_discard/{args.sub_dir}{suffix}' if not osp.exists(log_dir_new) and not osp.exists( model_dir_new): break print( f"Move log_dir '{args.log_dir}' \n to '{log_dir_new}'. \n" f"Move model_dir '{args.model_dir}' \n to '{model_dir_new}'" f"\nConfirm move(y or n)?", end='') if force_write > 0: cmd = 'y' elif force_write < 0: exit() else: cmd = input() if cmd == 'y': import shutil if osp.exists(args.log_dir): shutil.move(args.log_dir, log_dir_new) if osp.exists(args.model_dir): shutil.move(args.model_dir, model_dir_new) else: print("Please Rename 'name_tmp'") exit() else: print("Please Rename 'name_tmp'") exit() os.mkdir(args.log_dir) os.mkdir(args.model_dir) # exit() os.mkdir(osp.join(args.model_dir, 'cliprange_max')) os.mkdir(osp.join(args.model_dir, 'cliprange_min')) os.mkdir(osp.join(args.model_dir, 'actions')) # os.mkdir(osp.join(args.model_dir, 'mu0_logsigma0')) os.mkdir(osp.join(args.model_dir, 'kls, ratios')) os.mkdir(osp.join(args.model_dir, 'advs')) args_str = vars(args) with open(f'{args.log_dir}/args.json', 'w') as f: json.dump(args_str, f, indent=4, separators=(',', ':')) logger.configure(args.log_dir) train(clipped_type=args.clipped_type, num_timesteps=args.num_timesteps, seed=args.seed, args=args, policy=args.policy)
def prepare_data(dim, delta, sharelogsigma, clipcontroltype, cliprange, clip_clipratio, search_delta=False): global ress_tf_last path_data = path_root + '/KL2Clip/data/train_lambda' Name = f'dim={dim}, delta={delta}, train' path_data_processed = path_data + f'/{Name}' tools.mkdir(path_data_processed) if dim == 1: logsigma0s = np.array([0]) else: raise NotImplementedError logsigma0s = logsigma0s.reshape((-1, dim)) batch_size = 2048 mu = np.zeros((dim, )) opt = KL2Clip(dim=dim, batch_size=batch_size, sharelogsigma=sharelogsigma, clipcontroltype=clipcontroltype, cliprange=cliprange) def get_fn_sample(): mu0 = tf.placeholder(shape=[dim], dtype=tf.float32) a = tf.placeholder(shape=[batch_size, dim], dtype=tf.float32) logsigma0 = tf.placeholder(shape=[dim], dtype=tf.float32) sample_size = tf.placeholder(shape=(), dtype=tf.int32) dist = DiagGaussianPd(tf.concat((mu0, logsigma0), axis=0)) samples = dist.sample(sample_size) fn_sample = U.function([mu0, logsigma0, sample_size], samples) fn_p = U.function([mu0, logsigma0, a], dist.p(a)) return fn_sample, fn_p sess = U.make_session(make_default=True) results = [] fn_sample, fn_p = get_fn_sample() for logsigma0 in logsigma0s: prefix_save = f'{path_data_processed}/logsigma0={logsigma0}' Name_f = f"{Name},logsigma0={logsigma0}" file_fig = f'{prefix_save}.png' # a_s_batch = fn_sample( mu, logsigma0, batch_size ) a_s_batch = np.linspace(-5, 5, batch_size).reshape((-1, 1)) logsigma0s_batch = np.tile(logsigma0, (batch_size, 1)) print(a_s_batch.max(), a_s_batch.min()) # --- sort the data: have problem in 2-dim # inds = np.argsort(a_s_batch, axis=0) # inds = inds.reshape(-1) # a_s_batch = a_s_batch[inds] # logsigma0s_batch = logsigma0s_batch[inds] # tools.reset_time() # a_s_batch.fill(0) # print(a_s_batch.shape) # a_s_batch[0, :]=0 # if search_delta: # for i in range( batch_size): # a_s_batch[i,:] = 0.001 * (batch_size-i) if not os.path.exists(f'{prefix_save}.pkl'): # ress_tf = opt( mu0_logsigma0_tuple=(a_s_batch, logsigma0s_batch), a=None, delta=delta, clip_clipratio=clip_clipratio) ress_tf = opt(mu0_logsigma0_tuple=(np.zeros_like(logsigma0s_batch), logsigma0s_batch), a=a_s_batch, delta=delta, clip_clipratio=clip_clipratio) print(a_s_batch[0], ress_tf.x.max[0], ress_tf.x.min[0]) save_vars(f'{prefix_save}.pkl', a_s_batch, logsigma0, logsigma0s_batch, ress_tf) print(prefix_save) a_s_batch, logsigma0, logsigma0s_batch, ress_tf = load_vars( f'{prefix_save}.pkl') if search_delta: results.append(ress_tf) break if cliprange == clipranges[0]: # TODO tmp fig = plt.figure(figsize=(20, 10)) markers = ['^', '.'] colors = [['blue', 'red'], ['green', 'hotpink']] # for ind, opt_name in enumerate(['max']): for ind, opt_name in enumerate(['max', 'min']): # if ind == 1: # continue # --- plot tensorflow result ratios, cons = ress_tf.ratio[opt_name], ress_tf.con[opt_name] print( f'clip-{opt_name}_mean:{ratios.mean()}, clip-{opt_name}_min:{ratios.min()}, clip-{opt_name}_max:{ratios.max()}' ) if search_delta: continue if DEBUG: pass inds_good = cons <= get_ConstraintThreshold(ress_tf.delta) inds_bad = np.logical_not(inds_good) if dim == 1: if ind == 0 and 1: ps = fn_p(mu, logsigma0, a_s_batch) # +np.abs(ps.max()) + 1 ratio_new = -np.log(ps) ratio_new = ratio_new - ratio_new.min() + ratios.min() alpha = np.exp(-ps * 2) print(alpha) # plt.scatter(a_s_batch, ratio_new, s=5, label='ratio_new0') ratio_new = ratio_new.min() + alpha * (ratio_new - ratio_new.min()) # plt.scatter( a_s_batch, ratio_new, s=5, label='ratio_new1' ) # ps = -ps # ratios = ps - ps.min() + ratios.min() # print( ps.min() ) # ratios_new =np.square( a_s_batch-mu ) * np.exp( -logsigma0 ) # ratio_min = ps / (ps.max()-ps.min()) * ress_tf.ratio.min.max() # plt.scatter(a_s_batch, ratio_min, s=5, label='square') # plt.scatter(a_s_batch, 1./ratio_min, s=5, label='square') # plt.scatter(a_s_batch, 1./ratios, s=5, label='1/max') def plot_new(alpha): clip_max_new, clip_min_new = get_clip_new( alpha, ress_tf.ratio['max'], ress_tf.ratio['min'], clipcontroltype=clipcontroltype) plt.scatter(a_s_batch, clip_max_new, s=5, label=f'clip_max_{alpha}') plt.scatter(a_s_batch, clip_min_new, s=5, label=f'clip_min_{alpha}') if ind == 0: pass # plot_new(0.5) # plot_new(0.5) # plot_new(-1) plt.scatter(a_s_batch[inds_good], ratios[inds_good], label='ratio_predict-good_' + opt_name, s=5, color=colors[ind][0], marker=markers[ind]) plt.scatter(a_s_batch[inds_bad], ratios[inds_bad], label='ratio_predict-bad_' + opt_name, s=5, color=colors[ind][1], marker=markers[ind]) elif dim == 2: ax = fig.gca(projection='3d') # ax.view_init(30, 30) ax.view_init(90, 90) # ax.plot_trisurf(a_s_batch[:, 0], a_s_batch[:, 1], ratios) ax.scatter(a_s_batch[inds_good, 0], a_s_batch[inds_good, 1], ratios[inds_good], label='ratio_predict-good_' + opt_name, s=5, color=colors[ind][0], marker=markers[ind]) ax.scatter(a_s_batch[inds_bad, 0], a_s_batch[inds_bad, 1], ratios[inds_bad], label='ratio_predict-bad_' + opt_name, s=5, color=colors[ind][1], marker=markers[ind]) if dim <= 2 and not search_delta: plt.title( Name_f + f'\nstep:{ress_tf.step},rate_satisfycon:{ress_tf.rate_satisfycon_}, rate_statisfydifference_:{ress_tf.rate_statisfydifference_}, difference_max_:{ress_tf.difference_max_}' ) plt.legend(loc='best') if not DEBUG: plt.savefig(file_fig) opt.close() if dim <= 2 and not search_delta: if DEBUG: if cliprange == clipranges[-1]: plt_tools.set_postion() plt.show() plt.close()