def callback(it): if it % 3 == 0 and it > 1 and not replay: fname = osp.join(CKPTDIR, '%.5i' % it) U.save_state(fname) if args.continue_iter is not None and int(args.continue_iter) + 1 == it: fname = osp.join(CKPTDIR, str(args.continue_iter)) U.load_state(fname)
def callback(it): if MPI.COMM_WORLD.Get_rank()==0: if it % 5 == 0 and it > 3 and not replay: fname = osp.join("savedir/", 'checkpoints', '%.5i'%it) U.save_state(fname) if it == 0 and args.continue_iter is not None: fname = osp.join("savedir/"+args.savename+"/checkpoints/", str(args.continue_iter)) U.load_state(fname) pass
def callback(it): if MPI.COMM_WORLD.Get_rank() == 0: if it % 5 == 0 and it > 3 and not replay: fname = osp.join("savedir/", 'checkpoints', '%.5i' % it) # logger.log('Saving model to %s'%fname) U.save_state(fname) if it == 0 and args.continue_iter is not None: fname = osp.join("" + args.savename + "/checkpoints/", str(args.continue_iter)) print(fname) U.load_state(fname) return int(args.continue_iter) return it
def callback(it): global sess print("it: %s\n\n" % it) if MPI.COMM_WORLD.Get_rank() == 0: if (it > 0) and (not replay): fname = osp.join("savedir/", 'checkpoints', '%.5i' % it) print("save_state %s" % fname) U.save_state(fname) #tf.train.write_graph(sess.graph_def, osp.join("savedir/", 'checkpoints'), 'train%.5i.pb' % it, as_text=False) if it == 0 and args.continue_iter is not None: fname = osp.join("savedir/" + args.savename + "/checkpoints/", str(args.continue_iter)) U.load_state(fname) pass
def callback(it): if MPI.COMM_WORLD.Get_rank() == 0: if it % 5 == 0 and it > 3 and not replay: fname = osp.join("savedir/", 'checkpoints', '%.5i' % it) # logger.log('Saving model to %s'%fname) U.save_state(fname) if it == 0 and args.continue_iter is not None: fname = osp.join("" + args.savename + "/checkpoints/", str(args.continue_iter)) U.load_state(fname) # fname = osp.join(""+args.savename+"/checkpoints/", args.continue_iter) # subvars = [] # for i in range(args.num_subs-1): # subvars += tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="sub_policy_%i" % (i+1)) # print([v.name for v in subvars]) # U.load_state(fname, subvars) pass
def start(callback, args, workerseed, rank, comm): env = gym.make(args.task) env.seed(workerseed) np.random.seed(workerseed) ob_space = env.observation_space ac_space = env.action_space num_subs = args.num_subs macro_duration = args.macro_duration num_rollouts = args.num_rollouts warmup_time = args.warmup_time train_time = args.train_time sub_hidden_sizes = args.sub_hidden_sizes sub_policy_costs = args.sub_policy_costs save_folder = os.path.join("savedir/", args.savename) if not os.path.exists(save_folder): os.makedirs(save_folder) num_batches = 15 # observation in. ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, ob_space.shape[0]]) # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, 104]) # features = Features(name="features", ob=ob) # policy = Policy(name="policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs) # old_policy = Policy(name="old_policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs) # sub_policies = [SubPolicy(name="sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=sub_hidden_sizes[x], num_hid_layers=2) for x in range(num_subs)] # old_sub_policies = [SubPolicy(name="old_sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=sub_hidden_sizes[x], num_hid_layers=2) for x in range(num_subs)] sub_policy = SubPolicy(name="sub_policy_%i" % 0, ob=ob, ac_space=ac_space, hid_size=sub_hidden_sizes[0], num_hid_layers=2) old_sub_policy = SubPolicy(name="old_sub_policy_%i" % 0, ob=ob, ac_space=ac_space, hid_size=sub_hidden_sizes[0], num_hid_layers=2) learner = Learner(env, sub_policy, old_sub_policy, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-5, optim_batchsize=64, args=args) rollout = rollouts.traj_segment_generator(sub_policy, env, macro_duration, num_rollouts, stochastic=True, args=args, sub_policy_costs=sub_policy_costs) rollout_eval = rollouts.traj_segment_generator(sub_policy, env, macro_duration, num_rollouts, stochastic=False, args=args, sub_policy_costs=sub_policy_costs) for x in range(1): callback(x) if x == 0: learner.syncSubpolicies() print("synced subpols") # Run the inner meta-episode. # policy.reset() # earner.syncMasterPolicies() try: env.env.randomizeCorrect() shared_goal = comm.bcast(env.env.realgoal, root=0) env.env.realgoal = shared_goal except: pass # print("It is iteration %d so i'm changing the goal to %s" % (x, env.env.realgoal)) # mini_ep = 0 if x > 0 else -1 * (rank % 10)*int(warmup_time+train_time / 10) mini_ep = 0 totalmeans = [] while mini_ep < warmup_time+train_time: mini_ep += 1 # rollout rolls = rollout.__next__() # save images, rewards, macro actions if 'rgb_arrays' in rolls: current_save_folder = os.path.join(save_folder, 'episode' + str(mini_ep)) os.makedirs(current_save_folder, exist_ok=True) statistic_file = os.path.join(current_save_folder, 'statistic_file.txt') rgb_arrays_file = os.path.join(current_save_folder, 'rgb_arrays.pickle') with open(statistic_file, 'w') as f: ep_ret = sum(rolls['rews_without_cost']) f.write('%d: %f' % (mini_ep, ep_ret) + '\n') needed_keys = ['macro_ac', 'rews_without_cost'] for key in needed_keys: f.write(key + '\n') for v in rolls[key]: f.write(str(v) + ' ') f.write('\n\n') rgb_arrays = np.array(rolls['rgb_arrays']) rgb_arrays.dump(rgb_arrays_file) allrolls = [] allrolls.append(rolls) # train theta rollouts.add_advantage_macro(rolls, macro_duration, 0.99, 0.98) # train phi test_seg = rollouts.prepare_allrolls(allrolls, macro_duration, 0.99, 0.98, num_subpolicies=num_subs) learner.updateSubPolicies(test_seg, num_batches, (mini_ep >= warmup_time)) # print(("Episode %d return: %s" % (mini_ep, rolls['ep_rets_without_cost'][0]))) if args.s: totalmeans.append(gmean) with open('outfile'+str(x)+'.pickle', 'wb') as fp: pickle.dump(totalmeans, fp) if mini_ep % 50 == 0: if args.num_subs != 1: print("macro acts:", rolls['macro_ac']) # eval score if mini_ep % 50 == 0: returns = [] for i in range(50): rolls = rollout_eval.__next__() returns.append(rolls['ep_rets_without_cost'][0]) print("Episode %d return: %s" % (mini_ep, statistics.mean(returns))) # save session if mini_ep % 500 == 0: fname = os.path.join("savedir/", args.savename, 'checkpoints', '%.5i'%mini_ep) U.save_state(fname)
def start(args, workerseed, rank, comm): env = gym.make(args.task) env_eval = gym.make(args.task) env.seed(workerseed) env.set_experiment_id(args.id_number) ob_space = env.observation_space master_ob = gym.spaces.Box(np.array([-100,-100],dtype=np.float32),np.array([100,100],dtype=np.float32)) ac_space = env.action_space num_subs = args.num_subs num_rollouts = args.num_rollouts train_time = args.train_time num_batches = int(num_rollouts/64) print(num_batches) # observation in. ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, ob_space.shape[0]]) adv_ob = U.get_placeholder(name="adv_ob",dtype=tf.float32, shape=[None,master_ob.shape[0]]) master_policy = Policy(name="master", ob=adv_ob, ac_space=0, hid_size=16, num_hid_layers=2, num_subpolicies=2) old_master_policy = Policy(name="old_master", ob=adv_ob, ac_space=0, hid_size=16, num_hid_layers=2, num_subpolicies=2) sub_policies = [SubPolicy(name="sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs)] old_sub_policies = [SubPolicy(name="old_sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs)] learner = Learner(env,master_policy,old_master_policy,sub_policies, old_sub_policies, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64) adv_generator = adv_gen(1.0, ob_space, perturb_func= grid_reflect_x, delay=num_rollouts*args.warmup_time,augmented=args.augment) adv_generator_eval = adv_gen(-1.0, ob_space, perturb_func= grid_reflect_x) override=None rollout = rollouts.traj_segment_generator(adv_generator, master_policy, sub_policies, env, num_rollouts, stochastic=True, args=args) rollout_eval = rollouts.traj_segment_generator(adv_generator_eval, master_policy, sub_policies, env_eval, 1, stochastic=False, args=args) ret_buffer = deque(maxlen=20) ret_buffer_eval = deque(maxlen=20) fname = './data/'+args.filename +'.csv' file = open(fname,'w') writer = csv.writer(file) if args.load is not None: fname = osp.join("./savedir/",args.load, args.load) U.load_state(fname) #saver = tf.train.Saver() #callback(0) learner.syncSubpolicies() print("synced subpols") master_train = True sub_train = [True, True] goal_t = 0 mini_ep=0 totalmeans = [] while mini_ep < args.warmup_time + train_time: mini_ep += 1 if(mini_ep==args.warmup_time or args.warmup_time==0): print("===================") print("START TRAINING WITH") print("===================") args.pretrain = -1 sub_train = [False,True] #if(mini_ep == 200): # adv_generator.perturb_func = stoch_bias rolls = rollout.__next__() allrolls = [] allrolls.append(rolls) # train theta rollouts.add_advantage_macro(rolls, 0.99, 0.98) if args.pretrain < 0 and master_train: gmean, lmean = learner.updateMasterPolicy(rolls) # train phi test_seg = rollouts.prepare_allrolls(allrolls, 0.99, 0.98, num_subpolicies=num_subs) learner.updateSubPolicies(test_seg, num_batches, sub_train) rolls_eval = rollout_eval.__next__() # learner.updateSubPolicies(test_seg, # log ret_buffer.extend(rolls['ep_rets']) ret_buffer_eval.extend(rolls_eval['ep_rets']) ret_mean = np.mean(ret_buffer) ret_eval_mean = np.mean(ret_buffer_eval) if len(ret_buffer_eval) == 0: ret_eval_mean = -100 fields = [mini_ep, ret_mean,ret_eval_mean,rolls['latent_counts'][0],rolls['latent_counts'][1],rolls['real_counts'][0],rolls['real_counts'][1]] writer.writerow(fields) print("rollout: {}, avg ep r: {}, avg eval ep r: {}".format(mini_ep,ret_mean, ret_eval_mean)) print("--------------------------------------------------") if args.save is not None: fname = osp.join("savedir/", args.save, args.save) U.save_state(fname)
def start(args, workerseed, rank, comm): env = gym.make(args.task) env_eval = gym.make(args.task) env.seed(workerseed) #np.random.seed(workerseed) ob_space = env.observation_space master_ob = gym.spaces.Box(np.array([-100, -100], dtype=np.float32), np.array([100, 100], dtype=np.float32)) ac_space = env.action_space num_subs = args.num_subs num_rollouts = args.num_rollouts train_time = args.train_time num_batches = int(num_rollouts / 64) print(num_batches) # observation in. ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, ob_space.shape[0]]) adv_ob = U.get_placeholder(name="adv_ob", dtype=tf.float32, shape=[None, master_ob.shape[0]]) # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, 104]) master_policy = Policy(name="master", ob=adv_ob, ac_space=0, hid_size=8, num_hid_layers=2, num_subpolicies=2) old_master_policy = Policy(name="old_master", ob=adv_ob, ac_space=0, hid_size=8, num_hid_layers=2, num_subpolicies=2) # features = Features(name="features", ob=ob) sub_policies = [ SubPolicy(name="sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs) ] old_sub_policies = [ SubPolicy(name="old_sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs) ] #attack_grad = U.function([ob],tf.nn.l2_normalize(tf.gradients(sub_policies[0].vpred, ob)[0])) learner = Learner(env, master_policy, old_master_policy, sub_policies, old_sub_policies, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64) #adv_generator = adv_gen(ob_space,attack_grad,delay=args.warmup_time*num_rollouts) #adv_generator_eval = adv_gen(ob_space,attack_grad,delay=args.warmup_time*num_rollouts,dummy=True) adv_generator = adv_gen(1.0, ob_space, perturb_func=stoch_bias_grid, delay=num_rollouts * args.warmup_time, augmented=args.augment) adv_generator_eval = adv_gen(-1.0, ob_space, perturb_func=stoch_perturb) override = None rollout = rollouts.traj_segment_generator(adv_generator, master_policy, sub_policies, env, num_rollouts, stochastic=True, args=args) rollout_eval = rollouts.traj_segment_generator(adv_generator_eval, master_policy, sub_policies, env_eval, 1024, stochastic=False, args=args) ret_buffer = deque(maxlen=20) ret_buffer_eval = deque(maxlen=20) fname = './data/' + args.filename + '.csv' if not os.path.exists(os.path.dirname(fname)): try: os.makedirs(os.path.dirname(fname)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise file = open(fname, 'w') writer = csv.writer(file) if args.load is not None: fname = os.path.join("./savedir/", args.load, args.load) U.load_state(fname) #saver = tf.train.Saver() #callback(0) learner.syncSubpolicies() print("synced subpols") master_train = True sub_train = [True, True] goal_t = 0 mini_ep = 0 totalmeans = [] while mini_ep < args.warmup_time + train_time: mini_ep += 1 if (mini_ep == args.warmup_time or args.warmup_time == 0): print("start training with") args.pretrain = -1 sub_train = [False, True] #if(mini_ep == 200): # adv_generator.perturb_func = stoch_bias rolls = rollout.__next__() allrolls = [] allrolls.append(rolls) # train theta rollouts.add_advantage_macro(rolls, 0.99, 0.98) if args.pretrain < 0 and master_train: gmean, lmean = learner.updateMasterPolicy(rolls) # train phi test_seg = rollouts.prepare_allrolls(allrolls, 0.99, 0.98, num_subpolicies=num_subs) learner.updateSubPolicies(test_seg, num_batches, sub_train) rolls_eval = rollout_eval.__next__() # learner.updateSubPolicies(test_seg, # log ret_buffer.extend(rolls['ep_rets']) ret_buffer_eval.extend(rolls_eval['ep_rets']) ret_mean = np.mean(ret_buffer) ret_eval_mean = np.mean(ret_buffer_eval) if len(ret_buffer_eval) == 0: ret_eval_mean = -100 fields = [ mini_ep, ret_mean, ret_eval_mean, rolls['latent_counts'][0], rolls['latent_counts'][1], rolls['real_counts'][0], rolls['real_counts'][1] ] writer.writerow(fields) print("rollout: {}, avg ep r: {}, avg eval ep r: {}".format( mini_ep, ret_mean, ret_eval_mean)) if args.save is not None: fname = os.path.join("savedir/", args.save, args.save) U.save_state(fname)