Beispiel #1
0
def callback(it):
    if it % 3 == 0 and it > 1 and not replay:
        fname = osp.join(CKPTDIR, '%.5i' % it)
        U.save_state(fname)
    if args.continue_iter is not None and int(args.continue_iter) + 1 == it:
        fname = osp.join(CKPTDIR, str(args.continue_iter))
        U.load_state(fname)
Beispiel #2
0
def callback(it):
    if MPI.COMM_WORLD.Get_rank()==0:
        if it % 5 == 0 and it > 3 and not replay:
            fname = osp.join("savedir/", 'checkpoints', '%.5i'%it)
            U.save_state(fname)
    if it == 0 and args.continue_iter is not None:
        fname = osp.join("savedir/"+args.savename+"/checkpoints/", str(args.continue_iter))
        U.load_state(fname)
        pass
Beispiel #3
0
def callback(it):
    if MPI.COMM_WORLD.Get_rank() == 0:
        if it % 5 == 0 and it > 3 and not replay:
            fname = osp.join("savedir/", 'checkpoints', '%.5i' % it)
            # logger.log('Saving model to %s'%fname)
            U.save_state(fname)
    if it == 0 and args.continue_iter is not None:
        fname = osp.join("" + args.savename + "/checkpoints/",
                         str(args.continue_iter))
        print(fname)
        U.load_state(fname)

        return int(args.continue_iter)
    return it
Beispiel #4
0
def callback(it):
    global sess
    print("it: %s\n\n" % it)
    if MPI.COMM_WORLD.Get_rank() == 0:
        if (it > 0) and (not replay):
            fname = osp.join("savedir/", 'checkpoints', '%.5i' % it)
            print("save_state %s" % fname)
            U.save_state(fname)
            #tf.train.write_graph(sess.graph_def, osp.join("savedir/", 'checkpoints'), 'train%.5i.pb' % it, as_text=False)

    if it == 0 and args.continue_iter is not None:
        fname = osp.join("savedir/" + args.savename + "/checkpoints/",
                         str(args.continue_iter))
        U.load_state(fname)
        pass
Beispiel #5
0
def callback(it):
    if MPI.COMM_WORLD.Get_rank() == 0:
        if it % 5 == 0 and it > 3 and not replay:
            fname = osp.join("savedir/", 'checkpoints', '%.5i' % it)
            # logger.log('Saving model to %s'%fname)
            U.save_state(fname)
    if it == 0 and args.continue_iter is not None:
        fname = osp.join("" + args.savename + "/checkpoints/",
                         str(args.continue_iter))
        U.load_state(fname)

        # fname = osp.join(""+args.savename+"/checkpoints/", args.continue_iter)
        # subvars = []
        # for i in range(args.num_subs-1):
        #     subvars += tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="sub_policy_%i" % (i+1))
        # print([v.name for v in subvars])
        # U.load_state(fname, subvars)
        pass
Beispiel #6
0
def start(callback, args, workerseed, rank, comm):
    env = gym.make(args.task)
    env.seed(workerseed)
    np.random.seed(workerseed)
    ob_space = env.observation_space
    ac_space = env.action_space

    num_subs = args.num_subs
    macro_duration = args.macro_duration
    num_rollouts = args.num_rollouts
    warmup_time = args.warmup_time
    train_time = args.train_time
    sub_hidden_sizes = args.sub_hidden_sizes
    sub_policy_costs = args.sub_policy_costs


    save_folder = os.path.join("savedir/", args.savename)
    if not os.path.exists(save_folder):
        os.makedirs(save_folder)

    num_batches = 15

    # observation in.
    ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, ob_space.shape[0]])
    # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, 104])

    # features = Features(name="features", ob=ob)
    # policy = Policy(name="policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs)
    # old_policy = Policy(name="old_policy", ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2, num_subpolicies=num_subs)

    # sub_policies = [SubPolicy(name="sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=sub_hidden_sizes[x], num_hid_layers=2) for x in range(num_subs)]
    # old_sub_policies = [SubPolicy(name="old_sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=sub_hidden_sizes[x], num_hid_layers=2) for x in range(num_subs)]
    sub_policy = SubPolicy(name="sub_policy_%i" % 0, ob=ob, ac_space=ac_space, hid_size=sub_hidden_sizes[0], num_hid_layers=2)
    old_sub_policy = SubPolicy(name="old_sub_policy_%i" % 0, ob=ob, ac_space=ac_space, hid_size=sub_hidden_sizes[0], num_hid_layers=2)

    learner = Learner(env, sub_policy, old_sub_policy, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-5, optim_batchsize=64, args=args)
    rollout = rollouts.traj_segment_generator(sub_policy, env, macro_duration, num_rollouts,
                                              stochastic=True, args=args, sub_policy_costs=sub_policy_costs)
    rollout_eval = rollouts.traj_segment_generator(sub_policy, env, macro_duration, num_rollouts,
                                              stochastic=False, args=args, sub_policy_costs=sub_policy_costs)



    for x in range(1):
        callback(x)
        if x == 0:
            learner.syncSubpolicies()
            print("synced subpols")
        # Run the inner meta-episode.

        # policy.reset()
        # earner.syncMasterPolicies()

        try:
            env.env.randomizeCorrect()
            shared_goal = comm.bcast(env.env.realgoal, root=0)
            env.env.realgoal = shared_goal
        except:
            pass

        # print("It is iteration %d so i'm changing the goal to %s" % (x, env.env.realgoal))
        # mini_ep = 0 if x > 0 else -1 * (rank % 10)*int(warmup_time+train_time / 10)
        mini_ep = 0

        totalmeans = []
        while mini_ep < warmup_time+train_time:
            mini_ep += 1
            # rollout
            rolls = rollout.__next__()
            # save images, rewards, macro actions
            if 'rgb_arrays' in rolls:
                current_save_folder = os.path.join(save_folder, 'episode' + str(mini_ep))
                os.makedirs(current_save_folder, exist_ok=True)
                statistic_file = os.path.join(current_save_folder, 'statistic_file.txt')
                rgb_arrays_file = os.path.join(current_save_folder, 'rgb_arrays.pickle')
                with open(statistic_file, 'w') as f:
                    ep_ret = sum(rolls['rews_without_cost'])
                    f.write('%d: %f' % (mini_ep, ep_ret) + '\n')
                    needed_keys = ['macro_ac', 'rews_without_cost']
                    for key in needed_keys:
                        f.write(key + '\n')
                        for v in rolls[key]:
                            f.write(str(v) + ' ')
                        f.write('\n\n')
                rgb_arrays = np.array(rolls['rgb_arrays'])
                rgb_arrays.dump(rgb_arrays_file)

            allrolls = []
            allrolls.append(rolls)
            # train theta
            rollouts.add_advantage_macro(rolls, macro_duration, 0.99, 0.98)
            # train phi
            test_seg = rollouts.prepare_allrolls(allrolls, macro_duration, 0.99, 0.98, num_subpolicies=num_subs)
            learner.updateSubPolicies(test_seg, num_batches, (mini_ep >= warmup_time))
            # print(("Episode %d return: %s" % (mini_ep, rolls['ep_rets_without_cost'][0])))
            if args.s:
                totalmeans.append(gmean)
                with open('outfile'+str(x)+'.pickle', 'wb') as fp:
                    pickle.dump(totalmeans, fp)


            if mini_ep % 50 == 0:
                if args.num_subs != 1:
                    print("macro acts:", rolls['macro_ac'])
            # eval score
            if mini_ep % 50 == 0:
                returns = []
                for i in range(50):
                    rolls = rollout_eval.__next__()
                    returns.append(rolls['ep_rets_without_cost'][0])
                print("Episode %d return: %s" % (mini_ep, statistics.mean(returns)))
            # save session
            if mini_ep % 500 == 0:
                fname = os.path.join("savedir/", args.savename, 'checkpoints', '%.5i'%mini_ep)
                U.save_state(fname)
Beispiel #7
0
def start(args, workerseed, rank, comm):
    env = gym.make(args.task)
    env_eval = gym.make(args.task)

    env.seed(workerseed)
    env.set_experiment_id(args.id_number)

    ob_space = env.observation_space
    master_ob = gym.spaces.Box(np.array([-100,-100],dtype=np.float32),np.array([100,100],dtype=np.float32))
    ac_space = env.action_space

    num_subs = args.num_subs
    num_rollouts = args.num_rollouts
    train_time = args.train_time

    num_batches = int(num_rollouts/64)
    print(num_batches)

    # observation in.
    ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, ob_space.shape[0]])
    adv_ob = U.get_placeholder(name="adv_ob",dtype=tf.float32, shape=[None,master_ob.shape[0]])

    master_policy = Policy(name="master", ob=adv_ob, ac_space=0, hid_size=16, num_hid_layers=2, num_subpolicies=2)
    old_master_policy = Policy(name="old_master", ob=adv_ob, ac_space=0, hid_size=16, num_hid_layers=2, num_subpolicies=2)

    sub_policies = [SubPolicy(name="sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs)]
    old_sub_policies = [SubPolicy(name="old_sub_policy_%i" % x, ob=ob, ac_space=ac_space, hid_size=32, num_hid_layers=2) for x in range(num_subs)]

    learner = Learner(env,master_policy,old_master_policy,sub_policies, old_sub_policies, comm, clip_param=0.2, entcoeff=0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64)

    adv_generator = adv_gen(1.0, ob_space, perturb_func= grid_reflect_x, delay=num_rollouts*args.warmup_time,augmented=args.augment)
    adv_generator_eval = adv_gen(-1.0, ob_space, perturb_func= grid_reflect_x)

    override=None

    rollout = rollouts.traj_segment_generator(adv_generator, master_policy, sub_policies, env, num_rollouts, stochastic=True, args=args)
    rollout_eval = rollouts.traj_segment_generator(adv_generator_eval, master_policy, sub_policies, env_eval, 1, stochastic=False, args=args)

    ret_buffer = deque(maxlen=20)
    ret_buffer_eval = deque(maxlen=20)

    fname = './data/'+args.filename +'.csv'
    file  = open(fname,'w')
    writer = csv.writer(file)
    if args.load is not None:
        fname = osp.join("./savedir/",args.load, args.load)
        U.load_state(fname)
    #saver = tf.train.Saver()

    #callback(0)


    learner.syncSubpolicies()
    print("synced subpols")

    master_train = True
    sub_train = [True, True]
    goal_t = 0
    mini_ep=0
    totalmeans = []
    while mini_ep < args.warmup_time + train_time:

        mini_ep += 1
        if(mini_ep==args.warmup_time or args.warmup_time==0):
            print("===================")
            print("START TRAINING WITH")
            print("===================")
            args.pretrain = -1
            sub_train = [False,True]
        #if(mini_ep == 200):
         #   adv_generator.perturb_func = stoch_bias

        rolls = rollout.__next__()
        allrolls = []
        allrolls.append(rolls)
        # train theta
        rollouts.add_advantage_macro(rolls, 0.99, 0.98)
        if args.pretrain < 0 and master_train:
            gmean, lmean = learner.updateMasterPolicy(rolls)
        # train phi
        test_seg = rollouts.prepare_allrolls(allrolls, 0.99, 0.98, num_subpolicies=num_subs)
        learner.updateSubPolicies(test_seg, num_batches, sub_train)
        rolls_eval = rollout_eval.__next__()
        # learner.updateSubPolicies(test_seg,
        # log
        ret_buffer.extend(rolls['ep_rets'])
        ret_buffer_eval.extend(rolls_eval['ep_rets'])
        ret_mean = np.mean(ret_buffer)
        ret_eval_mean = np.mean(ret_buffer_eval)
        if len(ret_buffer_eval) == 0: ret_eval_mean =  -100
        fields = [mini_ep, ret_mean,ret_eval_mean,rolls['latent_counts'][0],rolls['latent_counts'][1],rolls['real_counts'][0],rolls['real_counts'][1]]
        writer.writerow(fields)

        print("rollout: {}, avg ep r: {}, avg eval ep r: {}".format(mini_ep,ret_mean, ret_eval_mean))
        print("--------------------------------------------------")
    if args.save is not None:
        fname = osp.join("savedir/", args.save, args.save)
        U.save_state(fname)
Beispiel #8
0
def start(args, workerseed, rank, comm):
    env = gym.make(args.task)
    env_eval = gym.make(args.task)

    env.seed(workerseed)
    #np.random.seed(workerseed)
    ob_space = env.observation_space
    master_ob = gym.spaces.Box(np.array([-100, -100], dtype=np.float32),
                               np.array([100, 100], dtype=np.float32))
    ac_space = env.action_space

    num_subs = args.num_subs
    num_rollouts = args.num_rollouts
    train_time = args.train_time

    num_batches = int(num_rollouts / 64)
    print(num_batches)

    # observation in.
    ob = U.get_placeholder(name="ob",
                           dtype=tf.float32,
                           shape=[None, ob_space.shape[0]])
    adv_ob = U.get_placeholder(name="adv_ob",
                               dtype=tf.float32,
                               shape=[None, master_ob.shape[0]])
    # ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[None, 104])
    master_policy = Policy(name="master",
                           ob=adv_ob,
                           ac_space=0,
                           hid_size=8,
                           num_hid_layers=2,
                           num_subpolicies=2)
    old_master_policy = Policy(name="old_master",
                               ob=adv_ob,
                               ac_space=0,
                               hid_size=8,
                               num_hid_layers=2,
                               num_subpolicies=2)
    # features = Features(name="features", ob=ob)
    sub_policies = [
        SubPolicy(name="sub_policy_%i" % x,
                  ob=ob,
                  ac_space=ac_space,
                  hid_size=32,
                  num_hid_layers=2) for x in range(num_subs)
    ]
    old_sub_policies = [
        SubPolicy(name="old_sub_policy_%i" % x,
                  ob=ob,
                  ac_space=ac_space,
                  hid_size=32,
                  num_hid_layers=2) for x in range(num_subs)
    ]
    #attack_grad = U.function([ob],tf.nn.l2_normalize(tf.gradients(sub_policies[0].vpred, ob)[0]))
    learner = Learner(env,
                      master_policy,
                      old_master_policy,
                      sub_policies,
                      old_sub_policies,
                      comm,
                      clip_param=0.2,
                      entcoeff=0,
                      optim_epochs=10,
                      optim_stepsize=3e-4,
                      optim_batchsize=64)
    #adv_generator = adv_gen(ob_space,attack_grad,delay=args.warmup_time*num_rollouts)
    #adv_generator_eval = adv_gen(ob_space,attack_grad,delay=args.warmup_time*num_rollouts,dummy=True)
    adv_generator = adv_gen(1.0,
                            ob_space,
                            perturb_func=stoch_bias_grid,
                            delay=num_rollouts * args.warmup_time,
                            augmented=args.augment)
    adv_generator_eval = adv_gen(-1.0, ob_space, perturb_func=stoch_perturb)
    override = None
    rollout = rollouts.traj_segment_generator(adv_generator,
                                              master_policy,
                                              sub_policies,
                                              env,
                                              num_rollouts,
                                              stochastic=True,
                                              args=args)
    rollout_eval = rollouts.traj_segment_generator(adv_generator_eval,
                                                   master_policy,
                                                   sub_policies,
                                                   env_eval,
                                                   1024,
                                                   stochastic=False,
                                                   args=args)

    ret_buffer = deque(maxlen=20)
    ret_buffer_eval = deque(maxlen=20)

    fname = './data/' + args.filename + '.csv'
    if not os.path.exists(os.path.dirname(fname)):
        try:
            os.makedirs(os.path.dirname(fname))
        except OSError as exc:  # Guard against race condition
            if exc.errno != errno.EEXIST:
                raise
    file = open(fname, 'w')
    writer = csv.writer(file)
    if args.load is not None:
        fname = os.path.join("./savedir/", args.load, args.load)
        U.load_state(fname)
    #saver = tf.train.Saver()

    #callback(0)

    learner.syncSubpolicies()
    print("synced subpols")

    master_train = True
    sub_train = [True, True]
    goal_t = 0
    mini_ep = 0
    totalmeans = []
    while mini_ep < args.warmup_time + train_time:

        mini_ep += 1
        if (mini_ep == args.warmup_time or args.warmup_time == 0):
            print("start training with")
            args.pretrain = -1
            sub_train = [False, True]
        #if(mini_ep == 200):
        #   adv_generator.perturb_func = stoch_bias

        rolls = rollout.__next__()
        allrolls = []
        allrolls.append(rolls)
        # train theta
        rollouts.add_advantage_macro(rolls, 0.99, 0.98)
        if args.pretrain < 0 and master_train:
            gmean, lmean = learner.updateMasterPolicy(rolls)
        # train phi
        test_seg = rollouts.prepare_allrolls(allrolls,
                                             0.99,
                                             0.98,
                                             num_subpolicies=num_subs)
        learner.updateSubPolicies(test_seg, num_batches, sub_train)
        rolls_eval = rollout_eval.__next__()
        # learner.updateSubPolicies(test_seg,
        # log
        ret_buffer.extend(rolls['ep_rets'])
        ret_buffer_eval.extend(rolls_eval['ep_rets'])
        ret_mean = np.mean(ret_buffer)
        ret_eval_mean = np.mean(ret_buffer_eval)
        if len(ret_buffer_eval) == 0: ret_eval_mean = -100
        fields = [
            mini_ep, ret_mean, ret_eval_mean, rolls['latent_counts'][0],
            rolls['latent_counts'][1], rolls['real_counts'][0],
            rolls['real_counts'][1]
        ]
        writer.writerow(fields)

        print("rollout: {}, avg ep r: {}, avg eval ep r: {}".format(
            mini_ep, ret_mean, ret_eval_mean))
    if args.save is not None:
        fname = os.path.join("savedir/", args.save, args.save)
        U.save_state(fname)