def main(open_plot=True):
    # TODO: Refactor and combine visualize_visitation, visualize_option, visualize_option_trajectory?

    # Plot the visitation statistics

    args = arguments()

    # Random seeds
    np.random.seed(1234)
    tf.set_random_seed(args.rseed)

    print('task=', args.task)

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    if args.restoretraj:
        # bfr = ExperienceBuffer()
        # bfr.restore('./vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.rseed) + '/' + 'traj')
        low_bfr = ExperienceBuffer()
        if args.reverse:
            low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' +
                            str(args.noptions) + 'rev_' +
                            str(args.ffuncnunit) + '_' + str(args.rseed) +
                            '/' + 'low_traj')
        else:
            low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' +
                            str(args.noptions) + '_' + str(args.ffuncnunit) +
                            '_' + str(args.rseed) + '/' + 'low_traj')
    else:
        _, low_bfr = sample_option_trajectories(mdp,
                                                args,
                                                noptions=args.noptions)

    # TODO: Print a list of states

    samples = low_bfr.buffer

    size = low_bfr.size()

    cur_o = None
    traj = [samples[i][0] for i in range(size)]

    if args.reverse:
        plot_visitation(traj,
                        mdp,
                        args,
                        filename=args.basedir + '/vis/' + args.task +
                        'option' + str(args.noptions) + 'rev_' +
                        str(args.ffuncnunit) + '_' + str(args.rseed) + '/' +
                        'visitations' + '.pdf')
    else:
        plot_visitation(traj,
                        mdp,
                        args,
                        filename=args.basedir + '/vis/' + args.task +
                        'option' + str(args.noptions) + '_' +
                        str(args.ffuncnunit) + '_' + str(args.rseed) + '/' +
                        'visitations' + '.pdf')
Ejemplo n.º 2
0
def main(open_plot=True):

    args = arguments()

    # Random seeds
    np.random.seed(1234)
    tf.set_random_seed(args.rseed)

    print('task=', args.task)

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    if args.restoretraj:
        # bfr = ExperienceBuffer()
        # bfr.restore('./vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.rseed) + '/' + 'traj')
        low_bfr = ExperienceBuffer()
        if args.reverse:
            low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' +
                            str(args.noptions) + 'rev_' +
                            str(args.ffuncnunit) + '_' + str(args.rseed) +
                            '/' + 'low_traj')
        else:
            low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' +
                            str(args.noptions) + '_' + str(args.ffuncnunit) +
                            '_' + str(args.rseed) + '/' + 'low_traj')
    else:
        _, low_bfr = sample_option_trajectories(mdp,
                                                args,
                                                noptions=args.noptions)

        print('sampled')
    # TODO: Print a list of states

    size = low_bfr.size()

    op = OptionWrapper(sess=None,
                       experience_buffer=None,
                       obs_dim=state_dim,
                       obs_bound=mdp.bounds(),
                       num_actions=num_actions,
                       action_dim=action_dim,
                       action_bound=action_bound,
                       low_method=args.lowmethod,
                       f_func=args.ffunction,
                       n_units=args.ffuncnunit,
                       init_all=args.initall,
                       restore=True,
                       name='option' + str(args.noptions) + '_' +
                       str(args.ffuncnunit) + '_' + str(args.rseed))
    op.restore(args.basedir + '/vis/' + args.task + 'option' +
               str(args.noptions) + '_' + str(args.ffuncnunit) + '_' +
               str(args.rseed))

    filename = args.basedir + '/vis/' + args.task + 'option' + str(
        args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(
            args.rseed) + '/' + 'fvalues.pdf'

    plot_fvalue(low_bfr, op, filename=filename)
Ejemplo n.º 3
0
def main():
    args = arguments()

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    bfr, low_bfr = sample_option_trajectories(mdp,
                                              args,
                                              noptions=args.noptions)

    # TODO: Trajectories are generated using noptions-1 options.

    if args.reverse:
        bfr.save(args.basedir + '/vis/' + args.task + 'option' +
                 str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' +
                 str(args.rseed) + '/' + 'traj')
        low_bfr.save(args.basedir + '/vis/' + args.task + 'option' +
                     str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' +
                     str(args.rseed) + '/' + 'low_traj')
    else:
        bfr.save(args.basedir + '/vis/' + args.task + 'option' +
                 str(args.noptions) + '_' + str(args.ffuncnunit) + '_' +
                 str(args.rseed) + '/' + 'traj')
        low_bfr.save(args.basedir + '/vis/' + args.task + 'option' +
                     str(args.noptions) + '_' + str(args.ffuncnunit) + '_' +
                     str(args.rseed) + '/' + 'low_traj')

    print('bfr  size=', bfr.size())
    print('lbfr size=', low_bfr.size())

    if args.task == 'PointMaze-v0':
        s, a, r, s, t = low_bfr.sample(20)
        for state in s:
            # print('s=', state) # TODO: how do we get the X, Y coordinates of the agent?
            print('x,y=', state.data[0], state.data[1])

    if args.task == 'MontezumaRevenge-ram-v0':
        s, a, r, s, t = low_bfr.sample(20)

        def getByte(ram, row, col):
            row = int(row, 16) - 8
            col = int(col, 16)
            return ram[row * 16 + col]

        for state in s:
            x = int(getByte(state.data, 'a', 'a'))
            y = int(getByte(state.data, 'a', 'b'))

            x_img = int(210.0 * (float(x) - 1) / float((9 * 16 + 8) - 1))
            y_img = int(160.0 * (float(y) - (8 * 16 + 6)) /
                        float((15 * 16 + 15) - (8 * 16 + 6)))

            print('(ram) x, y =', x, y)
            print('(img) x, y =', x_img, y_img)
Ejemplo n.º 4
0
def main(open_plot=True):

    args = arguments()
    
    # Random seeds
    np.random.seed(1234)
    tf.set_random_seed(args.rseed)

    print('task=', args.task)

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(args)



    if args.online:
        # TODO: Think how to solve the restoration for batch normalization.
        op = OptionWrapper(sess=None, experience_buffer=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, n_units=args.ffuncnunit, init_all=args.initall, restore=True, name='online-option' +  str(args.noptions) + '_' + str(args.ffuncnunit))
        op.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed))
        plot_eigenfunction(op, args, xind=0, yind=1, filename=args.basedir + '/vis/' + args.task + 'online-option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '/' + 'eigenfunc.pdf')
    else:
        op = OptionWrapper(sess=None, experience_buffer=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, n_units=args.ffuncnunit, init_all=args.initall, restore=True, name='option' +  str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed))
        op.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed))
        plot_eigenfunction(op, args, xind=0, yind=1, filename=args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'eigenfunc.pdf')
Ejemplo n.º 5
0
def main(open_plot=True):
    # TODO: Accept a set of options instead of just one
    args = arguments()

    # Random seeds
    np.random.seed(args.rseed)
    tf.set_random_seed(args.rseed)

    print('tasktype=', args.tasktype)

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    oagent = OptionAgent(sess=None,
                         obs_dim=state_dim,
                         obs_bound=state_bound,
                         num_actions=num_actions,
                         action_dim=action_dim,
                         action_bound=action_bound,
                         num_options=1 + args.noptions,
                         init_all=args.initall,
                         high_method=args.highmethod,
                         low_method=args.lowmethod,
                         f_func=args.ffunction,
                         batch_size=args.batchsize,
                         buffer_size=args.buffersize,
                         low_update_freq=args.lowupdatefreq,
                         option_batch_size=args.obatchsize,
                         option_buffer_size=args.obuffersize,
                         high_update_freq=args.highupdatefreq,
                         name='op')
    oagent.reset()

    for nop in range(1, args.noptions + 1):
        op = OptionWrapper(sess=None,
                           experience_buffer=None,
                           obs_dim=state_dim,
                           obs_bound=mdp.bounds(),
                           num_actions=num_actions,
                           action_dim=action_dim,
                           action_bound=action_bound,
                           low_method=args.lowmethod,
                           f_func=args.ffunction,
                           init_all=args.initall,
                           restore=True,
                           name='option' + str(nop) + '_' +
                           str(args.ffuncnunit) + '_' + str(args.rseed))

        if args.trajdir == '__default':
            if args.reverse:
                opdir = './vis/' + args.task + 'option' + str(
                    nop) + 'rev_' + str(args.ffuncnunit) + '_' + str(
                        args.rseed)
            else:
                opdir = './vis/' + args.task + 'option' + str(nop) + '_' + str(
                    args.ffuncnunit) + '_' + str(args.rseed)
        else:
            # Only one option can be restored from nonstandard locations
            assert (args.noptions == 1)
            opdir = args.trajdir
        op.restore(opdir)
        print('restored option', opdir)
        # print('upper_th=', op.upper_th)
        oagent.add_option(op)

    agents = []
    agents.append(oagent)

    if args.base:
        base = OptionAgent(sess=None,
                           obs_dim=state_dim,
                           obs_bound=state_bound,
                           num_actions=num_actions,
                           action_dim=action_dim,
                           action_bound=action_bound,
                           num_options=1,
                           high_method=args.highmethod,
                           low_method=args.lowmethod,
                           f_func=args.ffunction,
                           batch_size=args.batchsize,
                           buffer_size=args.buffersize,
                           low_update_freq=args.lowupdatefreq,
                           option_batch_size=1,
                           option_buffer_size=2,
                           high_update_freq=10000000,
                           init_all=args.initall,
                           name='base')
        agents.append(base)

    mdp.reset()

    # TODO: We need to count the number of times the agent reached the goal state.
    #       Because from the cumulative rewards, it is hard to see if the agent is performing as intended.
    #       Possible Solutions: (See the previous works first)
    #         1. Plot the number of times the agent reached the goal.
    #         2. Give a positive reward when it reached the goal
    run_agents_on_mdp(agents,
                      mdp,
                      episodes=args.nepisodes,
                      steps=args.nsteps,
                      instances=args.ninstances,
                      cumulative_plot=True)

    options = oagent.options
    for nop in range(1, len(options)):
        if args.trajdir == '__default':
            opdir = './vis/' + args.task + 'option' + str(nop) + '_' + str(
                args.ffuncnunit) + '_' + str(args.rseed)
        else:
            assert (args.noptions == 1)
            opdir = args.trajdir
        # print('upper=', options[nop].upper_th)
        options[nop].save(opdir + '_trained')

    if args.trajdir == '__default':
        bufdir = './vis/' + args.task + 'option' + str(
            args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)
    else:
        bufdir = args.trajdir
    oagent.option_buffer.save(bufdir + '_trained' + '/' + 'traj')
    oagent.experience_buffer.save(bufdir + '_trained' + '/' + 'low_traj')
Ejemplo n.º 6
0
def main(open_plot=True):

    args = arguments()

    # Random seeds
    np.random.seed(1234)
    tf.set_random_seed(args.rseed)

    print('task=', args.task)

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    if args.restoretraj:
        # bfr = ExperienceBuffer()
        # bfr.restore('./vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.rseed) + '/' + 'traj')
        low_bfr = ExperienceBuffer()
        if args.reverse:
            low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' +
                            str(args.noptions) + 'rev_' +
                            str(args.ffuncnunit) + '_' + str(args.rseed) +
                            '/' + 'low_traj')
        else:
            low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' +
                            str(args.noptions) + '_' + str(args.ffuncnunit) +
                            '_' + str(args.rseed) + '/' + 'low_traj')
    else:
        _, low_bfr = sample_option_trajectories(mdp,
                                                args,
                                                noptions=args.noptions)

        print('sampled')
    # TODO: Print a list of states

    samples = low_bfr.buffer

    size = low_bfr.size()

    print('size=', size)

    trajectories = []

    cur_o = None
    for i in range(size):
        # TODO: something wrong is happening in the trajectory. Why?
        s, a, r, s2, t, o = samples[i][0], samples[i][1], samples[i][
            2], samples[i][3], samples[i][4], samples[i][5]

        # assert(t is False)

        # print('o=', o, ', t=', t)

        if cur_o == args.noptions:
            if o == args.noptions and not t and i != size - 1:
                traj.append(s)
            else:
                # traj.append(s2)
                if args.tasktype == 'pinball':
                    t = [s for s in traj if s.x != 0.2 or s.y != 0.2
                         ]  # TODO: hack to remove the init state.
                else:
                    t = traj
                # for i, s in enumerate(t):
                # if 0.01466 <= s.data[0] and s.data[0] <= 0.01467:
                #     t.remove(s)
                #     # break
                # print(s.data[0])
                trajectories.append((i, t))

                cur_o = 0
                traj = []

                # TODO: what is the best way to print these figures out?
                # break
        else:
            if o == args.noptions:
                traj = [s]
                cur_o = args.noptions

    for traj in trajectories:
        i = traj[0]
        t = traj[1]
        print(i, ' traj length=', len(t))
        if args.reverse:
            plot_trajectory(t,
                            mdp,
                            args,
                            filename=args.basedir + '/vis/' + args.task +
                            'option' + str(args.noptions) + 'rev_' +
                            str(args.ffuncnunit) + '_' + str(args.rseed) +
                            '/' + 'traj' + str(i) + '.pdf')
        else:
            plot_trajectory(t,
                            mdp,
                            args,
                            filename=args.basedir + '/vis/' + args.task +
                            'option' + str(args.noptions) + '_' +
                            str(args.ffuncnunit) + '_' + str(args.rseed) +
                            '/' + 'traj' + str(i) + '.pdf')
Ejemplo n.º 7
0
                         f_func=args.ffunction,
                         batch_size=args.batchsize,
                         buffer_size=args.buffersize,
                         low_update_freq=args.lowupdatefreq,
                         option_batch_size=args.obatchsize,
                         option_buffer_size=args.obuffersize,
                         high_update_freq=args.highupdatefreq,
                         name='diayn' + str(args.noptions))

    for i in range(args.noptions):
        op = DiaynOption(rst, i, args.termprob)
        oagent.add_option(op)

    run_agents_on_mdp([oagent],
                      mdp,
                      episodes=args.nepisodes,
                      steps=args.nsteps,
                      instances=args.ninstances,
                      cumulative_plot=True)


if __name__ == "__main__":
    args = arguments()
    if args.exp == 'sample':
        save(args)
    elif args.exp == 'evaloff':
        restore(args)
    else:
        print('set --exp sample or evaloff')
        assert (False)
Ejemplo n.º 8
0
def main(open_plot=True):
    # TODO: Accept set of options and generate a new option based on them.

    args = arguments()

    np.random.seed(1234)
    # tf.set_random_seed(args.rseed)
    # tf.set_random_seed(5678)
    # tf.set_random_seed(5408)
    tf.set_random_seed(2345)

    print('tasktype=', args.tasktype)

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    # We generate k-th option based on the previous k-1 options.

    if args.restoretraj:
        bfr = ExperienceBuffer()
        if args.reverse:
            print('restoring buffer from ' + './vis/' + args.task + 'option' +
                  str(args.noptions - 1) + 'rev_' + str(args.ffuncnunit) +
                  '_' + str(args.rseed) + '/' + 'traj')
            bfr.restore(args.basedir + '/vis/' + args.task + 'option' +
                        str(args.noptions - 1) + 'rev_' +
                        str(args.ffuncnunit) + '_' + str(args.rseed) + '/' +
                        'traj')
        else:
            print('restoring buffer from ' + './vis/' + args.task + 'option' +
                  str(args.noptions - 1) + '_' + str(args.ffuncnunit) + '_' +
                  str(args.rseed) + '/' + 'traj')
            bfr.restore(args.basedir + '/vis/' + args.task + 'option' +
                        str(args.noptions - 1) + '_' + str(args.ffuncnunit) +
                        '_' + str(args.rseed) + '/' + 'traj')
        bfr_size = bfr.size()
        print('bfr_size=', bfr_size)  # TODO: parameter?
    else:
        bfr, _ = sample_option_trajectories(mdp,
                                            args,
                                            noptions=args.noptions - 1)
        bfr_size = bfr.size()
        print('bfr_size=', bfr_size)

    # TODO: In graph theory, inserting an edge results in significant change to the topology.
    #       However, seems adding just one transition sample to the NN does not change it too much.
    #       Can we tackle this problem other than sampling the trajectories again?

    op = OptionWrapper(sess=None,
                       experience_buffer=bfr,
                       option_b_size=min(32, bfr_size),
                       sp_training_steps=args.sptrainingstep,
                       obs_dim=state_dim,
                       obs_bound=state_bound,
                       num_actions=num_actions,
                       action_dim=action_dim,
                       action_bound=action_bound,
                       low_method=args.lowmethod,
                       f_func=args.ffunction,
                       n_units=args.ffuncnunit,
                       init_all=args.initall,
                       restore=None,
                       reversed_dir=args.reverse,
                       name='option' + str(args.noptions) + '_' +
                       str(args.ffuncnunit) + '_' + str(args.rseed))

    # if args.train:
    #     op.train(bfr, batch_size=args.snepisodes * args.snsteps)

    if args.reverse:
        filename = args.basedir + '/vis/' + args.task + 'option' + str(
            args.noptions) + 'rev_' + str(args.ffuncnunit) + "_" + str(
                args.rseed)
    else:
        filename = args.basedir + '/vis/' + args.task + 'option' + str(
            args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)

    op.save(filename)
            upper_limit = 2**sequence_length
            nonce = 0
            while (not golden_nonce and nonce < upper_limit):
                binary_sequence = self.binary_format(nonce).rjust(
                    sequence_length, '0')
                evaluator = NonceEvaluator(data, binary_sequence, difficulty)
                if (evaluator.valid_nonce()):
                    golden_nonce = binary_sequence
                nonce += 1
            sequence_length += 1
        return (golden_nonce, evaluator.hexdigest)

    def binary_format(self, nonce):
        return bin(nonce)[2:]


if __name__ == "__main__":
    _, difficulty, data = util.arguments(sys.argv)
    print("Data:", data, "| Difficulty:", difficulty)

    start_time = time.time()
    binary_sequence, hexdigest = SimpleFinder().find_by_zero_prepend(
        data, difficulty)
    # binary_sequence, hexdigest = SimpleFinder().find_by_increment(data, difficulty)
    processing_time = time.time() - start_time
    nonce = int(binary_sequence, 2)
    print("Golden Nonce:", nonce, "|",
          binary_sequence + "(" + str(len(binary_sequence)) + ")")
    print("Processing time: {0:.3f} s.".format(processing_time))
    print("Hexdigest", hexdigest)
Ejemplo n.º 10
0
def main(open_plot=True):
    args = arguments()

    # Random seeds
    np.random.seed(args.rseed)
    tf.set_random_seed(args.rseed)

    print('tasktype=', args.tasktype)

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    oagent = OptionAgent(sess=None,
                         obs_dim=state_dim,
                         obs_bound=state_bound,
                         num_actions=num_actions,
                         action_dim=action_dim,
                         action_bound=action_bound,
                         num_options=args.noptions,
                         init_all=args.initall,
                         high_method=args.highmethod,
                         low_method=args.lowmethod,
                         f_func=args.ffunction,
                         batch_size=args.batchsize,
                         buffer_size=args.buffersize,
                         option_batch_size=args.obatchsize,
                         option_buffer_size=args.obuffersize,
                         option_freq=args.ofreq,
                         option_min_steps=args.ominsteps,
                         name=str(args.noptions) + 'op-initall')

    agents = []
    agents.append(oagent)

    if args.base:
        base = OptionAgent(sess=None,
                           obs_dim=state_dim,
                           obs_bound=state_bound,
                           num_actions=num_actions,
                           action_dim=action_dim,
                           action_bound=action_bound,
                           num_options=1,
                           high_method=args.highmethod,
                           low_method=args.lowmethod,
                           f_func=args.ffunction,
                           batch_size=args.batchsize,
                           buffer_size=args.buffersize,
                           option_batch_size=1,
                           option_buffer_size=2,
                           init_all=args.initall,
                           name='base')
        agents.append(base)

    mdp.reset()

    run_agents_on_mdp(agents,
                      mdp,
                      episodes=args.nepisodes,
                      steps=args.nsteps,
                      instances=args.ninstances,
                      cumulative_plot=True)

    # TODO: Save the options learned by the agent
    options = oagent.generated_options[1]
    print('options=', options)
    for i, op in enumerate(options):
        if i == 0:
            continue
        op.save('./vis/' + args.task + 'online-option' + str(i) + '_' +
                str(args.rseed))
Ejemplo n.º 11
0
def main(open_plot=True):

    args = arguments()

    # Random seeds
    np.random.seed(1234)
    tf.set_random_seed(args.rseed)

    print('task=', args.task)

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    # TODO: Train an option using the trajectories sampled by itself.

    op = OptionWrapper(sess=None,
                       experience_buffer=None,
                       obs_dim=state_dim,
                       obs_bound=mdp.bounds(),
                       num_actions=num_actions,
                       action_dim=action_dim,
                       action_bound=action_bound,
                       low_method=args.lowmethod,
                       f_func=args.ffunction,
                       n_units=args.ffuncnunit,
                       restore=True,
                       init_all=args.initall,
                       reversed_dir=args.reverse,
                       name='option' + str(args.noptions) + '_' +
                       str(args.ffuncnunit) + '_' + str(args.rseed))

    # if args.reverse:
    #     op.restore('./vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed))
    # else:
    op.restore('./vis/' + args.task + 'option' + str(args.noptions) + '_' +
               str(args.ffuncnunit) + '_' + str(args.rseed))

    op.reversed_dir = args.reverse

    # TODO: Shouldn't we train the policy based on its own sample frequency?
    if args.restoretraj:
        if args.trajdir == '__default':
            args.trajdir = './vis/' + args.task + 'option' + str(
                args.noptions - 1) + '_' + str(args.ffuncnunit) + '_' + str(
                    args.rseed) + '/' + 'low_traj'

        print('restoring buffer from ' + args.trajdir)
        bfr = ExperienceBuffer()
        # if args.reverse:
        #     bfr.restore('./vis/' + args.task + 'option' + str(args.noptions - 1) + 'rev_' + str(args.rseed) + '/' + 'low_traj')
        # else:
        bfr.restore(args.trajdir)

        bfr_size = bfr.size()
        print('bfr_size=', bfr_size)  # TODO: parameter?
    else:
        _, bfr = sample_option_trajectories(mdp,
                                            args,
                                            noptions=args.noptions - 1)
        bfr_size = bfr.size()
        print('bfr_size=', bfr_size)

    _, _, r, _, _ = bfr.sample(32)
    print('rewards=', r)

    for _ in range(args.sptrainingstep):
        op.train(bfr, batch_size=min(128, bfr_size))

    if args.reverse:
        op.save('./vis/' + args.task + 'option' + str(args.noptions) + 'rev_' +
                str(args.ffuncnunit) + '_' + str(args.rseed))
    else:
        op.save('./vis/' + args.task + 'option' + str(args.noptions) + '_' +
                str(args.ffuncnunit) + '_' + str(args.rseed))
Ejemplo n.º 12
0
def main(open_plot=True):

    args = arguments()

    # Random seeds
    np.random.seed(args.rseed)
    tf.set_random_seed(args.rseed)

    print('task=', args.task)

    mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(
        args)

    #################################
    # 1. Retrieve trajectories
    #################################
    if args.trajdir == '__default':
        prefix = '.'
    else:
        prefix = args.trajdir

    if args.exp == "generate" or args.exp == "train":
        pathnop = str(args.noptions - 1)
    else:
        pathnop = str(args.noptions)

    # if args.reverse:
    #     dirop = 'rev_'
    # else:
    #     dirop = '_'
    dirop = '_'

    # pathdir: directory for the trajectories
    # opdir  : directory for the option
    pathdir = prefix + '/vis/' + args.task + 'option' + pathnop + dirop + str(
        args.ffuncnunit) + '_' + str(args.rseed)

    opdir = prefix + '/vis/' + args.task + 'option' + str(
        args.noptions) + dirop + str(args.ffuncnunit) + '_' + str(args.rseed)

    if args.saveimage:
        lowbfr_path = pathdir + '/low_traj_img'
        bfr_path = pathdir + '/traj_img'
    elif args.savecmp:
        lowbfr_path = pathdir + '/low_traj_sa'
        bfr_path = pathdir + '/low_traj_sa'
    else:
        lowbfr_path = pathdir + '/low_traj'
        bfr_path = pathdir + '/traj'

    bfrexp = ["vistraj", "visterm", "visvis", "visfval"]
    bfrexp_ = bfrexp + ["train"]
    if args.exp == "generate":
        print('restoring', bfr_path)
        bfr = ExperienceBuffer()
        if args.savecmp:
            bfr.restore_sa(bfr_path)
        else:
            bfr.restore(bfr_path)
    elif args.exp in bfrexp_:
        if args.exp in bfrexp and args.reverse:
            lowbfr_path = lowbfr_path + 'rev'
        print('restoring', lowbfr_path)
        low_bfr = ExperienceBuffer()
        if args.savecmp:
            low_bfr.restore_sao(lowbfr_path)
        else:
            low_bfr.restore(lowbfr_path)

        mix_traj = False
        if mix_traj:
            low_bfr2 = ExperienceBuffer()
            opdir2 = prefix + '/vis/' + args.task + 'option0' + dirop + str(
                args.ffuncnunit) + '_' + str(args.rseed)
        #     # TODO: savecmp not supported
        #     low_bfr2.restore(opdir2 + '/low_traj')
    else:
        print('No buffer retrieved')

    #################################
    # 2. Retrieve options
    #################################
    # Experiments which require 1 option to retrieve
    oneopexp = ["visop", "visfval", "train"]
    # Multilpe options to retrieve (But it is retrieved inside the util.py, so let's forget it here)
    # multiopexp = ["sample"]

    if args.exp in oneopexp:
        op = CoveringOption(sess=None,
                            experience_buffer=None,
                            obs_dim=state_dim,
                            obs_bound=mdp.bounds(),
                            num_actions=num_actions,
                            action_dim=action_dim,
                            action_bound=action_bound,
                            low_method=args.lowmethod,
                            f_func=args.ffunction,
                            n_units=args.ffuncnunit,
                            init_all=args.initall,
                            init_around_goal=args.init_around_goal,
                            init_dist=args.init_dist,
                            term_dist=args.term_dist,
                            restore=True,
                            name='option' + str(args.noptions) + '_' +
                            str(args.ffuncnunit) + '_' + str(args.rseed))

        op.restore(opdir)
    else:
        print('No option retrieved')

    #################################
    # 3. Run experiments
    #################################

    if args.exp == 'sample':
        print('sample')
        bfr, low_bfr = sample_option_trajectories(mdp,
                                                  args,
                                                  noptions=args.noptions)
    elif args.exp == 'generate':
        print('generate_option')
        print('buffersize = ', bfr.size())
        # TODO: option_b_size is the batch size for training f-function.
        op = CoveringOption(sess=None,
                            experience_buffer=bfr,
                            option_b_size=32,
                            sp_training_steps=args.sptrainingstep,
                            obs_dim=state_dim,
                            obs_bound=state_bound,
                            num_actions=num_actions,
                            action_dim=action_dim,
                            action_bound=action_bound,
                            low_method=args.lowmethod,
                            f_func=args.ffunction,
                            n_units=args.ffuncnunit,
                            init_all=args.initall,
                            reversed_dir=args.reverse,
                            init_around_goal=args.init_around_goal,
                            init_dist=args.init_dist,
                            term_dist=args.term_dist,
                            restore=None,
                            name='option' + str(args.noptions) + '_' +
                            str(args.ffuncnunit) + '_' + str(args.rseed))

    elif args.exp == 'train':
        print('train_option')
        op.reversed_dir = args.reverse
        _, _, r, _, _ = low_bfr.sample(32)
        print('background rewards=', r)
        for _ in range(args.sptrainingstep):
            op.train(low_bfr, batch_size=min(args.batchsize, low_bfr.size()))
    elif args.exp == 'evaloff' or args.exp == 'evalon':
        print('evaloff')
        agent_name = str(args.noptions) + 'options'
        if args.exp == 'evalon':
            agent_name = agent_name + '-online'

        if args.random_agent:
            oagent = GenerateRandomAgent(num_actions, action_dim, action_bound)
        else:
            oagent = OptionAgent(sess=None,
                                 obs_dim=state_dim,
                                 obs_bound=state_bound,
                                 num_actions=num_actions,
                                 action_dim=action_dim,
                                 action_bound=action_bound,
                                 num_options=1 + args.noptions,
                                 high_method=args.highmethod,
                                 low_method=args.lowmethod,
                                 f_func=args.ffunction,
                                 batch_size=args.batchsize,
                                 buffer_size=args.buffersize,
                                 low_update_freq=args.lowupdatefreq,
                                 option_batch_size=args.obatchsize,
                                 option_buffer_size=args.obuffersize,
                                 high_update_freq=args.highupdatefreq,
                                 init_all=args.initall,
                                 init_around_goal=args.init_around_goal,
                                 init_dist=args.init_dist,
                                 term_dist=args.term_dist,
                                 name=agent_name)
            oagent.reset()

        if args.exp == 'evaloff':
            for nop in range(1, args.noptions + 1):
                op = CoveringOption(sess=None,
                                    experience_buffer=None,
                                    obs_dim=state_dim,
                                    obs_bound=mdp.bounds(),
                                    num_actions=num_actions,
                                    action_dim=action_dim,
                                    action_bound=action_bound,
                                    low_method=args.lowmethod,
                                    f_func=args.ffunction,
                                    init_all=args.initall,
                                    init_around_goal=args.init_around_goal,
                                    init_dist=args.init_dist,
                                    term_dist=args.term_dist,
                                    restore=True,
                                    name='option' + str(nop) + '_' +
                                    str(args.ffuncnunit) + '_' +
                                    str(args.rseed))

                if args.reverse:
                    opdir = prefix + '/vis/' + args.task + 'option' + str(
                        nop) + 'rev_' + str(args.ffuncnunit) + '_' + str(
                            args.rseed)
                else:
                    opdir = prefix + '/vis/' + args.task + 'option' + str(
                        nop) + '_' + str(args.ffuncnunit) + '_' + str(
                            args.rseed)

                op.restore(opdir)
                print('restored option', opdir)
                oagent.add_option(op)
        else:
            print('evalon')
        mdp.reset()
        run_agents_on_mdp([oagent],
                          mdp,
                          episodes=args.nepisodes,
                          steps=args.nsteps,
                          instances=args.ninstances,
                          cumulative_plot=True,
                          verbose=args.verbose)
    else:
        print('No experiments run')

    #################################
    # 4. Plot figures
    #################################
    if args.exp == 'visop':
        plot_op(op, args, mdp, state_bound, opdir + '/eigenfunc.pdf')
    elif args.exp == 'vistraj' or args.exp == 'visterm':
        print(args.exp)
        samples = low_bfr.buffer
        size = low_bfr.size()
        trajectories = []
        cur_o = None
        for i in range(size):
            s, _, _, _, t, o = samples[i][0], samples[i][1], samples[i][
                2], samples[i][3], samples[i][4], samples[i][5]
            if cur_o == args.noptions:
                if o == args.noptions and not t and i != size - 1:
                    traj.append(s)
                else:
                    # traj.append(s2)
                    # if args.tasktype == 'pinball':
                    #     t = [s for s in traj if s.x != 0.2 or s.y != 0.2] # TODO: hack to remove the init state.
                    # else:
                    #     t = traj
                    if len(traj) > 10:
                        trajectories.append((i, traj))

                    cur_o = 0
                    traj = []
            else:
                if o == args.noptions:
                    traj = [s]
                    cur_o = args.noptions

        if len(trajectories) == 0:
            print('no trajectories sampled')

        if args.exp == 'visterm':
            terms = [traj[-1] for traj in trajectories]
            terms = terms[0:min(len(terms), 100)]
            # print('terms=', type(terms))
            print('#terms=', len(terms))
            if args.reverse:
                plot_terms(terms,
                           mdp,
                           args,
                           filename=pathdir + '/' + 'terms' + 'rev')
            else:
                plot_terms(terms, mdp, args, filename=pathdir + '/' + 'terms')
        else:
            t = trajectories[1][1]
            plot_traj(t, mdp, args, filename=pathdir + '/' + 'traj' + str(1))

    elif args.exp == 'visvis':
        print('visvis')
        samples = low_bfr.buffer
        traj = [samples[i][0] for i in range(low_bfr.size())]
        if mix_traj:

            samples2 = low_bfr2.buffer
            traj2 = [
                samples2[i][0]
                for i in range(int(min(low_bfr2.size() / 2,
                                       len(traj) / 2)))
            ]

            traj = traj[:int(len(traj) / 2)] + traj2
        plot_vis(traj, args, mdp, pathdir + '/visitation')
    elif args.exp == 'visfval':
        print('visfval')
    else:
        print('No plots')

    #################################
    # 5. Save the results
    #################################
    if args.exp == 'sample':
        print('save sample')
        if args.reverse:
            dirop = "rev"
        else:
            dirop = ""

        if args.saveimage:
            bfr.save(pathdir + '/traj_img' + dirop)
            low_bfr.save(pathdir + '/low_traj_img' + dirop)
        elif args.savecmp:
            bfr.save_sa(pathdir + '/traj_sa' + dirop)
            low_bfr.save_sao(pathdir + '/low_traj_sa' + dirop)
        else:
            bfr.save(pathdir + '/traj' + dirop)
            low_bfr.save(pathdir + '/low_traj' + dirop)

    elif args.exp == 'evaloff' or args.exp == 'evalon':
        print('save', args.exp)
        options = oagent.options
        for nop in range(1, len(options)):
            opdir = prefix + '/vis/' + args.task + 'option' + str(
                nop) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)
            if args.exp == 'evalon':
                opdir = opdir + '_online'

            options[nop].save(opdir + '_trained')
        oagent.option_buffer.save(pathdir + '_trained' + '/' + 'traj')
        oagent.experience_buffer.save(pathdir + '_trained' + '/' + 'low_traj')
    elif args.exp == 'generate':
        print('save generate')
        op.save(opdir)
    elif args.exp == 'train':
        print('save train')
        if args.reverse:
            op.save(opdir, rev=True)
        else:
            op.save(opdir)
    else:
        print('No save')