def main(open_plot=True): # TODO: Refactor and combine visualize_visitation, visualize_option, visualize_option_trajectory? # Plot the visitation statistics args = arguments() # Random seeds np.random.seed(1234) tf.set_random_seed(args.rseed) print('task=', args.task) mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) if args.restoretraj: # bfr = ExperienceBuffer() # bfr.restore('./vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.rseed) + '/' + 'traj') low_bfr = ExperienceBuffer() if args.reverse: low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'low_traj') else: low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'low_traj') else: _, low_bfr = sample_option_trajectories(mdp, args, noptions=args.noptions) # TODO: Print a list of states samples = low_bfr.buffer size = low_bfr.size() cur_o = None traj = [samples[i][0] for i in range(size)] if args.reverse: plot_visitation(traj, mdp, args, filename=args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'visitations' + '.pdf') else: plot_visitation(traj, mdp, args, filename=args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'visitations' + '.pdf')
def main(open_plot=True): args = arguments() # Random seeds np.random.seed(1234) tf.set_random_seed(args.rseed) print('task=', args.task) mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) if args.restoretraj: # bfr = ExperienceBuffer() # bfr.restore('./vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.rseed) + '/' + 'traj') low_bfr = ExperienceBuffer() if args.reverse: low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'low_traj') else: low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'low_traj') else: _, low_bfr = sample_option_trajectories(mdp, args, noptions=args.noptions) print('sampled') # TODO: Print a list of states size = low_bfr.size() op = OptionWrapper(sess=None, experience_buffer=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, n_units=args.ffuncnunit, init_all=args.initall, restore=True, name='option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) op.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) filename = args.basedir + '/vis/' + args.task + 'option' + str( args.noptions) + '_' + str(args.ffuncnunit) + '_' + str( args.rseed) + '/' + 'fvalues.pdf' plot_fvalue(low_bfr, op, filename=filename)
def main(): args = arguments() mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) bfr, low_bfr = sample_option_trajectories(mdp, args, noptions=args.noptions) # TODO: Trajectories are generated using noptions-1 options. if args.reverse: bfr.save(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj') low_bfr.save(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'low_traj') else: bfr.save(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj') low_bfr.save(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'low_traj') print('bfr size=', bfr.size()) print('lbfr size=', low_bfr.size()) if args.task == 'PointMaze-v0': s, a, r, s, t = low_bfr.sample(20) for state in s: # print('s=', state) # TODO: how do we get the X, Y coordinates of the agent? print('x,y=', state.data[0], state.data[1]) if args.task == 'MontezumaRevenge-ram-v0': s, a, r, s, t = low_bfr.sample(20) def getByte(ram, row, col): row = int(row, 16) - 8 col = int(col, 16) return ram[row * 16 + col] for state in s: x = int(getByte(state.data, 'a', 'a')) y = int(getByte(state.data, 'a', 'b')) x_img = int(210.0 * (float(x) - 1) / float((9 * 16 + 8) - 1)) y_img = int(160.0 * (float(y) - (8 * 16 + 6)) / float((15 * 16 + 15) - (8 * 16 + 6))) print('(ram) x, y =', x, y) print('(img) x, y =', x_img, y_img)
def main(open_plot=True): args = arguments() # Random seeds np.random.seed(1234) tf.set_random_seed(args.rseed) print('task=', args.task) mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params(args) if args.online: # TODO: Think how to solve the restoration for batch normalization. op = OptionWrapper(sess=None, experience_buffer=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, n_units=args.ffuncnunit, init_all=args.initall, restore=True, name='online-option' + str(args.noptions) + '_' + str(args.ffuncnunit)) op.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) plot_eigenfunction(op, args, xind=0, yind=1, filename=args.basedir + '/vis/' + args.task + 'online-option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '/' + 'eigenfunc.pdf') else: op = OptionWrapper(sess=None, experience_buffer=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, n_units=args.ffuncnunit, init_all=args.initall, restore=True, name='option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) op.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) plot_eigenfunction(op, args, xind=0, yind=1, filename=args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'eigenfunc.pdf')
def main(open_plot=True): # TODO: Accept a set of options instead of just one args = arguments() # Random seeds np.random.seed(args.rseed) tf.set_random_seed(args.rseed) print('tasktype=', args.tasktype) mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) oagent = OptionAgent(sess=None, obs_dim=state_dim, obs_bound=state_bound, num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, num_options=1 + args.noptions, init_all=args.initall, high_method=args.highmethod, low_method=args.lowmethod, f_func=args.ffunction, batch_size=args.batchsize, buffer_size=args.buffersize, low_update_freq=args.lowupdatefreq, option_batch_size=args.obatchsize, option_buffer_size=args.obuffersize, high_update_freq=args.highupdatefreq, name='op') oagent.reset() for nop in range(1, args.noptions + 1): op = OptionWrapper(sess=None, experience_buffer=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, init_all=args.initall, restore=True, name='option' + str(nop) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) if args.trajdir == '__default': if args.reverse: opdir = './vis/' + args.task + 'option' + str( nop) + 'rev_' + str(args.ffuncnunit) + '_' + str( args.rseed) else: opdir = './vis/' + args.task + 'option' + str(nop) + '_' + str( args.ffuncnunit) + '_' + str(args.rseed) else: # Only one option can be restored from nonstandard locations assert (args.noptions == 1) opdir = args.trajdir op.restore(opdir) print('restored option', opdir) # print('upper_th=', op.upper_th) oagent.add_option(op) agents = [] agents.append(oagent) if args.base: base = OptionAgent(sess=None, obs_dim=state_dim, obs_bound=state_bound, num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, num_options=1, high_method=args.highmethod, low_method=args.lowmethod, f_func=args.ffunction, batch_size=args.batchsize, buffer_size=args.buffersize, low_update_freq=args.lowupdatefreq, option_batch_size=1, option_buffer_size=2, high_update_freq=10000000, init_all=args.initall, name='base') agents.append(base) mdp.reset() # TODO: We need to count the number of times the agent reached the goal state. # Because from the cumulative rewards, it is hard to see if the agent is performing as intended. # Possible Solutions: (See the previous works first) # 1. Plot the number of times the agent reached the goal. # 2. Give a positive reward when it reached the goal run_agents_on_mdp(agents, mdp, episodes=args.nepisodes, steps=args.nsteps, instances=args.ninstances, cumulative_plot=True) options = oagent.options for nop in range(1, len(options)): if args.trajdir == '__default': opdir = './vis/' + args.task + 'option' + str(nop) + '_' + str( args.ffuncnunit) + '_' + str(args.rseed) else: assert (args.noptions == 1) opdir = args.trajdir # print('upper=', options[nop].upper_th) options[nop].save(opdir + '_trained') if args.trajdir == '__default': bufdir = './vis/' + args.task + 'option' + str( args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) else: bufdir = args.trajdir oagent.option_buffer.save(bufdir + '_trained' + '/' + 'traj') oagent.experience_buffer.save(bufdir + '_trained' + '/' + 'low_traj')
def main(open_plot=True): args = arguments() # Random seeds np.random.seed(1234) tf.set_random_seed(args.rseed) print('task=', args.task) mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) if args.restoretraj: # bfr = ExperienceBuffer() # bfr.restore('./vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.rseed) + '/' + 'traj') low_bfr = ExperienceBuffer() if args.reverse: low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'low_traj') else: low_bfr.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'low_traj') else: _, low_bfr = sample_option_trajectories(mdp, args, noptions=args.noptions) print('sampled') # TODO: Print a list of states samples = low_bfr.buffer size = low_bfr.size() print('size=', size) trajectories = [] cur_o = None for i in range(size): # TODO: something wrong is happening in the trajectory. Why? s, a, r, s2, t, o = samples[i][0], samples[i][1], samples[i][ 2], samples[i][3], samples[i][4], samples[i][5] # assert(t is False) # print('o=', o, ', t=', t) if cur_o == args.noptions: if o == args.noptions and not t and i != size - 1: traj.append(s) else: # traj.append(s2) if args.tasktype == 'pinball': t = [s for s in traj if s.x != 0.2 or s.y != 0.2 ] # TODO: hack to remove the init state. else: t = traj # for i, s in enumerate(t): # if 0.01466 <= s.data[0] and s.data[0] <= 0.01467: # t.remove(s) # # break # print(s.data[0]) trajectories.append((i, t)) cur_o = 0 traj = [] # TODO: what is the best way to print these figures out? # break else: if o == args.noptions: traj = [s] cur_o = args.noptions for traj in trajectories: i = traj[0] t = traj[1] print(i, ' traj length=', len(t)) if args.reverse: plot_trajectory(t, mdp, args, filename=args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj' + str(i) + '.pdf') else: plot_trajectory(t, mdp, args, filename=args.basedir + '/vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj' + str(i) + '.pdf')
f_func=args.ffunction, batch_size=args.batchsize, buffer_size=args.buffersize, low_update_freq=args.lowupdatefreq, option_batch_size=args.obatchsize, option_buffer_size=args.obuffersize, high_update_freq=args.highupdatefreq, name='diayn' + str(args.noptions)) for i in range(args.noptions): op = DiaynOption(rst, i, args.termprob) oagent.add_option(op) run_agents_on_mdp([oagent], mdp, episodes=args.nepisodes, steps=args.nsteps, instances=args.ninstances, cumulative_plot=True) if __name__ == "__main__": args = arguments() if args.exp == 'sample': save(args) elif args.exp == 'evaloff': restore(args) else: print('set --exp sample or evaloff') assert (False)
def main(open_plot=True): # TODO: Accept set of options and generate a new option based on them. args = arguments() np.random.seed(1234) # tf.set_random_seed(args.rseed) # tf.set_random_seed(5678) # tf.set_random_seed(5408) tf.set_random_seed(2345) print('tasktype=', args.tasktype) mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) # We generate k-th option based on the previous k-1 options. if args.restoretraj: bfr = ExperienceBuffer() if args.reverse: print('restoring buffer from ' + './vis/' + args.task + 'option' + str(args.noptions - 1) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj') bfr.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions - 1) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj') else: print('restoring buffer from ' + './vis/' + args.task + 'option' + str(args.noptions - 1) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj') bfr.restore(args.basedir + '/vis/' + args.task + 'option' + str(args.noptions - 1) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) + '/' + 'traj') bfr_size = bfr.size() print('bfr_size=', bfr_size) # TODO: parameter? else: bfr, _ = sample_option_trajectories(mdp, args, noptions=args.noptions - 1) bfr_size = bfr.size() print('bfr_size=', bfr_size) # TODO: In graph theory, inserting an edge results in significant change to the topology. # However, seems adding just one transition sample to the NN does not change it too much. # Can we tackle this problem other than sampling the trajectories again? op = OptionWrapper(sess=None, experience_buffer=bfr, option_b_size=min(32, bfr_size), sp_training_steps=args.sptrainingstep, obs_dim=state_dim, obs_bound=state_bound, num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, n_units=args.ffuncnunit, init_all=args.initall, restore=None, reversed_dir=args.reverse, name='option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) # if args.train: # op.train(bfr, batch_size=args.snepisodes * args.snsteps) if args.reverse: filename = args.basedir + '/vis/' + args.task + 'option' + str( args.noptions) + 'rev_' + str(args.ffuncnunit) + "_" + str( args.rseed) else: filename = args.basedir + '/vis/' + args.task + 'option' + str( args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) op.save(filename)
upper_limit = 2**sequence_length nonce = 0 while (not golden_nonce and nonce < upper_limit): binary_sequence = self.binary_format(nonce).rjust( sequence_length, '0') evaluator = NonceEvaluator(data, binary_sequence, difficulty) if (evaluator.valid_nonce()): golden_nonce = binary_sequence nonce += 1 sequence_length += 1 return (golden_nonce, evaluator.hexdigest) def binary_format(self, nonce): return bin(nonce)[2:] if __name__ == "__main__": _, difficulty, data = util.arguments(sys.argv) print("Data:", data, "| Difficulty:", difficulty) start_time = time.time() binary_sequence, hexdigest = SimpleFinder().find_by_zero_prepend( data, difficulty) # binary_sequence, hexdigest = SimpleFinder().find_by_increment(data, difficulty) processing_time = time.time() - start_time nonce = int(binary_sequence, 2) print("Golden Nonce:", nonce, "|", binary_sequence + "(" + str(len(binary_sequence)) + ")") print("Processing time: {0:.3f} s.".format(processing_time)) print("Hexdigest", hexdigest)
def main(open_plot=True): args = arguments() # Random seeds np.random.seed(args.rseed) tf.set_random_seed(args.rseed) print('tasktype=', args.tasktype) mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) oagent = OptionAgent(sess=None, obs_dim=state_dim, obs_bound=state_bound, num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, num_options=args.noptions, init_all=args.initall, high_method=args.highmethod, low_method=args.lowmethod, f_func=args.ffunction, batch_size=args.batchsize, buffer_size=args.buffersize, option_batch_size=args.obatchsize, option_buffer_size=args.obuffersize, option_freq=args.ofreq, option_min_steps=args.ominsteps, name=str(args.noptions) + 'op-initall') agents = [] agents.append(oagent) if args.base: base = OptionAgent(sess=None, obs_dim=state_dim, obs_bound=state_bound, num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, num_options=1, high_method=args.highmethod, low_method=args.lowmethod, f_func=args.ffunction, batch_size=args.batchsize, buffer_size=args.buffersize, option_batch_size=1, option_buffer_size=2, init_all=args.initall, name='base') agents.append(base) mdp.reset() run_agents_on_mdp(agents, mdp, episodes=args.nepisodes, steps=args.nsteps, instances=args.ninstances, cumulative_plot=True) # TODO: Save the options learned by the agent options = oagent.generated_options[1] print('options=', options) for i, op in enumerate(options): if i == 0: continue op.save('./vis/' + args.task + 'online-option' + str(i) + '_' + str(args.rseed))
def main(open_plot=True): args = arguments() # Random seeds np.random.seed(1234) tf.set_random_seed(args.rseed) print('task=', args.task) mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) # TODO: Train an option using the trajectories sampled by itself. op = OptionWrapper(sess=None, experience_buffer=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, n_units=args.ffuncnunit, restore=True, init_all=args.initall, reversed_dir=args.reverse, name='option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) # if args.reverse: # op.restore('./vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed)) # else: op.restore('./vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) op.reversed_dir = args.reverse # TODO: Shouldn't we train the policy based on its own sample frequency? if args.restoretraj: if args.trajdir == '__default': args.trajdir = './vis/' + args.task + 'option' + str( args.noptions - 1) + '_' + str(args.ffuncnunit) + '_' + str( args.rseed) + '/' + 'low_traj' print('restoring buffer from ' + args.trajdir) bfr = ExperienceBuffer() # if args.reverse: # bfr.restore('./vis/' + args.task + 'option' + str(args.noptions - 1) + 'rev_' + str(args.rseed) + '/' + 'low_traj') # else: bfr.restore(args.trajdir) bfr_size = bfr.size() print('bfr_size=', bfr_size) # TODO: parameter? else: _, bfr = sample_option_trajectories(mdp, args, noptions=args.noptions - 1) bfr_size = bfr.size() print('bfr_size=', bfr_size) _, _, r, _, _ = bfr.sample(32) print('rewards=', r) for _ in range(args.sptrainingstep): op.train(bfr, batch_size=min(128, bfr_size)) if args.reverse: op.save('./vis/' + args.task + 'option' + str(args.noptions) + 'rev_' + str(args.ffuncnunit) + '_' + str(args.rseed)) else: op.save('./vis/' + args.task + 'option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed))
def main(open_plot=True): args = arguments() # Random seeds np.random.seed(args.rseed) tf.set_random_seed(args.rseed) print('task=', args.task) mdp, state_dim, state_bound, num_actions, action_dim, action_bound = get_mdp_params( args) ################################# # 1. Retrieve trajectories ################################# if args.trajdir == '__default': prefix = '.' else: prefix = args.trajdir if args.exp == "generate" or args.exp == "train": pathnop = str(args.noptions - 1) else: pathnop = str(args.noptions) # if args.reverse: # dirop = 'rev_' # else: # dirop = '_' dirop = '_' # pathdir: directory for the trajectories # opdir : directory for the option pathdir = prefix + '/vis/' + args.task + 'option' + pathnop + dirop + str( args.ffuncnunit) + '_' + str(args.rseed) opdir = prefix + '/vis/' + args.task + 'option' + str( args.noptions) + dirop + str(args.ffuncnunit) + '_' + str(args.rseed) if args.saveimage: lowbfr_path = pathdir + '/low_traj_img' bfr_path = pathdir + '/traj_img' elif args.savecmp: lowbfr_path = pathdir + '/low_traj_sa' bfr_path = pathdir + '/low_traj_sa' else: lowbfr_path = pathdir + '/low_traj' bfr_path = pathdir + '/traj' bfrexp = ["vistraj", "visterm", "visvis", "visfval"] bfrexp_ = bfrexp + ["train"] if args.exp == "generate": print('restoring', bfr_path) bfr = ExperienceBuffer() if args.savecmp: bfr.restore_sa(bfr_path) else: bfr.restore(bfr_path) elif args.exp in bfrexp_: if args.exp in bfrexp and args.reverse: lowbfr_path = lowbfr_path + 'rev' print('restoring', lowbfr_path) low_bfr = ExperienceBuffer() if args.savecmp: low_bfr.restore_sao(lowbfr_path) else: low_bfr.restore(lowbfr_path) mix_traj = False if mix_traj: low_bfr2 = ExperienceBuffer() opdir2 = prefix + '/vis/' + args.task + 'option0' + dirop + str( args.ffuncnunit) + '_' + str(args.rseed) # # TODO: savecmp not supported # low_bfr2.restore(opdir2 + '/low_traj') else: print('No buffer retrieved') ################################# # 2. Retrieve options ################################# # Experiments which require 1 option to retrieve oneopexp = ["visop", "visfval", "train"] # Multilpe options to retrieve (But it is retrieved inside the util.py, so let's forget it here) # multiopexp = ["sample"] if args.exp in oneopexp: op = CoveringOption(sess=None, experience_buffer=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, n_units=args.ffuncnunit, init_all=args.initall, init_around_goal=args.init_around_goal, init_dist=args.init_dist, term_dist=args.term_dist, restore=True, name='option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) op.restore(opdir) else: print('No option retrieved') ################################# # 3. Run experiments ################################# if args.exp == 'sample': print('sample') bfr, low_bfr = sample_option_trajectories(mdp, args, noptions=args.noptions) elif args.exp == 'generate': print('generate_option') print('buffersize = ', bfr.size()) # TODO: option_b_size is the batch size for training f-function. op = CoveringOption(sess=None, experience_buffer=bfr, option_b_size=32, sp_training_steps=args.sptrainingstep, obs_dim=state_dim, obs_bound=state_bound, num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, n_units=args.ffuncnunit, init_all=args.initall, reversed_dir=args.reverse, init_around_goal=args.init_around_goal, init_dist=args.init_dist, term_dist=args.term_dist, restore=None, name='option' + str(args.noptions) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) elif args.exp == 'train': print('train_option') op.reversed_dir = args.reverse _, _, r, _, _ = low_bfr.sample(32) print('background rewards=', r) for _ in range(args.sptrainingstep): op.train(low_bfr, batch_size=min(args.batchsize, low_bfr.size())) elif args.exp == 'evaloff' or args.exp == 'evalon': print('evaloff') agent_name = str(args.noptions) + 'options' if args.exp == 'evalon': agent_name = agent_name + '-online' if args.random_agent: oagent = GenerateRandomAgent(num_actions, action_dim, action_bound) else: oagent = OptionAgent(sess=None, obs_dim=state_dim, obs_bound=state_bound, num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, num_options=1 + args.noptions, high_method=args.highmethod, low_method=args.lowmethod, f_func=args.ffunction, batch_size=args.batchsize, buffer_size=args.buffersize, low_update_freq=args.lowupdatefreq, option_batch_size=args.obatchsize, option_buffer_size=args.obuffersize, high_update_freq=args.highupdatefreq, init_all=args.initall, init_around_goal=args.init_around_goal, init_dist=args.init_dist, term_dist=args.term_dist, name=agent_name) oagent.reset() if args.exp == 'evaloff': for nop in range(1, args.noptions + 1): op = CoveringOption(sess=None, experience_buffer=None, obs_dim=state_dim, obs_bound=mdp.bounds(), num_actions=num_actions, action_dim=action_dim, action_bound=action_bound, low_method=args.lowmethod, f_func=args.ffunction, init_all=args.initall, init_around_goal=args.init_around_goal, init_dist=args.init_dist, term_dist=args.term_dist, restore=True, name='option' + str(nop) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed)) if args.reverse: opdir = prefix + '/vis/' + args.task + 'option' + str( nop) + 'rev_' + str(args.ffuncnunit) + '_' + str( args.rseed) else: opdir = prefix + '/vis/' + args.task + 'option' + str( nop) + '_' + str(args.ffuncnunit) + '_' + str( args.rseed) op.restore(opdir) print('restored option', opdir) oagent.add_option(op) else: print('evalon') mdp.reset() run_agents_on_mdp([oagent], mdp, episodes=args.nepisodes, steps=args.nsteps, instances=args.ninstances, cumulative_plot=True, verbose=args.verbose) else: print('No experiments run') ################################# # 4. Plot figures ################################# if args.exp == 'visop': plot_op(op, args, mdp, state_bound, opdir + '/eigenfunc.pdf') elif args.exp == 'vistraj' or args.exp == 'visterm': print(args.exp) samples = low_bfr.buffer size = low_bfr.size() trajectories = [] cur_o = None for i in range(size): s, _, _, _, t, o = samples[i][0], samples[i][1], samples[i][ 2], samples[i][3], samples[i][4], samples[i][5] if cur_o == args.noptions: if o == args.noptions and not t and i != size - 1: traj.append(s) else: # traj.append(s2) # if args.tasktype == 'pinball': # t = [s for s in traj if s.x != 0.2 or s.y != 0.2] # TODO: hack to remove the init state. # else: # t = traj if len(traj) > 10: trajectories.append((i, traj)) cur_o = 0 traj = [] else: if o == args.noptions: traj = [s] cur_o = args.noptions if len(trajectories) == 0: print('no trajectories sampled') if args.exp == 'visterm': terms = [traj[-1] for traj in trajectories] terms = terms[0:min(len(terms), 100)] # print('terms=', type(terms)) print('#terms=', len(terms)) if args.reverse: plot_terms(terms, mdp, args, filename=pathdir + '/' + 'terms' + 'rev') else: plot_terms(terms, mdp, args, filename=pathdir + '/' + 'terms') else: t = trajectories[1][1] plot_traj(t, mdp, args, filename=pathdir + '/' + 'traj' + str(1)) elif args.exp == 'visvis': print('visvis') samples = low_bfr.buffer traj = [samples[i][0] for i in range(low_bfr.size())] if mix_traj: samples2 = low_bfr2.buffer traj2 = [ samples2[i][0] for i in range(int(min(low_bfr2.size() / 2, len(traj) / 2))) ] traj = traj[:int(len(traj) / 2)] + traj2 plot_vis(traj, args, mdp, pathdir + '/visitation') elif args.exp == 'visfval': print('visfval') else: print('No plots') ################################# # 5. Save the results ################################# if args.exp == 'sample': print('save sample') if args.reverse: dirop = "rev" else: dirop = "" if args.saveimage: bfr.save(pathdir + '/traj_img' + dirop) low_bfr.save(pathdir + '/low_traj_img' + dirop) elif args.savecmp: bfr.save_sa(pathdir + '/traj_sa' + dirop) low_bfr.save_sao(pathdir + '/low_traj_sa' + dirop) else: bfr.save(pathdir + '/traj' + dirop) low_bfr.save(pathdir + '/low_traj' + dirop) elif args.exp == 'evaloff' or args.exp == 'evalon': print('save', args.exp) options = oagent.options for nop in range(1, len(options)): opdir = prefix + '/vis/' + args.task + 'option' + str( nop) + '_' + str(args.ffuncnunit) + '_' + str(args.rseed) if args.exp == 'evalon': opdir = opdir + '_online' options[nop].save(opdir + '_trained') oagent.option_buffer.save(pathdir + '_trained' + '/' + 'traj') oagent.experience_buffer.save(pathdir + '_trained' + '/' + 'low_traj') elif args.exp == 'generate': print('save generate') op.save(opdir) elif args.exp == 'train': print('save train') if args.reverse: op.save(opdir, rev=True) else: op.save(opdir) else: print('No save')