def main(): args = parser.parse_args() if args.render: from envs.gridworld import GridWorld else: from envs.gridworld_clockless import GridWorldClockless as GridWorld env = GridWorld(display=args.render, obstacles=[np.asarray([1, 2])], goal_state=np.asarray([5, 5]), step_wrapper=step_wrapper, reset_wrapper=reset_wrapper, seed=3) loss_t = LBT(list_size=100, stop_threshold=1.5, log_interval=100) model = ActorCritic(env, gamma=0.99, log_interval=200, max_episodes=5000, max_ep_length=20, termination=loss_t) if args.policy_path is not None: model.policy.load(args.policy_path) if args.reward_net is not None: reward_net = RewardNet(env.reset().shape[0]) reward_net.to('cuda') reward_net.load('./saved-models-rewards/0.pt') reward_net.eval() else: reward_net = None if not args.play: model.train_mp(n_jobs=4, reward_net=reward_net, irl=args.irl) if not args.dont_save: model.policy.save('./saved-models/') if args.play: env.tickSpeed = 15 assert args.policy_path is not None, 'pass a policy to play from!' model.generate_trajectory(args.num_trajs, './trajs/ac_gridworld/')
parser.add_argument('--gamma', type=float, default=0.9) parser.add_argument('--model', default='convdeconv1') parser.add_argument('--target', type=int, default=1000) parser.add_argument('--path', required=True) boolean_flag(parser, 'dueling', default=True) boolean_flag(parser, 'norm', default=True) boolean_flag(parser, 'double', default=True) boolean_flag(parser, 'render', default=False) args = parser.parse_args() n_steps = int(1e8) train_level = 'level1' test_levels = ['level1', 'level2', 'level3'] env = GridWorld(train_level) coords_shape = env.unwrapped.coords_shape set_global_seeds(args.seed) env.seed(args.seed) print('~~~~~~~~~~~~~~~~~~~~~~') print(env.spec.id) print('observations:', env.observation_space.shape) print('coords: ', coords_shape) print('actions: ', env.action_space.n) print('walls: ', env.unwrapped.walls.shape) print('~~~~~~~~~~~~~~~~~~~~~~') config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config)
def main(): args = parser.parse_args() experiment_logger = Logger('temp_save.txt') experiment_logger.log_header('Arguments for the experiment :') experiment_logger.log_info(vars(args)) mp.set_start_method('spawn') if args.render: from envs.gridworld import GridWorld else: from envs.gridworld_clockless import GridWorldClockless as GridWorld agent_width = 10 step_size = 10 obs_width = 10 grid_size = 10 if args.feat_extractor == 'Onehot': feat_ext = OneHot(grid_rows=10, grid_cols=10) if args.feat_extractor == 'SocialNav': feat_ext = SocialNav(fieldList=['agent_state', 'goal_state']) if args.feat_extractor == 'FrontBackSideSimple': feat_ext = FrontBackSideSimple( thresh1=1, thresh2=2, thresh3=3, thresh4=4, step_size=step_size, agent_width=agent_width, obs_width=obs_width, ) if args.feat_extractor == 'LocalGlobal': feat_ext = LocalGlobal( window_size=3, grid_size=grid_size, agent_width=agent_width, obs_width=obs_width, step_size=step_size, ) experiment_logger.log_header('Parameters of the feature extractor :') experiment_logger.log_info(feat_ext.__dict__) ''' np.asarray([2,2]),np.asarray([7,4]),np.asarray([3,5]), np.asarray([5,2]),np.asarray([8,3]),np.asarray([7,5]), np.asarray([3,3]),np.asarray([3,7]),np.asarray([5,7]) ''' env = GridWorld(display=args.render, is_onehot=False, is_random=True, rows=100, agent_width=agent_width, step_size=step_size, obs_width=obs_width, width=grid_size, cols=100, seed=7, buffer_from_obs=0, obstacles=3, goal_state=np.asarray([5, 5])) experiment_logger.log_header('Environment details :') experiment_logger.log_info(env.__dict__) model = ActorCritic(env, feat_extractor=feat_ext, gamma=0.99, log_interval=100, max_ep_length=40, hidden_dims=args.policy_net_hidden_dims, max_episodes=4000) experiment_logger.log_header('Details of the RL method :') experiment_logger.log_info(model.__dict__) pdb.set_trace() if args.policy_path is not None: model.policy.load(args.policy_path) if not args.play and not args.play_user: if args.reward_path is None: model.train_mp(n_jobs=4) else: from irlmethods.deep_maxent import RewardNet state_size = featExtract.extract_features(env.reset()).shape[0] reward_net = RewardNet(state_size) reward_net.load(args.reward_path) print(next(reward_net.parameters()).is_cuda) model.train_mp(reward_net=reward_net, n_jobs=4) if not args.dont_save: model.policy.save('./saved-models/') if args.play: env.tickSpeed = 15 assert args.policy_path is not None, 'pass a policy to play from!' model.generate_trajectory(args.num_trajs, './trajs/ac_fbs_simple4_static_map7/') if args.play_user: env.tickSpeed = 200 model.generate_trajectory_user(args.num_trajs, './trajs/ac_gridworld_user/')
def experiment(args, agent_algorithm): np.random.seed() scores = list() #add timestamp to results ts = str(time.time()) # Evaluation of the model provided by the user. if args.load_path and args.evaluation: # MDP if args.name not in ['Taxi', 'Gridworld']: mdp = Gym(args.name, args.horizon, args.gamma) n_states = None gamma_eval = 1. elif args.name == 'Taxi': mdp = generate_taxi('../../grid.txt') n_states = mdp.info.observation_space.size[0] gamma_eval = mdp.info.gamma else: rew_weights = [args.fast_zone, args.slow_zone, args.goal] grid_size = args.grid_size env = GridWorld(gamma=args.gamma, rew_weights=rew_weights, shape=(grid_size, grid_size), randomized_initial=args.rand_initial, horizon=args.horizon) gamma_eval = args.gamma mdp = env.generate_mdp() n_states = mdp.info.observation_space.size[0] # Policy epsilon_test = Parameter(value=args.test_exploration_rate) pi = BootPolicy(args.n_approximators, epsilon=epsilon_test) # Approximator input_shape = mdp.info.observation_space.shape + (1, ) input_preprocessor = list() approximator_params = dict(input_shape=input_shape, output_shape=(mdp.info.action_space.n, ), n_states=n_states, n_actions=mdp.info.action_space.n, n_features=args.n_features, n_approximators=args.n_approximators, input_preprocessor=input_preprocessor, name='test', load_path=args.load_path, net_type=args.net_type, optimizer={ 'name': args.optimizer, 'lr': args.learning_rate, 'lr_sigma': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon }) approximator = SimpleNet # Agent algorithm_params = dict(batch_size=0, initial_replay_size=0, max_replay_size=0, clip_reward=False, target_update_frequency=1) if args.alg == 'boot': algorithm_params['p_mask'] = args.p_mask pi = BootPolicy(args.n_approximators, epsilon=epsilon_test) elif args.alg == 'gaussian': if args.ucb: pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma)) else: pi = WeightedGaussianPolicy(epsilon=epsilon_test) elif args.alg == 'dqn': pi = EpsGreedy(epsilon=epsilon_test) elif args.alg == 'particle': if args.ucb: pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma)) else: pi = WeightedPolicy(args.n_approximators, epsilon=epsilon_test) else: raise ValueError("Algorithm uknown") if args.alg in ['gaussian', 'particle']: algorithm_params['update_type'] = args.update_type algorithm_params['delta'] = args.delta algorithm_params['store_prob'] = args.store_prob if args.clip_target: algorithm_params['max_spread'] = args.q_max - args.q_min approximator_params['q_min'] = args.q_min approximator_params['q_max'] = args.q_max approximator_params['loss'] = args.loss approximator_params['init_type'] = args.init_type approximator_params['sigma_weight'] = args.sigma_weight if args.alg in ['particle', 'boot']: approximator_params['n_approximators'] = args.n_approximators algorithm_params['n_approximators'] = args.n_approximators agent = agent_algorithm(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) # Algorithm core_test = Core(agent, mdp) # Evaluate model pi.set_eval(True) dataset = core_test.evaluate(n_steps=args.test_samples, render=args.render, quiet=args.quiet) get_stats(dataset) else: # DQN learning run print("Learning Run") # Settings if args.debug: initial_replay_size = 50 max_replay_size = 500 train_frequency = 5 target_update_frequency = 10 test_samples = 20 evaluation_frequency = 50 max_steps = 1000 else: initial_replay_size = args.initial_replay_size max_replay_size = args.max_replay_size train_frequency = args.train_frequency target_update_frequency = args.target_update_frequency test_samples = args.test_samples evaluation_frequency = args.evaluation_frequency max_steps = args.max_steps # MDP if args.name not in ['Taxi', 'Gridworld']: mdp = Gym(args.name, args.horizon, args.gamma) n_states = None gamma_eval = 1. elif args.name == 'Taxi': mdp = generate_taxi('../../grid.txt') n_states = mdp.info.observation_space.size[0] gamma_eval = mdp.info.gamma else: rew_weights = [args.fast_zone, args.slow_zone, args.goal] grid_size = args.grid_size env = GridWorld(gamma=args.gamma, rew_weights=rew_weights, shape=(grid_size, grid_size), randomized_initial=args.rand_initial, horizon=args.horizon) mdp = env.generate_mdp() n_states = mdp.info.observation_space.size[0] print(mdp.info.gamma) gamma_eval = args.gamma # Policy epsilon = LinearDecayParameter(value=args.initial_exploration_rate, min_value=args.final_exploration_rate, n=args.final_exploration_frame) epsilon_test = Parameter(value=args.test_exploration_rate) epsilon_random = Parameter(value=1.) policy_name = 'weighted' update_rule = args.update_type + "_update" if args.alg == 'boot': pi = BootPolicy(args.n_approximators, epsilon=epsilon) policy_name = 'boot' update_rule = 'boot' elif args.alg == 'dqn': pi = EpsGreedy(epsilon=epsilon) policy_name = 'eps_greedy' update_rule = 'td' elif args.alg == 'particle': if args.ucb: policy_name = 'ucb' pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma)) else: pi = WeightedPolicy(args.n_approximators) elif args.alg == 'gaussian': if args.ucb: policy_name = 'ucb' pi = UCBPolicy(delta=args.delta, q_max=1. / (1. - args.gamma)) else: pi = WeightedGaussianPolicy() else: raise ValueError("Algorithm unknown") # Summary folder folder_name = './logs/' + args.alg + "/" + policy_name + '/' + update_rule + '/' + args.name + "/" + args.loss + "/" + str( args.n_approximators ) + "_particles" + "/" + args.init_type + "_init" + "/" + str( args.learning_rate) + "/" + ts # Approximator input_shape = mdp.info.observation_space.shape input_preprocessor = list() approximator_params = dict(input_shape=input_shape, output_shape=(mdp.info.action_space.n, ), n_states=n_states, n_actions=mdp.info.action_space.n, n_features=args.n_features, n_approximators=args.n_approximators, input_preprocessor=input_preprocessor, folder_name=folder_name, net_type=args.net_type, sigma_weight=args.sigma_weight, optimizer={ 'name': args.optimizer, 'lr': args.learning_rate, 'lr_sigma': args.learning_rate, 'decay': args.decay, 'epsilon': args.epsilon }) if args.load_path: ts = os.path.basename(os.path.normpath(args.load_path)) approximator_params['load_path'] = args.load_path approximator_params['folder_name'] = args.load_path folder_name = args.load_path p = "scores_" + str(ts) + ".npy" scores = np.load(p).tolist() max_steps = max_steps - evaluation_frequency * len(scores) approximator = SimpleNet # Agent algorithm_params = dict( batch_size=args.batch_size, initial_replay_size=initial_replay_size, max_replay_size=max_replay_size, clip_reward=False, target_update_frequency=target_update_frequency // train_frequency, ) if args.alg == 'boot': algorithm_params['p_mask'] = args.p_mask elif args.alg in ['particle', 'gaussian']: algorithm_params['update_type'] = args.update_type algorithm_params['delta'] = args.delta algorithm_params['store_prob'] = args.store_prob if args.clip_target: algorithm_params['max_spread'] = args.q_max - args.q_min approximator_params['q_min'] = args.q_min approximator_params['q_max'] = args.q_max approximator_params['loss'] = args.loss approximator_params['init_type'] = args.init_type if args.alg in ['boot', 'particle']: approximator_params['n_approximators'] = args.n_approximators algorithm_params['n_approximators'] = args.n_approximators agent = agent_algorithm(approximator, pi, mdp.info, approximator_params=approximator_params, **algorithm_params) if args.ucb: q = agent.approximator if args.alg == 'particle': def mu(state): q_list = q.predict(state).squeeze() qs = np.array(q_list) return qs.mean(axis=0) quantiles = [ i * 1. / (args.n_approximators - 1) for i in range(args.n_approximators) ] for p in range(args.n_approximators): if quantiles[p] >= 1 - args.delta: delta_index = p break def quantile_func(state): q_list = q.predict(state).squeeze() qs = np.sort(np.array(q_list), axis=0) return qs[delta_index, :] print("Setting up ucb policy") pi.set_mu(mu) pi.set_quantile_func(quantile_func) if args.alg == 'gaussian': standard_bound = norm.ppf(1 - args.delta, loc=0, scale=1) def mu(state): q_and_sigma = q.predict(state).squeeze() means = q_and_sigma[0] return means def quantile_func(state): q_and_sigma = q.predict(state).squeeze() means = q_and_sigma[0] sigmas = q_and_sigma[1] return sigmas * standard_bound + means print("Setting up ucb policy") pi.set_mu(mu) pi.set_quantile_func(quantile_func) args.count = 100 if args.plot_qs: import matplotlib.pyplot as plt colors = ['red', 'blue', 'green'] labels = ['left', 'nop', 'right'] def plot_probs(qs): args.count += 1 if args.count < 1: return ax.clear() for i in range(qs.shape[-1]): mu = np.mean(qs[..., i], axis=0) sigma = np.std(qs[..., i], axis=0) x = np.linspace(mu - 3 * sigma, mu + 3 * sigma, 20) ax.plot(x, stats.norm.pdf(x, mu, sigma), label=labels[i], color=colors[i]) ax.set_xlabel('Q-value') ax.set_ylabel('Probability') ax.set_title('Q-distributions') #ax.set_ylim(bottom=0, top=1) plt.draw() plt.pause(0.02) #print("Plotted") args.count = 0 #return probs plt.ion() fig, ax = plt.subplots() plot_probs( np.array(agent.approximator.predict(np.array(mdp.reset())))) input() args.count = 100 qs = np.array([ np.linspace(-1000, 0, 10), np.linspace(-2000, -1000, 10), np.linspace(-750, -250, 10) ]) plot_probs(qs.T) # Algorithm core = Core(agent, mdp) core_test = Core(agent, mdp) # RUN # Fill replay memory with random dataset print_epoch(0) core.learn( n_steps=initial_replay_size, n_steps_per_fit=initial_replay_size, quiet=args.quiet, ) if args.save: agent.approximator.model.save() # Evaluate initial policy if hasattr(pi, 'set_eval'): pi.set_eval(True) pi.set_epsilon(epsilon_test) dataset = core_test.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) if args.plot_qs: pi.set_plotter(plot_probs) np.save(folder_name + '/scores_' + str(ts) + '.npy', scores) for n_epoch in range(1, max_steps // evaluation_frequency + 1): print_epoch(n_epoch) print('- Learning:') # learning step if hasattr(pi, 'set_eval'): pi.set_eval(False) pi.set_epsilon(epsilon) # learning step if args.plot_qs: pi.set_plotter(None) core.learn( n_steps=evaluation_frequency, n_steps_per_fit=train_frequency, quiet=args.quiet, ) if args.save: agent.approximator.model.save() print('- Evaluation:') # evaluation step if hasattr(pi, 'set_eval'): pi.set_eval(True) pi.set_epsilon(epsilon_test) if args.plot_qs: pi.set_plotter(plot_probs) dataset = core_test.evaluate(n_steps=test_samples, render=args.render, quiet=args.quiet) scores.append(get_stats(dataset)) np.save(folder_name + '/scores_' + str(ts) + '.npy', scores) return scores
argmax_action = action return argmax_action def policy_iteration(policy, env, discount=1.0): states = env.get_state_space() while True: policy_stable = True states_values = policy_evaluation(policy, env) states_values = states_values.flatten() for state in states: tmp = policy[state] argmax_action = update_rule(policy, env, state, states_values, discount) for action in policy[state]: if action == argmax_action: policy[state][action] = 1.0 # Max prob else: policy[state][action] = 0 if tmp != policy[state]: policy_stable = False if policy_stable: return policy if __name__ == '__main__': env = GridWorld() policy = RandomPolicy(env) pprint(policy_iteration(policy, env).__dict__)
def generate_agent_grid_visitation_map(policy_fname_list, feature_extractor=None, store=False): #given the policy file name list and feature extractor creates a heatmap of the #agent on the gridworld based on the trajectories in the list #if store=True, the figure is stored in the form of a pickle #list containing the points of trajectories of all the policies trajectory_point_master_list = [] traj_to_plot = 2 env = GridWorld(display=False, is_onehot=False, is_random=False, rows=10, cols=10, seed=3, obstacles=[np.asarray([5, 5])], goal_state=np.asarray([1, 5])) max_ep_length = 15 run_iterations = 50 rl_method = ActorCritic(env, feat_extractor=feature_extractor, gamma=0.99, max_ep_length=max_ep_length, log_interval=50) labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'] counter = 0 for name in policy_fname_list: counter += 1 if counter == traj_to_plot: policy_name_to_plot = name #ready the policy rl_method.policy.load(name) trajectory_point_policy = [] env = GridWorld(display=False, is_onehot=False, is_random=False, rows=10, cols=10, seed=7, obstacles=[np.asarray([5, 5])], goal_state=np.asarray([1, 5])) heat_map = np.zeros((env.rows, env.cols)) for i in range(run_iterations): trajectory_point_run = [] state = env.reset() heat_map[state['agent_state'][0], state['agent_state'][1]] += 1 trajectory_point_run.append( (state['agent_state'][0] * env.cellWidth, state['agent_state'][1] * env.cellWidth)) state = feature_extractor.extract_features(state) for t in range(max_ep_length): action = rl_method.select_action(state) state, reward, done, _ = env.step(action) heat_map[state['agent_state'][0], state['agent_state'][1]] += 1 trajectory_point_run.append( (state['agent_state'][0] * env.cellWidth, state['agent_state'][1] * env.cellWidth)) state = feature_extractor.extract_features(state) trajectory_point_policy.append(trajectory_point_run) trajectory_point_master_list.append(trajectory_point_policy) fig, ax = plt.subplots() im = ax.imshow(heat_map, vmin=0, vmax=40) ax.set_xticks(np.arange(10)) ax.set_yticks(np.arange(10)) ax.set_xticklabels(labels) ax.set_yticklabels(labels) ax.set_xlabel('Columns of the gridworld', fontsize='large') ax.set_ylabel('Rows of the gridworld', fontsize='large') for i in range(len(labels)): for j in range(len(labels)): text = ax.text(j, i, heat_map[i, j], ha="center", va="bottom", color="black") #arrow = ax.arrow(j,i,.1,.1,shape='full',head_width= .2) #arrow = ax.annotate("",xy = (j,i) , arrowprops = arrow) pass ax.set_title("Grid location visitation frequency for a unbiased agent") #plt.colorbar() #plt.clim(0,70) plt.draw() if store: pickle_filename = 'FigureObject' + str(counter) + '.fig.pickle' pickle.dump(fig, open(pickle_filename, 'wb')) plt.pause(.001) #annotate_trajectory(policy_name_to_plot, env, rl_method, # max_ep_length, ax, feature_extractor=feature_extractor) plt.show()
def plot_reward_across_policy_models(foldername, expert=None, feature_extractor=None, seed_list=[], iterations_per_model=50, compare_expert=True): #given a folder of policy networks, the function will go through them one by one and #create a plot of the rewards obtained by each of the policy networks and compare them #to that of an expert (if provided) color_list = ['r', 'g', 'b', 'c', 'm', 'y', 'k'] counter = 0 reward_across_seeds = [] xaxis = None for seed in seed_list: env = GridWorld(display=False, is_onehot=False, is_random=True, rows=10, cols=10, seed=seed, obstacles=[ np.asarray([5, 1]), np.array([5, 9]), np.asarray([4, 1]), np.array([6, 9]), np.asarray([3, 1]), np.array([7, 9]) ], goal_state=np.asarray([1, 5])) max_ep_length = 20 rl_method = ActorCritic(env, feat_extractor=feature_extractor, gamma=0.99, max_ep_length=max_ep_length, log_interval=50) model_names = glob.glob(os.path.join(foldername, '*.pt')) xaxis = np.arange(len(model_names)) reward_exp = get_rewards_for_model(expert, env=env, feature_extractor=feature_extractor, rl_method=rl_method, max_ep_length=max_ep_length, iterations=iterations_per_model) reward_across_models = [] reward_expert = [] for policy_file in sorted(model_names, key=numericalSort): print('asdfasfsa', policy_file) reward_per_model = get_rewards_for_model( policy_file, env=env, feature_extractor=feature_extractor, rl_method=rl_method, max_ep_length=max_ep_length, iterations=iterations_per_model) print('Average reward for the model:', reward_per_model) reward_across_models.append(reward_per_model) reward_expert.append(reward_exp) reward_across_seeds.append(reward_across_models) np_reward_across_seeds = np.array(reward_across_seeds) print(np_reward_across_seeds.shape) means_rewards = np.mean(np_reward_across_seeds, axis=0) print("the mean rewards :", means_rewards) print("The mean across all runs and seeds : ", np.mean(means_rewards)) std_rewards = np.std(np_reward_across_seeds, axis=0) print('the std :', std_rewards) plt.xlabel('IRL iteration no.') plt.ylabel('Reward obtained') plt.plot(xaxis, means_rewards, color=color_list[counter], label='IRL trained agent') plt.fill_between(xaxis, means_rewards - std_rewards, means_rewards + std_rewards, alpha=0.5, facecolor=color_list[counter]) plt.plot(reward_expert, color='k', label='Expert agent') plt.legend() plt.draw() plt.pause(0.001) plt.show() return reward_across_models