def load_policies(self, folder) -> None: """ :param: folder : name of the folder containing policies Output : none (policies of the folder stored in self.env_dict) """ listdir = os.listdir(folder) for policy_file in listdir: print(policy_file) pw = PolicyWrapper(GenericNet(), "", "", "", 0) policy = pw.load(folder + policy_file) if pw.env_name in self.env_dict: env = make_env(pw.env_name, pw.policy_type, pw.max_steps) env.set_reward_flag(False) env.set_duration_flag(False) scores = evaluate_pol(env, policy, False) self.score_dict[pw.env_name][scores.mean()] = [ pw.team_name, scores.std() ] else: env = make_env(pw.env_name, pw.policy_type, pw.max_steps) env.set_reward_flag(False) env.set_duration_flag(False) self.env_dict[pw.env_name] = env scores = evaluate_pol(env, policy, False) tmp_score_dict = {scores.mean(): [pw.team_name, scores.std()]} self.score_dict[pw.env_name] = tmp_score_dict
def main(params) -> None: env = make_env(params.env_name, params.policy_type, params.max_episode_steps, params.env_obs_space_name) env.set_file_name("{}_{}".format(params.gradients[0], params.env_name)) simulation = Simulation(env, params.nb_trajs, params.update_threshold, params.nb_updates, params.batch_size, params.print_interval) simulation.rescale_reward = lambda reward: reward / 10 policy_loss_file, critic_loss_file = set_files(params.gradients[0], params.env_name) chrono = Chrono() for j in range(params.nb_repet): env.reinit() memory = ReplayBuffer() # Initialise the policy/actor policy = PolicyNet(params.lr_actor, params.init_alpha, params.lr_alpha, params.target_entropy_alpha) pw = PolicyWrapper(policy, params.policy_type, params.env_name, params.team_name, params.max_episode_steps) pw.duration_flag = False # Initialise the critics critic = DoubleQNet(params.lr_critic,params.gamma, params.tau) plot_policy(policy, env, True, params.env_name, params.study_name, '_ante_', j, plot=False) simulation.train(memory, pw, critic, policy_loss_file, critic_loss_file) plot_policy(policy, env, True, params.env_name, params.study_name, '_post_', j, plot=False) plot_critic(env, params.env_name, critic.q1, policy, params.study_name, '_q1_post_', j) plot_critic(env, params.env_name, critic.q2, policy, params.study_name, '_q2_post_', j) critic.q1.save_model('data/critics/{}#{}#SAC{}.pt'.format(params.env_name, params.team_name, str(j))) critic.q2.save_model('data/critics/{}#{}#SAC{}.pt'.format(params.env_name, params.team_name, str(j))) simulation.env.close() chrono.stop()
def plot_critic_from_name(folder, file_name, policy) -> None: """ Plot a critic from a file present into the given directory A policy is given to plot Q(s,a) critic using this policy for a :param folder: the given directory :param file_name: the name of the file :param policy: the given policy :return: nothing """ complete_name = folder + file_name pw = PolicyWrapper(GenericNet(), "", "") critic = pw.load(complete_name) env_name = pw.env_name env, discrete = make_env(env_name, ["x", "y"]) obs_size = env.observation_space.shape[0] picture_name = file_name + '_portrait.pdf' if not discrete: if obs_size == 1: plot_qfunction_1D(critic, env, plot=False, save_figure=True, figname=picture_name, foldername='/critics/') else: plot_qfunction_ND(critic, policy, env, plot=False, save_figure=True, figname=picture_name, foldername='/critics/') else: if obs_size == 2: plot_vfunction_2D(critic, env, plot=False, save_figure=True, figname=picture_name, foldername='/critics/') else: plot_vfunction_ND(critic, env, plot=False, save_figure=True, figname=picture_name, foldername='/critics/')
def make_simu_from_params(params): """ Creates the environment, adding the required wrappers :param params: the hyper-parameters of the run, specified in arguments.py or in the command line :return: a simulation object """ env_name = params.env_name env = make_env(env_name, params.policy_type, params.max_episode_steps, params.reward_shift, params.env_obs_space_name) return Simu(env, env_name)
def make_simu_from_wrapper(pw, params): """ Creates the environment, adding the required wrappers Used when loading an agent from an external file, through a policy wrapper :param pw: the policy wrapper specifying the environment :param params: the hyper-parameters of the run, specified in arguments.py or in the command line :return: a simulation object """ env_name = pw.env_name params.env_name = env_name env = make_env(env_name, params.policy_type, params.max_episode_steps, params.reward_shift, params.env_obs_space_name) return Simu(env, env_name)
angles3D = [20, 45, 50, 65] # angles at which to save the plot3D elevs = [0, 30, 60] newVignette.saveAll(filename, saveInFile=args.saveInFile, save2D=args.save2D, save3D=args.save3D, directoryFile=args.directoryFile, directory2D=args.directory2D, directory3D=args.directory3D, computedImg=computedImg, angles3D=angles3D, elevs=elevs) env.close() if __name__ == '__main__': args = get_args() create_data_folders() directory = os.getcwd() + '/Models/' policies, colors, policy_name, env_name, max_episode_steps = load_policies( directory) args.env_name = env_name args.policy_type = policy_name args.max_episode_steps = int(max_episode_steps) print(args) env = make_env(args.env_name, args.policy_type, args.max_episode_steps, args.env_obs_space_name) compute_vignette(args, env, policies, colors)
""" policy = SquashedGaussianPolicy(env.observation_space.shape[0], 24, 36, 1, params.lr_actor) policy.set_weights(weights) state = env.reset() env.render(mode='rgb_array') for i in range(1000): action = policy.select_action(state, deterministic=True) print(action) next_state, reward, done, _ = env.step(action) env.render(mode='rgb_array') state = next_state print('finished rendering') # print("team: ", policy.team_name, "mean: ", scores.mean(), "std:", scores.std()) if __name__ == '__main__': args = get_args() print(args) pw = PolicyWrapper(GenericNet(), 0, "", "", "", 0) env = make_env(args.env_name, args.policy_type, args.max_episode_steps) env = gym.wrappers.Monitor(env, './videos/PG_fin') directory = os.getcwd() + '/Models/' weights_vecs = load_policies(directory) for weights_vec in weights_vecs: render_pol(args, env, weights_vec) env.close()
def main(): #output directory #output_dir = Path('/content/drive/My Drive/atari-pong-reinforcement-learning/output') output_dir = Path("../output") output_dir.mkdir(parents=True, exist_ok=True) #setup logging logfile_path = Path(output_dir / "output.log") setup_logging(logfile=logfile_path) #read config file #config_file = Path('/content/drive/My Drive/atari-pong-reinforcement-learning/config.ini') config_file = Path("../config.ini") reading_config(config_file) #environment env_name = Config.get("env_name") env = make_env(env_name) #configs batch_size = Config.get("training_batch_size") episodes = Config.get("episodes") gamma = Config.get("gamma") learning_rate = Config.get("learning_rate") epsilon_start = Config.get("epsilon_start") epsilon_end = Config.get("epsilon_end") epsilon_decay = Config.get("epsilon_decay") feature_extraction = Config.get("feature_extraction") n_actions = env.action_space.n device = Config.get("device") target_update = Config.get("target_update") #policy network #policy_network = Resnet18(n_actions, feature_extraction).to(device) policy_network = Alexnet(n_actions, feature_extraction).to(device) #target network # target_network = Resnet18(n_actions, feature_extraction).to(device) target_network = Alexnet(n_actions, feature_extraction).to(device) #initializing the weights of target network target_network.load_state_dict(policy_network.state_dict()) #freezing the target network's weights target_network.eval() #optimizer optimizer = adam_optimizer(policy_network, learning_rate) #loss function criterion = l1_loss #experience #Experience = namedtuple('Experience',('state', 'action', 'reward', 'next_state')) memory_size = Config.get("memory_size") memory = ReplayMemory(memory_size) #loading the checkpoint checkpoint_file = Path(output_dir / Config.get("checkpoint_file")) checkpoint_pong = load_checkpoint(checkpoint_file) start_episode = 1 if checkpoint_pong is not None: start_episode = checkpoint_pong['episode'] + 1 policy_network.load_state_dict(checkpoint_pong['policy_net']) optimizer.load_state_dict(checkpoint_pong['optimizer']) del checkpoint_pong #agent agent = Agent(policy_network, n_actions) #model model = Pong(env, policy_network, target_network, agent, optimizer, criterion, memory, output_dir) #training #model.train(episodes, target_update, start_episode, batch_size, epsilon_start, epsilon_end, epsilon_decay, gamma) model.evalutate()
import agent import environment import time from collections import deque import numpy as np name = 'PongDeterministic-v4' agent = agent.Agent(actions=[0, 2, 3], starting_mem_len=50000, max_mem_len=750000, starting_epsilon=1, learn_rate=.00025) env = environment.make_env(name, agent) scores = [] max_scores = [] steps = [] max_score = -2 env.reset() for i in range(10): timesteps = agent.total_timesteps cur_time = time.time() score = environment.play_episode(name, env, agent, debug=False) scores.append(score) steps.append(agent.total_timesteps - timesteps) if score > max_score: max_score = score