Example #1
0
 def load_policies(self, folder) -> None:
     """
      :param: folder : name of the folder containing policies
      Output : none (policies of the folder stored in self.env_dict)        
      """
     listdir = os.listdir(folder)
     for policy_file in listdir:
         print(policy_file)
         pw = PolicyWrapper(GenericNet(), "", "", "", 0)
         policy = pw.load(folder + policy_file)
         if pw.env_name in self.env_dict:
             env = make_env(pw.env_name, pw.policy_type, pw.max_steps)
             env.set_reward_flag(False)
             env.set_duration_flag(False)
             scores = evaluate_pol(env, policy, False)
             self.score_dict[pw.env_name][scores.mean()] = [
                 pw.team_name, scores.std()
             ]
         else:
             env = make_env(pw.env_name, pw.policy_type, pw.max_steps)
             env.set_reward_flag(False)
             env.set_duration_flag(False)
             self.env_dict[pw.env_name] = env
             scores = evaluate_pol(env, policy, False)
             tmp_score_dict = {scores.mean(): [pw.team_name, scores.std()]}
             self.score_dict[pw.env_name] = tmp_score_dict
def main(params) -> None:
    env = make_env(params.env_name, params.policy_type, params.max_episode_steps, params.env_obs_space_name)
    env.set_file_name("{}_{}".format(params.gradients[0], params.env_name))

    simulation = Simulation(env, params.nb_trajs, params.update_threshold, params.nb_updates, params.batch_size,
                            params.print_interval)
    simulation.rescale_reward = lambda reward: reward / 10

    policy_loss_file, critic_loss_file = set_files(params.gradients[0], params.env_name)

    chrono = Chrono()

    for j in range(params.nb_repet):
        env.reinit()
        memory = ReplayBuffer()

        # Initialise the policy/actor
        policy = PolicyNet(params.lr_actor, params.init_alpha, params.lr_alpha, params.target_entropy_alpha)
        pw = PolicyWrapper(policy, params.policy_type, params.env_name, params.team_name, params.max_episode_steps)
        pw.duration_flag = False
        # Initialise the critics
        critic = DoubleQNet(params.lr_critic,params.gamma, params.tau)

        plot_policy(policy, env, True, params.env_name, params.study_name, '_ante_', j, plot=False)

        simulation.train(memory, pw, critic, policy_loss_file, critic_loss_file)

        plot_policy(policy, env, True, params.env_name, params.study_name, '_post_', j, plot=False)
        plot_critic(env, params.env_name, critic.q1, policy, params.study_name, '_q1_post_', j)
        plot_critic(env, params.env_name, critic.q2, policy, params.study_name, '_q2_post_', j)
        critic.q1.save_model('data/critics/{}#{}#SAC{}.pt'.format(params.env_name, params.team_name, str(j)))
        critic.q2.save_model('data/critics/{}#{}#SAC{}.pt'.format(params.env_name, params.team_name, str(j)))

    simulation.env.close()
    chrono.stop()
Example #3
0
def plot_critic_from_name(folder, file_name, policy) -> None:
    """
    Plot a critic from a file present into the given directory
    A policy is given to plot Q(s,a) critic using this policy for a
    :param folder: the given directory
    :param file_name: the name of the file
    :param policy: the given policy
    :return: nothing
    """
    complete_name = folder + file_name
    pw = PolicyWrapper(GenericNet(), "", "")
    critic = pw.load(complete_name)
    env_name = pw.env_name
    env, discrete = make_env(env_name, ["x", "y"])
    obs_size = env.observation_space.shape[0]
    picture_name = file_name + '_portrait.pdf'
    if not discrete:
        if obs_size == 1:
            plot_qfunction_1D(critic, env, plot=False, save_figure=True, figname=picture_name, foldername='/critics/')
        else:
            plot_qfunction_ND(critic, policy, env, plot=False, save_figure=True, figname=picture_name, foldername='/critics/')
    else:
        if obs_size == 2:
            plot_vfunction_2D(critic, env, plot=False, save_figure=True, figname=picture_name, foldername='/critics/')
        else:
            plot_vfunction_ND(critic, env, plot=False, save_figure=True, figname=picture_name, foldername='/critics/')
def make_simu_from_params(params):
    """
    Creates the environment, adding the required wrappers
    :param params: the hyper-parameters of the run, specified in arguments.py or in the command line
    :return: a simulation object
    """
    env_name = params.env_name
    env = make_env(env_name, params.policy_type, params.max_episode_steps, params.reward_shift, params.env_obs_space_name)
    return Simu(env, env_name)
def make_simu_from_wrapper(pw, params):
    """
    Creates the environment, adding the required wrappers
    Used when loading an agent from an external file, through a policy wrapper
    :param pw: the policy wrapper specifying the environment
    :param params: the hyper-parameters of the run, specified in arguments.py or in the command line
    :return: a simulation object
    """
    env_name = pw.env_name
    params.env_name = env_name
    env = make_env(env_name, params.policy_type, params.max_episode_steps, params.reward_shift, params.env_obs_space_name)
    return Simu(env, env_name)
Example #6
0
    angles3D = [20, 45, 50, 65]  # angles at which to save the plot3D
    elevs = [0, 30, 60]
    newVignette.saveAll(filename,
                        saveInFile=args.saveInFile,
                        save2D=args.save2D,
                        save3D=args.save3D,
                        directoryFile=args.directoryFile,
                        directory2D=args.directory2D,
                        directory3D=args.directory3D,
                        computedImg=computedImg,
                        angles3D=angles3D,
                        elevs=elevs)

    env.close()


if __name__ == '__main__':

    args = get_args()
    create_data_folders()
    directory = os.getcwd() + '/Models/'
    policies, colors, policy_name, env_name, max_episode_steps = load_policies(
        directory)
    args.env_name = env_name
    args.policy_type = policy_name
    args.max_episode_steps = int(max_episode_steps)
    print(args)
    env = make_env(args.env_name, args.policy_type, args.max_episode_steps,
                   args.env_obs_space_name)
    compute_vignette(args, env, policies, colors)
    """
    policy = SquashedGaussianPolicy(env.observation_space.shape[0], 24, 36, 1,
                                    params.lr_actor)
    policy.set_weights(weights)
    state = env.reset()
    env.render(mode='rgb_array')
    for i in range(1000):
        action = policy.select_action(state, deterministic=True)
        print(action)
        next_state, reward, done, _ = env.step(action)
        env.render(mode='rgb_array')
        state = next_state
    print('finished rendering')
    # print("team: ", policy.team_name, "mean: ", scores.mean(), "std:", scores.std())


if __name__ == '__main__':
    args = get_args()
    print(args)

    pw = PolicyWrapper(GenericNet(), 0, "", "", "", 0)

    env = make_env(args.env_name, args.policy_type, args.max_episode_steps)
    env = gym.wrappers.Monitor(env, './videos/PG_fin')

    directory = os.getcwd() + '/Models/'
    weights_vecs = load_policies(directory)
    for weights_vec in weights_vecs:
        render_pol(args, env, weights_vec)
    env.close()
def main():

    #output directory
    #output_dir = Path('/content/drive/My Drive/atari-pong-reinforcement-learning/output')
    output_dir = Path("../output")
    output_dir.mkdir(parents=True, exist_ok=True)

    #setup logging
    logfile_path = Path(output_dir / "output.log")
    setup_logging(logfile=logfile_path)

    #read config file
    #config_file = Path('/content/drive/My Drive/atari-pong-reinforcement-learning/config.ini')
    config_file = Path("../config.ini")
    reading_config(config_file)

    #environment
    env_name = Config.get("env_name")
    env = make_env(env_name)

    #configs
    batch_size = Config.get("training_batch_size")
    episodes = Config.get("episodes")
    gamma = Config.get("gamma")
    learning_rate = Config.get("learning_rate")
    epsilon_start = Config.get("epsilon_start")
    epsilon_end = Config.get("epsilon_end")
    epsilon_decay = Config.get("epsilon_decay")
    feature_extraction = Config.get("feature_extraction")
    n_actions = env.action_space.n
    device = Config.get("device")
    target_update = Config.get("target_update")

    #policy network
    #policy_network = Resnet18(n_actions, feature_extraction).to(device)
    policy_network = Alexnet(n_actions, feature_extraction).to(device)
    #target network
    # target_network = Resnet18(n_actions, feature_extraction).to(device)
    target_network = Alexnet(n_actions, feature_extraction).to(device)
    #initializing the weights of target network
    target_network.load_state_dict(policy_network.state_dict())
    #freezing the target network's weights
    target_network.eval()

    #optimizer
    optimizer = adam_optimizer(policy_network, learning_rate)

    #loss function
    criterion = l1_loss

    #experience
    #Experience = namedtuple('Experience',('state', 'action', 'reward', 'next_state'))
    memory_size = Config.get("memory_size")
    memory = ReplayMemory(memory_size)

    #loading the checkpoint
    checkpoint_file = Path(output_dir / Config.get("checkpoint_file"))
    checkpoint_pong = load_checkpoint(checkpoint_file)
    start_episode = 1
    if checkpoint_pong is not None:
        start_episode = checkpoint_pong['episode'] + 1
        policy_network.load_state_dict(checkpoint_pong['policy_net'])
        optimizer.load_state_dict(checkpoint_pong['optimizer'])
    del checkpoint_pong

    #agent
    agent = Agent(policy_network, n_actions)

    #model
    model = Pong(env, policy_network, target_network, agent, optimizer,
                 criterion, memory, output_dir)

    #training
    #model.train(episodes, target_update, start_episode, batch_size, epsilon_start, epsilon_end, epsilon_decay, gamma)
    model.evalutate()
Example #9
0
import agent
import environment
import time
from collections import deque
import numpy as np

name = 'PongDeterministic-v4'
agent = agent.Agent(actions=[0, 2, 3],
                    starting_mem_len=50000,
                    max_mem_len=750000,
                    starting_epsilon=1,
                    learn_rate=.00025)
env = environment.make_env(name, agent)
scores = []
max_scores = []
steps = []
max_score = -2
env.reset()

for i in range(10):
    timesteps = agent.total_timesteps

    cur_time = time.time()
    score = environment.play_episode(name, env, agent, debug=False)

    scores.append(score)
    steps.append(agent.total_timesteps - timesteps)

    if score > max_score:

        max_score = score