Example #1
0
    def __init__(self, n_action, init_epsilon, final_epsilon, gamma,
                 buffer_size, batch_size, replace_iter, annealing,
                 learning_rate, ctx):
        self.n_action = n_action
        self.epsilon = init_epsilon
        self.init_epsilon = init_epsilon
        self.final_epsilon = final_epsilon
        # discount factor
        self.gamma = gamma
        # memory buffer size
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        # replace the parameters of the target network every T time steps
        self.replace_iter = replace_iter
        # The number of step it will take to linearly anneal the epsilon to its min value
        self.annealing = annealing
        self.learning_rate = learning_rate
        self.ctx = ctx

        self.total_steps = 0
        self.replay_buffer = MemoryBuffer(self.buffer_size, ctx)  # use deque

        # build the network
        self.target_network = DoubleQNetwork(n_action)
        self.main_network = DoubleQNetwork(n_action)
        self.target_network.collect_params().initialize(
            init.Xavier(), ctx=ctx)  # initialize the params
        self.main_network.collect_params().initialize(init.Xavier(), ctx=ctx)

        # optimize the main network
        self.optimizer = gluon.Trainer(self.main_network.collect_params(),
                                       'adam',
                                       {'learning_rate': self.learning_rate})
Example #2
0
    def __init__(self, action_dim, action_bound, actor_learning_rate,
                 critic_learning_rate, batch_size, memory_size, gamma, tau,
                 explore_steps, policy_update, policy_noise, explore_noise,
                 noise_clip, ctx):
        self.action_dim = action_dim
        self.action_bound = nd.array(action_bound, ctx=ctx)

        self.actor_learning_rate = actor_learning_rate
        self.critic_learning_rate = critic_learning_rate
        self.batch_size = batch_size
        self.memory_size = memory_size
        self.gamma = gamma
        self.tau = tau
        self.explore_steps = explore_steps
        self.policy_update = policy_update
        self.policy_noise = policy_noise
        self.explore_noise = explore_noise
        self.noise_clip = noise_clip
        self.ctx = ctx

        self.main_actor_network = Actor(action_dim, self.action_bound)
        self.target_actor_network = Actor(action_dim, self.action_bound)
        self.main_critic_network1 = Critic()
        self.target_critic_network1 = Critic()
        self.main_critic_network2 = Critic()
        self.target_critic_network2 = Critic()

        self.main_actor_network.collect_params().initialize(init=init.Xavier(),
                                                            ctx=ctx)
        self.target_actor_network.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.main_critic_network1.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.target_critic_network1.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.main_critic_network2.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)
        self.target_critic_network2.collect_params().initialize(
            init=init.Xavier(), ctx=ctx)

        self.actor_optimizer = gluon.Trainer(
            self.main_actor_network.collect_params(), 'adam',
            {'learning_rate': self.actor_learning_rate})
        self.critic1_optimizer = gluon.Trainer(
            self.main_critic_network1.collect_params(), 'adam',
            {'learning_rate': self.critic_learning_rate})
        self.critic2_optimizer = gluon.Trainer(
            self.main_critic_network2.collect_params(), 'adam',
            {'learning_rate': self.critic_learning_rate})

        self.total_steps = 0
        self.total_train_steps = 0

        self.memory_buffer = MemoryBuffer(buffer_size=self.memory_size,
                                          ctx=ctx)
Example #3
0
 def __init__(self, act_dim, env_dim, act_range, buffer_size=20000, gamma=0.99, lr=0.00005, tau=0.001):
     """Initialization"""
     # Environment and A2C parameters
     self.act_dim = act_dim
     self.act_range = act_range
     self.env_dim = env_dim
     self.gamma = gamma
     self.lr = lr
     # Create actor and critic networks
     self.actor = Actor(self.env_dim, act_dim, act_range, 0.1 * lr, tau)
     self.critic = Critic(self.env_dim, act_dim, lr, tau)
     self.buffer = MemoryBuffer(buffer_size)
Example #4
0
    def __init__(
            self,
            capacity_per_level=500000,
            warmup_steps=100000,
            n_frames=4,
            n_atoms=51,
            v_min=-1,
            v_max=0,
            gamma=.99,
            device='cuda',
            batch_size=48,
            lr=0.0000625 * 2,
            lr_decay=0.99,
            update_target_net_every=25000,
            train_every=6,
            frame_skip=4,
            disable_noisy_after=2000000,
            super_hexagon_path='C:\\Program Files (x86)\\Steam\\steamapps\\common\\Super Hexagon\\superhexagon.exe',
            run_afap=True):

        # training objects
        self.memory_buffer = MemoryBuffer(
            capacity_per_level,
            SuperHexagonInterface.n_levels,
            n_frames,
            SuperHexagonInterface.frame_size,
            SuperHexagonInterface.frame_size_cropped,
            gamma,
            device=device)
        self.net = Network(n_frames, SuperHexagonInterface.n_actions,
                           n_atoms).to(device)
        self.target_net = Network(n_frames, SuperHexagonInterface.n_actions,
                                  n_atoms).to(device)
        self.target_net.load_state_dict(self.net.state_dict())
        self.optimizer = torch.optim.Adam(self.net.parameters(),
                                          lr=lr,
                                          eps=1.5e-4)
        self.lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
            self.optimizer, ExpLrDecay(lr_decay, min_factor=.1))

        # parameters
        self.batch_size = batch_size
        self.update_target_net_every = update_target_net_every
        self.train_every = train_every
        self.frame_skip = frame_skip
        self.disable_noisy_after = disable_noisy_after
        self.warmup_steps = warmup_steps
        self.gamma = gamma
        self.device = device

        # parameters for distributional
        self.n_atoms = n_atoms
        self.v_min = v_min
        self.v_max = v_max
        self.delta_z = (v_max - v_min) / (n_atoms - 1)
        self.support = torch.linspace(v_min,
                                      v_max,
                                      n_atoms,
                                      dtype=torch.float,
                                      device=device)
        self.offset = torch.arange(0,
                                   batch_size * n_atoms,
                                   n_atoms,
                                   device=device).view(-1, 1)
        self.m = torch.empty((batch_size, n_atoms), device=device)

        # debug and logging stuff
        self.list_steps_alive = [[]
                                 for _ in range(SuperHexagonInterface.n_levels)
                                 ]
        self.longest_run = [(0, 0)] * SuperHexagonInterface.n_levels
        self.total_simulated_steps = [0] * SuperHexagonInterface.n_levels
        self.losses = []
        self.kls = []
        self.times = []
        self.iteration = 0

        self.super_hexagon_path = super_hexagon_path
        self.run_afap = run_afap