def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): self.gamma = gamma self.epsilon = epsilon self.lr =lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=f'{self.env_name}_{self.algo}_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=f'{self.env_name}_{self.algo}_q_next', chkpt_dir=self.chkpt_dir)
def __init__(self, state_size, action_size, behavior_name, index_player, replay_memory_size=1e4, batch_size=128, gamma=0.99, learning_rate = 1e-3, target_tau=1e-3, update_rate=4, seed=0): self.state_size = state_size self.current_state = [] self.action_size = action_size self.buffer_size = int(replay_memory_size) self.batch_size = batch_size self.gamma = gamma self.learn_rate = learning_rate self.tau = target_tau self.update_rate = update_rate self.seed = random.seed(seed) self.behavior_name = behavior_name self.index_player = index_player self.close_ball_reward = 0 self.touch_ball_reward = 0 """ Now we define two models: (a) one netwoek will be updated every (step % update_rate == 0), (b) A target network, with weights updated to equal to equal to the network (a) at a slower (target_tau) rate. """ self.network = QNetwork(state_size, action_size, seed).to(device) self.target_network = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.network.parameters(), lr= self.learn_rate) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) # Initialize time step ( for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self): self.pred_net_Q1, self.target_net_Q1 = ConvNet(), ConvNet() self.pred_net_Q2, self.target_net_Q2 = ConvNet(), ConvNet() # sync evac target self.target_deque1 = deque(maxlen=n) self.target_deque2 = deque(maxlen=n) self.update_target(self.target_net_Q1, self.pred_net_Q1, 1.0) self.update_target(self.target_net_Q2, self.pred_net_Q2, 1.0) self.target_deque1.append(self.target_net_Q1) # use gpu if USE_GPU: self.pred_net_Q1.cuda() self.target_net_Q1.cuda() self.pred_net_Q2.cuda() self.target_net_Q2.cuda() # simulator step counter self.memory_counter = 0 # target network step counter self.learn_step_counter = 0 # loss function self.loss_function = nn.MSELoss() # ceate the replay buffer self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY) # define optimizer self.optimizer = torch.optim.Adam(self.pred_net_Q1.parameters(), lr=LR) self.optimizer1 = torch.optim.Adam(self.pred_net_Q2.parameters(), lr=LR)
def __init__(self, state_size, action_size, seed, device=device, epsilon=0.3): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.device = device self.epsilon = epsilon self.t_step = 0 # counter for activating learning every few steps self.running_c_loss = 0 self.running_a_loss = 0 self.training_cnt = 0 # Actor network (w/ target network) self.actor_local = DDPGActor(state_size, action_size, seed).to(device) self.actor_target = DDPGActor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network (w/ target network) self.critic_local = D4PGCritic(state_size, action_size, seed, N_ATOMS, Vmin, Vmax).to(device) self.critic_target = D4PGCritic(state_size, action_size, seed, N_ATOMS, Vmin, Vmax).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
def __init__(self, state_size, action_size, seed): self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.t_step = 0 # counter for activating learning every few steps self.running_c_loss = 0 self.running_a_loss = 0 self.training_cnt = 0 # Actor network (w/ target network) self.actor_local = Actor(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network (w/ target network) self.critic_local = Critic(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
def __init__(self, input_dims, n_actions): self.epsilon = Config.epsilon self.n_actions = n_actions self.input_dims = input_dims self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(input_dims, n_actions)
def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None, chkpt_dir='tmp/dqn'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.algo = algo self.env_name = env_name self.chkpt_dir = chkpt_dir self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 # agents memory self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, chkpt_name, eps_min, eps_dec, replace, logging_dir): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.chkpt_dir = chkpt_name self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = FullyConnectedNet(self.lr, self.n_actions, input_dims=self.input_dims, chkpt_name=self.chkpt_dir, name='q_eval', logging_dir=logging_dir) self.q_next = FullyConnectedNet(self.lr, self.n_actions, input_dims=self.input_dims, name='q_next', chkpt_name=self.chkpt_dir, logging_dir=logging_dir)
def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=0.01, eps_dec=5e-7, replace=1000): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_target_cnt = replace self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims)
def __init__(self, gamma, epsilon, lr, n_actions, input_dims, action_joint_dim, mem_size, batch_size, eps_min, eps_dec, replace, prioritized=False, prob_alpha=0.6, beta=0.4, beta_increment=1e-4, temperature = 0.1, tau = 1e-5): """ Double Deep Q-Learning Agent class. ----- Args: gamma: Discount factor for reward. 0 indicates a myopic behaviour. 1 indicates a far-sighted behaviour. epsilon: Exploration/exploitation rate. 0 indicates full exploitation. lr: Learning Rate. The bigger 'lr' the bigger step in the gradient of the loss. n_actions: Number of possible actions. input_dims: Dimension of the state (allegedly an image). The channel goes first (CHANN, HEIGHT, WIDTH) action_joint_dim: Number of joints for the Multi-agent case. Normally the number of agents. mem_size: Number of the Replay Buffer memory. batch_size: Number of past experiences used for trainin Q-Network. eps_min: Min. value for the exploration. eps_dec: Epsilon decay in every epoch. replace: Number of epochs for replacing the target network with the behavioral network. ------ """ # Hiperparámetros de entrenamiento # self.gamma = gamma self.epsilon = epsilon self.beta = beta self.beta_increment = beta_increment self.prob_alpha = prob_alpha self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.update_target_count = replace self.action_space = [i for i in range(n_actions)] self.action_joint_dim = action_joint_dim self.prioritized = prioritized self.temperature = temperature self.tau = tau self.mem_size = mem_size if not self.prioritized: self.memory = ReplayBuffer(mem_size, input_dims, action_joint_dim) else: self.memory = PrioritizedReplayBuffer(mem_size, input_dims, action_joint_dim, self.prob_alpha) # Funciones del modelo y del target # self.q_eval = DeepQNetwork(self.lr, num_agents=action_joint_dim, action_size=n_actions, input_size=input_dims) self.q_eval.cuda() self.q_next = DeepQNetwork(self.lr, num_agents=action_joint_dim, action_size=n_actions, input_size=input_dims) self.q_next.cuda()
def __init__(self, state_size, action_size, num_agents): self.policy = PolicyNetwork(state_size, action_size).to(device) self.old_policy = PolicyNetwork(state_size, action_size).to(device) self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=LR) self.episodes = [Episode() for _ in range(num_agents)] self.memory = ReplayBuffer(BUFFER_SIZE) self.t_step = 0
def replay_buffer(): """ """ action_size = 10 buffer_size = 20 batch_size = 5 seed = 42 device = "cpu" rebuf = ReplayBuffer(action_size, buffer_size, batch_size, seed, device) return rebuf
def __init__(self, state_size, action_size, dqn_type='DQN', replay_memory_size=1e5, batch_size=64, gamma=0.99, learning_rate=1e-3, target_tau=2e-3, update_rate=4, seed=0): """ DQN Agent Parameters ====== state_size (int): dimension of each state action_size (int): dimension of each action dqn_type (string): can be either 'DQN' for vanillia dqn learning (default) or 'DDQN' for double-DQN. replay_memory size (int): size of the replay memory buffer (typically 5e4 to 5e6) batch_size (int): size of the memory batch used for model updates (typically 32, 64 or 128) gamma (float): paramete for setting the discoun ted value of future rewards (typically .95 to .995) learning_rate (float): specifies the rate of model learing (typically 1e-4 to 1e-3)) seed (int): random seed for initializing training point. """ self.dqn_type = dqn_type self.state_size = state_size self.action_size = action_size self.buffer_size = int(replay_memory_size) self.batch_size = batch_size self.gamma = gamma self.learn_rate = learning_rate self.tau = target_tau self.update_rate = update_rate self.seed = random.seed(seed) """ # DQN Agent Q-Network # For DQN training, two nerual network models are employed; # (a) A network that is updated every (step % update_rate == 0) # (b) A target network, with weights updated to equal the network at a slower (target_tau) rate. # The slower modulation of the target network weights operates to stablize learning. """ self.network = QNetwork(state_size, action_size, seed).to(device) self.target_network = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.network.parameters(), lr=self.learn_rate, betas=BETAS) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, state_size, action_size, num_agents, double_dqn=True): self.action_size = action_size self.double_dqn = double_dqn # Q-Network self.qnetwork_local = QNetwork(state_size, action_size).to(device) self.qnetwork_target = copy.deepcopy(self.qnetwork_local) self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE) self.num_agents = num_agents self.t_step = 0
def __init__(self, state_size, action_size, num_agents, double_dqn=False): self.action_size = action_size self.double_dqn = double_dqn # Q-Network self.qnetwork_local = QNetwork(state_size, action_size).to(device) self.qnetwork_target = copy.deepcopy(self.qnetwork_local) self.optimizer = torch.optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.lr_scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=4000, gamma=0.98, last_epoch=-1) # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE) self.num_agents = num_agents self.t_step = 0
def __init__(self, lr: float, gamma: float, obs_dims, num_actions: int, mem_size, mini_batchsize, epsilon_dec, env_name, algo_name, epsilon=1.0, replace=1000, epsilon_min=0.1, checkpoint_dir='results\\doubledqn'): self.lr = lr self.gamma = gamma self.obs_dims = obs_dims self.num_actions = num_actions self.mini_batchsize = mini_batchsize self.epsilon_min = epsilon_min self.epsilon_dec = epsilon_dec self.epsilon = epsilon self.replace_target_cnt = replace self.mem_counter = 0 self.copy_counter = 0 self.checkpoint_dir = checkpoint_dir self.memories = ReplayBuffer(mem_size=mem_size, state_shape=self.obs_dims, num_actions=self.num_actions) self.action_space = [i for i in range(self.num_actions)] self.learning_network = DeepQNetwork( lr=self.lr, num_actions=self.num_actions, input_dims=self.obs_dims, name=algo_name + '_' + env_name + '_' + 'learning', checkpoint_dir=self.checkpoint_dir) self.target_network = DeepQNetwork(lr=self.lr, num_actions=self.num_actions, input_dims=self.obs_dims, name=env_name + '_' + algo_name + '_target', checkpoint_dir=self.checkpoint_dir) self.loss_value = 0 self.writer = SummaryWriter(os.path.join(self.checkpoint_dir, 'logs'))
def __init__(self, n_actions, input_dims): self.n_actions = n_actions self.input_dims = input_dims self.epsilon = Config.epsilon self.action_space = [i for i in range(n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(input_dims, n_actions) name_root = Config.env_name + '_' + Config.algo self.q_eval = Network(self.n_actions, input_dims=self.input_dims, name=name_root + '_q_eval') self.q_next = Network(self.n_actions, input_dims=self.input_dims, name=name_root + '_q_next')
def __init__(self, states, actions, alpha, gamma, epsilon, epsilon_min, epsilon_decay, replay_buffer_sz, batch, path, path_pred): self.Q = Network(states.shape, actions, alpha, path) self.Q_pred = Network(states.shape, actions, alpha, path_pred) # self.memory = deque(maxlen=replay_buffer_sz) self.memory = ReplayBuffer(replay_buffer_sz, states.shape, actions) self.batch = batch self.learn_cnt = 0 self.gamma = gamma self.epsilon = epsilon self.epsilon_min = epsilon_min self.epsilon_decay = epsilon_decay self.actions = actions self.Q.path = path self.Q_pred.path = path_pred
def __init__(self, alpha, beta, input_dims, tau, gamma=0.99, max_action=1.0, \ n_actions=2, max_size=1000000, layer1_size=400, \ layer2_size=300, batch_size=100, reward_scale=2, path_dir='model/sac'): self.gamma = gamma self.tau = tau self.memory = ReplayBuffer(max_size, input_dims, n_actions) self.batch_size = batch_size self.n_actions = n_actions self.actor = ActorNetwork(alpha, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='_actor', max_action=max_action, chkpt_dir=path_dir) self.critic_1 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='_critic_1', chkpt_dir=path_dir) self.critic_2 = CriticNetwork(beta, input_dims, layer1_size, layer2_size, n_actions=n_actions, name='_critic_2', chkpt_dir=path_dir) self.value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name='_value', chkpt_dir=path_dir) self.target_value = ValueNetwork(beta, input_dims, layer1_size, layer2_size, name='_target_value', chkpt_dir=path_dir) self.scale = reward_scale self.update_network_parameters(tau=1)
def __init__(self, lr: float, gamma: float, obs_dims, num_actions: int, mem_size, mini_batchsize, epsilon_dec, env_name, algo_name, epsilon=1.0, replace=1000, epsilon_min=0.1, checkpoint_dir='temp/dqn/duelingdqn'): self.lr = lr self.gamma = gamma self.obs_dims = obs_dims self.num_actions = num_actions self.mini_batchsize = mini_batchsize self.epsilon_min = epsilon_min self.epsilon_dec = epsilon_dec self.epsilon = epsilon self.mem_counter = 0 self.copy_counter = 0 self.replace_target_cnt = replace self.checkpoint_dir = checkpoint_dir self.memories = ReplayBuffer(mem_size=mem_size, state_shape=self.obs_dims, num_actions=self.num_actions) self.action_space = [i for i in range(self.num_actions)] self.learning_network = DuelingQNetwork( lr=self.lr, num_actions=self.num_actions, input_dims=self.obs_dims, name=env_name + '_' + algo_name + '_learning', checkpoint_dir=self.checkpoint_dir) self.target_network = DuelingQNetwork( lr=self.lr, num_actions=self.num_actions, input_dims=self.obs_dims, name=env_name + '_' + algo_name + '_target', checkpoint_dir=self.checkpoint_dir)
def __init__(self, input_dims, n_actions, lr, mem_size, batch_size, epsilon, gamma=0.99, eps_dec=5e-7, eps_min=0.01, replace=1000, algo=None, env_name=None, checkpoint_dir='tmp/dqn'): self.lr = lr self.batch_size = batch_size self.input_dims = input_dims self.n_actions = n_actions self.gamma = gamma self.epsilon = epsilon self.eps_dec = eps_dec self.eps_min = eps_min self.replace = replace self.algo = algo self.env_name = env_name self.checkpoint_dir = checkpoint_dir self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + " " + self.algo + "_q_eval", checkpoint_dir=self.checkpoint_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + " " + self.algo + "_q_next", checkpoint_dir=self.checkpoint_dir)
def agent_init(self, agent_config): self.replay_buffer = ReplayBuffer(agent_config['replay_buffer_size'], agent_config['minibatch_sz'], agent_config.get("seed")) self.network = ActionValueNetwork(agent_config['network_config']) self.optimizer = Adam(self.network.layer_sizes, agent_config["optimizer_config"]) self.num_actions = agent_config['network_config']['num_actions'] self.num_replay = agent_config['num_replay_updates_per_step'] self.discount = agent_config['gamma'] self.tau = agent_config['tau'] self.rand_generator = np.random.RandomState(agent_config.get("seed")) self.last_state = None self.last_action = None self.sum_rewards = 0 self.episode_steps = 0
def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, chkpt_dir, eps_min=0.01, eps_dec=5e-7, replace=1000, algo=None, env_name=None): self.gamma = gamma # 0.99 self.epsilon = epsilon # 1.0 self.lr = lr # 0.0001 self.n_actions = n_actions # 6 self.input_dims = input_dims # (4, 84, 84) self.batch_size = batch_size # 32 self.eps_min = eps_min # 0.1 self.eps_dec = eps_dec # 1e-05 self.replace_target_cnt = replace # 1000 self.algo = algo # 'DQNAgent' self.env_name = env_name # 'PongNoFrameskip-v4' self.chkpt_dir = chkpt_dir # .\\models\\ self.action_space = [i for i in range(self.n_actions) ] # [0, 1, 2, 3, 4, 5] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.q_eval = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_eval', chkpt_dir=self.chkpt_dir) self.q_next = DeepQNetwork(self.lr, self.n_actions, input_dims=self.input_dims, name=self.env_name + '_' + self.algo + '_q_next', chkpt_dir=self.chkpt_dir)
def __init__(self): self.pred_net, self.target_net = ConvNet(), ConvNet() # sync evac target self.update_target(self.target_net, self.pred_net, 1.0) # use gpu if USE_GPU: self.pred_net.cuda() self.target_net.cuda() # simulator step counter self.memory_counter = 0 # target network step counter self.learn_step_counter = 0 # ceate the replay buffer self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY) # define optimizer self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR)
def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min=.01, eps_dec=5e-7, replace_count=1000, algorithm=None, env_name=None, checkpoint_dir='/checkpoints'): self.gamma = gamma self.epsilon = epsilon self.lr = lr self.n_actions = n_actions self.input_dims = input_dims self.batch_size = batch_size self.eps_min = eps_min self.eps_dec = eps_dec self.replace_count = replace_count self.algorithm = algorithm self.env_name = env_name self.checkpoint_dir = checkpoint_dir self.action_space = [i for i in range(self.n_actions)] self.learn_step_counter = 0 self.memory = ReplayBuffer(mem_size, input_dims, n_actions) print(type(self).__name__) self.q_eval = object self.q_policy = object
def __init__(self, state_size, action_size, dqn_type='DQN', replay_memory_size=1e5, batch_size=64, gamma=0.99, learning_rate=1e-3, target_tau=2e-3, update_rate=4, seed=0): self.dqn_type = dqn_type self.state_size = state_size self.action_size = action_size self.buffer_size = int(replay_memory_size) self.batch_size = batch_size self.gamma = gamma self.learn_rate = learning_rate self.tau = target_tau self.update_rate = update_rate self.seed = random.seed(seed) """ # DQN Agent Q-Network # For DQN training, two neural network models are employed; # (a) A network that is updated every (step % update_rate == 0) # (b) A target network, with weights updated to equal the network at a slower (target_tau) rate. # The slower modulation of the target network weights operates to stablize learning. """ self.network = QNetwork(state_size, action_size, seed).to(device) self.target_network = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.network.parameters(), lr=self.learn_rate) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0
def __init__(self, agent, env, n_episode, max_step, replay_size=32, data_size=10**6, n_warmup=5 * 10**4, priority=False, multi_step=1, render=False, test_render=False, test_episode=5, test_interval=1000, test_frame=False, metrics=None, init_model_dir=None): self.agent = agent self.env = env self.n_episode = n_episode self.max_steps = max_step self.render = render self.data_size = data_size self.n_warmup = n_warmup self.replay_size = replay_size # batch_size self.multi_step = multi_step self.test_episode = test_episode self.test_interval = test_interval if test_interval is not None else 10000 self.util = Utils(prefix=self.agent.__class__.__name__) self.util.initial() self.replay_buf = PrioritizeReplayBuffer( self.data_size) if priority else ReplayBuffer(self.data_size) self.global_step = tf.train.get_or_create_global_step() self.test_render = test_render self.test_frame = test_frame self.init_model_dir = init_model_dir self.metrics = metrics
def __init__(self, gamma, epsilon, lr, n_actions, input_dims, mem_size, batch_size, eps_min = 0.01, eps_dec = 5e-7, replace = 10_000): pass self.gamma = gamma #used to discount future rewards self.epsilon = epsilon #used for epsilon-greedy action choosing algo. self.lr = lr #learning rate, essentially, how big of a step does the optimizer take self.n_actions = n_actions #number of actions available to our agent in its environment self.action_space = [i for i in range(n_actions)]#list comprehension to create array of indices of possible actions to choose from self.input_dims = input_dims #the dimensions of our input as defined by the agent's environment self.mem_size = mem_size #maximum amount of memories to store self.batch_size = batch_size #mini-batch size to sample from memory. self.eps_min = eps_min #smallest possible epsilon value for our agent self.eps_dec = eps_dec #how much to decrease epsilon each iteration self.replace_after = replace #how many iterations until we replace our target network with a sofy copy of our local network self.steps = 0 #iteration counter for use with replace_after #create a ReplayBuffer to store our memories, also used to sample a mini-batch self.memory = ReplayBuffer(mem_size, input_dims, n_actions) self.Q_local = DeepQNetwork(self.lr, self.n_actions, input_dims = self.input_dims) self.Q_target = DeepQNetwork(self.lr, self.n_actions, input_dims = self.input_dims)
def __init__(self): if USE_CNN: if USE_GPU: self.eval_net, self.target_net = ConvNet().cuda(), ConvNet( ).cuda() else: self.eval_net, self.target_net = ConvNet(), ConvNet() else: if USE_GPU: self.eval_net, self.target_net = Net().cuda(), Net().cuda() else: self.eval_net, self.target_net = Net(), Net() self.learn_step_counter = 0 # for target updating self.memory_counter = 0 # Create the replay buffer if MEMORY_MODE == 'PER': self.replay_buffer = PrioritizedReplayBuffer(MEMORY_CAPACITY, alpha=PER_ALPHA) else: self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY) self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=LR)
def __init__(self): self.pred_net, self.target_net = ConvNet(), ConvNet() # sync eval target self.update_target(self.target_net, self.pred_net, 1.0) # use gpu if USE_GPU: self.pred_net.cuda() self.target_net.cuda() # simulator step conter self.memory_counter = 0 # target network step counter self.learn_step_counter = 0 # ceate the replay buffer self.replay_buffer = ReplayBuffer(MEMORY_CAPACITY) # define optimizer self.optimizer = torch.optim.Adam(self.pred_net.parameters(), lr=LR) # discrete values self.value_range = torch.FloatTensor(V_RANGE) # (N_ATOM) if USE_GPU: self.value_range = self.value_range.cuda()