def __init__( self, player_id: int = 1, name: str = "Ugo", batch_size: int = 128, gamma: float = 0.98, memory_size: int = 40000, ) -> None: """Initialization for the DQN agent Args: player_id (int, optional): Side of the board on which to play. Defaults to 1. name (str, optional): Name of the player. Defaults to "Ugo". batch_size (int, optional): Batch size of the update. Defaults to 128. gamma (float, optional): Gamme value for update decay. Defaults to 0.98. memory_size (int, optional): Experience memory capacity. Defaults to 40000. """ # list of parameters of the agent self.player_id = player_id self.name = name self.batch_size = batch_size # size of batch for update self.gamma = gamma # discount factor self.memory_size = memory_size # size of replay memory self.memory = ReplayMemory(self.memory_size, train_buffer_capacity=4, test_buffer_capacity=4) # networks self.policy_net = DQN(action_space_dim=3, hidden_dim=256).to(torch.device(device)) self.target_net = DQN(action_space_dim=3, hidden_dim=256).to(torch.device(device)) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1e-4)
def main(): num_digits = 4 state_size = 128 embedding_dim = 8 model = StateModel(num_digits, state_size, embedding_dim) env = gym.make("GuessNumEnv-v0") episodes = 100 max_epside_len = 100 replay_memory = ReplayMemory(1000) for ep in range(episodes): state, reward, done = env.reset() state = torch.from_numpy(state) action = torch.argmax(model((state[:, :-2].unsqueeze(0).long(), state[:, -2:].unsqueeze(0).float())), dim=-1) + 1 # Plus one because the action is composed of the numbers between 1 and 9 next_state, reward, done = env.step(action.numpy().reshape(-1,)) t = Transition(state=state, next_state=next_state, reward=reward, action=action) env.render() print(reward, done) break
def __init__(self, env_name, state_space, n_actions, replay_buffer_size=50000, batch_size=32, hidden_size=12, gamma=0.98): self.env_name = env_name self.n_actions = n_actions self.state_space_dim = state_space if "CartPole" in self.env_name: self.policy_net = CartpoleDQN(state_space, n_actions, hidden_size) self.target_net = CartpoleDQN(state_space, n_actions, hidden_size) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1e-3) elif "LunarLander" in self.env_name: self.policy_net = LunarLanderDQN(state_space, n_actions, hidden_size) self.target_net = LunarLanderDQN(state_space, n_actions, hidden_size) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.Adam(self.policy_net.parameters(), lr=5e-4) else: raise ValueError( "Wrong environment. An agent has not been specified for %s" % env_name) self.memory = ReplayMemory(replay_buffer_size) self.batch_size = batch_size self.gamma = gamma
def __init__(self, ob_sp, act_sp, alow, ahigh, writer, args): self.args = args self.alow = alow self.ahigh = ahigh self.policy = Policy_net(ob_sp, act_sp) self.policy_targ = Policy_net(ob_sp, act_sp) self.qnet = Q_net(ob_sp, act_sp) self.qnet_targ = Q_net(ob_sp, act_sp) self.policy.to(device) self.qnet.to(device) self.policy_targ.to(device) self.qnet_targ.to(device) self.MSE_loss = nn.MSELoss() self.noise = OUNoise(1, 1) hard_update(self.policy_targ, self.policy) hard_update(self.qnet_targ, self.qnet) self.p_optimizer = optim.Adam(self.policy.parameters(), lr=LR) self.q_optimizer = optim.Adam(self.qnet.parameters(), lr=LR) self.memory = ReplayMemory(int(1e6)) self.epsilon_scheduler = LinearSchedule(E_GREEDY_STEPS, FINAL_STD, INITIAL_STD, warmup_steps=WARMUP_STEPS) self.n_steps = 0 self.n_updates = 0 self.writer = writer
def __init__(self, mem_size, epsilon, mini_batch_size, learning_rate, gamma): self.epsilon = epsilon self.mini_batch_size = mini_batch_size self.gamma = gamma self.update_counter = 0 self.net = nn.Sequential( nn.Linear(2, 128), nn.ReLU(), nn.Linear(128, 128), nn.ReLU(), nn.Linear(128, 3) ).float() self.net_target = copy.deepcopy(self.net) self.net = self.net.cuda() self.net_target = self.net_target.cuda() # self.net_target = nn.Sequential( # nn.Linear(2, 128), # nn.ReLU(), # nn.Linear(128, 128), # nn.ReLU(), # nn.Linear(128, 3) # ).float() self.replay_memory = ReplayMemory(max_size=mem_size) self.criterion = nn.MSELoss() self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate)
def __init__(self, env_name, state_space, n_actions, replay_buffer_size=500000, batch_size=32, hidden_size=64, gamma=0.99): self.env_name = env_name device = 'cuda' if torch.cuda.is_available() else 'cpu' self.train_device = device self.n_actions = n_actions self.state_space_dim = state_space if "CartPole" in self.env_name: self.policy_net = CartpoleDQN(state_space, n_actions, 4) self.target_net = CartpoleDQN(state_space, n_actions, 4) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1e-4) elif "WimblepongVisualSimpleAI-v0" in self.env_name: self.policy_net = Policy(state_space, n_actions, 4) self.target_net = Policy(state_space, n_actions, 4) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.Adam(self.policy_net.parameters(), lr=5e-4) else: raise ValueError( "Wrong environment. An agent has not been specified for %s" % env_name) self.memory = ReplayMemory(replay_buffer_size) self.batch_size = batch_size self.gamma = gamma
def __init__(self, sess, training_steps=5000000, learning_rate=0.0001, momentum=0.95, memory_size=100000, discount_rate=0.95, eps_min=0.05): self.activation = tf.nn.relu self.optimizer = tf.train.MomentumOptimizer self.learning_rate = learning_rate self.momentum = momentum self._build_graph() self.memory_size = memory_size self.memory = ReplayMemory(self.memory_size) ''' The discount rate is the parameter that indicates how many actions will be considered in the future to evaluate the reward of a given action. A value of 0 means the agent only considers the present action, and a value close to 1 means the agent considers actions very far in the future. ''' self.discount_rate = discount_rate self.eps_min = eps_min self.eps_decay_steps = int(training_steps / 2) self.sess = sess self.init = tf.global_variables_initializer()
def __init__(self, game_name, gamma, batch_size, eps_start, eps_end, eps_decay, mem_size, device): if batch_size > mem_size: print( "Error: the training crushes due to batch size smaller than memory size." ) return self.gamma = gamma self.batch_size = batch_size self.eps_start = eps_start self.eps_end = eps_end self.eps_decay = eps_decay self.env = Environment(game_name) self.step_done = 0 self.device = device self.memory = ReplayMemory(mem_size) # define the policy net and target net _, _, height, width = self.env.get_screen().shape self.policy_net = Net(height, width, self.env.num_action).to(self.device) self.target_net = Net(height, width, self.env.num_action).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.RMSprop(self.policy_net.parameters())
def __init__(self, env, n_episodes=3000, time_steps=500, gamma=0.99, batch_size=32, memory_capacity=100000, tau=1e-2, eps=0.1, lr=0.00001, render=False): self.env = env self.gamma = gamma self.time_steps = time_steps self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.batch_size = batch_size self.memory_capacity = memory_capacity self.tau = tau self.eps = eps self.lr = lr self.render = render # Same weights for target network as for original network self.actor = Actor(state_dim=self.state_dim, action_dim=self.action_dim) self.actor_target = Actor(state_dim=self.state_dim, action_dim=self.action_dim) self.critic = Critic(state_dim=self.state_dim, action_dim=self.action_dim) self.critic_target = Critic(state_dim=self.state_dim, action_dim=self.action_dim) for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) self.critic_loss_fct = torch.nn.MSELoss() self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=self.lr) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=self.lr * 10) self.n_episodes = n_episodes self.replay_memory = ReplayMemory(capacity=self.memory_capacity, batch_size=batch_size) self.res = pd.DataFrame({ 'episodes': [], 'states': [], 'rewards': [], 'steps': [] })
def __init__(self, REPLAY_MEM_SIZE=10000, BATCH_SIZE=40, GAMMA=0.98, EPS_START=1, EPS_END=0.12, EPS_STEPS=300, LEARNING_RATE=0.001, INPUT_DIM=24, HIDDEN_DIM=120, ACTION_NUMBER=3, TARGET_UPDATE=10, MODEL='ddqn', DOUBLE=True): self.REPLAY_MEM_SIZE = REPLAY_MEM_SIZE self.BATCH_SIZE = BATCH_SIZE self.GAMMA = GAMMA self.EPS_START = EPS_START self.EPS_END = EPS_END self.EPS_STEPS = EPS_STEPS self.LEARNING_RATE = LEARNING_RATE self.INPUT_DIM = INPUT_DIM self.HIDDEN_DIM = HIDDEN_DIM self.ACTION_NUMBER = ACTION_NUMBER self.TARGET_UPDATE = TARGET_UPDATE self.MODEL = MODEL # deep q network (dqn) or Dueling deep q network (ddqn) self.DOUBLE = DOUBLE # to understand if use or do not use a 'Double' model (regularization) self.TRAINING = True # to do not pick random actions during testing self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") print("Agent is using device:\t" + str(self.device)) '''elif self.MODEL == 'lin_ddqn': self.policy_net = DuelingDQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device) self.target_net = DuelingDQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device) elif self.MODEL == 'lin_dqn': self.policy_net = DQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device) self.target_net = DQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device) ''' if self.MODEL == 'ddqn': self.policy_net = ConvDuelingDQN( self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) self.target_net = ConvDuelingDQN( self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) elif self.MODEL == 'dqn': self.policy_net = ConvDQN(self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) self.target_net = ConvDQN(self.INPUT_DIM, self.ACTION_NUMBER).to(self.device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.Adam(self.policy_net.parameters(), lr=self.LEARNING_RATE) self.memory = ReplayMemory(self.REPLAY_MEM_SIZE) self.steps_done = 0 self.training_cumulative_reward = []
def __init__(self, policy_cls, env, verbose=0, replay_memory_capacity=100000): super(OffPolicyRLModel, self).__init__(policy_cls, env, verbose=verbose) self.replay_memory = ReplayMemory(capacity=replay_memory_capacity)
def __init__(self, name, env): self.name = name self.env = env self.eps = 0.005 self.max_timesteps = 10000 self.explore_noise = 0.5 self.batch_size = 32 self.discount = 0.99 self.tau = 0.005 self.max_episode_steps = 200 self.memory = ReplayMemory(10000)
def __init__(self, model, env, **kwargs): Agent.__init__(self, **kwargs) self.update_step = 0 self.eps = self.EPS_START self.global_step = 0 self.model = model self.target_model = copy.deepcopy(model) self.in_size = model.in_size self.out_size = model.out_size self.memory = ReplayMemory(self.REPLAY_CAPACITY) self.opt = torch.optim.Adam(self.model.parameters(), lr=self.LR) self.env = env self.container = Container(self.model.SAVE_MODEL_NAME)
def __init__(self, num_actions, gamma=0.98, memory_size=5000, batch_size=32): self.scaler = None self.featurizer = None self.q_functions = None self.gamma = gamma self.batch_size = batch_size self.num_actions = num_actions self.memory = ReplayMemory(memory_size) self.initialize_model()
def __init__(self, env, policy_network, value_network, alpha=.003, gamma=.99, memory_size=10000, batch_size=64, use_cuda=True): DeepAgent.__init__(self, env, alpha, gamma, use_cuda) # Network prep device = torch.device('cuda' if use_cuda else 'cpu') self.policy_network = policy_network.to(device) self.value_network = value_network.to(device) self.policy_opt = optim.Adam(self.policy_network.parameters(), lr=alpha) self.value_opt = optim.Adam(self.value_network.parameters(), lr=alpha) # Experience replay prep self.memory = ReplayMemory(max_size=memory_size) self.batch_size = batch_size
def __init__(self, device, state_size, action_size, folder, config): self.folder = folder self.config = config self.device = device self.memory = ReplayMemory(self.config["MEMORY_CAPACITY"]) self.state_size = state_size self.action_size = action_size self.critic = Critic(self.state_size, self.action_size, self.device, self.config) self.actor = Actor(self.state_size, self.action_size, self.device, self.config)
def __init__(self, model, env, demo_memory, **kwargs): DQNAgent.__init__(self, model, env, **kwargs) self.EXPERT_MARGIN = kwargs.pop("expert_margin", 0.8) self.DEMO_PER = kwargs.pop("demo_percent", 0.3) self.N_STEP = kwargs.pop("n_step", 5) self.LAMBDA_1 = kwargs.pop("lambda_1", 0.1) self.LAMBDA_2 = kwargs.pop("lambda_2", 0.5) self.LAMBDA_3 = kwargs.pop("lambda_3", 0) self.memory = ReplayMemory(self.REPLAY_CAPACITY, self.N_STEP, self.GAMMA) self.demo_memory = demo_memory self.demo_memory.n_step = self.N_STEP self.demo_memory.gamma = self.GAMMA self.is_pre_train = False
def __init__(self, q_models, target_model, hyperbolic, k, gamma, model_params, replay_buffer_size, batch_size, inp_dim, lr, no_models, act_space, hidden_size, loss_type, target_update=False): super(Agent, self).__init__() if hyperbolic: self.q_models = DQN(state_space_dim=inp_dim, action_space_dim=act_space, hidden=hidden_size, no_models=no_models) self.target_models = DQN(state_space_dim=inp_dim, action_space_dim=act_space, hidden=hidden_size, no_models=no_models) self.target_models.load_state_dict(self.q_models.state_dict()) self.target_models.eval() else: self.q_models = q_models self.optimizer = optim.RMSprop(self.q_models.parameters(), lr=lr) self.hyperbolic = hyperbolic self.n_actions = model_params.act_space self.k = k # self.gammas = torch.tensor(np.linspace(0, 1, self.q_models.no_models + 1), dtype=torch.float)[1:] self.gammas = np.sort( np.random.uniform(0, 1, self.q_models.no_models + 1)) self.gammas = np.append(self.gammas, 0.98) self.gammas = torch.tensor(np.sort(self.gammas)) self.memory = ReplayMemory(replay_buffer_size) self.batch_size = batch_size self.inp_dim = inp_dim self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") self.target_models.to(self.device) self.q_models.to(self.device) self.gammas = self.gammas.to(self.device) self.loss_type = loss_type self.criterion = nn.MSELoss() self.use_target_network = target_update
def __init__(self, env, action_list, actors, critics, old_actors, old_critics, args, device): self.device = device self.env = env self.n_players = len(actors) self.action_list = action_list self.action_space_size = len(action_list) self.actors = [actor.to(device) for actor in actors] self.critics = [critic.to(device) for critic in critics] self.old_actors = [old_actor.to(device) for old_actor in old_actors] self.old_critics = [ old_critic.to(device) for old_critic in old_critics ] self.old_actors = old_actors self.old_critics = old_critics # self.max_memory_size = args.max_memory_size self.replay_memory = ReplayMemory(max_memory_size=args.max_memory_size) self.episodes_before_training = args.episodes_before_training self.n_episodes = args.n_episodes self.episode_max_length = args.episode_max_length self.batch_size = args.batch_size self.save_interval = args.save_interval self.gamma = args.gamma self.epsilon = args.epsilon self.tau = args.tau self.critic_loss = nn.MSELoss() self.lr = args.lr self.actor_optimizers = [ Adam(model.parameters(), lr=args.lr, weight_decay=0.01) for model in self.actors ] self.critic_optimizers = [ Adam(model.parameters(), lr=args.lr, weight_decay=0.01) for model in self.critics ] # save checkpoints # self.model_dir = args.model_dir # if not os.path.exists(self.model_dir): # os.makedirs(self.model_dir) # self.save_interval = args.save_interval # log self.k = 500 # moving average window size self.writer = SummaryWriter(args.log_dir) if not os.path.exists(args.log_dir): os.makedirs(args.log_dir)
def __init__(self, default_reward, name, color, env, agent_type, features_n, memory_capacity, init_value=0.0, batch_size=64, gamma=0.99, eps_start=0.9, eps_end=0.01, eps_decay=50, need_reload=False, reload_path=None, need_exploit=True): super(EGreedyAgent, self).__init__((0, 0), default_reward=default_reward, color=color, env=env, name=name, default_type=agent_type, default_value=init_value) self.actions_n = env.action_space.n # discounted value self.gamma = gamma self.batch_size = batch_size self.eps_start = eps_start self.eps_end = eps_end self.eps_decay = eps_decay self.features_n = features_n self.memory_capacity = memory_capacity self.memory = ReplayMemory(self.memory_capacity) self.steps_count = 0 self.device = 'cpu' # for evaluate Q_value self.policy_net = DQN(self.features_n, self.actions_n, 50, 50, 50) # evaluate Q_target self.target_net = DQN(self.features_n, self.actions_n, 50, 50, 50) if need_reload: self.restore(reload_path) # let target net has the same params as policy net self.target_net.eval() self.target_net.load_state_dict(self.policy_net.state_dict()) self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0.001) self.save_file_path = './model/' self.need_exploit = need_exploit
def __init__(self, args): # which environment to load from the opencv database self.env_id = "PongNoFrameskip-v4" # create the environment self.env = Environment(self.env_id) # part of the q-value formula self.discount_factor = 0.99 self.batch_size = 64 # how often to update the network (backpropogation) self.update_frequency = 4 # often synchronize with the target network self.target_network_update_freq = 1000 # keeps track of the frames for training, and retrieves them in batches self.agent_history_length = 4 self.memory = ReplayMemory(capacity=10000, batch_size=self.batch_size) # two neural networks. One for main and one for target self.main_network = PongNetwork(num_actions=self.env.get_action_space_size(), agent_history_length=self.agent_history_length) self.target_network = PongNetwork(num_actions=self.env.get_action_space_size(), agent_history_length=self.agent_history_length) # adam optimizer. just a standard procedure self.optimizer = Adam(learning_rate=1e-4, epsilon=1e-6) # we start with a high exploration rate then slowly decrease it self.init_explr = 1.0 self.final_explr = 0.1 self.final_explr_frame = 1000000 self.replay_start_size = 10000 # metrics for the loss self.loss = tf.keras.losses.Huber() # this will be the mean of 100 last rewards self.loss_metric = tf.keras.metrics.Mean(name="loss") # comes from the q loss below self.q_metric = tf.keras.metrics.Mean(name="Q_value") # what is the max number of frames to train. probably won't reach here. self.training_frames = int(1e7) # path to save the checkpoints, logs and the weights self.checkpoint_path = "./checkpoints/" + args.run_name self.tensorboard_writer = tf.summary.create_file_writer(self.checkpoint_path + "/runs/") self.print_log_interval = 10 self.save_weight_interval = 10 self.env.reset()
def __init__(self, state_space, n_actions, replay_buffer_size=50000, batch_size=32, hidden_size=12, gamma=0.98): self.n_actions = n_actions self.state_space_dim = state_space self.policy_net = DQN(state_space, n_actions, hidden_size) self.target_net = DQN(state_space, n_actions, hidden_size) self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=1e-3) self.memory = ReplayMemory(replay_buffer_size) self.batch_size = batch_size self.gamma = gamma
def __init__(self, q_models, target_model, hyperbolic, k, gamma, model_params, replay_buffer_size, batch_size, inp_dim, lr): super(Agent, self).__init__() if hyperbolic: self.q_models = torch.nn.ModuleList(q_models) self.target_models = torch.nn.ModuleList(target_model) else: self.q_models = q_models self.target_models = target_model self.optimizer = optim.RMSprop(self.q_models.parameters(), lr=1e-5) self.hyperbolic = hyperbolic self.n_actions = model_params.act_space self.k = k self.gamma = gamma self.memory = ReplayMemory(replay_buffer_size) self.batch_size = batch_size self.inp_dim = inp_dim
def generate_memory(size, game='Pendulum'): if game.startswith('Pendulum'): env = PendulumWrapper() elif game.startswith('LunarLander'): env = LunarWrapper() memory = ReplayMemory(100000) for i in range(size): s = env.reset() a = env.action_space.sample() s_, r, d, _ = env.step(a) memory.push(s, a, r, s_, 1 - int(d)) return memory
def __init__(self, learning_rate, gamma, state_shape, actions, batch_size, epsilon_initial=0.9, epsilon_decay=1e-3, epsilon_final=0.01, replay_buffer_capacity=1000000, model_name='dqn_model.h5', model_dir='models/dqn_model', ckpt_dir='models/dqn_model/checkpoints', log_dir='logs'): """Initialize DQN agent Args: learning_rate (float): Optimizer learning rate gamma (float): Discount factor in Bellman equation state_shape (np.shape): Shape of state space of the environment actions (int): Number of actions batch_size (int): Size of batch from which agent would learn epsilon_initial (float): Initial value of epsilon epsilon_decay (float): Decay rate of epsilon epsilon_final (float): Final value of epsilon after complete decay replay_buffer_capacity (int): Maximum size of experience replay buffer model_name (str): Name of the model file to save/load model_dir (str): Directory in which model file is stored ckpt_dir (str): Model Checkpoint directory log_dir (str): Directory where tensorflow logs are stored """ self.learning_rate = learning_rate self.gamma = gamma self.actions = actions self.batch_size = batch_size self.epsilon = epsilon_initial self.epsilon_decay = epsilon_decay self.epsilon_final = epsilon_final self.buffer = ReplayMemory(replay_buffer_capacity, state_shape) self.q_network = self._get_model() self.model_file = f'{model_dir}/{model_name}' self.checkpoint_dir = ckpt_dir
def __init__(self, x, y, r, color, agent_type, features_n, actions_n, discounted_value, memory_capacity=4096, batch_size=512, learning_rate=0.0001, need_restore=False): super(EGreedyAgent, self).__init__(x, y, r, color, agent_type) self.gamma = discounted_value self.features_n = features_n self.actions_n = actions_n self.lr = learning_rate self.save_file_path = 'model/dqn.pkl' self.device = 'cpu' self.policy_net = DQNet(self.features_n, self.actions_n) self.target_net = DQNet(self.features_n, self.actions_n) # let target net has the same params as policy net self.target_net.eval() self.target_net.load_state_dict(self.policy_net.state_dict()) self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=self.lr) self.memory = [] self.eps_start = 0.9 self.eps_end = 0.05 self.eps_decay = 5000 self.steps_count = 0 self.batch_size = batch_size self.memory = ReplayMemory(memory_capacity) self.need_exploit = True if need_restore: self.restore()
def __init__(self, state_space, n_actions, replay_buffer_size=50000, batch_size=32, hidden_size=64, gamma=0.99): self.n_actions = n_actions self.state_space_dim = state_space self.policy_net = GenericNetwork(state_space, n_actions, hidden_size, name='dqn_network_') self.target_net = GenericNetwork(state_space, n_actions, hidden_size, name='target_dqn_network_') self.target_net.load_state_dict(self.policy_net.state_dict()) self.target_net.eval() self.memory = ReplayMemory(replay_buffer_size) self.batch_size = batch_size self.gamma = gamma self.action = {} self.j = 0
def test_arb(arb_env, modules_list, n_epi=250, max_steps=500): s_dim, a_dim = 16, 4 n_modules = len(modules_list) pi_tensors = get_pi(modules_list) arb = Arbitrator().to(device) returns = [] all_rets = [] memory = ReplayMemory(10000) for epi in range(n_epi): arb_env.reset() r_list = [] steps = 0 while steps < max_steps: state = get_state_vector(arb_env.cur_state) coeff = arb(state) pi_k = torch.zeros(s_dim, a_dim) for m in range(n_modules): pi_k += coeff[0][m] * pi_tensors[m] a = np.random.choice( 4, p=pi_k[arb_env.cur_state].detach().cpu().numpy()) s, a, s_, r, done = arb_env.step(a) r_list.append(r) reward = torch.FloatTensor([r], device=device) next_state = get_state_vector(s_) steps += 1 memory.push(state, torch.FloatTensor([a], device=device), next_state, reward) if done: state = get_state_vector(arb_env.cur_state) coeff = arb(state) pi_k = torch.zeros(s_dim, a_dim) for m in range(n_modules): pi_k += coeff[0][m] * pi_tensors[m] a = np.random.choice( 4, p=pi_k[arb_env.cur_state].detach().cpu().numpy()) # state = get_state_vector(arb_env.cur_state) next_state = state r = 100. steps += 1 reward = torch.FloatTensor([r], device=device) r_list.append(r) memory.push(state, torch.FloatTensor([a], device=device), next_state, reward) break rets = [] return_so_far = 0 for t in range(len(r_list) - 1, -1, -1): return_so_far = r_list[t] + 0.9 * return_so_far rets.append(return_so_far) # The returns are stored backwards in time, so we need to revert it rets = list(reversed(rets)) all_rets.extend(rets) print("epi {} over".format(epi)) if epi % 7 == 0: arb.optimize(memory, pi_tensors, torch.FloatTensor(all_rets)) all_rets = [] memory = ReplayMemory(10000) returns.append(sum(r_list)) return returns
def run_dq_pole(num_episodes): logg = logging.getLogger(f"c.{__name__}.run_dq_pole") logg.debug(f"Start run_dq_pole") env = gym.make("CartPole-v0").unwrapped plt.ion() # if gpu is to be used device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logg.debug(f"Using {device} as device") # show_frame(env) # hyperparameters BATCH_SIZE = 128 GAMMA = 0.999 EPS_START = 0.9 EPS_END = 0.05 EPS_DECAY = 200 TARGET_UPDATE = 10 env.reset() # Get screen size so that we can initialize layers correctly based on shape # returned from AI gym. Typical dimensions at this point are close to 3x40x90 # which is the result of a clamped and down-scaled render buffer in get_screen() init_screen = get_screen(env, device) _, _, screen_height, screen_width = init_screen.shape # Get number of actions from gym action space n_actions = env.action_space.n policy_net = DQN(screen_height, screen_width, n_actions).to(device) target_net = DQN(screen_height, screen_width, n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) memory = ReplayMemory(10000) steps_done = 0 # main training loop. At the beginning we reset the environment and # initialize the state Tensor. Then, we sample an action, execute it, # observe the next screen and the reward (always 1), and optimize our model # once. When the episode ends (our model fails), we restart the loop. # num_episodes = 50 episode_durations = [] for i_episode in range(num_episodes): # Initialize the environment and state env.reset() last_screen = get_screen(env, device) current_screen = get_screen(env, device) state = current_screen - last_screen for t in count(): # Select and perform an action action = select_action( state, n_actions, steps_done, device, policy_net, EPS_START, EPS_END, EPS_DECAY, ) _, reward, done, _ = env.step(action.item()) reward = torch.tensor([reward], device=device) # Observe new state last_screen = current_screen current_screen = get_screen(env, device) if not done: next_state = current_screen - last_screen else: next_state = None # Store the transition in memory memory.push(state, action, next_state, reward) # Move to the next state state = next_state # Perform one step of the optimization (on the target network) optimize_model(BATCH_SIZE, memory, device, policy_net, target_net, GAMMA, optimizer) if done: episode_durations.append(t + 1) plot_durations(episode_durations) break # Update the target network, copying all weights and biases in DQN if i_episode % TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict()) print("Complete") env.render() # remember to close the env, avoid sys.meta_path undefined env.close() plt.ioff() plt.show()
# memory settings max_memory_size = 100000 min_memory_size = 1000 # number needed before model training starts epsilon_rate = (epsilon - epsilon_min) / epsilon_steps # PLE takes our game and the state_preprocessor. It will process the state for our agent. game = Catcher(width=128, height=128) env = PLE(game, fps=60, state_preprocessor=nv_state_preprocessor) agent = Agent(env, batch_size, num_frames, frame_skip, lr, discount, rng, optimizer="sgd_nesterov") agent.build_model() memory = ReplayMemory(max_memory_size, min_memory_size) env.init() for epoch in range(1, num_epochs + 1): steps, num_episodes = 0, 0 losses, rewards = [], [] env.display_screen = False # training loop while steps < num_steps_train: episode_reward = 0.0 agent.start_episode() while env.game_over() == False and steps < num_steps_train: state = env.getGameState()