Ejemplo n.º 1
0
    def __init__(self,
                 sess,
                 training_steps=5000000,
                 learning_rate=0.0001,
                 momentum=0.95,
                 memory_size=100000,
                 discount_rate=0.95,
                 eps_min=0.05):
        self.activation = tf.nn.relu
        self.optimizer = tf.train.MomentumOptimizer
        self.learning_rate = learning_rate
        self.momentum = momentum

        self._build_graph()

        self.memory_size = memory_size
        self.memory = ReplayMemory(self.memory_size)
        '''
        The discount rate is the parameter that indicates how many actions will be considered in the future to evaluate
        the reward of a given action.
        A value of 0 means the agent only considers the present action, and a value close to 1 means the agent
        considers actions very far in the future.
        '''
        self.discount_rate = discount_rate

        self.eps_min = eps_min
        self.eps_decay_steps = int(training_steps / 2)

        self.sess = sess
        self.init = tf.global_variables_initializer()
Ejemplo n.º 2
0
 def __init__(self, game_name, gamma, batch_size, eps_start, eps_end,
              eps_decay, mem_size, device):
     if batch_size > mem_size:
         print(
             "Error: the training crushes due to batch size smaller than memory size."
         )
         return
     self.gamma = gamma
     self.batch_size = batch_size
     self.eps_start = eps_start
     self.eps_end = eps_end
     self.eps_decay = eps_decay
     self.env = Environment(game_name)
     self.step_done = 0
     self.device = device
     self.memory = ReplayMemory(mem_size)
     # define the policy net and target net
     _, _, height, width = self.env.get_screen().shape
     self.policy_net = Net(height, width,
                           self.env.num_action).to(self.device)
     self.target_net = Net(height, width,
                           self.env.num_action).to(self.device)
     self.target_net.load_state_dict(self.policy_net.state_dict())
     self.target_net.eval()
     self.optimizer = optim.RMSprop(self.policy_net.parameters())
    def __init__(self, mem_size, epsilon, mini_batch_size, learning_rate, gamma):

        self.epsilon = epsilon
        self.mini_batch_size = mini_batch_size
        self.gamma = gamma

        self.update_counter = 0

        self.net = nn.Sequential(
            nn.Linear(2, 128),
            nn.ReLU(),
            nn.Linear(128, 128),
            nn.ReLU(),
            nn.Linear(128, 3)
        ).float()

        self.net_target = copy.deepcopy(self.net)

        self.net = self.net.cuda()
        self.net_target = self.net_target.cuda()

        # self.net_target = nn.Sequential(
        #     nn.Linear(2, 128),
        #     nn.ReLU(),
        #     nn.Linear(128, 128),
        #     nn.ReLU(),
        #     nn.Linear(128, 3)
        # ).float()

        self.replay_memory = ReplayMemory(max_size=mem_size)

        self.criterion = nn.MSELoss()
        self.optimizer = optim.Adam(self.net.parameters(), lr=learning_rate)
Ejemplo n.º 4
0
    def __init__(
        self,
        player_id: int = 1,
        name: str = "Ugo",
        batch_size: int = 128,
        gamma: float = 0.98,
        memory_size: int = 40000,
    ) -> None:
        """Initialization for the DQN agent

        Args:
            player_id (int, optional): Side of the board on which to play. Defaults to 1.
            name (str, optional): Name of the player. Defaults to "Ugo".
            batch_size (int, optional): Batch size of the update. Defaults to 128.
            gamma (float, optional): Gamme value for update decay. Defaults to 0.98.
            memory_size (int, optional): Experience memory capacity. Defaults to 40000.
        """
        # list of parameters of the agent
        self.player_id = player_id
        self.name = name
        self.batch_size = batch_size  # size of batch for update
        self.gamma = gamma  # discount factor
        self.memory_size = memory_size  # size of replay memory
        self.memory = ReplayMemory(self.memory_size,
                                   train_buffer_capacity=4,
                                   test_buffer_capacity=4)

        # networks
        self.policy_net = DQN(action_space_dim=3,
                              hidden_dim=256).to(torch.device(device))
        self.target_net = DQN(action_space_dim=3,
                              hidden_dim=256).to(torch.device(device))
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1e-4)
Ejemplo n.º 5
0
 def __init__(self,
              env_name,
              state_space,
              n_actions,
              replay_buffer_size=500000,
              batch_size=32,
              hidden_size=64,
              gamma=0.99):
     self.env_name = env_name
     device = 'cuda' if torch.cuda.is_available() else 'cpu'
     self.train_device = device
     self.n_actions = n_actions
     self.state_space_dim = state_space
     if "CartPole" in self.env_name:
         self.policy_net = CartpoleDQN(state_space, n_actions, 4)
         self.target_net = CartpoleDQN(state_space, n_actions, 4)
         self.target_net.load_state_dict(self.policy_net.state_dict())
         self.target_net.eval()
         self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1e-4)
     elif "WimblepongVisualSimpleAI-v0" in self.env_name:
         self.policy_net = Policy(state_space, n_actions, 4)
         self.target_net = Policy(state_space, n_actions, 4)
         self.target_net.load_state_dict(self.policy_net.state_dict())
         self.target_net.eval()
         self.optimizer = optim.Adam(self.policy_net.parameters(), lr=5e-4)
     else:
         raise ValueError(
             "Wrong environment. An agent has not been specified for %s" %
             env_name)
     self.memory = ReplayMemory(replay_buffer_size)
     self.batch_size = batch_size
     self.gamma = gamma
Ejemplo n.º 6
0
 def __init__(self,
              env_name,
              state_space,
              n_actions,
              replay_buffer_size=50000,
              batch_size=32,
              hidden_size=12,
              gamma=0.98):
     self.env_name = env_name
     self.n_actions = n_actions
     self.state_space_dim = state_space
     if "CartPole" in self.env_name:
         self.policy_net = CartpoleDQN(state_space, n_actions, hidden_size)
         self.target_net = CartpoleDQN(state_space, n_actions, hidden_size)
         self.target_net.load_state_dict(self.policy_net.state_dict())
         self.target_net.eval()
         self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1e-3)
     elif "LunarLander" in self.env_name:
         self.policy_net = LunarLanderDQN(state_space, n_actions,
                                          hidden_size)
         self.target_net = LunarLanderDQN(state_space, n_actions,
                                          hidden_size)
         self.target_net.load_state_dict(self.policy_net.state_dict())
         self.target_net.eval()
         self.optimizer = optim.Adam(self.policy_net.parameters(), lr=5e-4)
     else:
         raise ValueError(
             "Wrong environment. An agent has not been specified for %s" %
             env_name)
     self.memory = ReplayMemory(replay_buffer_size)
     self.batch_size = batch_size
     self.gamma = gamma
Ejemplo n.º 7
0
    def __init__(self, ob_sp, act_sp, alow, ahigh, writer, args):
        self.args = args
        self.alow = alow
        self.ahigh = ahigh
        self.policy = Policy_net(ob_sp, act_sp)
        self.policy_targ = Policy_net(ob_sp, act_sp)
        self.qnet = Q_net(ob_sp, act_sp)
        self.qnet_targ = Q_net(ob_sp, act_sp)

        self.policy.to(device)
        self.qnet.to(device)
        self.policy_targ.to(device)
        self.qnet_targ.to(device)
        self.MSE_loss = nn.MSELoss()
        self.noise = OUNoise(1, 1)

        hard_update(self.policy_targ, self.policy)
        hard_update(self.qnet_targ, self.qnet)

        self.p_optimizer = optim.Adam(self.policy.parameters(), lr=LR)
        self.q_optimizer = optim.Adam(self.qnet.parameters(), lr=LR)
        self.memory = ReplayMemory(int(1e6))
        self.epsilon_scheduler = LinearSchedule(E_GREEDY_STEPS,
                                                FINAL_STD,
                                                INITIAL_STD,
                                                warmup_steps=WARMUP_STEPS)
        self.n_steps = 0
        self.n_updates = 0
        self.writer = writer
Ejemplo n.º 8
0
    def __init__(self,
                 env,
                 n_episodes=3000,
                 time_steps=500,
                 gamma=0.99,
                 batch_size=32,
                 memory_capacity=100000,
                 tau=1e-2,
                 eps=0.1,
                 lr=0.00001,
                 render=False):
        self.env = env
        self.gamma = gamma
        self.time_steps = time_steps
        self.state_dim = env.observation_space.shape[0]
        self.action_dim = env.action_space.shape[0]
        self.batch_size = batch_size
        self.memory_capacity = memory_capacity
        self.tau = tau
        self.eps = eps
        self.lr = lr
        self.render = render

        # Same weights for target network as for original network
        self.actor = Actor(state_dim=self.state_dim,
                           action_dim=self.action_dim)
        self.actor_target = Actor(state_dim=self.state_dim,
                                  action_dim=self.action_dim)

        self.critic = Critic(state_dim=self.state_dim,
                             action_dim=self.action_dim)
        self.critic_target = Critic(state_dim=self.state_dim,
                                    action_dim=self.action_dim)

        for target_param, param in zip(self.actor_target.parameters(),
                                       self.actor.parameters()):
            target_param.data.copy_(param.data)
        for target_param, param in zip(self.critic_target.parameters(),
                                       self.critic.parameters()):
            target_param.data.copy_(param.data)

        self.critic_loss_fct = torch.nn.MSELoss()

        self.actor_optim = torch.optim.Adam(self.actor.parameters(),
                                            lr=self.lr)
        self.critic_optim = torch.optim.Adam(self.critic.parameters(),
                                             lr=self.lr * 10)

        self.n_episodes = n_episodes

        self.replay_memory = ReplayMemory(capacity=self.memory_capacity,
                                          batch_size=batch_size)

        self.res = pd.DataFrame({
            'episodes': [],
            'states': [],
            'rewards': [],
            'steps': []
        })
Ejemplo n.º 9
0
    def __init__(self,
                 REPLAY_MEM_SIZE=10000,
                 BATCH_SIZE=40,
                 GAMMA=0.98,
                 EPS_START=1,
                 EPS_END=0.12,
                 EPS_STEPS=300,
                 LEARNING_RATE=0.001,
                 INPUT_DIM=24,
                 HIDDEN_DIM=120,
                 ACTION_NUMBER=3,
                 TARGET_UPDATE=10,
                 MODEL='ddqn',
                 DOUBLE=True):

        self.REPLAY_MEM_SIZE = REPLAY_MEM_SIZE
        self.BATCH_SIZE = BATCH_SIZE
        self.GAMMA = GAMMA
        self.EPS_START = EPS_START
        self.EPS_END = EPS_END
        self.EPS_STEPS = EPS_STEPS
        self.LEARNING_RATE = LEARNING_RATE
        self.INPUT_DIM = INPUT_DIM
        self.HIDDEN_DIM = HIDDEN_DIM
        self.ACTION_NUMBER = ACTION_NUMBER
        self.TARGET_UPDATE = TARGET_UPDATE
        self.MODEL = MODEL  # deep q network (dqn) or Dueling deep q network (ddqn)
        self.DOUBLE = DOUBLE  # to understand if use or do not use a 'Double' model (regularization)
        self.TRAINING = True  # to do not pick random actions during testing
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")
        print("Agent is using device:\t" + str(self.device))
        '''elif self.MODEL == 'lin_ddqn':
            self.policy_net = DuelingDQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device)
            self.target_net = DuelingDQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device)
        elif self.MODEL == 'lin_dqn':
            self.policy_net = DQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device)
            self.target_net = DQN(self.INPUT_DIM, self.HIDDEN_DIM, self.ACTION_NUMBER).to(self.device)
        '''

        if self.MODEL == 'ddqn':
            self.policy_net = ConvDuelingDQN(
                self.INPUT_DIM, self.ACTION_NUMBER).to(self.device)
            self.target_net = ConvDuelingDQN(
                self.INPUT_DIM, self.ACTION_NUMBER).to(self.device)
        elif self.MODEL == 'dqn':
            self.policy_net = ConvDQN(self.INPUT_DIM,
                                      self.ACTION_NUMBER).to(self.device)
            self.target_net = ConvDQN(self.INPUT_DIM,
                                      self.ACTION_NUMBER).to(self.device)

        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(),
                                    lr=self.LEARNING_RATE)
        self.memory = ReplayMemory(self.REPLAY_MEM_SIZE)
        self.steps_done = 0
        self.training_cumulative_reward = []
Ejemplo n.º 10
0
 def __init__(self, name, env):
     self.name = name
     self.env = env
     self.eps = 0.005
     self.max_timesteps = 10000
     self.explore_noise = 0.5
     self.batch_size = 32
     self.discount = 0.99
     self.tau = 0.005
     self.max_episode_steps = 200
     self.memory = ReplayMemory(10000)
Ejemplo n.º 11
0
 def __init__(self,
              num_actions,
              gamma=0.98,
              memory_size=5000,
              batch_size=32):
     self.scaler = None
     self.featurizer = None
     self.q_functions = None
     self.gamma = gamma
     self.batch_size = batch_size
     self.num_actions = num_actions
     self.memory = ReplayMemory(memory_size)
     self.initialize_model()
Ejemplo n.º 12
0
 def __init__(self, model, env, **kwargs):
     Agent.__init__(self, **kwargs)
     self.update_step = 0
     self.eps = self.EPS_START
     self.global_step = 0
     self.model = model
     self.target_model = copy.deepcopy(model)
     self.in_size = model.in_size
     self.out_size = model.out_size
     self.memory = ReplayMemory(self.REPLAY_CAPACITY)
     self.opt = torch.optim.Adam(self.model.parameters(), lr=self.LR)
     self.env = env
     self.container = Container(self.model.SAVE_MODEL_NAME)
Ejemplo n.º 13
0
    def __init__(self, env, policy_network, value_network, alpha=.003, gamma=.99, memory_size=10000, batch_size=64, use_cuda=True):
        DeepAgent.__init__(self, env, alpha, gamma, use_cuda)

        # Network prep
        device              = torch.device('cuda' if use_cuda else 'cpu')
        self.policy_network = policy_network.to(device)
        self.value_network  = value_network.to(device)
        self.policy_opt     = optim.Adam(self.policy_network.parameters(), lr=alpha)
        self.value_opt      = optim.Adam(self.value_network.parameters(), lr=alpha)

        # Experience replay prep
        self.memory         = ReplayMemory(max_size=memory_size)
        self.batch_size     = batch_size
Ejemplo n.º 14
0
 def __init__(self,
              q_models,
              target_model,
              hyperbolic,
              k,
              gamma,
              model_params,
              replay_buffer_size,
              batch_size,
              inp_dim,
              lr,
              no_models,
              act_space,
              hidden_size,
              loss_type,
              target_update=False):
     super(Agent, self).__init__()
     if hyperbolic:
         self.q_models = DQN(state_space_dim=inp_dim,
                             action_space_dim=act_space,
                             hidden=hidden_size,
                             no_models=no_models)
         self.target_models = DQN(state_space_dim=inp_dim,
                                  action_space_dim=act_space,
                                  hidden=hidden_size,
                                  no_models=no_models)
         self.target_models.load_state_dict(self.q_models.state_dict())
         self.target_models.eval()
     else:
         self.q_models = q_models
     self.optimizer = optim.RMSprop(self.q_models.parameters(), lr=lr)
     self.hyperbolic = hyperbolic
     self.n_actions = model_params.act_space
     self.k = k
     # self.gammas = torch.tensor(np.linspace(0, 1, self.q_models.no_models + 1), dtype=torch.float)[1:]
     self.gammas = np.sort(
         np.random.uniform(0, 1, self.q_models.no_models + 1))
     self.gammas = np.append(self.gammas, 0.98)
     self.gammas = torch.tensor(np.sort(self.gammas))
     self.memory = ReplayMemory(replay_buffer_size)
     self.batch_size = batch_size
     self.inp_dim = inp_dim
     self.device = torch.device(
         "cuda:0" if torch.cuda.is_available() else "cpu")
     self.target_models.to(self.device)
     self.q_models.to(self.device)
     self.gammas = self.gammas.to(self.device)
     self.loss_type = loss_type
     self.criterion = nn.MSELoss()
     self.use_target_network = target_update
Ejemplo n.º 15
0
    def __init__(self, device, state_size, action_size, folder, config):

        self.folder = folder
        self.config = config
        self.device = device
        self.memory = ReplayMemory(self.config["MEMORY_CAPACITY"])

        self.state_size = state_size
        self.action_size = action_size

        self.critic = Critic(self.state_size, self.action_size, self.device,
                             self.config)
        self.actor = Actor(self.state_size, self.action_size, self.device,
                           self.config)
Ejemplo n.º 16
0
 def __init__(self, model, env, demo_memory, **kwargs):
     DQNAgent.__init__(self, model, env, **kwargs)
     self.EXPERT_MARGIN = kwargs.pop("expert_margin", 0.8)
     self.DEMO_PER = kwargs.pop("demo_percent", 0.3)
     self.N_STEP = kwargs.pop("n_step", 5)
     self.LAMBDA_1 = kwargs.pop("lambda_1", 0.1)
     self.LAMBDA_2 = kwargs.pop("lambda_2", 0.5)
     self.LAMBDA_3 = kwargs.pop("lambda_3", 0)
     self.memory = ReplayMemory(self.REPLAY_CAPACITY, self.N_STEP,
                                self.GAMMA)
     self.demo_memory = demo_memory
     self.demo_memory.n_step = self.N_STEP
     self.demo_memory.gamma = self.GAMMA
     self.is_pre_train = False
Ejemplo n.º 17
0
    def __init__(self, env, action_list, actors, critics, old_actors,
                 old_critics, args, device):
        self.device = device
        self.env = env
        self.n_players = len(actors)
        self.action_list = action_list
        self.action_space_size = len(action_list)
        self.actors = [actor.to(device) for actor in actors]
        self.critics = [critic.to(device) for critic in critics]
        self.old_actors = [old_actor.to(device) for old_actor in old_actors]
        self.old_critics = [
            old_critic.to(device) for old_critic in old_critics
        ]
        self.old_actors = old_actors
        self.old_critics = old_critics
        # self.max_memory_size = args.max_memory_size
        self.replay_memory = ReplayMemory(max_memory_size=args.max_memory_size)
        self.episodes_before_training = args.episodes_before_training
        self.n_episodes = args.n_episodes
        self.episode_max_length = args.episode_max_length
        self.batch_size = args.batch_size
        self.save_interval = args.save_interval

        self.gamma = args.gamma
        self.epsilon = args.epsilon
        self.tau = args.tau

        self.critic_loss = nn.MSELoss()
        self.lr = args.lr
        self.actor_optimizers = [
            Adam(model.parameters(), lr=args.lr, weight_decay=0.01)
            for model in self.actors
        ]
        self.critic_optimizers = [
            Adam(model.parameters(), lr=args.lr, weight_decay=0.01)
            for model in self.critics
        ]

        # save checkpoints
        # self.model_dir = args.model_dir
        # if not os.path.exists(self.model_dir):
        #     os.makedirs(self.model_dir)
        # self.save_interval = args.save_interval

        # log
        self.k = 500  # moving average window size
        self.writer = SummaryWriter(args.log_dir)
        if not os.path.exists(args.log_dir):
            os.makedirs(args.log_dir)
Ejemplo n.º 18
0
 def __init__(self,
              default_reward,
              name,
              color,
              env,
              agent_type,
              features_n,
              memory_capacity,
              init_value=0.0,
              batch_size=64,
              gamma=0.99,
              eps_start=0.9,
              eps_end=0.01,
              eps_decay=50,
              need_reload=False,
              reload_path=None,
              need_exploit=True):
     super(EGreedyAgent, self).__init__((0, 0),
                                        default_reward=default_reward,
                                        color=color,
                                        env=env,
                                        name=name,
                                        default_type=agent_type,
                                        default_value=init_value)
     self.actions_n = env.action_space.n
     # discounted value
     self.gamma = gamma
     self.batch_size = batch_size
     self.eps_start = eps_start
     self.eps_end = eps_end
     self.eps_decay = eps_decay
     self.features_n = features_n
     self.memory_capacity = memory_capacity
     self.memory = ReplayMemory(self.memory_capacity)
     self.steps_count = 0
     self.device = 'cpu'
     # for evaluate Q_value
     self.policy_net = DQN(self.features_n, self.actions_n, 50, 50, 50)
     # evaluate Q_target
     self.target_net = DQN(self.features_n, self.actions_n, 50, 50, 50)
     if need_reload:
         self.restore(reload_path)
     # let target net has the same params as policy net
     self.target_net.eval()
     self.target_net.load_state_dict(self.policy_net.state_dict())
     self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0.001)
     self.save_file_path = './model/'
     self.need_exploit = need_exploit
Ejemplo n.º 19
0
def main():
    num_digits = 4
    state_size = 128
    embedding_dim = 8

    model = StateModel(num_digits, state_size, embedding_dim)
    
    env = gym.make("GuessNumEnv-v0")
    
    episodes = 100
    max_epside_len = 100

    replay_memory = ReplayMemory(1000)

    for ep in range(episodes):
        state, reward, done = env.reset()

        state = torch.from_numpy(state)
        action = torch.argmax(model((state[:, :-2].unsqueeze(0).long(), state[:, -2:].unsqueeze(0).float())), dim=-1) + 1 # Plus one because the action is composed of the numbers between 1 and 9
        
        next_state, reward, done = env.step(action.numpy().reshape(-1,))
        t = Transition(state=state, next_state=next_state, reward=reward, action=action)
        env.render()
        print(reward, done)
        break
Ejemplo n.º 20
0
    def __init__(self, args):

        # which environment to load from the opencv database
        self.env_id = "PongNoFrameskip-v4"
        # create the environment
        self.env = Environment(self.env_id)

        # part of the q-value formula
        self.discount_factor = 0.99
        self.batch_size = 64
        # how often to update the network (backpropogation)
        self.update_frequency = 4
        # often synchronize with the target  network
        self.target_network_update_freq = 1000

        # keeps track of the frames for training, and retrieves them in batches 
        self.agent_history_length = 4
        self.memory = ReplayMemory(capacity=10000, batch_size=self.batch_size)

        # two neural networks. One for main and one for target
        self.main_network = PongNetwork(num_actions=self.env.get_action_space_size(), agent_history_length=self.agent_history_length)
        self.target_network = PongNetwork(num_actions=self.env.get_action_space_size(), agent_history_length=self.agent_history_length)
        
        # adam optimizer. just a standard procedure
        self.optimizer = Adam(learning_rate=1e-4, epsilon=1e-6)
        # we start with a high exploration rate then slowly decrease it
        self.init_explr = 1.0
        self.final_explr = 0.1
        self.final_explr_frame = 1000000
        self.replay_start_size = 10000

        # metrics for the loss 
        self.loss = tf.keras.losses.Huber()
        # this will be the mean of 100 last rewards
        self.loss_metric = tf.keras.metrics.Mean(name="loss")
        # comes from the q loss below
        self.q_metric = tf.keras.metrics.Mean(name="Q_value")

        # what is the max number of frames to train. probably won't reach here.
        self.training_frames = int(1e7)

        # path to save the checkpoints, logs and the weights
        self.checkpoint_path = "./checkpoints/" + args.run_name
        self.tensorboard_writer = tf.summary.create_file_writer(self.checkpoint_path + "/runs/")
        self.print_log_interval = 10
        self.save_weight_interval = 10
        self.env.reset()
 def __init__(self, q_models, target_model, hyperbolic, k, gamma,
              model_params, replay_buffer_size, batch_size, inp_dim, lr):
     super(Agent, self).__init__()
     if hyperbolic:
         self.q_models = torch.nn.ModuleList(q_models)
         self.target_models = torch.nn.ModuleList(target_model)
     else:
         self.q_models = q_models
         self.target_models = target_model
     self.optimizer = optim.RMSprop(self.q_models.parameters(), lr=1e-5)
     self.hyperbolic = hyperbolic
     self.n_actions = model_params.act_space
     self.k = k
     self.gamma = gamma
     self.memory = ReplayMemory(replay_buffer_size)
     self.batch_size = batch_size
     self.inp_dim = inp_dim
Ejemplo n.º 22
0
def generate_memory(size, game='Pendulum'):

    if game.startswith('Pendulum'):
        env = PendulumWrapper()
    elif game.startswith('LunarLander'):
        env = LunarWrapper()

    memory = ReplayMemory(100000)

    for i in range(size):
        s = env.reset()
        a = env.action_space.sample()
        s_, r, d, _ = env.step(a)

        memory.push(s, a, r, s_, 1 - int(d))

    return memory
Ejemplo n.º 23
0
 def __init__(self,
              state_space,
              n_actions,
              replay_buffer_size=50000,
              batch_size=32,
              hidden_size=12,
              gamma=0.98):
     self.n_actions = n_actions
     self.state_space_dim = state_space
     self.policy_net = DQN(state_space, n_actions, hidden_size)
     self.target_net = DQN(state_space, n_actions, hidden_size)
     self.target_net.load_state_dict(self.policy_net.state_dict())
     self.target_net.eval()
     self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=1e-3)
     self.memory = ReplayMemory(replay_buffer_size)
     self.batch_size = batch_size
     self.gamma = gamma
Ejemplo n.º 24
0
    def __init__(self,
                 learning_rate,
                 gamma,
                 state_shape,
                 actions,
                 batch_size,
                 epsilon_initial=0.9,
                 epsilon_decay=1e-3,
                 epsilon_final=0.01,
                 replay_buffer_capacity=1000000,
                 model_name='dqn_model.h5',
                 model_dir='models/dqn_model',
                 ckpt_dir='models/dqn_model/checkpoints',
                 log_dir='logs'):
        """Initialize DQN agent

        Args:
            learning_rate (float): Optimizer learning rate
            gamma (float): Discount factor in Bellman equation
            state_shape (np.shape): Shape of state space of the environment
            actions (int): Number of actions
            batch_size (int): Size of batch from which agent would learn
            epsilon_initial (float): Initial value of epsilon
            epsilon_decay (float): Decay rate of epsilon
            epsilon_final (float): Final value of epsilon after complete decay
            replay_buffer_capacity (int): Maximum size of experience replay
                                          buffer
            model_name (str): Name of the model file to save/load
            model_dir (str): Directory in which model file is stored
            ckpt_dir (str): Model Checkpoint directory
            log_dir (str): Directory where tensorflow logs are stored
        """
        self.learning_rate = learning_rate
        self.gamma = gamma
        self.actions = actions
        self.batch_size = batch_size
        self.epsilon = epsilon_initial
        self.epsilon_decay = epsilon_decay
        self.epsilon_final = epsilon_final
        self.buffer = ReplayMemory(replay_buffer_capacity, state_shape)
        self.q_network = self._get_model()

        self.model_file = f'{model_dir}/{model_name}'
        self.checkpoint_dir = ckpt_dir
Ejemplo n.º 25
0
    def __init__(self,
                 policy_cls,
                 env,
                 verbose=0,
                 replay_memory_capacity=100000):
        super(OffPolicyRLModel, self).__init__(policy_cls,
                                               env,
                                               verbose=verbose)

        self.replay_memory = ReplayMemory(capacity=replay_memory_capacity)
Ejemplo n.º 26
0
class TransitionSaver:
    def __init__(self):
        self.processor = PreprocessImage(None)
        self.memory = ReplayMemory()
        self.transitions = []
        self.index = 0
        self.nsteps = 10

    def new_episode(self, first_state):
        self.state = self.processor._observation(first_state)

    def add_transition(self, action, next_state, reward, done):
        if not done and self.index < self.nsteps:
            next_state = self.processor._observation(next_state)
            self.transitions.insert(0, Transition(self.state, self.add_noop(action), next_state, torch.FloatTensor([reward]), torch.zeros(1)))

            transitions = []
            gamma = 1
            for trans in self.transitions:
                transitions.append(trans._replace(n_reward= trans.n_reward + gamma * reward))
                gamma = gamma * GAMMA
            self.transitions = transitions
        else:
            for trans in self.transitions:
                self.memory.push(trans)
            self.transitions = []
        self.state = next_state
    
    def add_noop(self, actions):
        actions.insert(0, 0)
        actions = torch.LongTensor(actions)
        actions[0] = (1 - actions[1:].max(0)[0])[0]
        return actions.max(0)[1]

    def save(self, fname):
        with open(fname, 'wb') as memory_file:
            pickle.dump(self.memory, memory_file)
Ejemplo n.º 27
0
 def __init__(self,
              x,
              y,
              r,
              color,
              agent_type,
              features_n,
              actions_n,
              discounted_value,
              memory_capacity=4096,
              batch_size=512,
              learning_rate=0.0001,
              need_restore=False):
     super(EGreedyAgent, self).__init__(x, y, r, color, agent_type)
     self.gamma = discounted_value
     self.features_n = features_n
     self.actions_n = actions_n
     self.lr = learning_rate
     self.save_file_path = 'model/dqn.pkl'
     self.device = 'cpu'
     self.policy_net = DQNet(self.features_n, self.actions_n)
     self.target_net = DQNet(self.features_n, self.actions_n)
     # let target net has the same params as policy net
     self.target_net.eval()
     self.target_net.load_state_dict(self.policy_net.state_dict())
     self.optimizer = optim.RMSprop(self.policy_net.parameters(),
                                    lr=self.lr)
     self.memory = []
     self.eps_start = 0.9
     self.eps_end = 0.05
     self.eps_decay = 5000
     self.steps_count = 0
     self.batch_size = batch_size
     self.memory = ReplayMemory(memory_capacity)
     self.need_exploit = True
     if need_restore:
         self.restore()
Ejemplo n.º 28
0
 def __init__(self,
              state_space,
              n_actions,
              replay_buffer_size=50000,
              batch_size=32,
              hidden_size=64,
              gamma=0.99):
     self.n_actions = n_actions
     self.state_space_dim = state_space
     self.policy_net = GenericNetwork(state_space,
                                      n_actions,
                                      hidden_size,
                                      name='dqn_network_')
     self.target_net = GenericNetwork(state_space,
                                      n_actions,
                                      hidden_size,
                                      name='target_dqn_network_')
     self.target_net.load_state_dict(self.policy_net.state_dict())
     self.target_net.eval()
     self.memory = ReplayMemory(replay_buffer_size)
     self.batch_size = batch_size
     self.gamma = gamma
     self.action = {}
     self.j = 0
Ejemplo n.º 29
0
def test_arb(arb_env, modules_list, n_epi=250, max_steps=500):
    s_dim, a_dim = 16, 4
    n_modules = len(modules_list)

    pi_tensors = get_pi(modules_list)
    arb = Arbitrator().to(device)
    returns = []
    all_rets = []
    memory = ReplayMemory(10000)
    for epi in range(n_epi):
        arb_env.reset()
        r_list = []
        steps = 0
        while steps < max_steps:
            state = get_state_vector(arb_env.cur_state)
            coeff = arb(state)
            pi_k = torch.zeros(s_dim, a_dim)
            for m in range(n_modules):
                pi_k += coeff[0][m] * pi_tensors[m]
            a = np.random.choice(
                4, p=pi_k[arb_env.cur_state].detach().cpu().numpy())
            s, a, s_, r, done = arb_env.step(a)
            r_list.append(r)
            reward = torch.FloatTensor([r], device=device)
            next_state = get_state_vector(s_)
            steps += 1
            memory.push(state, torch.FloatTensor([a], device=device),
                        next_state, reward)

            if done:
                state = get_state_vector(arb_env.cur_state)
                coeff = arb(state)
                pi_k = torch.zeros(s_dim, a_dim)
                for m in range(n_modules):
                    pi_k += coeff[0][m] * pi_tensors[m]

                a = np.random.choice(
                    4, p=pi_k[arb_env.cur_state].detach().cpu().numpy())
                # state = get_state_vector(arb_env.cur_state)
                next_state = state
                r = 100.
                steps += 1
                reward = torch.FloatTensor([r], device=device)
                r_list.append(r)
                memory.push(state, torch.FloatTensor([a], device=device),
                            next_state, reward)
                break

        rets = []
        return_so_far = 0
        for t in range(len(r_list) - 1, -1, -1):
            return_so_far = r_list[t] + 0.9 * return_so_far
            rets.append(return_so_far)
        # The returns are stored backwards in time, so we need to revert it
        rets = list(reversed(rets))
        all_rets.extend(rets)
        print("epi {} over".format(epi))
        if epi % 7 == 0:
            arb.optimize(memory, pi_tensors, torch.FloatTensor(all_rets))
            all_rets = []
            memory = ReplayMemory(10000)
        returns.append(sum(r_list))

    return returns
Ejemplo n.º 30
0
def run_dq_pole(num_episodes):
    logg = logging.getLogger(f"c.{__name__}.run_dq_pole")
    logg.debug(f"Start run_dq_pole")

    env = gym.make("CartPole-v0").unwrapped

    plt.ion()

    # if gpu is to be used
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logg.debug(f"Using {device} as device")

    #  show_frame(env)

    # hyperparameters
    BATCH_SIZE = 128
    GAMMA = 0.999
    EPS_START = 0.9
    EPS_END = 0.05
    EPS_DECAY = 200
    TARGET_UPDATE = 10

    env.reset()
    # Get screen size so that we can initialize layers correctly based on shape
    # returned from AI gym. Typical dimensions at this point are close to 3x40x90
    # which is the result of a clamped and down-scaled render buffer in get_screen()
    init_screen = get_screen(env, device)
    _, _, screen_height, screen_width = init_screen.shape

    # Get number of actions from gym action space
    n_actions = env.action_space.n

    policy_net = DQN(screen_height, screen_width, n_actions).to(device)
    target_net = DQN(screen_height, screen_width, n_actions).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.RMSprop(policy_net.parameters())
    memory = ReplayMemory(10000)

    steps_done = 0

    # main training loop. At the beginning we reset the environment and
    # initialize the state Tensor. Then, we sample an action, execute it,
    # observe the next screen and the reward (always 1), and optimize our model
    # once. When the episode ends (our model fails), we restart the loop.

    #  num_episodes = 50
    episode_durations = []

    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset()
        last_screen = get_screen(env, device)
        current_screen = get_screen(env, device)
        state = current_screen - last_screen
        for t in count():
            # Select and perform an action
            action = select_action(
                state,
                n_actions,
                steps_done,
                device,
                policy_net,
                EPS_START,
                EPS_END,
                EPS_DECAY,
            )
            _, reward, done, _ = env.step(action.item())
            reward = torch.tensor([reward], device=device)

            # Observe new state
            last_screen = current_screen
            current_screen = get_screen(env, device)
            if not done:
                next_state = current_screen - last_screen
            else:
                next_state = None

            # Store the transition in memory
            memory.push(state, action, next_state, reward)

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the target network)
            optimize_model(BATCH_SIZE, memory, device, policy_net, target_net,
                           GAMMA, optimizer)
            if done:
                episode_durations.append(t + 1)
                plot_durations(episode_durations)
                break
        # Update the target network, copying all weights and biases in DQN
        if i_episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())

    print("Complete")
    env.render()
    # remember to close the env, avoid sys.meta_path undefined
    env.close()
    plt.ioff()
    plt.show()
Ejemplo n.º 31
0
    # memory settings
    max_memory_size = 100000
    min_memory_size = 1000  # number needed before model training starts

    epsilon_rate = (epsilon - epsilon_min) / epsilon_steps

    # PLE takes our game and the state_preprocessor. It will process the state for our agent.
    game = Catcher(width=128, height=128)
    env = PLE(game, fps=60, state_preprocessor=nv_state_preprocessor)

    agent = Agent(env, batch_size, num_frames, frame_skip, lr,
                  discount, rng, optimizer="sgd_nesterov")
    agent.build_model()

    memory = ReplayMemory(max_memory_size, min_memory_size)

    env.init()

    for epoch in range(1, num_epochs + 1):
        steps, num_episodes = 0, 0
        losses, rewards = [], []
        env.display_screen = False

        # training loop
        while steps < num_steps_train:
            episode_reward = 0.0
            agent.start_episode()

            while env.game_over() == False and steps < num_steps_train:
                state = env.getGameState()