def __init__(self,
                 actions,
                 name="Q-learning",
                 alpha=0.1,
                 gamma=0.99,
                 epsilon=0.1,
                 explore="uniform",
                 anneal=False):
        '''
        Args:
            actions (list): Contains strings denoting the actions.
            name (str): Denotes the name of the agent.
            alpha (float): Learning rate.
            gamma (float): Discount factor.
            epsilon (float): Exploration term.
            explore (str): One of {softmax, uniform}. Denotes explore policy.
        '''
        name_ext = "-" + explore if explore != "uniform" else ""
        Agent.__init__(self,
                       name=name + name_ext,
                       actions=actions,
                       gamma=gamma)

        # Set/initialize parameters and other relevant classwide data
        self.alpha, self.alpha_init = alpha, alpha
        self.epsilon, self.epsilon_init = epsilon, epsilon
        self.step_number = 0
        self.anneal = anneal
        self.default_q = 0  #1 / (1 - self.gamma)
        self.explore = explore

        # Q Function:
        self.q_func = defaultdict(lambda: defaultdict(lambda: self.default_q))
    def __init__(self,
                 states,
                 state_map,
                 actions,
                 par_tensor,
                 times,
                 gamma=0.95,
                 horizon=2,
                 name="Optimal",
                 greedy=False):
        name = name
        Agent.__init__(self, name=name, actions=actions, gamma=gamma)

        self.states = states
        self.state_map = state_map
        self.horizon = horizon
        self.greedy = greedy
        self.times = times
        self.par_tensor = par_tensor
        self.reset()
        #        print(self.states)
        #        print(self.actions)
        print(self.par_tensor)
        self.policy = defaultdict(type(self.actions[0]))
        self.update_all()
Esempio n. 3
0
 def __init__(self, actions, gamma=0.95, horizon=3, s_a_threshold=2, name="RMax-h"):
     name = name + str(horizon) if name[-2:] == "-h" else name
     Agent.__init__(self, name=name, actions=actions, gamma=gamma)
     self.rmax = 1.0
     self.horizon = horizon
     self.s_a_threshold = s_a_threshold
     self.reset()
Esempio n. 4
0
 def __init__(self, policy, name=NAME):
     '''
     Args:
         policy (func: S ---> A)
     '''
     Agent.__init__(self, name=name, actions=[])
     self.policy = policy
Esempio n. 5
0
    def __init__(self,
                 states,
                 state_map,
                 actions,
                 times,
                 gamma=0.95,
                 horizon=3,
                 s_a_threshold=2,
                 name="RMax",
                 greedy=False):
        name = name
        Agent.__init__(self, name=name, actions=actions, gamma=gamma)
        self.rmax = 1.0
        self.states = states
        self.state_map = state_map
        self.horizon = horizon
        self.s_a_threshold = s_a_threshold
        self.greedy = greedy
        self.reset()
        self.times = 0
        self.max_times = times

        s_len = len(self.states)
        shape = ((len(self.actions), s_len, s_len + 1))
        self.par_tensor = np.zeros(shape)
 def end_of_episode(self):
     '''
     Summary:
         Resets the agents prior pointers.
     '''
     if self.anneal:
         self._anneal()
     Agent.end_of_episode(self)
Esempio n. 7
0
 def __init__(self,
              actions,
              name="LinUCB",
              rand_init=True,
              context_size=1,
              alpha=1.5):
     '''
     Args:
         actions (list): Contains a string for each action.
         name (str)
         context_size (int)
         alpha (float): Uncertainty parameter.
     '''
     Agent.__init__(self, name, actions)
     self.alpha = alpha
     self.context_size = context_size
     self.prev_context = None
     self.step_number = 0
     self.rand_init = rand_init
     self._init_action_model(rand_init)
 def __init__(self, states, state_map, actions, use_tensor=True, rank=2, mu=0.1,
              gamma=0.95, horizon=3, s_a_threshold=2, rho=0.7, beta=0.2, name="tensor", 
              greedy=True, strict=True, origin_tensor=None, os=False):
     name = name
     Agent.__init__(self, name=name, actions=actions, gamma=gamma)
     self.rmax = 1.0
     self.horizon = horizon
     self.states = states
     self.state_map = state_map
     self.s_a_threshold = s_a_threshold
     self.use_tensor = use_tensor
     self.rank = rank
     self.mu = mu
     self.greedy = greedy
     self.strict = strict
     self.rho = rho
     self.beta = beta
     self.reset()
     self.times = 0
     self.origin_tensor = origin_tensor
     self.os = os
    def __init__(self, actions, init_q=None, name="Delayed-Q", gamma=0.99, m=5, epsilon1=0.1):
        '''
        Args:
            actions (list): Contains strings denoting the actions.
            init_q (2d list): Initial Q function. AU(s, a) in Strehl et al 2006.
            name (str): Denotes the name of the agent.
            gamma (float): discount factor
            m (float): Number of samples for updating Q-value
            epsilon1 (float): Learning rate
        '''
        # Set initial q func.
        self.rmax = 1  # TODO: set/get function
        init_q = defaultdict(lambda : defaultdict(lambda: self.rmax / (1 - gamma))) if init_q is None else init_q

        Agent.__init__(self, name=name, actions=actions, gamma=gamma)

        # Set/initialize parameters and other relevant classwide data
        self.step_number = 0

        # TODO: Here we assume that init_q has Qvalue for every (s, a) pair.
        self.q_func = copy.deepcopy(init_q)
        self.default_q_func = copy.deepcopy(init_q)

        self.AU = defaultdict(lambda: defaultdict(lambda: 0.0))  # used for attempted updates
        self.l = defaultdict(lambda: defaultdict(lambda: 0))  # counters
        self.b = defaultdict(lambda: defaultdict(lambda: 0))  # beginning timestep of attempted update
        self.LEARN = defaultdict(lambda: defaultdict(lambda: True))  # beginning timestep of attempted update
        for x in init_q:
            for y in init_q[x]:
                self.AU[x][y] = 0.0  # AU(s, a) <- 0
                self.l[x][y] = 0  # l(s, a) <- 0
                self.b[x][y] = 0  # b(s, a) <- 0
                self.LEARN[x][y] = False

        # TODO: Add a code to calculate m and epsilon1 from epsilon and delta.
        # m and epsilon1 should be set according to epsilon and delta in order to be PAC-MDP.
        self.m = m
        self.epsilon1 = epsilon1
        
        self.tstar = 0  # time of most recent action value change
    def __init__(self, actions, n_states, sess, name="actor-critic"):
        name = "policy_gradient" if name is "" else name
        Agent.__init__(self, name=name, actions=actions)

        self.reset()
        self.sess = sess
        self.n_states = n_states
        self.learning_rate = 0.001
        self.epsilon = 1.0
        self.epsilon_decay = .995
        self.gamma = .95
        self.tau = .125

        self.memory = deque(maxlen=2000)
        self.actor_state_input, self.actor_model = self.create_actor_model()
        _, self.target_actor_model = self.create_actor_model()

        self.actor_critic_grad = tf.placeholder(
            tf.float32, [None, len(self.actions)
                         ])  # where we will feed de/dC (from critic)

        actor_model_weights = self.actor_model.trainable_weights
        self.actor_grads = tf.gradients(
            self.actor_model.output, actor_model_weights,
            -self.actor_critic_grad)  # dC/dA (from actor)
        grads = zip(self.actor_grads, actor_model_weights)
        self.optimize = tf.train.AdamOptimizer(
            self.learning_rate).apply_gradients(grads)

        self.critic_state_input, self.critic_action_input, \
         self.critic_model = self.create_critic_model()

        _, _, self.target_critic_model = self.create_critic_model()

        self.critic_grads = tf.gradients(
            self.critic_model.output, self.critic_action_input
        )  # where we calcaulte de/dC for feeding above

        # Initialize for later gradient calculations
        self.sess.run(tf.initialize_all_variables())
 def reset(self):
     self.step_number = 0
     self.episode_number = 0
     self.q_func = defaultdict(lambda: defaultdict(lambda: self.default_q))
     Agent.reset(self)
 def __init__(self, actions, name=""):
     name = "Random" if name is "" else name
     Agent.__init__(self, name=name, actions=actions)
 def end_of_episode(self):
     '''
     Summary:
         Resets the agents prior pointers.
     '''
     Agent.end_of_episode(self)
 def reset(self):
     self.step_number = 0
     self.episode_number = 0
     self.q_func = copy.deepcopy(self.default_q_func)
     Agent.reset(self)
 def reset(self):
     self.step_number = 0
     self.episode_number = 0
     self.q_funcs = {"A":defaultdict(lambda : defaultdict(lambda: self.default_q)), \
                     "B":defaultdict(lambda : defaultdict(lambda: self.default_q))}
     Agent.reset(self)