Beispiel #1
0
    def __init__(self,
                 actions,
                 name=NAME,
                 learning_rate=1e-4,
                 x_dim=21,
                 y_dim=16,
                 eps_start=1.0,
                 eps_decay=0.0000001,
                 eps_end=0.1,
                 num_channels=3,
                 should_train=True,
                 from_checkpoint=None,
                 player_id=1):
        Agent.__init__(self, name=name, actions=[])
        self.learning_rate = learning_rate
        self.x_dim, self.y_dim = x_dim, y_dim
        self.actions, self.num_actions = actions, len(actions)
        self.hidden_layers = [32, 32]
        self.mainQN = QNetwork(learning_rate=self.learning_rate,
                               num_actions=self.num_actions,
                               x_dim=self.x_dim,
                               y_dim=self.y_dim,
                               num_channels=num_channels)
        self.targetQN = QNetwork(learning_rate=self.learning_rate,
                                 num_actions=self.num_actions,
                                 x_dim=self.x_dim,
                                 y_dim=self.y_dim,
                                 num_channels=num_channels)
        self.sess = tf.Session()
        self.experience_buffer = ExperienceBuffer(buffer_size=10e5)
        self.prev_state, self.prev_action = None, None
        self.epsilon, self.epsilon_decay, self.epsilon_end = eps_start, eps_decay, eps_end
        self.curr_step, self.total_steps = 0, 0
        self.curr_episode = 0
        self.update_freq = 100
        self.batch_size = 32
        self.update_target = 100
        self.should_train = should_train
        self.should_save, self.save_every = True, 100000
        self.print_loss, self.print_every = True, 10000
        self.saver = tf.train.Saver()
        self.action_counts = np.zeros(self.num_actions)
        # Parameters for updating target network.
        tau = 0.001

        # TODO: Update to support player_id > 2.
        # NOTE: This is a bit of a hack to update the variables in the target
        # network. It can be fixed by using scope and Tensorflow 1.4 which takes
        # a scope argument in tf.trainable_variables().
        if player_id == 2:
            vs = tf.trainable_variables()
            self.target_ops = updateTargetGraph(vs[len(vs) // 2:], tau)
        else:
            self.target_ops = updateTargetGraph(tf.trainable_variables(), tau)
        self.sess.run(tf.global_variables_initializer())

        # Load model from a checkpoint
        if not (from_checkpoint is None):
            self.saver.restore(self.sess, from_checkpoint)
            print 'Restored model from checkpoint: {}'.format(from_checkpoint)
Beispiel #2
0
 def __init__(self, actions, gamma=0.95, horizon=3, s_a_threshold=2, name="RMax-h"):
     name = name + str(horizon) if name[-2:] == "-h" else name
     Agent.__init__(self, name=name, actions=actions, gamma=gamma)
     self.rmax = 1.0
     self.horizon = horizon
     self.s_a_threshold = s_a_threshold
     self.reset()
    def __init__(self, actions, name=NAME, learning_rate=1e-4,  x_dim=210, y_dim=160, eps_start=1.0, eps_decay=0.0000001, eps_end=0.1, num_channels=3, should_train=True, from_checkpoint=None, player_id=1):
        Agent.__init__(self, name=name, actions=[])
        self.learning_rate = learning_rate
        self.x_dim, self.y_dim = x_dim, y_dim
        self.actions, self.num_actions = actions, len(actions)
        self.hidden_layers = [32, 32]
        self.num_channels = num_channels
        self.eps_start, self.epsilon_decay, self.epsilon_end = eps_start, eps_decay, eps_end
        self.should_train = should_train
        self.reset()

        # Parameters for updating target network.
        tau = 0.001

        # TODO: Update to support player_id > 2.
        # NOTE: This is a bit of a hack to update the variables in the target
        # network. It can be fixed by using scope and Tensorflow 1.4 which takes
        # a scope argument in tf.trainable_variables().
        if player_id == 2:
            vs = tf.trainable_variables()
            self.target_ops = update_target_graph(vs[len(vs)//2:], tau)
        else:
            self.target_ops = update_target_graph(tf.trainable_variables(), tau)

        # Load model from a checkpoint
        if not (from_checkpoint is None):
            self.saver.restore(self.sess, from_checkpoint)
            print('Restored model from checkpoint: {}'.format(from_checkpoint))
    def __init__(self, actions, name="Q-learning", alpha=0.1, gamma=0.99, epsilon=0.1, explore="uniform", anneal=False, custom_q_init=None, default_q=0):
        '''
        Args:
            actions (list): Contains strings denoting the actions.
            name (str): Denotes the name of the agent.
            alpha (float): Learning rate.
            gamma (float): Discount factor.
            epsilon (float): Exploration term.
            explore (str): One of {softmax, uniform}. Denotes explore policy.
            custom_q_init (defaultdict{state, defaultdict{action, float}}): a dictionary of dictionaries storing the initial q-values. Can be used for potential shaping (Wiewiora, 2003)
            default_q (float): the default value to initialize every entry in the q-table with [by default, set to 0.0]
        '''
        name_ext = "-" + explore if explore != "uniform" else ""
        Agent.__init__(self, name=name + name_ext, actions=actions, gamma=gamma)

        # Set/initialize parameters and other relevant classwide data
        self.alpha, self.alpha_init = alpha, alpha
        self.epsilon, self.epsilon_init = epsilon, epsilon
        self.step_number = 0
        self.anneal = anneal
        self.default_q = default_q # 0 # 1 / (1 - self.gamma)
        self.explore = explore
        self.custom_q_init = custom_q_init

        # Q Function:
        if self.custom_q_init:
            self.q_func = self.custom_q_init
        else:
            self.q_func = defaultdict(lambda: defaultdict(lambda: self.default_q))
Beispiel #5
0
    def __init__(self,
                 actions,
                 name="Q-learning",
                 alpha=0.1,
                 gamma=0.99,
                 epsilon=0.1,
                 explore="uniform",
                 anneal=False):
        '''
        Args:
            actions (list): Contains strings denoting the actions.
            name (str): Denotes the name of the agent.
            alpha (float): Learning rate.
            gamma (float): Discount factor.
            epsilon (float): Exploration term.
            explore (str): One of {softmax, uniform}. Denotes explore policy.
        '''
        name_ext = "-" + explore if explore != "uniform" else ""
        Agent.__init__(self,
                       name=name + name_ext,
                       actions=actions,
                       gamma=gamma)

        # Set/initialize parameters and other relevant classwide data
        self.alpha, self.alpha_init = alpha, alpha
        self.epsilon, self.epsilon_init = epsilon, epsilon
        self.step_number = 0
        self.anneal = anneal
        self.default_q = 0  #1 / (1 - self.gamma)
        self.explore = explore

        # Q Function:
        self.q_func = defaultdict(lambda: defaultdict(lambda: self.default_q))
Beispiel #6
0
    def __init__(self,
                 states,
                 state_map,
                 actions,
                 gamma=0.95,
                 horizon=3,
                 name="FMRL",
                 thres_sm=5,
                 thres_lg=10,
                 t1=6,
                 model_gap=0.4,
                 greedy=False,
                 xi=0.2):
        name = name
        Agent.__init__(self, name=name, actions=actions, gamma=gamma)

        self.horizon = horizon
        self.changed1 = False
        self.changed2 = False
        self.thres_sm = thres_sm
        self.thres_lg = thres_lg
        self.epsilon = 0.3
        self.states = states
        self.state_map = state_map
        self.greedy = greedy
        self.t1 = t1
        self.xi = xi
        self.model_gap = model_gap
        self.has_incorp = False  # for phase 2: whether has already incorporated past groups
        self.single_agent = None
        self.groups = []
        self.flag = []
        self.count = -1  # how many tasks we have learned
        self.reset()
Beispiel #7
0
    def __init__(self,
                 actions,
                 default_q=1.0 / (1.0 - 0.99),
                 name="Updating-delayed-Q-learning",
                 gamma=0.99,
                 m=1,
                 epsilon1=0.1,
                 qstar_transfer=False,
                 num_sample_tasks=20,
                 sample_with_q=False):
        '''
        Args:
            actions (list): Contains strings denoting the actions.
            init_q (2d list): Initial Q function. AU(s, a) in Strehl et al 2006.
            name (str): Denotes the name of the agent.
            gamma (float): discount factor
            m (float): Number of samples for updating Q-value
            epsilon1 (float): Learning rate
        '''
        # name_ext = "-" + explore if explore != "uniform" else ""
        Agent.__init__(self, name=name, actions=actions, gamma=gamma)
        self.rmax = 1  # TODO: set/get function

        # Set/initialize parameters and other relevant classwide data
        self.step_number = 0

        # TODO: Here we assume that init_q has Qvalue for every (s, a) pair.
        self.q_func = defaultdict(lambda: defaultdict(lambda: default_q))
        self.init_q_func = defaultdict(lambda: defaultdict(lambda: default_q))

        self.AU = defaultdict(
            lambda: defaultdict(lambda: 0.0))  # used for attempted updates
        self.l = defaultdict(lambda: defaultdict(lambda: 0))  # counters
        self.b = defaultdict(lambda: defaultdict(lambda: 0)
                             )  # beginning timestep of attempted update
        self.LEARN = defaultdict(lambda: defaultdict(lambda: False)
                                 )  # beginning timestep of attempted update
        # for x in init_q:
        #     for y in init_q[x]:
        #         self.AU[x][y] = 0.0  # AU(s, a) <- 0
        #         self.l[x][y] = 0  # l(s, a) <- 0
        #         self.b[x][y] = 0  # b(s, a) <- 0
        #         self.LEARN[x][y] = False

        # TODO: Add a code to calculate m and epsilon1 from epsilon and delta.
        # m and epsilon1 should be set according to epsilon and delta in order to be PAC-MDP.
        self.m = m
        self.epsilon1 = epsilon1

        self.tstar = 0  # time of most recent action value change
        self.task_number = 0
        self.default_q = default_q
        self.num_sample_tasks = num_sample_tasks
        self.qstar_transfer = qstar_transfer
        self.sample_with_q = sample_with_q

        if self.sample_with_q:
            self.q_agent = QLearningAgent(actions,
                                          gamma=self.gamma,
                                          default_q=self.default_q)
Beispiel #8
0
    def __init__(self,
                 actions,
                 states,
                 reward_func,
                 initial_safe_states,
                 initial_safe_actions,
                 similarity_function,
                 analagous_state_function,
                 transition_support_function=None,
                 gamma=0.99,
                 vi_horizon=100,
                 name='safe-agent',
                 beta_T=0.5,
                 tau=0.1,
                 update_frequency=100,
                 use_sparse_matrices=False):

        self.use_sparse_matrices = use_sparse_matrices
        self.gamma = gamma
        self.vi_horizon = vi_horizon
        self.beta_T = beta_T
        self.tau = tau
        self.update_frequency = update_frequency

        self.states = states
        self.num_states = len(states)
        self.actions = actions
        self.num_actions = len(actions)
        self.s0 = None

        self.step_number = 0

        self.state_to_id = dict()
        for s_id, state in enumerate(self.states):
            self.state_to_id[state] = s_id
        self.action_to_id = dict()
        for i, a in enumerate(self.actions):
            self.action_to_id[a] = i
        self.initial_safe_sa = np.zeros([self.num_states, self.num_actions],
                                        dtype=np.bool)
        for state in initial_safe_states:
            s = self.state_to_id[state]
            for action in initial_safe_actions(state):
                a = self.action_to_id[action]
                self.initial_safe_sa[s, a] = 1

        self.transition_table = AnalogousStateTransitionTable(
            actions=actions,
            similarity_function=similarity_function,
            analagous_state_function=analagous_state_function,
            initial_safe_sa=self.initial_safe_sa,
            reward_func=reward_func,
            states=states,
            support_function=transition_support_function,
            use_sparse_matrices=self.use_sparse_matrices,
            beta_T=beta_T)

        self.z_safe = self.initial_safe_sa

        Agent.__init__(self, name=name, actions=actions, gamma=gamma)
 def __init__(self, actions, gamma=0.95, horizon=3, s_a_threshold=2, name="RMax-h"):
     name = name + str(horizon) if name[-2:] == "-h" else name
     Agent.__init__(self, name=name, actions=actions, gamma=gamma)
     self.rmax = 1.0
     self.horizon = horizon
     self.s_a_threshold = s_a_threshold
     self.reset()
Beispiel #10
0
    def __init__(self, actions, name=NAME, learning_rate=1e-4,  x_dim=210, y_dim=160, eps_start=1.0, eps_decay=0.0000001, eps_end=0.1, num_channels=3, should_train=True, from_checkpoint=None, player_id=1):
        Agent.__init__(self, name=name, actions=[])
        self.learning_rate = learning_rate
        self.x_dim, self.y_dim = x_dim, y_dim
        self.actions, self.num_actions = actions, len(actions)
        self.hidden_layers = [32, 32]
        self.num_channels = num_channels
        self.eps_start, self.epsilon_decay, self.epsilon_end = eps_start, eps_decay, eps_end
        self.should_train = should_train
        self.reset()

        # Parameters for updating target network.
        tau = 0.001

        # TODO: Update to support player_id > 2.
        # NOTE: This is a bit of a hack to update the variables in the target
        # network. It can be fixed by using scope and Tensorflow 1.4 which takes
        # a scope argument in tf.trainable_variables().
        if player_id == 2:
            vs = tf.trainable_variables()
            self.target_ops = update_target_graph(vs[len(vs)//2:], tau)
        else:
            self.target_ops = update_target_graph(tf.trainable_variables(), tau)

        # Load model from a checkpoint
        if not (from_checkpoint is None):
            self.saver.restore(self.sess, from_checkpoint)
            print('Restored model from checkpoint: {}'.format(from_checkpoint))
Beispiel #11
0
 def __init__(self, policy, name=NAME):
     '''
     Args:
         policy (func: S ---> A)
     '''
     Agent.__init__(self, name=name, actions=[])
     self.policy = policy
    def __init__(self,
                 sess=None,
                 obs_dim=None,
                 num_actions=0,
                 buffer_size=100000,
                 gamma=0.99,
                 epsilon=0.05,
                 learning_rate=0.001,
                 tau=0.001,
                 conv=False,
                 name=NAME):
        Agent.__init__(self, name=name, actions=range(num_actions))

        if sess is None:
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.sess = tf.Session(config=config)
        else:
            self.sess = sess

        self.obs_dim = obs_dim
        self.num_actions = num_actions
        self.buffer_size = buffer_size
        self.gamma = gamma
        self.epsilon = epsilon
        self.learning_rate = learning_rate
        self.tau = tau

        self.update_freq = 1
        self.batch_size = 64

        self.conv = conv

        self.mainQ = QNetwork(sess=self.sess,
                              learning_rate=self.learning_rate,
                              obs_dim=self.obs_dim,
                              num_actions=self.num_actions,
                              conv=self.conv,
                              name=name + "_main_q")
        self.targetQ = QNetwork(sess=self.sess,
                                learning_rate=self.learning_rate,
                                obs_dim=self.obs_dim,
                                num_actions=self.num_actions,
                                conv=self.conv,
                                name=name + "_target_q")

        self.network_params = tf.trainable_variables(scope=self.name +
                                                     "_main_q")
        self.target_network_params = tf.trainable_variables(scope=self.name +
                                                            "_target_q")
        self.update_target_params = \
                                    [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) +
                                                                          tf.multiply(self.target_network_params[i], 1.0 - self.tau))
                                     for i in range(len(self.target_network_params))]

        self.saver = tf.train.Saver(self.network_params +
                                    self.target_network_params)

        self.reset()
Beispiel #13
0
 def end_of_episode(self):
     '''
     Summary:
         Resets the agents prior pointers.
     '''
     if self.anneal:
         self._anneal()
     Agent.end_of_episode(self)
 def __init__(self, name, actions, gamma=0.99):
     '''
     Args:
         name (str)
         actions (list)
         gamma (float
     '''
     Agent.__init__(self, name, actions, gamma)
 def reset(self):
     self.step_number = 0
     self.episode_number = 0
     if self.custom_q_init:
         self.q_func = self.custom_q_init
     else:
         self.q_func = defaultdict(lambda : defaultdict(lambda: self.default_q))
     Agent.reset(self)
Beispiel #16
0
 def __init__(self, name, actions, gamma=0.99):
     '''
     Args:
         name (str)
         actions (list)
         gamma (float
     '''
     Agent.__init__(self, name, actions, gamma)
 def end_of_episode(self):
     '''
     Summary:
         Resets the agents prior pointers.
     '''
     if self.anneal:
         self._anneal()
     Agent.end_of_episode(self)
Beispiel #18
0
 def __init__(self, actions, gamma=0.95, horizon=4, s_a_threshold=1):
     Agent.__init__(self,
                    name="rmax-h" + str(horizon),
                    actions=actions,
                    gamma=gamma)
     self.rmax = 1.0
     self.horizon = horizon
     self.s_a_threshold = s_a_threshold
     self.reset()
Beispiel #19
0
 def reset(self, mdp=None):
     self.step_number = 0
     self.episode_number = 0
     if mdp is not None:
         self.update_init_q_function(mdp)
     if self.task_number < self.num_sample_tasks:
         self.q_func = defaultdict(
             lambda: defaultdict(lambda: self.default_q))
     else:
         self.q_func = copy.deepcopy(self.default_q_func)
     self.task_number = self.task_number + 1
     Agent.reset(self)
 def reset(self):
     self.step_number = 0
     self.episode_number = 0
     # print "#####################################"
     # print "Reset", self.name, "Q-function"
     # # print self.q_func
     # for x in self.q_func:
     #     print (x)
     #     for y in self.q_func[x]:
     #         print (y, ':', self.q_func[x][y])
     self.update_init_q_function()
     self.q_func = copy.deepcopy(self.init_q_func)
     Agent.reset(self)
Beispiel #21
0
    def __init__(self,
                 sess=None,
                 obs_dim=None,
                 action_dim=None,
                 action_bound=None,
                 buffer_size=100000,
                 batch_size=64,
                 name=NAME,
                 actor_rate=0.0001,
                 critic_rate=0.001,
                 tau=0.001,
                 should_train=True,
                 from_checkpoint=None,
                 gamma=0.99):
        # TODO: Use a shared experience buffer?

        Agent.__init__(self, name=name, actions=[])

        assert (type(obs_dim) is int)
        assert (type(action_dim) is int)
        assert (action_bound is not None)
        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.action_bound = action_bound
        self.buffer_size = buffer_size

        self.gamma = gamma

        self.should_train = should_train

        # Fixed parameters
        self.update_freq = 1  # 64
        self.batch_size = batch_size
        self.should_save, self.save_every = True, 100000
        self.print_loss, self.print_every = True, 10000

        if sess is None:
            self.sess = tf.Session()
        else:
            self.sess = sess
        self.actor = Actor(sess=self.sess, obs_dim=self.obs_dim, action_dim=self.action_dim, \
                           action_bound=self.action_bound, learning_rate=actor_rate, tau=tau, batch_size=self.batch_size, name=name)

        self.critic = Critic(sess=self.sess, obs_dim=self.obs_dim, action_dim=self.action_dim, \
                             learning_rate=critic_rate, tau=tau, name=name)

        self.actor_noise = ActorNoise(mu=np.zeros(self.action_dim), sigma=0.3)

        self.total_reward = 0

        self.reset()
 def __init__(self, actions, name="LinUCB", rand_init=True, context_size=1, alpha=1.5):
     '''
     Args:
         actions (list): Contains a string for each action.
         name (str)
         context_size (int)
         alpha (float): Uncertainty parameter.
     '''
     Agent.__init__(self, name, actions)
     self.alpha = alpha
     self.context_size = context_size
     self.prev_context = None
     self.step_number = 0
     self._init_action_model(rand_init)
    def __init__(self,
                 actions,
                 init_q=None,
                 name="Delayed-Q",
                 gamma=0.99,
                 m=5,
                 epsilon1=0.1):
        '''
        Args:
            actions (list): Contains strings denoting the actions.
            init_q (2d list): Initial Q function. AU(s, a) in Strehl et al 2006.
            name (str): Denotes the name of the agent.
            gamma (float): discount factor
            m (float): Number of samples for updating Q-value
            epsilon1 (float): Learning rate
        '''
        # Set initial q func.
        self.rmax = 1  # TODO: set/get function
        init_q = defaultdict(lambda: defaultdict(lambda: self.rmax / (
            1 - gamma))) if init_q is None else init_q

        Agent.__init__(self, name=name, actions=actions, gamma=gamma)

        # Set/initialize parameters and other relevant classwide data
        self.step_number = 0

        # TODO: Here we assume that init_q has Qvalue for every (s, a) pair.
        self.q_func = copy.deepcopy(init_q)
        self.default_q_func = copy.deepcopy(init_q)

        self.AU = defaultdict(
            lambda: defaultdict(lambda: 0.0))  # used for attempted updates
        self.l = defaultdict(lambda: defaultdict(lambda: 0))  # counters
        self.b = defaultdict(lambda: defaultdict(lambda: 0)
                             )  # beginning timestep of attempted update
        self.LEARN = defaultdict(lambda: defaultdict(lambda: True)
                                 )  # beginning timestep of attempted update
        for x in init_q:
            for y in init_q[x]:
                self.AU[x][y] = 0.0  # AU(s, a) <- 0
                self.l[x][y] = 0  # l(s, a) <- 0
                self.b[x][y] = 0  # b(s, a) <- 0
                self.LEARN[x][y] = False

        # TODO: Add a code to calculate m and epsilon1 from epsilon and delta.
        # m and epsilon1 should be set according to epsilon and delta in order to be PAC-MDP.
        self.m = m
        self.epsilon1 = epsilon1

        self.tstar = 0  # time of most recent action value change
Beispiel #24
0
    def __init__(self,
                 states,
                 actions,
                 epsilon=0.1,
                 gamma=0.99,
                 vi_horizon=100,
                 name='unsafe-agent'):

        self.transition_table = TransitionTable(states, actions)
        self.epsilon = epsilon
        self.gamma = gamma
        self.vi_horizon = vi_horizon

        Agent.__init__(self, name=name, actions=actions, gamma=gamma)
 def __init__(self, actions, name="LinUCB", rand_init=True, context_size=1, alpha=1.5):
     '''
     Args:
         actions (list): Contains a string for each action.
         name (str)
         context_size (int)
         alpha (float): Uncertainty parameter.
     '''
     Agent.__init__(self, name, actions)
     self.alpha = alpha
     self.context_size = context_size
     self.prev_context = None
     self.step_number = 0
     self.rand_init = rand_init
     self._init_action_model(rand_init)
Beispiel #26
0
 def __init__(self,
              actions,
              gamma=0.95,
              horizon=4,
              s_a_threshold=1,
              name="RMax-h"):
     name = name + str(horizon) if name[-2:] == "-h" else name
     Agent.__init__(self, name=name, actions=actions, gamma=gamma)
     self.rmax = 1.0
     self.horizon = horizon
     self.s_a_threshold = s_a_threshold
     # self.init_q_func = None
     self.init_q_func = defaultdict(lambda: defaultdict(lambda: 1.0 /
                                                        (1.0 - gamma)))
     self.reset()
Beispiel #27
0
 def __init__(self,
              actions,
              gamma=0.95,
              horizon=4,
              s_a_threshold=1,
              num_sample_eps=20,
              name="UpdatingRMax-h"):
     name = name + str(horizon) if name[-2:] == "-h" else name
     Agent.__init__(self, name=name, actions=actions, gamma=gamma)
     self.rmax = 1.0
     self.horizon = horizon
     self.s_a_threshold = s_a_threshold
     # trans_qmax stores the maximum qvalue a (s, a) pair achieved so far.
     self.init_q_func = defaultdict(lambda: defaultdict(lambda: 0.0))
     self.cur_eps = 0
     self.num_sample_eps = num_sample_eps
     self.reset()
Beispiel #28
0
    def __init__(self,
                 actions,
                 name="qlearner",
                 alpha=0.05,
                 gamma=0.99,
                 epsilon=0.1,
                 explore=" ",
                 anneal=False,
                 mdp=None):
        '''
        Args:
            actions (list): Contains strings denoting the actions.
            name (str): Denotes the name of the agent.
            alpha (float): Learning rate.
            gamma (float): Discount factor.
            epsilon (float): Exploration term.
            explore (str): One of {softmax, uniform}. Denotes explore policy.
        '''
        explore = " "
        name_ext = "-" + explore if explore != "uniform" else ""
        Agent.__init__(self,
                       name=name + name_ext,
                       actions=actions,
                       gamma=gamma)
        if mdp == None:
            raise ValueError('DataMDP not defined')
        else:
            self.mdp = mdp
        # Set/initialize parameters and other relevant classwide data
        self.alpha, self.alpha_init = alpha, alpha
        self.epsilon, self.epsilon_init = epsilon, epsilon
        self.step_number = 0
        self.anneal = anneal
        self.default_q = 0.0
        self.mdp = mdp

        # Q Function:
        # Key: state
        # Val: dict
        #   Key: action
        #   Val: q-value
        self.q_func = defaultdict(lambda: defaultdict(lambda: self.default_q))

        # Choose explore type.
        self.explore = explore
    def __init__(self, sess=None, obs_dim=None, action_dim=None, action_bound=None, num_actions=None, num_options=0, gamma=0.99, epsilon=0.05, tau=0.001, name=NAME):
        # TODO: Implement an interface for discrete action space
        Agent.__init__(self, name=name, actions=[])

        if sess is None:
            self.sess = tf.Session()
        else:
            self.sess = sess
        self.obs_dim = obs_dim
        self.action_dim = action_dim
        self.action_bound = action_bound
        self.num_actions = num_actions
        if self.num_actions is None:
            self.continuous_action = True
        else:
            self.continuous_action = False

        self.epsilon = epsilon
        self.gamma = gamma
        self.update_freq = 1
        self.batch_size = 64
        self.tau = tau

        self.option_b_size = 16
        self.option_freq = 16

        self.num_options = num_options

        self.curr_instances = 0

        # TODO: How can I abstract the high-level control policy?
        # TODO: How can I implement a low-level control policy using the linearSARSA?
        self.high_control_main = QNetwork(self.sess, obs_dim=self.obs_dim, num_options=self.num_options, learning_rate=0.00001, name=self.name+"_high_main")
        self.high_control_target = QNetwork(self.sess, obs_dim=self.obs_dim, num_options=self.num_options, learning_rate=0.00001, name=self.name+"_high_target")


        self.network_params = tf.trainable_variables(scope=self.name+"_high_main")
        self.target_network_params = tf.trainable_variables(scope=self.name+"_high_main")
        self.update_target_params = \
                                    [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) +
                                                                          tf.multiply(self.target_network_params[i], 1.0 - self.tau))
                                     for i in range(len(self.target_network_params))]


        self.reset()
Beispiel #30
0
    def __init__(self,
                 actions,
                 name="Updating-Q-learning",
                 alpha=0.05,
                 gamma=0.99,
                 epsilon=0.1,
                 explore="uniform",
                 anneal=False,
                 default_q=1.0 / (1.0 - 0.99),
                 num_sample_tasks=20):
        '''
        Args:
            actions (list): Contains strings denoting the actions.
            name (str): Denotes the name of the agent.
            alpha (float): Learning rate.
            gamma (float): Discount factor.
            epsilon (float): Exploration term.
            explore (str): One of {softmax, uniform}. Denotes explore policy.
        '''
        name_ext = "-" + explore if explore != "uniform" else ""
        Agent.__init__(self,
                       name=name + name_ext,
                       actions=actions,
                       gamma=gamma)

        # Set/initialize parameters and other relevant classwide data
        self.alpha, self.alpha_init = alpha, alpha
        self.epsilon, self.epsilon_init = epsilon, epsilon
        self.step_number = 0
        self.anneal = anneal
        self.default_q = default_q

        # Q Function:
        # Key: state
        # Val: dict
        #   Key: action
        #   Val: q-value
        self.q_func = defaultdict(lambda: defaultdict(lambda: self.default_q))
        self.default_q_func = copy.deepcopy(self.q_func)

        # Choose explore type.
        self.explore = explore

        self.task_number = 0
        self.num_sample_tasks = num_sample_tasks
Beispiel #31
0
    def __init__(self,
                 actions,
                 gamma=0.95,
                 s_a_threshold=2,
                 epsilon_one=0.99,
                 max_reward=1.0,
                 name="RMax",
                 custom_q_init=None):
        self.name = name
        Agent.__init__(self, name=name, actions=actions, gamma=gamma)
        self.rmax = max_reward
        self.s_a_threshold = s_a_threshold
        self.custom_q_init = custom_q_init
        self.reset()
        self.custom_q_init = custom_q_init
        self.gamma = gamma
        self.epsilon_one = epsilon_one

        if self.custom_q_init:
            self.q_func = self.custom_q_init
        else:
            self.q_func = defaultdict(lambda: defaultdict(lambda: self.rmax))
    def __init__(self, states, state_map, actions, gamma=0.95, horizon=3, 
                 init_threshold=2, name="RMax", greedy=False):
        name = name
        Agent.__init__(self, name=name, actions=actions, gamma=gamma)
        self.rmax = 1.0
        self.horizon = horizon
        self.init_threshold = init_threshold
        self.greedy = greedy
        
        self.states = states
        self.state_map = state_map
        self.actions = actions
        self.action_map = {}
        k = 0

        #Define the id of actions in the list.
        for a in self.actions:
            self.action_map[a] = k
            k += 1
#        print(self.state_map)
#        print(self.action_map)
        self.reset()
    def __init__(self, actions, name="Q-learning", alpha=0.1, gamma=0.99, epsilon=0.1, explore="uniform", anneal=False):
        '''
        Args:
            actions (list): Contains strings denoting the actions.
            name (str): Denotes the name of the agent.
            alpha (float): Learning rate.
            gamma (float): Discount factor.
            epsilon (float): Exploration term.
            explore (str): One of {softmax, uniform}. Denotes explore policy.
        '''
        name_ext = "-" + explore if explore != "uniform" else ""
        Agent.__init__(self, name=name + name_ext, actions=actions, gamma=gamma)

        # Set/initialize parameters and other relevant classwide data
        self.alpha, self.alpha_init = alpha, alpha
        self.epsilon, self.epsilon_init = epsilon, epsilon
        self.step_number = 0
        self.anneal = anneal
        self.default_q = 0 #1 / (1 - self.gamma)
        self.explore = explore

        # Q Function:
        self.q_func = defaultdict(lambda : defaultdict(lambda: self.default_q))
Beispiel #34
0
    def __init__(self,
                 states,
                 state_map,
                 actions,
                 gamma=0.95,
                 horizon=3,
                 name="TempLe",
                 thres_sm=5,
                 thres_lg=10,
                 pattern_gap=0.4,
                 greedy=True,
                 with_grouping=False,
                 t1=0,
                 model_gap=0.4,
                 flag_tol=3):
        name = name
        Agent.__init__(self, name=name, actions=actions, gamma=gamma)

        self.horizon = horizon
        self.thres_sm = thres_sm
        self.thres_lg = thres_lg
        self.states = states
        self.state_map = state_map
        self.greedy = greedy
        self.pattern_gap = pattern_gap
        self.patterns = []
        self.single_agent = None
        self.with_grouping = with_grouping

        if self.with_grouping:
            self.t1 = t1
            self.groups = []
            self.model_gap = model_gap
            self.flag_tol = flag_tol

        self.count = 0
        self.reset()
Beispiel #35
0
 def reset(self, mdp=None):
     self.step_number = 0
     self.episode_number = 0
     # print "#####################################"
     # print "Reset", self.name, "Q-function"
     # # print self.q_func
     # for x in self.q_func:
     #     print (x)
     #     for y in self.q_func[x]:
     #         print (y, ':', self.q_func[x][y])
     if mdp is not None:
         self.update_init_q_function(mdp)
     if self.task_number >= self.num_sample_tasks:
         for x in self.init_q_func:
             for y in self.init_q_func[x]:
                 assert (self.init_q_func[x][y] >= -0.001)
         self.q_func = copy.deepcopy(self.init_q_func)
     else:
         self.q_func = defaultdict(
             lambda: defaultdict(lambda: self.default_q))
     self.task_number = self.task_number + 1
     if self.sample_with_q:
         self.q_agent.reset()
     Agent.reset(self)
Beispiel #36
0
    def __init__(self,
                 sess=None,
                 obs_dim=None,
                 obs_bound=None,
                 action_dim=None,
                 action_bound=None,
                 num_actions=None,
                 num_options=0,
                 gamma=0.99,
                 epsilon=0.0,
                 tau=0.001,
                 high_method='linear',
                 low_method='linear',
                 f_func='fourier',
                 batch_size=32,
                 buffer_size=32,
                 low_update_freq=1,
                 option_batch_size=32,
                 option_buffer_size=32,
                 high_update_freq=10,
                 option_freq=256,
                 option_min_steps=512,
                 init_all=True,
                 init_around_goal=True,
                 init_dist=0.9,
                 term_dist=0.1,
                 bidirectional=False,
                 name=NAME):
        # TODO: Implement an interface for discrete action space
        Agent.__init__(self, name=name, actions=[])

        if sess is None:
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True  # TODO: conv dumps error without this
            self.sess = tf.Session(config=config)
        else:
            self.sess = sess
        self.obs_dim = obs_dim
        self.obs_bound = obs_bound
        self.action_dim = action_dim
        self.action_bound = action_bound
        self.num_actions = num_actions
        # if self.num_actions is None:
        #     self.continuous_action = True
        # else:
        #     self.continuous_action = False

        self.epsilon = epsilon
        self.gamma = gamma
        self.batch_size = batch_size
        self.buffer_size = buffer_size  # TODO: Let's test online learning first.
        self.low_update_freq = low_update_freq
        self.tau = tau
        self.init_around_goal = init_around_goal
        self.init_dist = init_dist
        self.term_dist = term_dist

        # TODO: Should we use this as an initialization process?
        if num_options == 1:
            # Never update the high level policy if there is no options.
            self.high_update_freq = 1000000000000000000
        else:
            self.high_update_freq = high_update_freq
        self.option_batch_size = option_batch_size
        self.option_buffer_size = option_buffer_size  # Online setting
        self.option_freq = option_freq
        self.option_min_steps = option_min_steps

        self.num_options = num_options
        self.init_all = init_all
        self.bidirectional = bidirectional

        self.default_options = []

        self.curr_instances = 0
        self.generated_options = dict()

        self.high_method = high_method
        self.low_method = low_method
        self.f_func = f_func

        if self.high_method == 'linear':
            # low_bound = np.asarray([0.0, 0.0, -2.0, -2.0])
            # up_bound = np.asarray([1.0, 1.0, 2.0, 2.0])
            features = Fourier(state_dim=obs_dim, bound=obs_bound, order=3)
            self.high_control = LinearQAgent(actions=range(self.num_options),
                                             feature=features,
                                             name=self.name + "_high")
        elif self.high_method == 'sarsa':
            # low_bound = np.asarray([0.0, 0.0, -2.0, -2.0])
            # up_bound = np.asarray([1.0, 1.0, 2.0, 2.0])
            features = Fourier(state_dim=obs_dim, bound=obs_bound, order=3)
            self.high_control = LinearQAgent(actions=range(self.num_options),
                                             feature=features,
                                             sarsa=True,
                                             name=self.name + "_high")
        elif self.high_method == 'dqn':
            self.high_control = DQNAgent(sess=self.sess,
                                         obs_dim=obs_dim,
                                         num_actions=self.num_options,
                                         buffer_size=0,
                                         gamma=self.gamma,
                                         epsilon=self.epsilon,
                                         learning_rate=0.001,
                                         tau=self.tau,
                                         name=self.name + "_high")
        elif self.high_method == 'rand':
            self.high_control = RandomAgent(range(self.num_options),
                                            name=self.name + "_high")
        else:
            assert (False)

        self.reset()
 def reset(self):
     self.step_number = 0
     self.episode_number = 0
     self.q_func = defaultdict(lambda : defaultdict(lambda: self.default_q))
     Agent.reset(self)
 def reset(self):
     self.step_number = 0
     self.episode_number = 0
     self.q_funcs = {"A":defaultdict(lambda : defaultdict(lambda: self.default_q)), \
                     "B":defaultdict(lambda : defaultdict(lambda: self.default_q))}
     Agent.reset(self)
 def __init__(self, actions, name=""):
     name = "policy_gradient" if name is "" else name
     Agent.__init__(self, name=name, actions=actions)
 def __init__(self, actions, name=""):
     name = "Random" if name is "" else name
     Agent.__init__(self, name=name, actions=actions)