def __init__(self, actions, name=NAME, learning_rate=1e-4, x_dim=210, y_dim=160, eps_start=1.0, eps_decay=0.0000001, eps_end=0.1, num_channels=3, should_train=True, from_checkpoint=None, player_id=1): Agent.__init__(self, name=name, actions=[]) self.learning_rate = learning_rate self.x_dim, self.y_dim = x_dim, y_dim self.actions, self.num_actions = actions, len(actions) self.hidden_layers = [32, 32] self.num_channels = num_channels self.eps_start, self.epsilon_decay, self.epsilon_end = eps_start, eps_decay, eps_end self.should_train = should_train self.reset() # Parameters for updating target network. tau = 0.001 # TODO: Update to support player_id > 2. # NOTE: This is a bit of a hack to update the variables in the target # network. It can be fixed by using scope and Tensorflow 1.4 which takes # a scope argument in tf.trainable_variables(). if player_id == 2: vs = tf.trainable_variables() self.target_ops = update_target_graph(vs[len(vs)//2:], tau) else: self.target_ops = update_target_graph(tf.trainable_variables(), tau) # Load model from a checkpoint if not (from_checkpoint is None): self.saver.restore(self.sess, from_checkpoint) print('Restored model from checkpoint: {}'.format(from_checkpoint))
def __init__(self, actions, name=NAME, learning_rate=1e-4, x_dim=21, y_dim=16, eps_start=1.0, eps_decay=0.0000001, eps_end=0.1, num_channels=3, should_train=True, from_checkpoint=None, player_id=1): Agent.__init__(self, name=name, actions=[]) self.learning_rate = learning_rate self.x_dim, self.y_dim = x_dim, y_dim self.actions, self.num_actions = actions, len(actions) self.hidden_layers = [32, 32] self.mainQN = QNetwork(learning_rate=self.learning_rate, num_actions=self.num_actions, x_dim=self.x_dim, y_dim=self.y_dim, num_channels=num_channels) self.targetQN = QNetwork(learning_rate=self.learning_rate, num_actions=self.num_actions, x_dim=self.x_dim, y_dim=self.y_dim, num_channels=num_channels) self.sess = tf.Session() self.experience_buffer = ExperienceBuffer(buffer_size=10e5) self.prev_state, self.prev_action = None, None self.epsilon, self.epsilon_decay, self.epsilon_end = eps_start, eps_decay, eps_end self.curr_step, self.total_steps = 0, 0 self.curr_episode = 0 self.update_freq = 100 self.batch_size = 32 self.update_target = 100 self.should_train = should_train self.should_save, self.save_every = True, 100000 self.print_loss, self.print_every = True, 10000 self.saver = tf.train.Saver() self.action_counts = np.zeros(self.num_actions) # Parameters for updating target network. tau = 0.001 # TODO: Update to support player_id > 2. # NOTE: This is a bit of a hack to update the variables in the target # network. It can be fixed by using scope and Tensorflow 1.4 which takes # a scope argument in tf.trainable_variables(). if player_id == 2: vs = tf.trainable_variables() self.target_ops = updateTargetGraph(vs[len(vs) // 2:], tau) else: self.target_ops = updateTargetGraph(tf.trainable_variables(), tau) self.sess.run(tf.global_variables_initializer()) # Load model from a checkpoint if not (from_checkpoint is None): self.saver.restore(self.sess, from_checkpoint) print 'Restored model from checkpoint: {}'.format(from_checkpoint)
def __init__(self, actions, name="Q-learning", alpha=0.1, gamma=0.99, epsilon=0.1, explore="uniform", anneal=False, custom_q_init=None, default_q=0): ''' Args: actions (list): Contains strings denoting the actions. name (str): Denotes the name of the agent. alpha (float): Learning rate. gamma (float): Discount factor. epsilon (float): Exploration term. explore (str): One of {softmax, uniform}. Denotes explore policy. custom_q_init (defaultdict{state, defaultdict{action, float}}): a dictionary of dictionaries storing the initial q-values. Can be used for potential shaping (Wiewiora, 2003) default_q (float): the default value to initialize every entry in the q-table with [by default, set to 0.0] ''' name_ext = "-" + explore if explore != "uniform" else "" Agent.__init__(self, name=name + name_ext, actions=actions, gamma=gamma) # Set/initialize parameters and other relevant classwide data self.alpha, self.alpha_init = alpha, alpha self.epsilon, self.epsilon_init = epsilon, epsilon self.step_number = 0 self.anneal = anneal self.default_q = default_q # 0 # 1 / (1 - self.gamma) self.explore = explore self.custom_q_init = custom_q_init # Q Function: if self.custom_q_init: self.q_func = self.custom_q_init else: self.q_func = defaultdict(lambda: defaultdict(lambda: self.default_q))
def __init__(self, actions, name="Q-learning", alpha=0.1, gamma=0.99, epsilon=0.1, explore="uniform", anneal=False): ''' Args: actions (list): Contains strings denoting the actions. name (str): Denotes the name of the agent. alpha (float): Learning rate. gamma (float): Discount factor. epsilon (float): Exploration term. explore (str): One of {softmax, uniform}. Denotes explore policy. ''' name_ext = "-" + explore if explore != "uniform" else "" Agent.__init__(self, name=name + name_ext, actions=actions, gamma=gamma) # Set/initialize parameters and other relevant classwide data self.alpha, self.alpha_init = alpha, alpha self.epsilon, self.epsilon_init = epsilon, epsilon self.step_number = 0 self.anneal = anneal self.default_q = 0 #1 / (1 - self.gamma) self.explore = explore # Q Function: self.q_func = defaultdict(lambda: defaultdict(lambda: self.default_q))
def __init__(self, states, state_map, actions, gamma=0.95, horizon=3, name="FMRL", thres_sm=5, thres_lg=10, t1=6, model_gap=0.4, greedy=False, xi=0.2): name = name Agent.__init__(self, name=name, actions=actions, gamma=gamma) self.horizon = horizon self.changed1 = False self.changed2 = False self.thres_sm = thres_sm self.thres_lg = thres_lg self.epsilon = 0.3 self.states = states self.state_map = state_map self.greedy = greedy self.t1 = t1 self.xi = xi self.model_gap = model_gap self.has_incorp = False # for phase 2: whether has already incorporated past groups self.single_agent = None self.groups = [] self.flag = [] self.count = -1 # how many tasks we have learned self.reset()
def __init__(self, actions, default_q=1.0 / (1.0 - 0.99), name="Updating-delayed-Q-learning", gamma=0.99, m=1, epsilon1=0.1, qstar_transfer=False, num_sample_tasks=20, sample_with_q=False): ''' Args: actions (list): Contains strings denoting the actions. init_q (2d list): Initial Q function. AU(s, a) in Strehl et al 2006. name (str): Denotes the name of the agent. gamma (float): discount factor m (float): Number of samples for updating Q-value epsilon1 (float): Learning rate ''' # name_ext = "-" + explore if explore != "uniform" else "" Agent.__init__(self, name=name, actions=actions, gamma=gamma) self.rmax = 1 # TODO: set/get function # Set/initialize parameters and other relevant classwide data self.step_number = 0 # TODO: Here we assume that init_q has Qvalue for every (s, a) pair. self.q_func = defaultdict(lambda: defaultdict(lambda: default_q)) self.init_q_func = defaultdict(lambda: defaultdict(lambda: default_q)) self.AU = defaultdict( lambda: defaultdict(lambda: 0.0)) # used for attempted updates self.l = defaultdict(lambda: defaultdict(lambda: 0)) # counters self.b = defaultdict(lambda: defaultdict(lambda: 0) ) # beginning timestep of attempted update self.LEARN = defaultdict(lambda: defaultdict(lambda: False) ) # beginning timestep of attempted update # for x in init_q: # for y in init_q[x]: # self.AU[x][y] = 0.0 # AU(s, a) <- 0 # self.l[x][y] = 0 # l(s, a) <- 0 # self.b[x][y] = 0 # b(s, a) <- 0 # self.LEARN[x][y] = False # TODO: Add a code to calculate m and epsilon1 from epsilon and delta. # m and epsilon1 should be set according to epsilon and delta in order to be PAC-MDP. self.m = m self.epsilon1 = epsilon1 self.tstar = 0 # time of most recent action value change self.task_number = 0 self.default_q = default_q self.num_sample_tasks = num_sample_tasks self.qstar_transfer = qstar_transfer self.sample_with_q = sample_with_q if self.sample_with_q: self.q_agent = QLearningAgent(actions, gamma=self.gamma, default_q=self.default_q)
def __init__(self, actions, states, reward_func, initial_safe_states, initial_safe_actions, similarity_function, analagous_state_function, transition_support_function=None, gamma=0.99, vi_horizon=100, name='safe-agent', beta_T=0.5, tau=0.1, update_frequency=100, use_sparse_matrices=False): self.use_sparse_matrices = use_sparse_matrices self.gamma = gamma self.vi_horizon = vi_horizon self.beta_T = beta_T self.tau = tau self.update_frequency = update_frequency self.states = states self.num_states = len(states) self.actions = actions self.num_actions = len(actions) self.s0 = None self.step_number = 0 self.state_to_id = dict() for s_id, state in enumerate(self.states): self.state_to_id[state] = s_id self.action_to_id = dict() for i, a in enumerate(self.actions): self.action_to_id[a] = i self.initial_safe_sa = np.zeros([self.num_states, self.num_actions], dtype=np.bool) for state in initial_safe_states: s = self.state_to_id[state] for action in initial_safe_actions(state): a = self.action_to_id[action] self.initial_safe_sa[s, a] = 1 self.transition_table = AnalogousStateTransitionTable( actions=actions, similarity_function=similarity_function, analagous_state_function=analagous_state_function, initial_safe_sa=self.initial_safe_sa, reward_func=reward_func, states=states, support_function=transition_support_function, use_sparse_matrices=self.use_sparse_matrices, beta_T=beta_T) self.z_safe = self.initial_safe_sa Agent.__init__(self, name=name, actions=actions, gamma=gamma)
def __init__(self, actions, gamma=0.95, horizon=3, s_a_threshold=2, name="RMax-h"): name = name + str(horizon) if name[-2:] == "-h" else name Agent.__init__(self, name=name, actions=actions, gamma=gamma) self.rmax = 1.0 self.horizon = horizon self.s_a_threshold = s_a_threshold self.reset()
def __init__(self, policy, name=NAME): ''' Args: policy (func: S ---> A) ''' Agent.__init__(self, name=name, actions=[]) self.policy = policy
def __init__(self, sess=None, obs_dim=None, num_actions=0, buffer_size=100000, gamma=0.99, epsilon=0.05, learning_rate=0.001, tau=0.001, conv=False, name=NAME): Agent.__init__(self, name=name, actions=range(num_actions)) if sess is None: config = tf.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.Session(config=config) else: self.sess = sess self.obs_dim = obs_dim self.num_actions = num_actions self.buffer_size = buffer_size self.gamma = gamma self.epsilon = epsilon self.learning_rate = learning_rate self.tau = tau self.update_freq = 1 self.batch_size = 64 self.conv = conv self.mainQ = QNetwork(sess=self.sess, learning_rate=self.learning_rate, obs_dim=self.obs_dim, num_actions=self.num_actions, conv=self.conv, name=name + "_main_q") self.targetQ = QNetwork(sess=self.sess, learning_rate=self.learning_rate, obs_dim=self.obs_dim, num_actions=self.num_actions, conv=self.conv, name=name + "_target_q") self.network_params = tf.trainable_variables(scope=self.name + "_main_q") self.target_network_params = tf.trainable_variables(scope=self.name + "_target_q") self.update_target_params = \ [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1.0 - self.tau)) for i in range(len(self.target_network_params))] self.saver = tf.train.Saver(self.network_params + self.target_network_params) self.reset()
def __init__(self, name, actions, gamma=0.99): ''' Args: name (str) actions (list) gamma (float ''' Agent.__init__(self, name, actions, gamma)
def __init__(self, actions, gamma=0.95, horizon=4, s_a_threshold=1): Agent.__init__(self, name="rmax-h" + str(horizon), actions=actions, gamma=gamma) self.rmax = 1.0 self.horizon = horizon self.s_a_threshold = s_a_threshold self.reset()
def __init__(self, sess=None, obs_dim=None, action_dim=None, action_bound=None, buffer_size=100000, batch_size=64, name=NAME, actor_rate=0.0001, critic_rate=0.001, tau=0.001, should_train=True, from_checkpoint=None, gamma=0.99): # TODO: Use a shared experience buffer? Agent.__init__(self, name=name, actions=[]) assert (type(obs_dim) is int) assert (type(action_dim) is int) assert (action_bound is not None) self.obs_dim = obs_dim self.action_dim = action_dim self.action_bound = action_bound self.buffer_size = buffer_size self.gamma = gamma self.should_train = should_train # Fixed parameters self.update_freq = 1 # 64 self.batch_size = batch_size self.should_save, self.save_every = True, 100000 self.print_loss, self.print_every = True, 10000 if sess is None: self.sess = tf.Session() else: self.sess = sess self.actor = Actor(sess=self.sess, obs_dim=self.obs_dim, action_dim=self.action_dim, \ action_bound=self.action_bound, learning_rate=actor_rate, tau=tau, batch_size=self.batch_size, name=name) self.critic = Critic(sess=self.sess, obs_dim=self.obs_dim, action_dim=self.action_dim, \ learning_rate=critic_rate, tau=tau, name=name) self.actor_noise = ActorNoise(mu=np.zeros(self.action_dim), sigma=0.3) self.total_reward = 0 self.reset()
def __init__(self, actions, init_q=None, name="Delayed-Q", gamma=0.99, m=5, epsilon1=0.1): ''' Args: actions (list): Contains strings denoting the actions. init_q (2d list): Initial Q function. AU(s, a) in Strehl et al 2006. name (str): Denotes the name of the agent. gamma (float): discount factor m (float): Number of samples for updating Q-value epsilon1 (float): Learning rate ''' # Set initial q func. self.rmax = 1 # TODO: set/get function init_q = defaultdict(lambda: defaultdict(lambda: self.rmax / ( 1 - gamma))) if init_q is None else init_q Agent.__init__(self, name=name, actions=actions, gamma=gamma) # Set/initialize parameters and other relevant classwide data self.step_number = 0 # TODO: Here we assume that init_q has Qvalue for every (s, a) pair. self.q_func = copy.deepcopy(init_q) self.default_q_func = copy.deepcopy(init_q) self.AU = defaultdict( lambda: defaultdict(lambda: 0.0)) # used for attempted updates self.l = defaultdict(lambda: defaultdict(lambda: 0)) # counters self.b = defaultdict(lambda: defaultdict(lambda: 0) ) # beginning timestep of attempted update self.LEARN = defaultdict(lambda: defaultdict(lambda: True) ) # beginning timestep of attempted update for x in init_q: for y in init_q[x]: self.AU[x][y] = 0.0 # AU(s, a) <- 0 self.l[x][y] = 0 # l(s, a) <- 0 self.b[x][y] = 0 # b(s, a) <- 0 self.LEARN[x][y] = False # TODO: Add a code to calculate m and epsilon1 from epsilon and delta. # m and epsilon1 should be set according to epsilon and delta in order to be PAC-MDP. self.m = m self.epsilon1 = epsilon1 self.tstar = 0 # time of most recent action value change
def __init__(self, actions, name="LinUCB", rand_init=True, context_size=1, alpha=1.5): ''' Args: actions (list): Contains a string for each action. name (str) context_size (int) alpha (float): Uncertainty parameter. ''' Agent.__init__(self, name, actions) self.alpha = alpha self.context_size = context_size self.prev_context = None self.step_number = 0 self._init_action_model(rand_init)
def __init__(self, states, actions, epsilon=0.1, gamma=0.99, vi_horizon=100, name='unsafe-agent'): self.transition_table = TransitionTable(states, actions) self.epsilon = epsilon self.gamma = gamma self.vi_horizon = vi_horizon Agent.__init__(self, name=name, actions=actions, gamma=gamma)
def __init__(self, actions, gamma=0.95, horizon=4, s_a_threshold=1, name="RMax-h"): name = name + str(horizon) if name[-2:] == "-h" else name Agent.__init__(self, name=name, actions=actions, gamma=gamma) self.rmax = 1.0 self.horizon = horizon self.s_a_threshold = s_a_threshold # self.init_q_func = None self.init_q_func = defaultdict(lambda: defaultdict(lambda: 1.0 / (1.0 - gamma))) self.reset()
def __init__(self, actions, name="LinUCB", rand_init=True, context_size=1, alpha=1.5): ''' Args: actions (list): Contains a string for each action. name (str) context_size (int) alpha (float): Uncertainty parameter. ''' Agent.__init__(self, name, actions) self.alpha = alpha self.context_size = context_size self.prev_context = None self.step_number = 0 self.rand_init = rand_init self._init_action_model(rand_init)
def __init__(self, actions, gamma=0.95, horizon=4, s_a_threshold=1, num_sample_eps=20, name="UpdatingRMax-h"): name = name + str(horizon) if name[-2:] == "-h" else name Agent.__init__(self, name=name, actions=actions, gamma=gamma) self.rmax = 1.0 self.horizon = horizon self.s_a_threshold = s_a_threshold # trans_qmax stores the maximum qvalue a (s, a) pair achieved so far. self.init_q_func = defaultdict(lambda: defaultdict(lambda: 0.0)) self.cur_eps = 0 self.num_sample_eps = num_sample_eps self.reset()
def __init__(self, actions, name="Updating-Q-learning", alpha=0.05, gamma=0.99, epsilon=0.1, explore="uniform", anneal=False, default_q=1.0 / (1.0 - 0.99), num_sample_tasks=20): ''' Args: actions (list): Contains strings denoting the actions. name (str): Denotes the name of the agent. alpha (float): Learning rate. gamma (float): Discount factor. epsilon (float): Exploration term. explore (str): One of {softmax, uniform}. Denotes explore policy. ''' name_ext = "-" + explore if explore != "uniform" else "" Agent.__init__(self, name=name + name_ext, actions=actions, gamma=gamma) # Set/initialize parameters and other relevant classwide data self.alpha, self.alpha_init = alpha, alpha self.epsilon, self.epsilon_init = epsilon, epsilon self.step_number = 0 self.anneal = anneal self.default_q = default_q # Q Function: # Key: state # Val: dict # Key: action # Val: q-value self.q_func = defaultdict(lambda: defaultdict(lambda: self.default_q)) self.default_q_func = copy.deepcopy(self.q_func) # Choose explore type. self.explore = explore self.task_number = 0 self.num_sample_tasks = num_sample_tasks
def __init__(self, actions, name="qlearner", alpha=0.05, gamma=0.99, epsilon=0.1, explore=" ", anneal=False, mdp=None): ''' Args: actions (list): Contains strings denoting the actions. name (str): Denotes the name of the agent. alpha (float): Learning rate. gamma (float): Discount factor. epsilon (float): Exploration term. explore (str): One of {softmax, uniform}. Denotes explore policy. ''' explore = " " name_ext = "-" + explore if explore != "uniform" else "" Agent.__init__(self, name=name + name_ext, actions=actions, gamma=gamma) if mdp == None: raise ValueError('DataMDP not defined') else: self.mdp = mdp # Set/initialize parameters and other relevant classwide data self.alpha, self.alpha_init = alpha, alpha self.epsilon, self.epsilon_init = epsilon, epsilon self.step_number = 0 self.anneal = anneal self.default_q = 0.0 self.mdp = mdp # Q Function: # Key: state # Val: dict # Key: action # Val: q-value self.q_func = defaultdict(lambda: defaultdict(lambda: self.default_q)) # Choose explore type. self.explore = explore
def __init__(self, sess=None, obs_dim=None, action_dim=None, action_bound=None, num_actions=None, num_options=0, gamma=0.99, epsilon=0.05, tau=0.001, name=NAME): # TODO: Implement an interface for discrete action space Agent.__init__(self, name=name, actions=[]) if sess is None: self.sess = tf.Session() else: self.sess = sess self.obs_dim = obs_dim self.action_dim = action_dim self.action_bound = action_bound self.num_actions = num_actions if self.num_actions is None: self.continuous_action = True else: self.continuous_action = False self.epsilon = epsilon self.gamma = gamma self.update_freq = 1 self.batch_size = 64 self.tau = tau self.option_b_size = 16 self.option_freq = 16 self.num_options = num_options self.curr_instances = 0 # TODO: How can I abstract the high-level control policy? # TODO: How can I implement a low-level control policy using the linearSARSA? self.high_control_main = QNetwork(self.sess, obs_dim=self.obs_dim, num_options=self.num_options, learning_rate=0.00001, name=self.name+"_high_main") self.high_control_target = QNetwork(self.sess, obs_dim=self.obs_dim, num_options=self.num_options, learning_rate=0.00001, name=self.name+"_high_target") self.network_params = tf.trainable_variables(scope=self.name+"_high_main") self.target_network_params = tf.trainable_variables(scope=self.name+"_high_main") self.update_target_params = \ [self.target_network_params[i].assign(tf.multiply(self.network_params[i], self.tau) + tf.multiply(self.target_network_params[i], 1.0 - self.tau)) for i in range(len(self.target_network_params))] self.reset()
def __init__(self, actions, gamma=0.95, s_a_threshold=2, epsilon_one=0.99, max_reward=1.0, name="RMax", custom_q_init=None): self.name = name Agent.__init__(self, name=name, actions=actions, gamma=gamma) self.rmax = max_reward self.s_a_threshold = s_a_threshold self.custom_q_init = custom_q_init self.reset() self.custom_q_init = custom_q_init self.gamma = gamma self.epsilon_one = epsilon_one if self.custom_q_init: self.q_func = self.custom_q_init else: self.q_func = defaultdict(lambda: defaultdict(lambda: self.rmax))
def __init__(self, states, state_map, actions, gamma=0.95, horizon=3, init_threshold=2, name="RMax", greedy=False): name = name Agent.__init__(self, name=name, actions=actions, gamma=gamma) self.rmax = 1.0 self.horizon = horizon self.init_threshold = init_threshold self.greedy = greedy self.states = states self.state_map = state_map self.actions = actions self.action_map = {} k = 0 #Define the id of actions in the list. for a in self.actions: self.action_map[a] = k k += 1 # print(self.state_map) # print(self.action_map) self.reset()
def __init__(self, actions, name="Q-learning", alpha=0.1, gamma=0.99, epsilon=0.1, explore="uniform", anneal=False): ''' Args: actions (list): Contains strings denoting the actions. name (str): Denotes the name of the agent. alpha (float): Learning rate. gamma (float): Discount factor. epsilon (float): Exploration term. explore (str): One of {softmax, uniform}. Denotes explore policy. ''' name_ext = "-" + explore if explore != "uniform" else "" Agent.__init__(self, name=name + name_ext, actions=actions, gamma=gamma) # Set/initialize parameters and other relevant classwide data self.alpha, self.alpha_init = alpha, alpha self.epsilon, self.epsilon_init = epsilon, epsilon self.step_number = 0 self.anneal = anneal self.default_q = 0 #1 / (1 - self.gamma) self.explore = explore # Q Function: self.q_func = defaultdict(lambda : defaultdict(lambda: self.default_q))
def __init__(self, states, state_map, actions, gamma=0.95, horizon=3, name="TempLe", thres_sm=5, thres_lg=10, pattern_gap=0.4, greedy=True, with_grouping=False, t1=0, model_gap=0.4, flag_tol=3): name = name Agent.__init__(self, name=name, actions=actions, gamma=gamma) self.horizon = horizon self.thres_sm = thres_sm self.thres_lg = thres_lg self.states = states self.state_map = state_map self.greedy = greedy self.pattern_gap = pattern_gap self.patterns = [] self.single_agent = None self.with_grouping = with_grouping if self.with_grouping: self.t1 = t1 self.groups = [] self.model_gap = model_gap self.flag_tol = flag_tol self.count = 0 self.reset()
def __init__(self, actions, name=""): name = "policy_gradient" if name is "" else name Agent.__init__(self, name=name, actions=actions)
def __init__(self, sess=None, obs_dim=None, obs_bound=None, action_dim=None, action_bound=None, num_actions=None, num_options=0, gamma=0.99, epsilon=0.0, tau=0.001, high_method='linear', low_method='linear', f_func='fourier', batch_size=32, buffer_size=32, low_update_freq=1, option_batch_size=32, option_buffer_size=32, high_update_freq=10, option_freq=256, option_min_steps=512, init_all=True, init_around_goal=True, init_dist=0.9, term_dist=0.1, bidirectional=False, name=NAME): # TODO: Implement an interface for discrete action space Agent.__init__(self, name=name, actions=[]) if sess is None: config = tf.ConfigProto() config.gpu_options.allow_growth = True # TODO: conv dumps error without this self.sess = tf.Session(config=config) else: self.sess = sess self.obs_dim = obs_dim self.obs_bound = obs_bound self.action_dim = action_dim self.action_bound = action_bound self.num_actions = num_actions # if self.num_actions is None: # self.continuous_action = True # else: # self.continuous_action = False self.epsilon = epsilon self.gamma = gamma self.batch_size = batch_size self.buffer_size = buffer_size # TODO: Let's test online learning first. self.low_update_freq = low_update_freq self.tau = tau self.init_around_goal = init_around_goal self.init_dist = init_dist self.term_dist = term_dist # TODO: Should we use this as an initialization process? if num_options == 1: # Never update the high level policy if there is no options. self.high_update_freq = 1000000000000000000 else: self.high_update_freq = high_update_freq self.option_batch_size = option_batch_size self.option_buffer_size = option_buffer_size # Online setting self.option_freq = option_freq self.option_min_steps = option_min_steps self.num_options = num_options self.init_all = init_all self.bidirectional = bidirectional self.default_options = [] self.curr_instances = 0 self.generated_options = dict() self.high_method = high_method self.low_method = low_method self.f_func = f_func if self.high_method == 'linear': # low_bound = np.asarray([0.0, 0.0, -2.0, -2.0]) # up_bound = np.asarray([1.0, 1.0, 2.0, 2.0]) features = Fourier(state_dim=obs_dim, bound=obs_bound, order=3) self.high_control = LinearQAgent(actions=range(self.num_options), feature=features, name=self.name + "_high") elif self.high_method == 'sarsa': # low_bound = np.asarray([0.0, 0.0, -2.0, -2.0]) # up_bound = np.asarray([1.0, 1.0, 2.0, 2.0]) features = Fourier(state_dim=obs_dim, bound=obs_bound, order=3) self.high_control = LinearQAgent(actions=range(self.num_options), feature=features, sarsa=True, name=self.name + "_high") elif self.high_method == 'dqn': self.high_control = DQNAgent(sess=self.sess, obs_dim=obs_dim, num_actions=self.num_options, buffer_size=0, gamma=self.gamma, epsilon=self.epsilon, learning_rate=0.001, tau=self.tau, name=self.name + "_high") elif self.high_method == 'rand': self.high_control = RandomAgent(range(self.num_options), name=self.name + "_high") else: assert (False) self.reset()
def __init__(self, actions, name=""): name = "Random" if name is "" else name Agent.__init__(self, name=name, actions=actions)
def __init__(self, actions, name=""): name = "Random" if name is "" else name # print('type(actions)', type(actions)) Agent.__init__(self, name=name, actions=[]) self.actions = actions
def __init__(self, actions, name=""): name = "random" if name is "" else name Agent.__init__(self, name=name, actions=actions)
def __init__(self, state_size, action_size, seed, device, lr_actor=LRA, lr_critic=LRC, batch_size=BATCH_SIZE, tensor_log=False, writer=None, name="Global-DDPG-Agent"): self.state_size = state_size self.action_size = action_size self.actor_learning_rate = lr_actor self.critic_learning_rate = lr_critic self.batch_size = batch_size self.seed = random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) self.device = device self.tensor_log = tensor_log self.name = name self.noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(action_size)) self.actor = Actor(state_size, action_size, device=device) self.critic = Critic(state_size, action_size, device=device) self.target_actor = Actor(state_size, action_size, device=device) self.target_critic = Critic(state_size, action_size, device=device) # Initialize actor target network for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): target_param.data.copy_(param.data) # Initialize critic target network for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic, weight_decay=1e-2) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor) self.replay_buffer = ReplayBuffer( buffer_size=BUFFER_SIZE, name_buffer="{}_replay_buffer".format(name)) self.epsilon = 1.0 # Tensorboard logging self.writer = None if tensor_log: self.writer = writer if writer is not None else SummaryWriter() self.n_learning_iterations = 0 self.n_acting_iterations = 0 Agent.__init__(self, name, [], gamma=GAMMA)
def __init__(self, state_size, action_size, trained_options, seed, device, name="DQN-Agent", eps_start=1., tensor_log=False, lr=LR, use_double_dqn=False, gamma=GAMMA, loss_function="huber", gradient_clip=None, evaluation_epsilon=0.05, writer=None): self.state_size = state_size self.action_size = action_size self.trained_options = trained_options self.learning_rate = lr self.use_ddqn = use_double_dqn self.gamma = gamma self.loss_function = loss_function self.gradient_clip = gradient_clip self.evaluation_epsilon = evaluation_epsilon self.seed = random.seed(seed) self.tensor_log = tensor_log self.device = device # Q-Network self.policy_network = QNetwork(state_size, action_size, seed).to(self.device) self.target_network = QNetwork(state_size, action_size, seed).to(self.device) self.optimizer = optim.Adam(self.policy_network.parameters(), lr=lr) # Replay memory self.replay_buffer = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, self.device) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # Epsilon strategy self.epsilon_schedule = GlobalEpsilonSchedule( eps_start) if "global" in name.lower() else OptionEpsilonSchedule( eps_start) self.epsilon = eps_start self.num_executions = 0 # Number of times act() is called (used for eps-decay) # Debugging attributes self.num_updates = 0 self.num_epsilon_updates = 0 if self.tensor_log: self.writer = SummaryWriter() if writer is None else writer print("\nCreating {} with lr={} and ddqn={} and buffer_sz={}\n".format( name, self.learning_rate, self.use_ddqn, BUFFER_SIZE)) Agent.__init__(self, name, range(action_size), GAMMA)