def __init__(self, cfg_parser: ConfigurationManager): super().__init__(cfg_parser, QuantileRegressionAgent.head) self.cfg = cfg_parser.parse_and_return_dictionary( "AGENT", QuantileRegressionAgent.required_params) self.loss = self.build_loss_op() self.prepare(self.loss)
def __init__(self, cfg_parser: ConfigurationManager, agent: Agent): act_plcy_cfg = cfg_parser.parse_and_return_dictionary( "POLICY.EXPLORATION_STRATEGY", Policy.required_params) if act_plcy_cfg["TYPE"] == "EPSILON_GREEDY": self.policy = EpsilonGreedy(cfg_parser, agent) elif act_plcy_cfg["TYPE"] == "SOFTMAX": self.policy = SoftMax(agent) else: raise NotImplementedError
def __init__(self, config_parser: ConfigurationManager, net: network.GeneralNetwork): super().__init__(config_parser, net) self.cfg = config_parser.parse_and_return_dictionary( "HEAD", SoftmaxFixedAtomsDistributionalHead.required_params) self.q_dist = tf.nn.softmax(self.q_dist, name="state_action_value_dist", axis=-1)
def __init__(self, env: Environment, config_parser: ConfigurationManager): gym.Wrapper.__init__(self, env.env) self.cfg = config_parser.parse_and_return_dictionary( "ENVIRONMENT", NetworkActionToEnvAction.required_params) if "ACTION_SPECIFICATIONS" in self.cfg: self.actions = self.cfg["ACTION_SPECIFICATIONS"] else: self.actions = list( range(config_parser.parsed_json["DEFAULT_NUM_ACTIONS"]))
def __init__(self, config_parser: ConfigurationManager): self.cfg = config_parser.parse_and_return_dictionary( "ENVIRONMENT", Head.required_params) if "ACTION_SPECIFICATIONS" in self.cfg: self.num_actions = len(self.cfg["ACTION_SPECIFICATIONS"]) else: self.num_actions = config_parser.parsed_json["DEFAULT_NUM_ACTIONS"] self.greedy_action = None
def __init__(self, cfg_parser: ConfigurationManager, head): super().__init__() from util.util import build_train_and_target_general_network_with_head, get_session self.sess = get_session(cfg_parser) self.train_network_base, self.train_network, \ self.target_network_base, self.target_network, self.copy_op, self.saver = \ build_train_and_target_general_network_with_head(head, cfg_parser) from memory.experience_replay import ExperienceReplay self.experience_replay = ExperienceReplay(cfg_parser) self.cfg_parser = cfg_parser from function_approximator.head import QNetworkHead self.train_network: QNetworkHead self.target_network: QNetworkHead cfg_parser["NUM_ACTIONS"] = self.train_network.num_actions self.cfg = cfg_parser.parse_and_return_dictionary( "AGENT", BaseDQNBasedAgent.required_params) self.train_step = None self.action_placeholder = tf.placeholder(name="action", dtype=tf.int32, shape=[ None, ]) self.reward_placeholder = tf.placeholder(name="reward", dtype=tf.float32, shape=(None, )) # TODO: Optimize memory uint8 -> bool (check if casting works to float) self.terminal_placeholder = tf.placeholder(name="terminal", dtype=tf.uint8, shape=(None, )) self.predict_calls = 0 self.train_calls = 0 self.num_updates = tf.Variable(initial_value=0, dtype=tf.int32, trainable=False) self.batch_dim_range = tf.range(tf.shape(self.train_network_base.x)[0], dtype=tf.int32) self.policy = None
def __init__(self, cfg_parser: ConfigurationManager): super().__init__(cfg_parser, CategoricalAgent.head) self.cfg = cfg_parser.parse_and_return_dictionary( "AGENT", CategoricalAgent.required_params) self.cfg["NB_ATOMS"] = self.cfg_parser["HEAD.NB_ATOMS"] self.Z, self.delta_z = np.linspace(self.cfg["V_MIN"], self.cfg["V_MAX"], self.cfg["NB_ATOMS"], retstep=True) self.loss = self.build_loss_op() self.prepare(self.loss)
def __init__(self, config_parser: ConfigurationManager): self.cfg = config_parser.parse_and_return_dictionary( "NETWORK", GeneralNetwork.required_params) if "STATE_DIMENSIONS" in self.cfg: obs_shape = [int(i) for i in self.cfg["STATE_DIMENSIONS"]] else: obs_shape = config_parser.parsed_json["DEFAULT_OBS_DIMS"] # Input self.x = tf.placeholder(name="state", dtype=tf.float32, shape=(None, *obs_shape)) # Convolutional Layers self.conv_outputs = [] for CONV_LAYER_SPEC in self.cfg["CONVOLUTIONAL_LAYERS_SPEC"]: self.conv_outputs.append( layers.conv2d( name="conv_layer_" + str(len(self.conv_outputs) + 1), inputs=self.x if len(self.conv_outputs) == 0 else self.conv_outputs[-1], filters=CONV_LAYER_SPEC["filters"], kernel_size=CONV_LAYER_SPEC["kernel_size"], strides=CONV_LAYER_SPEC["strides"], activation=tf.nn.relu)) if len(self.cfg["CONVOLUTIONAL_LAYERS_SPEC"]) > 0: # Flatten self.flattened_conv_output = tf.layers.flatten( name="conv_output_flattener", inputs=self.conv_outputs[-1]) last_out = self.flattened_conv_output else: last_out = self.x # Hidden Layer self.dense_outputs = [] for DENSE_LAYER_SPEC in self.cfg["DENSE_LAYERS_SPEC"]: self.dense_outputs.append( layers.dense( name="fc_layer_" + str(len(self.dense_outputs) + 1), inputs=last_out if len( self.dense_outputs) == 0 else self.dense_outputs[-1], units=DENSE_LAYER_SPEC, activation=tf.nn.relu)) self.last_op = self.dense_outputs[-1]
def get_session(cfg_params: ConfigurationManager): required_params = [] tf_params = cfg_params.parse_and_return_dictionary("TENSORFLOW", required_params) config = tf.ConfigProto() if "ALLOW_GPU_GROWTH" not in tf_params or not tf_params["ALLOW_GPU_GROWTH"]: config.gpu_options.allow_growth = True if "INTRA_OP_PARALLELISM" in tf_params: config.intra_op_parallelism_threads = tf_params["INTRA_OP_PARALLELISM"] if "INTER_OP_PARALLELISM" in tf_params: config.inter_op_parallelism_threads = tf_params["INTER_OP_PARALLELISM"] return tf.Session(config=config)
def __init__(self, config_parser: ConfigurationManager, net: network.GeneralNetwork): super().__init__(config_parser) self.cfg = config_parser.parse_and_return_dictionary( "HEAD", FixedAtomsDistributionalHead.required_params) # State-Action-Value Distributions (as a flattened vector) self.flattened_dist = layers.dense(name="flattened_dists", inputs=net.last_op, units=self.num_actions * self.cfg["NB_ATOMS"], activation=None) # Unflatten self.q_dist = tf.reshape(self.flattened_dist, [-1, self.num_actions, self.cfg["NB_ATOMS"]], name="per_action_dist") self.q = tf.reduce_mean(self.q_dist, axis=-1) self.greedy_action = tf.cast(tf.squeeze(tf.argmax(self.q, axis=-1)), dtype=tf.int32)
def __init__(self, config_parser: ConfigurationManager): self.cfg = config_parser.parse_and_return_dictionary( "EXPERIENCE_REPLAY", ExperienceReplay.required_params) self.memory = deque(maxlen=self.cfg["EXPERIENCE_REPLAY_SIZE"])
def __init__(self, config_parser: ConfigurationManager, net: network.GeneralNetwork): super().__init__(config_parser) self.cfg = config_parser.parse_and_return_dictionary( "HEAD", IQNHead.required_params) self.psi = net.last_op self.num_samples = tf.placeholder(dtype=tf.int32, shape=[], name="num_samples") # Preprocessed tau (choose number of samples and pass through beta as necessary) from action_policy.distorted_expectation import distorted_expectation, get_uniform_dist self.uniform_tau = get_uniform_dist(psi=self.psi, N_placeholder=self.num_samples) self.distorted_tau = distorted_expectation( config_parser, psi=self.psi, N_placeholder=self.num_samples) import math as m pi = tf.constant(m.pi) cos_embed = tf.layers.Dense(units=self.cfg["EMBEDDING_SIZE"], activation=tf.nn.relu, name="cosine_embedding") self.distorted_tau_phi = cos_embed( tf.cos( tf.einsum( 'bn,j->bnj', self.distorted_tau, tf.range(self.cfg["EMBEDDING_SIZE"], dtype=tf.float32)) * pi)) mul_distorted = tf.einsum('bnj,bj->bnj', self.distorted_tau_phi, self.psi) ### self.uniform_tau_phi = cos_embed( tf.cos( tf.einsum( 'bn,j->bnj', self.uniform_tau, tf.range(self.cfg["EMBEDDING_SIZE"], dtype=tf.float32)) * pi)) mul_uniform = tf.einsum('bnj,bj->bnj', self.uniform_tau_phi, self.psi) ### q_dist_layer = tf.layers.Dense(units=self.num_actions, activation=None, name="q_dist") self.q_dist = tf.transpose(q_dist_layer(mul_uniform), perm=[0, 2, 1]) self.q_undistorted = tf.reduce_mean(self.q_dist, axis=-1) self.q_dist_distorted = tf.transpose(q_dist_layer(mul_distorted), perm=[0, 2, 1]) self.q = tf.reduce_mean(self.q_dist_distorted, axis=-1) self.greedy_action = tf.cast(tf.squeeze(tf.argmax(self.q, axis=-1)), dtype=tf.int32)