def __init__(self, agent_parameters: AgentParameters, has_target: bool, has_global: bool, name: str, spaces: SpacesDefinition, replicated_device=None, worker_device=None): self.ap = agent_parameters self.network_parameters = self.ap.network_wrappers[name] self.has_target = has_target self.has_global = has_global self.name = name self.sess = None if self.network_parameters.framework == Frameworks.tensorflow: if "tensorflow" not in failed_imports: general_network = GeneralTensorFlowNetwork.construct else: raise Exception('Install tensorflow before using it as framework') elif self.network_parameters.framework == Frameworks.mxnet: if "mxnet" not in failed_imports: general_network = GeneralMxnetNetwork.construct else: raise Exception('Install mxnet before using it as framework') else: raise Exception("{} Framework is not supported" .format(Frameworks().to_string(self.network_parameters.framework))) variable_scope = "{}/{}".format(self.ap.full_name_id, name) # Global network - the main network shared between threads self.global_network = None if self.has_global: # we assign the parameters of this network on the parameters server self.global_network = general_network(variable_scope=variable_scope, devices=force_list(replicated_device), agent_parameters=agent_parameters, name='{}/global'.format(name), global_network=None, network_is_local=False, spaces=spaces, network_is_trainable=True) # Online network - local copy of the main network used for playing self.online_network = None self.online_network = general_network(variable_scope=variable_scope, devices=force_list(worker_device), agent_parameters=agent_parameters, name='{}/online'.format(name), global_network=self.global_network, network_is_local=True, spaces=spaces, network_is_trainable=True) # Target network - a local, slow updating network used for stabilizing the learning self.target_network = None if self.has_target: self.target_network = general_network(variable_scope=variable_scope, devices=force_list(worker_device), agent_parameters=agent_parameters, name='{}/target'.format(name), global_network=self.global_network, network_is_local=True, spaces=spaces, network_is_trainable=False)
def post_training_commands(self): # remove entropy regularization self.networks['main'].online_network.set_variable_value( self.networks['main'].online_network.output_heads[1].set_beta, 0, self.networks['main'].online_network.output_heads[1]. beta_placeholder) # set the loss weights to the SIL loss weights for output_head_idx, output_head in enumerate( self.networks['main'].online_network.output_heads): self.networks['main'].online_network.set_variable_value( output_head.set_loss_weight, force_list(self.ap.network_wrappers['main']. sil_loss_weights[output_head_idx]), output_head.loss_weight_placeholder) # sil training for i in range(self.ap.algorithm. off_policy_training_steps_per_on_policy_training_steps): off_policy_loss = self.train_off_policy() # add back entropy regularization self.networks['main'].online_network.set_variable_value( self.networks['main'].online_network.output_heads[1].set_beta, self.ap.algorithm.beta_entropy, self.networks['main']. online_network.output_heads[1].beta_placeholder) # recover the regular loss weights for output_head_idx, output_head in enumerate( self.networks['main'].online_network.output_heads): self.networks['main'].online_network.set_variable_value( output_head.set_loss_weight, force_list(self.ap.network_wrappers['main']. loss_weights[output_head_idx]), output_head.loss_weight_placeholder)
def train_on_batch(self, inputs, targets, scaler=1., additional_fetches=None, importance_weights=None): """ Given a batch of examples and targets, runs a forward pass & backward pass and then applies the gradients :param additional_fetches: Optional tensors to fetch during the training process :param inputs: The input for the network :param targets: The targets corresponding to the input batch :param scaler: A scaling factor that allows rescaling the gradients before applying them :param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss error of this sample. If it is not given, the samples losses won't be scaled :return: The loss of the network """ if additional_fetches is None: additional_fetches = [] force_list(additional_fetches) loss = self.accumulate_gradients(inputs, targets, additional_fetches=additional_fetches, importance_weights=importance_weights) self.apply_and_reset_gradients(self.accumulated_gradients, scaler) return loss
def step(self, action: Union[List[ActionType], ActionType]) -> List[EnvResponse]: """ Make a single step in the environment using the given action :param action: an action to use for stepping the environment. Should follow the definition of the action space. :return: the environment response as returned in get_last_env_response """ clipped_and_scaled_action = list() for agent_action, action_space in zip(force_list(action), force_list(self.action_space)): agent_action = action_space.clip_action_to_space(agent_action) if action_space and not action_space.contains(agent_action): raise ValueError( "The given action does not match the action space definition. " "Action = {}, action space definition = {}".format( agent_action, action_space)) if hasattr( action_space, 'scale_action_space') and action_space.scale_action_space: agent_action = action_space.scale_action_values(agent_action) clipped_and_scaled_action.append(agent_action) action = clipped_and_scaled_action # store the last agent action done and allow passing None actions to repeat the previously done action if action is None: action = self.last_action self.last_action = action self.current_episode_steps_counter += 1 if self.phase != RunPhase.UNDEFINED: self.total_steps_counter += 1 # act self._take_action(action) # observe self._update_state() self.total_reward_in_current_episode = [ total_reward_in_current_episode + reward for total_reward_in_current_episode, reward in zip( self.total_reward_in_current_episode, self.reward) ] self.last_env_response = \ [EnvResponse( next_state=state, reward=reward, game_over=done, goal=self.goal, info=self.info ) for state, reward, done in zip(self.state, self.reward, self.done)] return self.last_env_response
def _build_module(self): """ self.state_in: tuple of placeholders containing the initial state self.state_out: tuple of output state todo: it appears that the shape of the output is batch, feature the code here seems to be slicing off the first element in the batch which would definitely be wrong. need to double check the shape """ self.layers.append(self.input) # optionally insert some layers before the LSTM for idx, layer_params in enumerate(self.layers_params): self.layers.extend(force_list( layer_params(self.layers[-1], name='fc{}'.format(idx), is_training=self.is_training) )) # add the LSTM layer lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(self.number_of_lstm_cells, state_is_tuple=True) self.c_init = np.zeros((1, lstm_cell.state_size.c), np.float32) self.h_init = np.zeros((1, lstm_cell.state_size.h), np.float32) self.state_init = [self.c_init, self.h_init] self.c_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.c]) self.h_in = tf.placeholder(tf.float32, [1, lstm_cell.state_size.h]) self.state_in = (self.c_in, self.h_in) rnn_in = tf.expand_dims(self.layers[-1], [0]) step_size = tf.shape(self.layers[-1])[:1] state_in = tf.nn.rnn_cell.LSTMStateTuple(self.c_in, self.h_in) lstm_outputs, lstm_state = tf.nn.dynamic_rnn( lstm_cell, rnn_in, initial_state=state_in, sequence_length=step_size, time_major=False) lstm_c, lstm_h = lstm_state self.state_out = (lstm_c[:1, :], lstm_h[:1, :]) self.output = tf.reshape(lstm_outputs, [-1, self.number_of_lstm_cells])
def last_env_response(self, val: Union[List[EnvResponse], EnvResponse]): """ Set the last environment response :param val: the last environment response """ self._last_env_response = force_list(val)
def should_dump_video_of_the_current_episode(self, episode_terminated=False): if self.visualization_parameters.video_dump_methods: for video_dump_method in force_list(self.visualization_parameters.video_dump_methods): if not video_dump_method.should_dump(episode_terminated, **self.__dict__): return False return True return False
def _build_module(self) -> None: """ Builds the graph of the module This method is called early on from __call__. It is expected to store the graph in self.output. :return: None """ # NOTE: for image inputs, we expect the data format to be of type uint8, so to be memory efficient. we chose not # to implement the rescaling as an input filters.observation.observation_filter, as this would have caused the # input to the network to be float, which is 4x more expensive in memory. # thus causing each saved transition in the memory to also be 4x more pricier. input_layer = self.input / self.input_rescaling input_layer -= self.input_offset # clip input using te given range if self.input_clipping is not None: input_layer = tf.clip_by_value(input_layer, self.input_clipping[0], self.input_clipping[1]) self.layers.append(input_layer) for idx, layer_params in enumerate(self.layers_params): self.layers.extend( force_list( layer_params(input_layer=self.layers[-1], name='{}_{}'.format( layer_params.__class__.__name__, idx), is_training=self.is_training))) self.output = tf.contrib.layers.flatten(self.layers[-1])
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str, head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str = 'relu'): self.head_idx = head_idx self.network_name = network_name self.network_parameters = agent_parameters.network_wrappers[ self.network_name] self.name = "head" self.output = [] self.loss = [] self.loss_type = [] self.regularizations = [] self.loss_weight = force_list(loss_weight) self.target = [] self.importance_weight = [] self.input = [] self.is_local = is_local self.ap = agent_parameters self.spaces = spaces self.return_type = None self.activation_function = activation_function
def __init__(self, agent_parameters: AgentParameters, spaces: SpacesDefinition, network_name: str, head_idx: int = 0, loss_weight: float = 1., is_local: bool = True, activation_function: str = 'relu', dense_layer=Dense): self.head_idx = head_idx self.network_name = network_name self.network_parameters = agent_parameters.network_wrappers[ self.network_name] self.name = "head" self.output = [] self.loss = [] self.loss_type = [] self.regularizations = [] self.loss_weight = tf.Variable( [float(w) for w in force_list(loss_weight)], trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES]) self.target = [] self.importance_weight = [] self.input = [] self.is_local = is_local self.ap = agent_parameters self.spaces = spaces self.return_type = None self.activation_function = activation_function self.dense_layer = dense_layer if self.dense_layer is None: self.dense_layer = Dense else: self.dense_layer = convert_layer_class(self.dense_layer)
def _build_module(self, input_layer): self.layers.append(input_layer) for idx, layer_params in enumerate(self.scheme): self.layers.extend(force_list( layer_params(input_layer=self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx)) )) self.layers.append(self.dense_layer(self.num_actions)(self.layers[-1], name='output')) self.output = self.layers[-1]
def train_value_network(self, dataset, epochs): loss = [] batch = Batch(dataset) network_keys = self.ap.network_wrappers[ 'critic'].input_embedders_parameters.keys() # * Found not to have any impact * # add a timestep to the observation # current_states_with_timestep = self.concat_state_and_timestep(dataset) mix_fraction = self.ap.algorithm.value_targets_mix_fraction total_returns = batch.n_step_discounted_rewards(True) for j in range(epochs): curr_batch_size = batch.size if self.networks['critic'].online_network.optimizer_type != 'LBFGS': curr_batch_size = self.ap.network_wrappers['critic'].batch_size for i in range(batch.size // curr_batch_size): # split to batches for first order optimization techniques current_states_batch = { k: v[i * curr_batch_size:(i + 1) * curr_batch_size] for k, v in batch.states(network_keys).items() } total_return_batch = total_returns[i * curr_batch_size:(i + 1) * curr_batch_size] old_policy_values = force_list( self.networks['critic'].target_network.predict( current_states_batch).squeeze()) if self.networks[ 'critic'].online_network.optimizer_type != 'LBFGS': targets = total_return_batch else: current_values = self.networks[ 'critic'].online_network.predict(current_states_batch) targets = current_values * ( 1 - mix_fraction) + total_return_batch * mix_fraction inputs = copy.copy(current_states_batch) for input_index, input in enumerate(old_policy_values): name = 'output_0_{}'.format(input_index) if name in self.networks['critic'].online_network.inputs: inputs[name] = input value_loss = self.networks[ 'critic'].online_network.accumulate_gradients( inputs, targets) self.networks['critic'].apply_gradients_to_online_network() if isinstance(self.ap.task_parameters, DistributedTaskParameters): self.networks['critic'].apply_gradients_to_global_network() self.networks[ 'critic'].online_network.reset_accumulated_gradients() loss.append([value_loss[0]]) loss = np.mean(loss, 0) return loss
def filter(self, unfiltered_data: Union[EnvResponse, List[EnvResponse], Transition, List[Transition]], update_internal_state: bool=True, deep_copy: bool=True) -> Union[List[EnvResponse], List[Transition]]: """ A wrapper around _filter which first copies the env_response so that we don't change the original one This function should not be updated! :param unfiltered_data: the input data :param update_internal_state: should the filter's internal state change due to this call :return: the filtered env_response """ if self.i_am_a_reference_filter: raise Exception("The filter being used is a reference filter. It is not to be used directly. " "Instead get a duplicate from it by calling __call__.") if deep_copy: filtered_data = copy.deepcopy(unfiltered_data) else: filtered_data = [copy.copy(t) for t in unfiltered_data] filtered_data = force_list(filtered_data) # TODO: implement observation space validation # filter observations if isinstance(filtered_data[0], Transition): state_objects_to_filter = [[f.state for f in filtered_data], [f.next_state for f in filtered_data]] elif isinstance(filtered_data[0], EnvResponse): state_objects_to_filter = [[f.next_state for f in filtered_data]] else: raise ValueError("unfiltered_data should be either of type EnvResponse or Transition. ") for state_object_list in state_objects_to_filter: for observation_name, filters in self._observation_filters.items(): if observation_name in state_object_list[0].keys(): for filter in filters.values(): data_to_filter = [state_object[observation_name] for state_object in state_object_list] if filter.supports_batching: filtered_observations = filter.filter( data_to_filter, update_internal_state=update_internal_state) else: filtered_observations = [] for data_point in data_to_filter: filtered_observations.append(filter.filter( data_point, update_internal_state=update_internal_state)) for i, state_object in enumerate(state_object_list): state_object[observation_name] = filtered_observations[i] # filter reward for filter in self._reward_filters.values(): if filter.supports_batching: filtered_rewards = filter.filter([f.reward for f in filtered_data], update_internal_state) for d, filtered_reward in zip(filtered_data, filtered_rewards): d.reward = filtered_reward else: for d in filtered_data: d.reward = filter.filter(d.reward, update_internal_state) return filtered_data
def _build_module(self): self.layers.append(self.input) for idx, layer_params in enumerate(self.layers_params): self.layers.extend(force_list( layer_params(self.layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx), is_training=self.is_training) )) self.output = self.layers[-1]
def __call__(self, input_layer): """ Wrapper for building the module graph including scoping and loss creation :param input_layer: the input to the graph :return: the output of the last layer and the target placeholder """ with tf.variable_scope( self.get_name(), initializer=tf.contrib.layers.xavier_initializer()): self._build_module(input_layer) self.output = force_list(self.output) self.target = force_list(self.target) self.input = force_list(self.input) self.loss_type = force_list(self.loss_type) self.loss = force_list(self.loss) self.regularizations = force_list(self.regularizations) if self.is_local: self.set_loss() self._post_build() if self.is_local: return self.output, self.target, self.input, self.importance_weight else: return self.output, self.input
def _build_module(self): self.output = [] for stream_idx in range(self.num_streams): layers = [self.input] for idx, layer_params in enumerate(self.layers_params): layers.extend(force_list( layer_params(layers[-1], name='{}_{}'.format(layer_params.__class__.__name__, idx + stream_idx * len(self.layers_params)), is_training=self.is_training) )) self.output.append((layers[-1]))
def prepare_batch_for_inference(self, states: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]], network_name: str): """ convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all observations together, measurements together, etc. """ # convert to batch so we can run it through the network states = force_list(states) batches_dict = {} for key in self.ap.network_wrappers[ network_name].input_embedders_parameters.keys(): # there are cases (e.g. ddpg) where the state does not contain all the information needed for running # through the network and this has to be added externally (e.g. ddpg where the action needs to be given in # addition to the current_state, so that all the inputs of the network will be filled) if key in states[0].keys(): batches_dict[key] = np.array( [np.array(state[key]) for state in states]) return batches_dict
def prepare_batch_for_inference(self, states: Union[Dict[str, np.ndarray], List[Dict[str, np.ndarray]]], network_name: str) -> Dict[str, np.array]: """ Convert curr_state into input tensors tensorflow is expecting. i.e. if we have several inputs states, stack all observations together, measurements together, etc. :param states: A list of environment states, where each one is a dict mapping from an observation name to its corresponding observation :param network_name: The agent network name to prepare the batch for. this is needed in order to extract only the observation relevant for the network from the states. :return: A dictionary containing a list of values from all the given states for each of the observations """ # convert to batch so we can run it through the network states = force_list(states) batches_dict = {} for key in self.ap.network_wrappers[network_name].input_embedders_parameters.keys(): # there are cases (e.g. ddpg) where the state does not contain all the information needed for running # through the network and this has to be added externally (e.g. ddpg where the action needs to be given in # addition to the current_state, so that all the inputs of the network will be filled) if key in states[0].keys(): batches_dict[key] = np.array([np.array(state[key]) for state in states]) return batches_dict
def accumulate_gradients(self, inputs: Dict[str, np.ndarray], targets: List[np.ndarray], additional_fetches: List[Tuple[int, str]] = None, importance_weights: np.ndarray = None, no_accumulation: bool = False) -> Tuple[float, List[float], float, list]: """ Runs a forward & backward pass, clips gradients if needed and accumulates them into the accumulation :param inputs: environment states (observation, etc.) as well extra inputs required by loss. Shape of ndarray is (batch_size, observation_space_size) or (batch_size, observation_space_size, stack_size) :param targets: targets required by loss (e.g. sum of discounted rewards) :param additional_fetches: additional fetches to calculate and return. Each fetch is specified as (int, str) tuple of head-type-index and fetch-name. The tuple is obtained from each head. :param importance_weights: ndarray of shape (batch_size,) to multiply with batch loss. :param no_accumulation: if True, set gradient values to the new gradients, otherwise sum with previously calculated gradients :return: tuple of total_loss, losses, norm_unclipped_grads, fetched_tensors total_loss (float): sum of all head losses losses (list of float): list of all losses. The order is list of target losses followed by list of regularization losses. The specifics of losses is dependant on the network parameters (number of heads, etc.) norm_unclippsed_grads (float): global norm of all gradients before any gradient clipping is applied fetched_tensors: all values for additional_fetches """ if self.accumulated_gradients is None: self.reset_accumulated_gradients() embedders = [emb.embedder_name for emb in self.model.nets[0].input_embedders] nd_inputs = tuple(nd.array(inputs[emb]) for emb in embedders) assert self.middleware.__class__.__name__ != 'LSTMMiddleware', "LSTM middleware not supported" targets = force_list(targets) with autograd.record(): out_per_head = utils.split_outputs_per_head(self.model(*nd_inputs), self.model.output_heads) tgt_per_loss = utils.split_targets_per_loss(targets, self.losses) losses = list() regularizations = list() additional_fetches = [(k, None) for k in additional_fetches] for h, h_loss, h_out, l_tgt in zip(self.model.output_heads, self.losses, out_per_head, tgt_per_loss): l_in = utils.get_loss_agent_inputs(inputs, head_type_idx=h.head_type_idx, loss=h_loss) # Align arguments with loss.loss_forward and convert to NDArray l_args = utils.to_mx_ndarray(utils.align_loss_args(h_out, l_in, l_tgt, h_loss)) # Calculate loss and all auxiliary outputs loss_outputs = utils.loss_output_dict(utils.to_list(h_loss(*l_args)), h_loss.output_schema) if LOSS_OUT_TYPE_LOSS in loss_outputs: losses.extend(loss_outputs[LOSS_OUT_TYPE_LOSS]) if LOSS_OUT_TYPE_REGULARIZATION in loss_outputs: regularizations.extend(loss_outputs[LOSS_OUT_TYPE_REGULARIZATION]) # Set additional fetches for i, fetch in enumerate(additional_fetches): head_type_idx, fetch_name = fetch[0] # fetch key is a tuple of (head_type_index, fetch_name) if head_type_idx == h.head_type_idx: assert fetch[1] is None # sanity check that fetch is None additional_fetches[i] = (fetch[0], loss_outputs[fetch_name]) # Total loss is losses and regularization (NOTE: order is important) total_loss_list = losses + regularizations total_loss = nd.add_n(*total_loss_list) # Calculate gradients total_loss.backward() assert self.optimizer_type != 'LBFGS', 'LBFGS not supported' # allreduce gradients from all contexts self.trainer.allreduce_grads() # Calculate global norm of gradients # FIXME global norm is returned even when not used for clipping! Is this necessary? # FIXME global norm might be calculated twice if clipping method is global norm norm_unclipped_grads = utils.global_norm(self._model_grads) # Clip gradients if self.network_parameters.clip_gradients: utils.clip_grad( self._model_grads, clip_method=self.network_parameters.gradients_clipping_method, clip_val=self.network_parameters.clip_gradients, inplace=True) # Update self.accumulated_gradients depending on no_accumulation flag if no_accumulation: for acc_grad, model_grad in zip(self.accumulated_gradients, self._model_grads): acc_grad[:] = model_grad else: for acc_grad, model_grad in zip(self.accumulated_gradients, self._model_grads): acc_grad += model_grad # result of of additional fetches fetched_tensors = [fetch[1] for fetch in additional_fetches] # convert everything to numpy or scalar before returning result = utils.asnumpy_or_asscalar((total_loss, total_loss_list, norm_unclipped_grads, fetched_tensors)) return result
def train_policy_network(self, dataset, epochs): loss = [] for j in range(epochs): loss = { 'total_loss': [], 'policy_losses': [], 'unclipped_grads': [], 'fetch_result': [] } #shuffle(dataset) for i in range( len(dataset) // self.ap.network_wrappers['actor'].batch_size): batch = Batch( dataset[i * self.ap.network_wrappers['actor'].batch_size:(i + 1) * self.ap.network_wrappers['actor'].batch_size]) network_keys = self.ap.network_wrappers[ 'actor'].input_embedders_parameters.keys() advantages = batch.info('advantage') actions = batch.actions() if not isinstance(self.spaces.action, DiscreteActionSpace) and len( actions.shape) == 1: actions = np.expand_dims(actions, -1) # get old policy probabilities and distribution old_policy = force_list( self.networks['actor'].target_network.predict( batch.states(network_keys))) # calculate gradients and apply on both the local policy network and on the global policy network fetches = [ self.networks['actor'].online_network.output_heads[0]. kl_divergence, self.networks['actor'].online_network. output_heads[0].entropy ] inputs = copy.copy(batch.states(network_keys)) inputs['output_0_0'] = actions # old_policy_distribution needs to be represented as a list, because in the event of discrete controls, # it has just a mean. otherwise, it has both a mean and standard deviation for input_index, input in enumerate(old_policy): inputs['output_0_{}'.format(input_index + 1)] = input total_loss, policy_losses, unclipped_grads, fetch_result =\ self.networks['actor'].online_network.accumulate_gradients( inputs, [advantages], additional_fetches=fetches) self.networks['actor'].apply_gradients_to_online_network() if isinstance(self.ap.task_parameters, DistributedTaskParameters): self.networks['actor'].apply_gradients_to_global_network() self.networks[ 'actor'].online_network.reset_accumulated_gradients() loss['total_loss'].append(total_loss) loss['policy_losses'].append(policy_losses) loss['unclipped_grads'].append(unclipped_grads) loss['fetch_result'].append(fetch_result) self.unclipped_grads.add_sample(unclipped_grads) for key in loss.keys(): loss[key] = np.mean(loss[key], 0) if self.ap.network_wrappers['critic'].learning_rate_decay_rate != 0: curr_learning_rate = self.networks[ 'critic'].online_network.get_variable_value( self.ap.learning_rate) self.curr_learning_rate.add_sample(curr_learning_rate) else: curr_learning_rate = self.ap.network_wrappers[ 'critic'].learning_rate # log training parameters screen.log_dict(OrderedDict([ ("Surrogate loss", loss['policy_losses'][0]), ("KL divergence", loss['fetch_result'][0]), ("Entropy", loss['fetch_result'][1]), ("training epoch", j), ("learning_rate", curr_learning_rate) ]), prefix="Policy training") self.total_kl_divergence_during_training_process = loss[ 'fetch_result'][0] self.entropy.add_sample(loss['fetch_result'][1]) self.kl_divergence.add_sample(loss['fetch_result'][0]) return loss['total_loss']
def accumulate_gradients(self, inputs, targets, additional_fetches=None, importance_weights=None, no_accumulation=False): """ Runs a forward pass & backward pass, clips gradients if needed and accumulates them into the accumulation placeholders :param additional_fetches: Optional tensors to fetch during gradients calculation :param inputs: The input batch for the network :param targets: The targets corresponding to the input batch :param importance_weights: A coefficient for each sample in the batch, which will be used to rescale the loss error of this sample. If it is not given, the samples losses won't be scaled :param no_accumulation: If is set to True, the gradients in the accumulated gradients placeholder will be replaced by the newely calculated gradients instead of accumulating the new gradients. This can speed up the function runtime by around 10%. :return: A list containing the total loss and the individual network heads losses """ if self.accumulated_gradients is None: self.reset_accumulated_gradients() # feed inputs if additional_fetches is None: additional_fetches = [] feed_dict = self.create_feed_dict(inputs) #var_list = self.create_variable_list() # feed targets targets = force_list(targets) for placeholder_idx, target in enumerate(targets): feed_dict[self.targets[placeholder_idx]] = target # feed importance weights importance_weights = force_list(importance_weights) for placeholder_idx, target_ph in enumerate(targets): if len(importance_weights) <= placeholder_idx or importance_weights[placeholder_idx] is None: importance_weight = np.ones(target_ph.shape[0]) else: importance_weight = importance_weights[placeholder_idx] importance_weight = np.reshape(importance_weight, (-1,) + (1,)*(len(target_ph.shape)-1)) feed_dict[self.importance_weights[placeholder_idx]] = importance_weight if self.optimizer_type != 'LBFGS': # feed the lstm state if necessary if self.middleware.__class__.__name__ == 'LSTMMiddleware': # we can't always assume that we are starting from scratch here can we? feed_dict[self.middleware.c_in] = self.middleware.c_init feed_dict[self.middleware.h_in] = self.middleware.h_init fetches = self.train_fetches + additional_fetches if self.ap.visualization.tensorboard: fetches += [self.merged] # get grads result = self.sess.run(fetches, feed_dict=feed_dict) if hasattr(self, 'train_writer') and self.train_writer is not None: self.train_writer.add_summary(result[-1], self.sess.run(self.global_step)) # extract the fetches norm_unclipped_grads, grads, total_loss, losses = result[:4] if self.middleware.__class__.__name__ == 'LSTMMiddleware': (self.curr_rnn_c_in, self.curr_rnn_h_in) = result[4] fetched_tensors = [] if len(additional_fetches) > 0: fetched_tensors = result[self.additional_fetches_start_idx:self.additional_fetches_start_idx + len(additional_fetches)] # accumulate the gradients for idx, grad in enumerate(grads): if no_accumulation: self.accumulated_gradients[idx] = grad else: self.accumulated_gradients[idx] += grad return total_loss, losses, norm_unclipped_grads, fetched_tensors else: self.optimizer.minimize(session=self.sess, feed_dict=feed_dict) return [0]
# if no arg is given if len(sys.argv) == 1: parser.print_help() exit(0) dir_prefix = args.dir_prefix preset = args.preset levels = args.level.split(',') if args.level is not None else [None] num_seeds = args.seeds num_workers = args.num_workers gpu = [int(gpu) for gpu in args.gpu.split(',')] level_as_sub_dir = args.level_as_sub_dir processes = [] gpu_list = force_list(gpu) curr_gpu_idx = 0 for level in levels: for seed in range(num_seeds): # select the next gpu for this run set_gpu(gpu_list[curr_gpu_idx]) command = [ 'python3', 'rl_coach/coach.py', '-ns', '-p', '{}'.format(preset), '--seed', '{}'.format(seed), '-n', '{}'.format(num_workers) ] if dir_prefix != "": dir_prefix += "_" if args.use_cpu: command.append("-c")
def _build_module(self): self.layers.append(self.input) self.activation_function = tf.nn.relu initializer = tf.keras.initializers.VarianceScaling(scale=2.0) window_size = (3, 3) self.layers.append( Conv2D(64, window_size, padding='same', activation=self.activation_function, kernel_initializer=initializer)(self.layers[-1])) self.layers.append( Conv2D(64, window_size, padding='same', activation=self.activation_function, kernel_initializer=initializer)(self.layers[-1])) self.layers.append(MaxPooling2D()(self.layers[-1])) self.layers.append( Conv2D(128, window_size, padding='same', activation=self.activation_function, kernel_initializer=initializer)(self.layers[-1])) self.layers.append( Conv2D(128, window_size, padding='same', activation=self.activation_function, kernel_initializer=initializer)(self.layers[-1])) self.layers.append(MaxPooling2D()(self.layers[-1])) self.layers.append( Conv2D(256, window_size, padding='same', activation=self.activation_function, kernel_initializer=initializer)(self.layers[-1])) self.layers.append( Conv2D(256, window_size, padding='same', activation=self.activation_function, kernel_initializer=initializer)(self.layers[-1])) self.layers.append( Conv2D(256, window_size, padding='same', activation=self.activation_function, kernel_initializer=initializer)(self.layers[-1])) self.layers.append(MaxPooling2D()(self.layers[-1])) self.layers.append( Conv2D(512, window_size, padding='same', activation=self.activation_function, kernel_initializer=initializer)(self.layers[-1])) self.layers.append( Conv2D(512, window_size, padding='same', activation=self.activation_function, kernel_initializer=initializer)(self.layers[-1])) self.layers.append( Conv2D(512, window_size, padding='same', activation=self.activation_function, kernel_initializer=initializer)(self.layers[-1])) self.layers.append(MaxPooling2D()(self.layers[-1])) self.layers.append( Conv2D(512, window_size, padding='same', activation=self.activation_function, kernel_initializer=initializer)(self.layers[-1])) self.layers.append( Conv2D(512, window_size, padding='same', activation=self.activation_function, kernel_initializer=initializer)(self.layers[-1])) self.layers.append( Conv2D(512, window_size, padding='same', activation=self.activation_function, kernel_initializer=initializer)(self.layers[-1])) self.layers.append(MaxPooling2D()(self.layers[-1])) self.layers.append(Flatten()(self.layers[-1])) for idx, layer_params in enumerate(self.layers_params): print(idx, layer_params) self.layers.extend( force_list( layer_params(self.layers[-1], name='{}_{}'.format( layer_params.__class__.__name__, idx), is_training=self.is_training, kernel_initializer=initializer, activation=self.activation_function))) self.output = self.layers[-1]
def __init__(self, run_phases: Union[RunPhase, List[RunPhase]]): self.run_phases = force_list(run_phases)
def __init__(self, params: Union[List, int]): """ :param params: list of [num_output_neurons] """ self.params = force_list(params)
def __init__(self, params: List): """ :param params: list of [num_output_neurons] """ self.params = force_list(params) self.sigma0 = 0.5
def __init__(self, level: LevelSelection, seed: int, frame_skip: int, human_control: bool, custom_reward_threshold: Union[int, float], visualization_parameters: VisualizationParameters, server_height: int, server_width: int, camera_height: int, camera_width: int, verbose: bool, experiment_suite: ExperimentSuite, config: str, episode_max_time: int, allow_braking: bool, quality: CarlaEnvironmentParameters.Quality, cameras: List[CameraTypes], weather_id: List[int], experiment_path: str, separate_actions_for_throttle_and_brake: bool, num_speedup_steps: int, max_speed: float, **kwargs): super().__init__(level, seed, frame_skip, human_control, custom_reward_threshold, visualization_parameters) # server configuration self.server_height = server_height self.server_width = server_width self.port = get_open_port() self.host = 'localhost' self.map_name = CarlaLevel[level.upper()].value['map_name'] self.map_path = CarlaLevel[level.upper()].value['map_path'] self.experiment_path = experiment_path # client configuration self.verbose = verbose self.quality = quality self.cameras = cameras self.weather_id = weather_id self.episode_max_time = episode_max_time self.allow_braking = allow_braking self.separate_actions_for_throttle_and_brake = separate_actions_for_throttle_and_brake self.camera_width = camera_width self.camera_height = camera_height # setup server settings self.experiment_suite = experiment_suite self.config = config if self.config: # load settings from file with open(self.config, 'r') as fp: self.settings = fp.read() else: # hard coded settings self.settings = CarlaSettings() self.settings.set(SynchronousMode=True, SendNonPlayerAgentsInfo=False, NumberOfVehicles=15, NumberOfPedestrians=30, WeatherId=random.choice( force_list(self.weather_id)), QualityLevel=self.quality.value, SeedVehicles=seed, SeedPedestrians=seed) if seed is None: self.settings.randomize_seeds() self.settings = self._add_cameras(self.settings, self.cameras, self.camera_width, self.camera_height) # open the server self.server = self._open_server() logging.disable(40) # open the client self.game = CarlaClient(self.host, self.port, timeout=99999999) self.game.connect() if self.experiment_suite: self.current_experiment_idx = 0 self.current_experiment = self.experiment_suite.get_experiments()[ self.current_experiment_idx] self.scene = self.game.load_settings( self.current_experiment.conditions) else: self.scene = self.game.load_settings(self.settings) # get available start positions self.positions = self.scene.player_start_spots self.num_positions = len(self.positions) self.current_start_position_idx = 0 self.current_pose = 0 # state space self.state_space = StateSpace({ "measurements": VectorObservationSpace( 4, measurements_names=["forward_speed", "x", "y", "z"]) }) for camera in self.scene.sensors: self.state_space[camera.name] = ImageObservationSpace( shape=np.array([self.camera_height, self.camera_width, 3]), high=255) # action space if self.separate_actions_for_throttle_and_brake: self.action_space = BoxActionSpace( shape=3, low=np.array([-1, 0, 0]), high=np.array([1, 1, 1]), descriptions=["steer", "gas", "brake"]) else: self.action_space = BoxActionSpace( shape=2, low=np.array([-1, -1]), high=np.array([1, 1]), descriptions=["steer", "gas_and_brake"]) # human control if self.human_control: # convert continuous action space to discrete self.steering_strength = 0.5 self.gas_strength = 1.0 self.brake_strength = 0.5 # TODO: reverse order of actions self.action_space = PartialDiscreteActionSpaceMap( target_actions=[[0., 0.], [0., -self.steering_strength], [0., self.steering_strength], [self.gas_strength, 0.], [-self.brake_strength, 0], [self.gas_strength, -self.steering_strength], [self.gas_strength, self.steering_strength], [self.brake_strength, -self.steering_strength], [self.brake_strength, self.steering_strength]], descriptions=[ 'NO-OP', 'TURN_LEFT', 'TURN_RIGHT', 'GAS', 'BRAKE', 'GAS_AND_TURN_LEFT', 'GAS_AND_TURN_RIGHT', 'BRAKE_AND_TURN_LEFT', 'BRAKE_AND_TURN_RIGHT' ]) # map keyboard keys to actions for idx, action in enumerate(self.action_space.descriptions): for key in key_map.keys(): if action == key: self.key_to_action[key_map[key]] = idx self.num_speedup_steps = num_speedup_steps self.max_speed = max_speed # measurements self.autopilot = None self.planner = Planner(self.map_name) # env initialization self.reset_internal_state(True) # render if self.is_rendered: image = self.get_rendered_image() self.renderer.create_screen(image.shape[1], image.shape[0])