def augment_fn(states: list): state = prepare(states) image = utils.to_float(state['state_image']) if alpha > 0.0: # Color Jitter: if aug.tf_chance(seed=seed) < alpha: image = rl.augmentations.simclr.color_jitter(image, strength=alpha, seed=seed) # blur if aug.tf_chance(seed=seed) < 0.25 * alpha: blur_size = 3 if aug.tf_chance(seed=seed) >= 0.5 else 5 image = aug.tf_gaussian_blur(image, size=blur_size, seed=seed) # noise if aug.tf_chance(seed=seed) < 0.2 * alpha: image = aug.tf_salt_and_pepper_batch(image, amount=0.1) if aug.tf_chance(seed=seed) < 0.33 * alpha: image = aug.tf_gaussian_noise_batch(image, amount=0.10, std=0.075, seed=seed) image = aug.tf_normalize_batch(image) # cutout if aug.tf_chance(seed=seed) < 0.15 * alpha: image = aug.tf_cutout_batch(image, size=6, seed=seed) # coarse dropout if aug.tf_chance(seed=seed) < 0.15 * alpha: image = aug.tf_coarse_dropout_batch(image, size=81, amount=0.04, seed=seed) state['state_image'] = image return state
def filter_throttle(s, a, r): mask = a[:, 0] >= 0.0 s = {k: utils.to_float(v)[mask] for k, v in s.items()} return s, a[mask], r[tf.concat([mask, [True]], axis=0)]
def explore_traces(traces_dir: str, amount=64, seed=None): import tensorflow as tf amounts = dict(left=amount, right=amount, center=amount) def filter_throttle(s, a, r): mask = a[:, 0] >= 0.0 s = {k: utils.to_float(v)[mask] for k, v in s.items()} return s, a[mask], r[tf.concat([mask, [True]], axis=0)] def shuffle_trace(s: dict, a, r): indices = tf.range(start=0, limit=tf.shape(a)[0], dtype=tf.int32) indices = tf.random.shuffle(indices) for k, v in s.items(): s[k] = tf.gather(v, indices) a = tf.gather(a, indices) r = tf.gather(r, tf.concat([indices, [tf.shape(r)[0] - 1]], axis=0)) return s, a, r def mask_reward(r, mask): return r[tf.concat([mask, [True]], axis=0)] def filter_steering(s, a, r, t=0.1): masks = dict(left=a[:, 1] <= -t, right=a[:, 1] >= t, center=(a[:, 1] > -t) & (a[:, 1] < t)) filtered_data = [] for k in ['left', 'center', 'right']: mask = masks[k] taken = int(min(amounts[k], tf.reduce_sum(tf.cast(mask, tf.int32)))) amounts[k] -= taken filtered_data.append( dict(state={k: v[mask][:taken] for k, v in s.items()}, action=a[mask][:taken], reward=mask_reward(r, mask)[:taken])) return filtered_data random.seed(seed) data = None while sum(map(lambda k_: amounts[k_], amounts)) > 0: for j, trace in enumerate(utils.load_traces(traces_dir)): print(f'trace-{j}') print('amounts:', amounts) state, action, reward, _ = utils.unpack_trace(trace) state, action, reward = filter_throttle(state, utils.to_float(action), reward) state, action, reward = shuffle_trace(state, action, reward) f_data = filter_steering(state, action, reward) if data is None: data = f_data else: for i, d in enumerate(f_data): data[i]['state'] = utils.concat_dict_tensor( data[i]['state'], d['state']) data[i]['action'] = tf.concat( [data[i]['action'], d['action']], axis=0) data[i]['reward'] = tf.concat( [data[i]['reward'], d['reward']], axis=0) if sum(map(lambda k_: amounts[k_], amounts)) <= 0: break for i, d in enumerate(data): print(i, d['action'].shape) d = dict(state=utils.concat_dict_tensor(*list(d['state'] for d in data)), action=tf.concat(list(d['action'] for d in data), axis=0), reward=tf.concat(list(d['reward'] for d in data), axis=0)) breakpoint()
def imitation_objective(self, batch, validation=False): """Imitation learning objective with `concordance loss` (i.e. a loss that encourages the network to make consistent predictions among augmented and non-augmented batches of data) """ states, aug_states, speed, similarity = batch true_actions = utils.to_float(states['action']) true_values = states['value'] # prediction on NON-augmented and AUGMENTED states policy, value = self.network.imitation_predict(states) policy_aug, value_aug = self.network.imitation_predict(aug_states) # actions, values, speed, and similarities actions, actions_aug = utils.to_float(policy['actions']), utils.to_float(policy_aug['actions']) values, values_aug = value['value'], value_aug['value'] pi_speed, pi_speed_aug = policy['speed'], policy_aug['speed'] v_speed, v_speed_aug = value['speed'], value_aug['speed'] pi_similarity, pi_similarity_aug = policy['similarity'], policy_aug['similarity'] v_similarity, v_similarity_aug = value['similarity'], value_aug['similarity'] if not validation: self.log_actions(actions_pred_imitation=actions, actions_pred_aug_imitation=actions_aug) self.log(values_pred_imitation=values, values_pred_aug_imitation=values_aug, speed_pi=pi_speed, speed_pi_aug=pi_speed_aug, speed_v=v_speed, speed_v_aug=v_speed_aug, similarity_pi=pi_similarity, similarity_pi_aug=pi_similarity_aug, similarity_v=v_similarity, similarity_v_aug=v_similarity_aug) # loss policy = sum of per-action MAE error loss_policy = (tf.reduce_mean(tf.reduce_sum(tf.abs(true_actions - actions), axis=1)) + tf.reduce_mean(tf.reduce_sum(tf.abs(true_actions - actions_aug), axis=1))) / 2.0 loss_value = (tf.reduce_mean(losses.MSE(y_true=true_values, y_pred=values)) + tf.reduce_mean(losses.MSE(y_true=true_values, y_pred=values_aug))) / 2.0 loss_speed_policy = (tf.reduce_mean(losses.MSE(y_true=speed, y_pred=pi_speed)) + tf.reduce_mean(losses.MSE(y_true=speed, y_pred=pi_speed_aug))) / 2.0 loss_speed_value = (tf.reduce_mean(losses.MSE(y_true=speed, y_pred=v_speed)) + tf.reduce_mean(losses.MSE(y_true=speed, y_pred=v_speed_aug))) / 2.0 loss_similarity_policy = (tf.reduce_mean(losses.MSE(y_true=similarity, y_pred=pi_similarity)) + tf.reduce_mean(losses.MSE(y_true=similarity, y_pred=pi_similarity_aug))) / 2.0 loss_similarity_value = (tf.reduce_mean(losses.MSE(y_true=similarity, y_pred=v_similarity)) + tf.reduce_mean(losses.MSE(y_true=similarity, y_pred=v_similarity_aug))) / 2.0 # concordance loss: make both prediction be close as possible concordance_policy = (tf.reduce_mean(losses.MSE(actions, actions_aug)) + tf.reduce_mean(losses.MSE(pi_speed, pi_speed_aug)) + tf.reduce_mean(losses.MSE(pi_similarity, pi_similarity_aug))) / 3.0 concordance_value = (tf.reduce_mean(losses.MSE(values, values_aug)) + tf.reduce_mean(losses.MSE(v_speed, v_speed_aug)) + tf.reduce_mean(losses.MSE(v_similarity, v_similarity_aug))) / 3.0 # total loss total_loss_policy = \ loss_policy + self.aux * (loss_speed_policy + loss_similarity_policy) + self.delta * concordance_policy total_loss_value = \ loss_value + self.aux * (loss_speed_value + loss_similarity_value) + self.eta * concordance_value if not validation: self.log(loss_policy=loss_policy, loss_value=loss_value, loss_speed_policy=loss_speed_policy, loss_similarity_policy=loss_similarity_policy, loss_speed_value=loss_speed_value, loss_similarity_value=loss_similarity_value, loss_concordance_policy=concordance_policy, loss_concordance_value=concordance_value, # loss_steer=steer_penalty, loss_throttle=throttle_penalty, loss_entropy=entropy_penalty ) return total_loss_policy, total_loss_value
def imitation_prepare_data(self, batch_size: int, traces_dir: str, num_traces: int, shuffle=False, offset=0) -> (dict, int): """Loads data from traces, and builds a batch with balanced actions (e.g. same amount of left and right steering etc.) """ def filter_throttle(s, a, r): mask = a[:, 0] >= 0.0 s = {_k: utils.to_float(v)[mask] for _k, v in s.items()} return s, a[mask], r[tf.concat([mask, [True]], axis=0)] def shuffle_trace(s: dict, a, r): indices = tf.range(start=0, limit=tf.shape(a)[0], dtype=tf.int32) indices = tf.random.shuffle(indices) for _k, v in s.items(): s[_k] = tf.gather(v, indices) a = tf.gather(a, indices) r = tf.gather(r, tf.concat([indices, [tf.shape(r)[0] - 1]], axis=0)) return s, a, r def mask_reward(r, mask): return r[tf.concat([mask, [True]], axis=0)] def filter_steering(s, a, r, t=0.1): masks = dict(left=a[:, 1] <= -t, right=a[:, 1] >= t, center=(a[:, 1] > -t) & (a[:, 1] < t)) filtered_data = [] for k in ['left', 'center', 'right']: mask = masks[k] taken = int(min(amounts[k], tf.reduce_sum(tf.cast(mask, tf.int32)))) amounts[k] -= taken filtered_data.append(dict(state={k: v[mask][:taken] for k, v in s.items()}, action=a[mask][:taken], reward=mask_reward(r, mask)[:taken])) return filtered_data amounts = dict(left=batch_size, right=batch_size, center=batch_size) data = None k = offset while sum(map(lambda k_: amounts[k_], amounts)) > 0: for j, trace in enumerate(utils.load_traces(traces_dir, max_amount=num_traces, shuffle=shuffle, offset=0 if self.seed is None else offset)): k += 1 trace = utils.unpack_trace(trace, unpack=False) states, actions = trace['state'], utils.to_float(trace['action']) rewards = utils.to_float(trace['reward']) states['speed'] = utils.to_tensor(trace['info_speed'], expand_axis=-1) states['similarity'] = utils.to_tensor(trace['info_similarity'], expand_axis=-1) states['state_command'] = self.convert_command(states['state_command']) # compute (decomposed) returns returns = utils.rewards_to_go(rewards, discount=self.gamma) states: dict states['returns_base'], \ states['returns_exp'] = tf.map_fn(fn=utils.decompose_number, elems=utils.to_float(returns), dtype=(tf.float32, tf.float32)) states, actions, rewards = filter_throttle(states, actions, rewards) states, actions, rewards = shuffle_trace(states, actions, rewards) f_data = filter_steering(states, actions, rewards) if data is None: data = f_data else: for i, d in enumerate(f_data): # for i in left, center, right... data[i]['state'] = utils.concat_dict_tensor(data[i]['state'], d['state']) data[i]['action'] = tf.concat([data[i]['action'], d['action']], axis=0) data[i]['reward'] = tf.concat([data[i]['reward'], d['reward']], axis=0) if sum(map(lambda k_: amounts[k_], amounts)) <= 0: break # concat left, center, and right parts together return dict(state=utils.concat_dict_tensor(*list(d['state'] for d in data)), action=tf.concat(list(d['action'] for d in data), axis=0), reward=tf.concat(list(d['reward'] for d in data), axis=0)), k