def score(self, expert_evaluation, expert_trajectory: Trajectory, streaming_enviroment, trace_list, video_csv_list, add_data=False): """ Wrapper for the base scoring function :param expert_evaluation: :param expert_trajectory: :param streaming_enviroment: :param trace_list: Which traces did we evaluate :param video_csv_list: Which videos did we evaluate :param add_data: :return: """ expert_trajectory.convert_list() behavioural_cloning_trace_generator_testing = TrajectoryVideoStreaming(self, streaming_enviroment, trace_list=trace_list, video_csv_list=video_csv_list) state_t = np.array([self.classifier.extract_features_observation(state_t) for state_t, _, _ in tqdm(expert_trajectory.trajectory_list, desc='transforming')]) state_t = pd.DataFrame(state_t, columns=self.classifier.extract_features_names()) self.impute_NaN_inplace(state_t) expert_action = expert_trajectory.trajectory_action_t_arr approx_action = self.classifier.predict(state_t) expert_action = expert_action.ravel() behavioural_cloning_evaluation, behavioural_cloning_evaluation_trajectory = behavioural_cloning_trace_generator_testing.create_trajectories( random_action_probability=0, cores_avail=1) return self.score_comparison(expert_evaluation=expert_evaluation, expert_trajectory=expert_trajectory, expert_action=expert_action, approx_evaluation=behavioural_cloning_evaluation, approx_trajectory=behavioural_cloning_evaluation_trajectory, approx_action=approx_action, add_data=add_data)
def __init__(self, abr_name, max_quality_change, deterministic, past_measurement_dimensions, future_measurements_dimensions, cloning_epochs, drop_prob=0.1, hidden_dim=32, batch_size_cloning=64, validation_split=0.2, scaling_factor_sigma=1, future_reward_discount=0.99, model_iterations=20, cores_avail=1, rde_distill_epochs=20, pretrain=True, balanced=False): """ :type validation_split: object :param abr_name: :param max_quality_change: :param deterministic: :param lookback: """ super().__init__(abr_name, max_quality_change, deterministic) self.rde_distill_epochs = rde_distill_epochs self.cores_avail = cores_avail self.model_iterations = model_iterations self.future_measurements_dimensions = future_measurements_dimensions self.validation_split = validation_split self.future_reward_discount = future_reward_discount self.pretrain = pretrain self.balanced = balanced self.past_measurement_dimensions = past_measurement_dimensions self.n_actions = max_quality_change * 2 + 1 self.hidden_dim = hidden_dim self.drop_prob = drop_prob self.value_history = None self.policy_history = None self.pretrain_history = None self.value_history_last = None self.policy_history_last = None self.batch_size_cloning = batch_size_cloning self.cloning_epochs = cloning_epochs self.trajectory_dummy = Trajectory() self.policy_network = KerasGAIL(past_measurement_dimensions=self.past_measurement_dimensions, future_measurements_dimensions=self.future_measurements_dimensions, hidden_dim=self.hidden_dim, action_dimension=self.n_actions, drop_prob=self.drop_prob) self.scaling_factor_sigma = scaling_factor_sigma self.bc_cloning_network = KerasEmbedder(past_measurement_dimensions=self.past_measurement_dimensions, future_measurements_dimensions=self.future_measurements_dimensions, hidden_dim=hidden_dim, embedding_dimension=self.n_actions, drop_prob=drop_prob) self.rnd_cloning_network = KerasEmbedder(past_measurement_dimensions=self.past_measurement_dimensions, future_measurements_dimensions=self.future_measurements_dimensions, hidden_dim=hidden_dim, embedding_dimension=self.n_actions, drop_prob=drop_prob) self.value_model = KerasValue(past_measurement_dimensions=self.past_measurement_dimensions, future_measurements_dimensions=self.future_measurements_dimensions, hidden_dim=self.hidden_dim, drop_prob=self.drop_prob)
def score(self, expert_evaluation, expert_trajectory: Trajectory, streaming_enviroment, trace_list, video_csv_list, add_data=False): expert_trajectory.convert_list() behavioural_cloning_trace_generator_testing = TrajectoryVideoStreaming(self, streaming_enviroment, trace_list=trace_list, video_csv_list=video_csv_list) state_t_testing = expert_trajectory.trajectory_state_t_arr state_t_future_testing = expert_trajectory.trajectory_state_t_future expert_action = expert_trajectory.trajectory_action_t_arr approx_action = self.policy_network.model.predict([state_t_testing, state_t_future_testing]).argmax(-1) expert_action = expert_action.ravel() behavioural_cloning_evaluation, behavioural_cloning_evaluation_trajectory = behavioural_cloning_trace_generator_testing.create_trajectories( random_action_probability=0, cores_avail=1) return self.score_comparison(expert_evaluation=expert_evaluation, expert_trajectory=expert_trajectory, expert_action=expert_action, approx_evaluation=behavioural_cloning_evaluation, approx_trajectory=behavioural_cloning_evaluation_trajectory, approx_action=approx_action, add_data=add_data)
def score(self, expert_evaluation, expert_trajectory: Trajectory, streaming_enviroment, trace_list, video_csv_list, add_data=False): expert_trajectory.convert_list() behavioural_cloning_trace_generator_testing = TrajectoryVideoStreaming(self, streaming_enviroment, trace_list=trace_list, video_csv_list=video_csv_list) state_t = np.array([self.abr_policy_learner.extract_features_observation(state_t) for state_t, _, _ in tqdm(expert_trajectory.trajectory_list, desc='transforming')]) state_t = pd.DataFrame(state_t, columns=self.abr_policy_learner.extract_features_names()) expert_action = expert_trajectory.trajectory_action_t_arr approx_action = self.abr_policy_learner.predict(state_t) behavioural_cloning_evaluation, behavioural_cloning_evaluation_trajectory = behavioural_cloning_trace_generator_testing.create_trajectories( random_action_probability=0, cores_avail=1) return self.score_comparison(expert_evaluation=expert_evaluation, expert_trajectory=expert_trajectory, expert_action=expert_action, approx_evaluation=behavioural_cloning_evaluation, approx_trajectory=behavioural_cloning_evaluation_trajectory, approx_action=approx_action, add_data=add_data)
def __init__(self, abr_name, max_quality_change, deterministic, past_measurement_dimensions, future_measurements_dimensions, cloning_epochs, drop_prob=0.1, hidden_dim=32, batch_size_cloning=64, validation_split=0.2, cores_avail=1, balanced=False): """ Behavioral Cloning for Keras (GRU) policy :param abr_name: :param max_quality_change: :param deterministic: :param past_measurement_dimensions: :param future_measurements_dimensions: :param cloning_epochs: :param drop_prob: :param hidden_dim: :param batch_size_cloning: :param validation_split: :param cores_avail: :param balanced: """ super().__init__(abr_name, max_quality_change, deterministic) self.cores_avail = cores_avail self.future_measurements_dimensions = future_measurements_dimensions self.validation_split = validation_split self.balanced = balanced self.past_measurement_dimensions = past_measurement_dimensions self.n_actions = max_quality_change * 2 + 1 self.hidden_dim = hidden_dim self.drop_prob = drop_prob self.policy_network = KerasPolicy(past_measurement_dimensions=self.past_measurement_dimensions, future_measurements_dimensions=self.future_measurements_dimensions, hidden_dim=hidden_dim, action_dimension=self.n_actions, drop_prob=drop_prob) self.policy_history = None self.batch_size_cloning = batch_size_cloning self.cloning_epochs = cloning_epochs self.trajectory_dummy = Trajectory()
def clone_from_trajectory(self, expert_evaluation, expert_trajectory: Trajectory, streaming_enviroment, trace_list, video_csv_list, log_steps=False): logging_iteration = 0 # Select the training/validation traces self.policy_history = None trace_list = np.array(trace_list) video_csv_list = np.array(video_csv_list) expert_evaluation = np.array(expert_evaluation) train_idx, test_idx = train_test_split(np.arange(len(expert_evaluation)), test_size=self.validation_split, random_state=RANDOM_SEED) trace_video_pair_list = [f.name for f in expert_evaluation[train_idx]] expert_trajectory_train = expert_trajectory.extract_trajectory(trace_video_pair_list=trace_video_pair_list) expert_trajectory_train.convert_list() trace_video_pair_list = [f.name for f in expert_evaluation[test_idx]] self.fit_clustering_scorer(expert_trajectory) ########### if self.weight_samples: self.fit_value_function(to_imitate_evaluation=expert_evaluation[train_idx], to_imitate_trajectory=expert_trajectory_train) advantage = [] for index in train_idx: advantage += list(self.estimate_advantage_frame(expert_evaluation[index], trace_list[index], video_csv_list[index], streaming_enviroment)) advantage = np.array(advantage).flatten() advantage = advantage + np.min( advantage) # We smooth the estimate so that the low advantages are a bit bolstered assert (advantage < 0).sum() == 0, 'advantage should be non negative everywhere' #### estimate advantage on the training samples expert_trajectory_test = expert_trajectory.extract_trajectory(trace_video_pair_list=trace_video_pair_list) state_t = np.array([self.classifier.extract_features_observation(state_t) for state_t, _, _ in tqdm(expert_trajectory_train.trajectory_list, desc='transforming')]) state_t = pd.DataFrame(state_t, columns=self.classifier.extract_features_names()) self.impute_NaN_inplace(state_t) expert_action = expert_trajectory_train.trajectory_action_t_arr if self.weight_samples: self.classifier.fit(state_t, expert_action.ravel(), sample_weight=advantage) else: self.classifier.fit(state_t, expert_action.ravel()) if self.policy_history is None: self.policy_history, behavioural_cloning_evaluation = self.score(expert_evaluation[test_idx], expert_trajectory_test, streaming_enviroment, trace_list[test_idx], video_csv_list[test_idx], add_data=False) weight_filepaths = [] for cloning_iteration in range(self.iterations): behavioural_cloning_trace_generator_testing = TrajectoryVideoStreaming(self, streaming_enviroment, trace_list=trace_list, video_csv_list=video_csv_list) behavioural_cloning_evaluation, behavioural_cloning_evaluation_trajectory = behavioural_cloning_trace_generator_testing.create_trajectories( random_action_probability=0,cores_avail=1) behavioural_cloning_evaluation_trajectory.convert_list() transformed_observations = self.transform_trajectory(behavioural_cloning_evaluation_trajectory) sample_weights_new = self.clustering_scorer.predict(transformed_observations) state_t_new = np.array([self.classifier.extract_features_observation(state_t) for state_t, _, _ in tqdm(behavioural_cloning_evaluation_trajectory.trajectory_list, desc='transforming')]) state_t_new = np.array(state_t_new[sample_weights_new == 1.]) state_t_new = pd.DataFrame(state_t_new, columns=self.classifier.extract_features_names()) state_t = state_t.append(state_t_new) action_new = behavioural_cloning_evaluation_trajectory.trajectory_action_t_arr[sample_weights_new == 1.] expert_action = np.array(list(expert_action) + list(action_new)) self.classifier.fit(state_t, expert_action.ravel()) weight_filepath = self.rnd_id + '_policy_network_iteration_%d.h5' % cloning_iteration with open(weight_filepath, 'wb') as output_file: dill.dump(self.classifier, output_file) weight_filepaths.append(weight_filepath) best_iteration = self.opt_policy_opt_operator(self.policy_history[self.opt_policy_value_name]) with open(weight_filepaths[best_iteration], 'rb') as input_file: self.classifier = dill.load(input_file)
def clone_from_trajectory(self, expert_evaluation, expert_trajectory: Trajectory, streaming_enviroment, trace_list, video_csv_list, log_steps=False): self.reset_learning() self.policy_history = None self.fit_clustering_scorer(expert_trajectory) trace_list = np.array(trace_list) video_csv_list = np.array(video_csv_list) expert_evaluation = np.array(expert_evaluation) train_idx, test_idx = train_test_split(np.arange(len(expert_evaluation)), test_size=self.validation_split * 2.) test_idx, validation_idx = train_test_split(test_idx, test_size=0.5) trace_video_pair_list = [f.name for f in expert_evaluation[train_idx]] expert_trajectory_train = expert_trajectory.extract_trajectory(trace_video_pair_list=trace_video_pair_list) expert_trajectory_train.convert_list() trace_video_pair_list = [f.name for f in expert_evaluation[test_idx]] expert_trajectory_test = expert_trajectory.extract_trajectory(trace_video_pair_list=trace_video_pair_list) expert_trajectory_test.convert_list() trace_video_pair_list = [f.name for f in expert_evaluation[validation_idx]] expert_trajectory_validation = expert_trajectory.extract_trajectory(trace_video_pair_list=trace_video_pair_list) expert_trajectory_validation.convert_list() state_t_training = expert_trajectory_train.trajectory_state_t_arr state_t_future_training = expert_trajectory_train.trajectory_state_t_future action_training = to_categorical(expert_trajectory_train.trajectory_action_t_arr, self.n_actions) state_t_testing = expert_trajectory_test.trajectory_state_t_arr state_t_future_testing = expert_trajectory_test.trajectory_state_t_future action_testing = to_categorical(expert_trajectory_test.trajectory_action_t_arr, self.n_actions) validation_data = ([state_t_testing, state_t_future_testing], action_testing) weight_filepaths = [] keras_class_weighting = None self.fit_clustering_scorer(expert_trajectory) if self.balanced: keras_class_weighting = class_weight.compute_class_weight('balanced', np.unique(action_training.argmax(1)), action_training.argmax(1)) for cloning_iteration in tqdm(range(self.cloning_epochs), desc='Cloning Epochs'): history = self.policy_network.model.fit([state_t_training, state_t_future_training], action_training, validation_data=validation_data, epochs=1, verbose=0, class_weight=keras_class_weighting).history if self.policy_history is None: self.policy_history = history else: for k, v in history.items(): self.policy_history[k] += history[k] scoring_history, behavioural_cloning_evaluation = self.score(expert_evaluation[validation_idx], expert_trajectory_validation, streaming_enviroment, trace_list[validation_idx], video_csv_list[validation_idx]) if log_steps: logging_folder = 'logging_%s' % self.abr_name if not os.path.exists(logging_folder): os.makedirs(logging_folder) with open(os.path.join(logging_folder, 'logging_iteration_%d' % cloning_iteration), 'wb') as output_file: dill.dump(behavioural_cloning_evaluation, output_file) for k, v in scoring_history.items(): if k in self.policy_history: self.policy_history[k] += scoring_history[k] else: self.policy_history[k] = scoring_history[k] weight_filepath = self.rnd_id + '_policy_network_iteration_%d.h5' % cloning_iteration self.policy_network.model.save_weights(filepath=weight_filepath) weight_filepaths.append(weight_filepath) best_iteration = self.opt_policy_opt_operator(self.policy_history[self.opt_policy_value_name]) self.policy_network.model.load_weights(weight_filepaths[best_iteration]) logger.info('Restoring best iteration %d' % best_iteration) for path in weight_filepaths: os.remove(path)
class BehavioralCloningIterative(ABRPolicy): def __init__(self, abr_name, max_quality_change, deterministic, past_measurement_dimensions, future_measurements_dimensions, cloning_epochs, drop_prob=0.1, hidden_dim=32, batch_size_cloning=64, validation_split=0.2, cores_avail=1, balanced=False): """ Behavioral Cloning for Keras (GRU) policy :param abr_name: :param max_quality_change: :param deterministic: :param past_measurement_dimensions: :param future_measurements_dimensions: :param cloning_epochs: :param drop_prob: :param hidden_dim: :param batch_size_cloning: :param validation_split: :param cores_avail: :param balanced: """ super().__init__(abr_name, max_quality_change, deterministic) self.cores_avail = cores_avail self.future_measurements_dimensions = future_measurements_dimensions self.validation_split = validation_split self.balanced = balanced self.past_measurement_dimensions = past_measurement_dimensions self.n_actions = max_quality_change * 2 + 1 self.hidden_dim = hidden_dim self.drop_prob = drop_prob self.policy_network = KerasPolicy(past_measurement_dimensions=self.past_measurement_dimensions, future_measurements_dimensions=self.future_measurements_dimensions, hidden_dim=hidden_dim, action_dimension=self.n_actions, drop_prob=drop_prob) self.policy_history = None self.batch_size_cloning = batch_size_cloning self.cloning_epochs = cloning_epochs self.trajectory_dummy = Trajectory() def copy(self): copy_ = BehavioralCloningIterative(self.abr_name, self.max_quality_change, self.deterministic, self.past_measurement_dimensions, self.future_measurements_dimensions, self.cloning_epochs, self.drop_prob, self.hidden_dim, self.batch_size_cloning, self.validation_split, self.cores_avail) tmp_file_name = self.randomString(self.rnd_string_length) + 'tmp_id' self.policy_network.model.save_weights(filepath=tmp_file_name) copy_.policy_network.model.load_weights(tmp_file_name) os.remove(tmp_file_name) return copy_ def next_quality(self, observation, reward): current_level = observation['current_level'][-1] streaming_enviroment = observation['streaming_environment'] observation = self.trajectory_dummy.scale_observation( observation) # This is important as the learned representation is also scaled state_t = [v for k, v in sorted( observation.items()) if 'streaming_environment' != k and 'future' not in k] state_t = np.array(state_t).T state_t = np.expand_dims(state_t, axis=0) state_t_future = [v for k, v in sorted( observation.items()) if 'streaming_environment' != k and 'future' in k] state_t_future = np.array(state_t_future).T state_t_future = np.expand_dims(state_t_future, axis=0) action_prob = self.policy_network.model.predict([state_t, state_t_future]) self.likelihood_last_decision_val = max(action_prob) if self.deterministic: next_quality_switch_idx = np.argmax(action_prob) else: probability = action_prob next_quality_switch_idx = np.random.choice(np.arange(len(probability)), size=1, p=probability) next_quality = np.clip(current_level + self.quality_change_arr[next_quality_switch_idx], a_min=0, a_max=streaming_enviroment.max_quality_level) return next_quality def likelihood_last_decision(self): return self.likelihood_last_decision_val def reset(self): pass def reset_learning(self): self.policy_history = None self.policy_network = KerasPolicy(past_measurement_dimensions=self.past_measurement_dimensions, future_measurements_dimensions=self.future_measurements_dimensions, hidden_dim=self.hidden_dim, action_dimension=self.n_actions, drop_prob=self.drop_prob) def calculate_reference_reward(self, expert_evaluation, test_idx): return [frame.reward.mean() for frame in [expert_evaluation[i].streaming_session_evaluation for i in test_idx]] def clone_from_trajectory(self, expert_evaluation, expert_trajectory: Trajectory, streaming_enviroment, trace_list, video_csv_list, log_steps=False): self.reset_learning() self.policy_history = None self.fit_clustering_scorer(expert_trajectory) trace_list = np.array(trace_list) video_csv_list = np.array(video_csv_list) expert_evaluation = np.array(expert_evaluation) train_idx, test_idx = train_test_split(np.arange(len(expert_evaluation)), test_size=self.validation_split * 2.) test_idx, validation_idx = train_test_split(test_idx, test_size=0.5) trace_video_pair_list = [f.name for f in expert_evaluation[train_idx]] expert_trajectory_train = expert_trajectory.extract_trajectory(trace_video_pair_list=trace_video_pair_list) expert_trajectory_train.convert_list() trace_video_pair_list = [f.name for f in expert_evaluation[test_idx]] expert_trajectory_test = expert_trajectory.extract_trajectory(trace_video_pair_list=trace_video_pair_list) expert_trajectory_test.convert_list() trace_video_pair_list = [f.name for f in expert_evaluation[validation_idx]] expert_trajectory_validation = expert_trajectory.extract_trajectory(trace_video_pair_list=trace_video_pair_list) expert_trajectory_validation.convert_list() state_t_training = expert_trajectory_train.trajectory_state_t_arr state_t_future_training = expert_trajectory_train.trajectory_state_t_future action_training = to_categorical(expert_trajectory_train.trajectory_action_t_arr, self.n_actions) state_t_testing = expert_trajectory_test.trajectory_state_t_arr state_t_future_testing = expert_trajectory_test.trajectory_state_t_future action_testing = to_categorical(expert_trajectory_test.trajectory_action_t_arr, self.n_actions) validation_data = ([state_t_testing, state_t_future_testing], action_testing) weight_filepaths = [] keras_class_weighting = None self.fit_clustering_scorer(expert_trajectory) if self.balanced: keras_class_weighting = class_weight.compute_class_weight('balanced', np.unique(action_training.argmax(1)), action_training.argmax(1)) for cloning_iteration in tqdm(range(self.cloning_epochs), desc='Cloning Epochs'): history = self.policy_network.model.fit([state_t_training, state_t_future_training], action_training, validation_data=validation_data, epochs=1, verbose=0, class_weight=keras_class_weighting).history if self.policy_history is None: self.policy_history = history else: for k, v in history.items(): self.policy_history[k] += history[k] scoring_history, behavioural_cloning_evaluation = self.score(expert_evaluation[validation_idx], expert_trajectory_validation, streaming_enviroment, trace_list[validation_idx], video_csv_list[validation_idx]) if log_steps: logging_folder = 'logging_%s' % self.abr_name if not os.path.exists(logging_folder): os.makedirs(logging_folder) with open(os.path.join(logging_folder, 'logging_iteration_%d' % cloning_iteration), 'wb') as output_file: dill.dump(behavioural_cloning_evaluation, output_file) for k, v in scoring_history.items(): if k in self.policy_history: self.policy_history[k] += scoring_history[k] else: self.policy_history[k] = scoring_history[k] weight_filepath = self.rnd_id + '_policy_network_iteration_%d.h5' % cloning_iteration self.policy_network.model.save_weights(filepath=weight_filepath) weight_filepaths.append(weight_filepath) best_iteration = self.opt_policy_opt_operator(self.policy_history[self.opt_policy_value_name]) self.policy_network.model.load_weights(weight_filepaths[best_iteration]) logger.info('Restoring best iteration %d' % best_iteration) for path in weight_filepaths: os.remove(path) def save_model(self, weight_filepath): self.policy_network.model.save_weights(filepath=weight_filepath) def load_model(self, weight_filepath): self.policy_network.model.load_weights(weight_filepath) def score(self, expert_evaluation, expert_trajectory: Trajectory, streaming_enviroment, trace_list, video_csv_list, add_data=False): expert_trajectory.convert_list() behavioural_cloning_trace_generator_testing = TrajectoryVideoStreaming(self, streaming_enviroment, trace_list=trace_list, video_csv_list=video_csv_list) state_t_testing = expert_trajectory.trajectory_state_t_arr state_t_future_testing = expert_trajectory.trajectory_state_t_future expert_action = expert_trajectory.trajectory_action_t_arr approx_action = self.policy_network.model.predict([state_t_testing, state_t_future_testing]).argmax(-1) expert_action = expert_action.ravel() behavioural_cloning_evaluation, behavioural_cloning_evaluation_trajectory = behavioural_cloning_trace_generator_testing.create_trajectories( random_action_probability=0, cores_avail=1) return self.score_comparison(expert_evaluation=expert_evaluation, expert_trajectory=expert_trajectory, expert_action=expert_action, approx_evaluation=behavioural_cloning_evaluation, approx_trajectory=behavioural_cloning_evaluation_trajectory, approx_action=approx_action, add_data=add_data)
def clone_from_trajectory(self, expert_evaluation, expert_trajectory: Trajectory, streaming_enviroment, trace_list, video_csv_list, log_steps=False): """ Main function which will try to imitate the expert actions. Simply imitate the actions of an expert in a given situation :param expert_evaluation: :param expert_trajectory: :param streaming_enviroment: :param trace_list: :param video_csv_list: :param log_steps: :return: """ logging_iteration = 0 # Select the training/validation traces self.policy_history = None trace_list = np.array(trace_list) video_csv_list = np.array(video_csv_list) expert_evaluation = np.array(expert_evaluation) train_idx, test_idx = train_test_split(np.arange(len(expert_evaluation)), test_size=self.validation_split, random_state=RANDOM_SEED) trace_video_pair_list = [f.name for f in expert_evaluation[train_idx]] expert_trajectory_train = expert_trajectory.extract_trajectory(trace_video_pair_list=trace_video_pair_list) expert_trajectory_train.convert_list() trace_video_pair_list = [f.name for f in expert_evaluation[test_idx]] self.fit_clustering_scorer(expert_trajectory) ########### if self.weight_samples: self.fit_value_function(to_imitate_evaluation=expert_evaluation[train_idx], to_imitate_trajectory=expert_trajectory_train) advantage = [] ## Add advante to the training data for index in train_idx: advantage += list(self.estimate_advantage_frame(expert_evaluation[index], trace_list[index], video_csv_list[index], streaming_enviroment)) advantage = np.array(advantage).flatten() advantage = advantage + np.min( advantage) # We smooth the estimate so that the low advantages are a bit bolstered ## NO NEGATIV WEIGHTS ! assert (advantage < 0).sum() == 0, 'advantage should be non negative everywhere' #### estimate advantage on the training samples expert_trajectory_test = expert_trajectory.extract_trajectory(trace_video_pair_list=trace_video_pair_list) state_t = np.array([self.classifier.extract_features_observation(state_t) for state_t, _, _ in tqdm(expert_trajectory_train.trajectory_list, desc='transforming')]) state_t = pd.DataFrame(state_t, columns=self.classifier.extract_features_names()) self.impute_NaN_inplace(state_t) expert_action = expert_trajectory_train.trajectory_action_t_arr if self.weight_samples: self.classifier.fit(state_t, expert_action.ravel(), sample_weight=advantage) if log_steps: logging_folder = 'logging_%s' % self.abr_name if not os.path.exists(logging_folder): os.makedirs(logging_folder) with open(os.path.join(logging_folder, 'advantage_distribution'), 'wb') as output_file: dill.dump(advantage, output_file) else: self.classifier.fit(state_t, expert_action.ravel()) if self.policy_history is None: self.policy_history, behavioural_cloning_evaluation = self.score(expert_evaluation[test_idx], expert_trajectory_test, streaming_enviroment, trace_list[test_idx], video_csv_list[test_idx], add_data=False) if log_steps: with open(os.path.join(logging_folder, 'logging_iteration_%d' % logging_iteration), 'wb') as output_file: dill.dump(behavioural_cloning_evaluation, output_file)
def clone_from_trajectory(self, expert_evaluation, expert_trajectory: Trajectory, streaming_enviroment, trace_list, video_csv_list, log_steps=False): self.reset_learning() self.fit_clustering_scorer(expert_trajectory) # Select the training/validation traces trace_list = np.array(trace_list) video_csv_list = np.array(video_csv_list) expert_evaluation = np.array(expert_evaluation) train_idx, test_idx = train_test_split(np.arange(len(expert_evaluation)), test_size=self.validation_split * 2.) test_idx, validation_idx = train_test_split(test_idx, test_size=0.5) trace_video_pair_list = [f.name for f in expert_evaluation[train_idx]] expert_trajectory_train = expert_trajectory.extract_trajectory(trace_video_pair_list=trace_video_pair_list) expert_trajectory_train.convert_list() trace_video_pair_list = [f.name for f in expert_evaluation[test_idx]] expert_trajectory_test = expert_trajectory.extract_trajectory(trace_video_pair_list=trace_video_pair_list) expert_trajectory_test.convert_list() trace_video_pair_list = [f.name for f in expert_evaluation[validation_idx]] expert_trajectory_validation = expert_trajectory.extract_trajectory(trace_video_pair_list=trace_video_pair_list) expert_trajectory_validation.convert_list() state_t_training = expert_trajectory_train.trajectory_state_t_arr state_t_future_training = expert_trajectory_train.trajectory_state_t_future action_training = to_categorical(expert_trajectory_train.trajectory_action_t_arr, self.n_actions) state_t_testing = expert_trajectory_test.trajectory_state_t_arr state_t_future_testing = expert_trajectory_test.trajectory_state_t_future action_testing = to_categorical(expert_trajectory_test.trajectory_action_t_arr, self.n_actions) weight_filepaths = [] behavioural_cloning_trace_generator_training = TrajectoryVideoStreaming(self, streaming_enviroment, trace_list=trace_list[train_idx], video_csv_list=video_csv_list[ train_idx]) keras_class_weighting = None if self.balanced: keras_class_weighting = class_weight.compute_class_weight('balanced', np.unique(action_training.argmax(1)), action_training.argmax(1)) if self.pretrain: history = self.gail_model.policy_model.model.fit( [state_t_training, state_t_future_training], action_training, validation_data=([state_t_testing, state_t_future_testing], action_testing), epochs=self.pretrain_max_epochs, verbose=0, callbacks=self.early_stopping, class_weight=keras_class_weighting).history self.pretrain_history_last = history.copy() self.pretrain_history = self.keep_last_entry(history) for cloning_iteration in tqdm(range(self.cloning_epochs), desc='Cloning Epochs'): # -------------------------------------------------------------------------------------------------- # Train Discriminator behavioural_cloning_training_evaluation, behavioural_cloning_training_trajectory = behavioural_cloning_trace_generator_training.create_trajectories( random_action_probability=0) behavioural_cloning_training_trajectory.convert_list() training_trajectory_state_t = behavioural_cloning_training_trajectory.trajectory_state_t_arr training_trajectory_state_t_future = behavioural_cloning_training_trajectory.trajectory_state_t_future behavioural_action = behavioural_cloning_training_trajectory.trajectory_action_t_arr behavioural_action_likelihood = behavioural_cloning_training_trajectory.trajectory_likelihood train_idx_clone, test_idx_clone = train_test_split(np.arange(len(training_trajectory_state_t)), test_size=self.validation_split) behavioral_action = to_categorical(behavioural_action, num_classes=self.n_actions) state_t_train = np.vstack([training_trajectory_state_t[train_idx_clone], state_t_training]) state_t_future_train = np.vstack( [training_trajectory_state_t_future[train_idx_clone], state_t_future_training]) action_train = np.vstack([behavioral_action[train_idx_clone], action_training]) target_label_train = to_categorical(np.vstack([0] * len(train_idx_clone) + [1] * len(action_training)), num_classes=2) state_t_validation = np.vstack([training_trajectory_state_t[test_idx_clone], state_t_testing]) state_t_future_validation = np.vstack( [training_trajectory_state_t_future[test_idx_clone], state_t_future_testing]) action_validation = np.vstack([behavioral_action[test_idx_clone], action_testing]) target_label_validation = to_categorical(np.vstack([0] * len(test_idx_clone) + [1] * len(action_testing)), num_classes=2) validation_data_discriminator = ( [state_t_validation, state_t_future_validation, action_validation], target_label_validation) data_train = [state_t_train, state_t_future_train, action_train] history = self.discriminator.model.fit(data_train, target_label_train, validation_data=validation_data_discriminator, epochs=self.adverserial_max_epochs, verbose=0).history # Repeated early stopping callback introduce errors self.discriminator_history_last = history.copy() history = self.keep_last_entry(history) if self.discriminator_history is None: self.discriminator_history = history else: for k, v in history.items(): self.discriminator_history[k] += history[k] data_predict_discriminator = [training_trajectory_state_t, training_trajectory_state_t_future, behavioral_action] discriminator_prediction = self.discriminator.model.predict(data_predict_discriminator)[:, 1] reward = np.log(discriminator_prediction) # Scales to 1.0 as recommended # Train the value net future_reward_obtained = [] i_start = 0 i_end = 0 for evaluation_dataframe in behavioural_cloning_training_evaluation: i_end += len(evaluation_dataframe.streaming_session_evaluation) reward_transform = list(reward[i_start:i_end]) # We ignore the last reward obtained as we don't have a corresponding state for i in range(1, len(reward_transform))[::-1]: exponent = (len(reward_transform) - i) reward_transform[i - 1] += reward_transform[i] * self.future_reward_discount ** exponent future_reward_obtained += reward_transform i_start = i_end future_reward_obtained = np.array(future_reward_obtained).reshape((-1, 1)) future_reward_predicted = self.value_model.model.predict( [training_trajectory_state_t, training_trajectory_state_t_future]) history = self.value_model.model.fit([training_trajectory_state_t, training_trajectory_state_t_future], future_reward_obtained, validation_split=0.2, epochs=self.adverserial_max_epochs, verbose=0).history self.value_history_last = history.copy() history = self.keep_last_entry(history) if self.value_history is None: self.value_history = history else: for k, v in history.items(): self.value_history[k] += history[k] estimated_advantage = future_reward_obtained - future_reward_predicted estimated_advantage = estimated_advantage # -------------------------------------------------------------------------------------------------------- # Fit with the PPO loss # print(np.mean(self.gail_model.policy_model.concatenate_informations.get_weights())) # print('---------' * 10) # print('---------' * 10) history = self.gail_model.gail_training_model.fit( [training_trajectory_state_t, training_trajectory_state_t_future, estimated_advantage, behavioural_action_likelihood], behavioral_action, validation_split=self.validation_split, epochs=self.adverserial_max_epochs, verbose=0, shuffle=True).history # print(np.mean(self.gail_model.policy_model.concatenate_informations.get_weights())) # print('=========' * 10) # print('=========' * 10) self.policy_history_last = history.copy() history = self.keep_last_entry(history) if self.policy_history is None: self.policy_history = history else: for k, v in history.items(): self.policy_history[k] += history[k] scoring_history, behavioural_cloning_evaluation = self.score(expert_evaluation[validation_idx], expert_trajectory_validation, streaming_enviroment, trace_list[validation_idx], video_csv_list[validation_idx]) if log_steps: logging_folder = 'logging_%s' % self.abr_name if not os.path.exists(logging_folder): os.makedirs(logging_folder) with open(os.path.join(logging_folder, 'logging_iteration_%d' % cloning_iteration), 'wb') as output_file: dill.dump(behavioural_cloning_evaluation, output_file) for k, v in scoring_history.items(): if k in self.policy_history: self.policy_history[k] += scoring_history[k] else: self.policy_history[k] = scoring_history[k] weight_filepath = self.rnd_id + '_policy_network_iteration_%d.h5' % cloning_iteration self.gail_model.policy_model.model.save_weights(filepath=weight_filepath) weight_filepaths.append(weight_filepath) best_iteration = self.opt_policy_opt_operator(self.policy_history[self.opt_policy_value_name]) self.gail_model.policy_model.model.load_weights(weight_filepaths[best_iteration]) logger.info('Restoring best iteration %d' % best_iteration) for path in weight_filepaths: os.remove(path)
def __init__(self, abr_name, max_quality_change, deterministic, past_measurement_dimensions, future_measurements_dimensions, cloning_epochs, drop_prob=0.1, hidden_dim=32, batch_size_cloning=64, validation_split=0.2, pretrain=False, pretrain_max_epochs=20, random_action_probability=0.9, random_action_probability_decay=0.75, future_reward_discount=0.99, adverserial_max_epochs=20, cores_avail=1, balanced=False): """ https://arxiv.org/abs/1606.03476 with PPO as reward learning function :param abr_name: :param max_quality_change: :param deterministic: Distribution over actions or take the best action proposed :param past_measurement_dimensions: how many past measurements do we consider :param future_measurements_dimensions: how many future measurements do we consider :param cloning_epochs: :param drop_prob: :param hidden_dim: :param batch_size_cloning: :param validation_split: :param pretrain: Behavioral cloning before we start training the network :param pretrain_max_epochs: How many epochs do we pretrain :param random_action_probability: Exploration probability :param random_action_probability_decay: Exploration probability decay :param future_reward_discount: gamma in the reward function :param adverserial_max_epochs: How many epochs do we run one training for a cloning epoch :param cores_avail: :param balanced: Do we balance for the actions """ super().__init__(abr_name, max_quality_change, deterministic) self.cores_avail = cores_avail self.future_reward_discount = future_reward_discount self.random_action_probability_decay = random_action_probability_decay self.random_action_probability = random_action_probability self.pretrain = pretrain self.past_measurement_dimensions = past_measurement_dimensions self.n_actions = max_quality_change * 2 + 1 self.hidden_dim = hidden_dim self.drop_prob = drop_prob self.future_measurements_dimensions = future_measurements_dimensions self.discriminator = KerasDiscriminator(past_measurement_dimensions=self.past_measurement_dimensions, future_measurements_dimensions=self.future_measurements_dimensions, hidden_dim=self.hidden_dim, action_dimension=self.n_actions, drop_prob=self.drop_prob) self.gail_model = KerasGAIL(past_measurement_dimensions=self.past_measurement_dimensions, future_measurements_dimensions=self.future_measurements_dimensions, hidden_dim=self.hidden_dim, action_dimension=self.n_actions, drop_prob=self.drop_prob) self.value_model = KerasValue(past_measurement_dimensions=self.past_measurement_dimensions, future_measurements_dimensions=self.future_measurements_dimensions, hidden_dim=self.hidden_dim, drop_prob=self.drop_prob) self.pretrain_history = None self.discriminator_history = None self.value_history = None self.policy_history = None self.pretrain_history_last = None self.discriminator_history_last = None self.value_history_last = None self.policy_history_last = None self.pretrain_max_epochs = pretrain_max_epochs self.adverserial_max_epochs = adverserial_max_epochs self.batch_size_cloning = batch_size_cloning self.cloning_epochs = cloning_epochs self.trajectory_dummy = Trajectory() self.validation_split = validation_split self.balanced = balanced
class GAILPPO(ABRPolicy): def __init__(self, abr_name, max_quality_change, deterministic, past_measurement_dimensions, future_measurements_dimensions, cloning_epochs, drop_prob=0.1, hidden_dim=32, batch_size_cloning=64, validation_split=0.2, pretrain=False, pretrain_max_epochs=20, random_action_probability=0.9, random_action_probability_decay=0.75, future_reward_discount=0.99, adverserial_max_epochs=20, cores_avail=1, balanced=False): """ https://arxiv.org/abs/1606.03476 with PPO as reward learning function :param abr_name: :param max_quality_change: :param deterministic: Distribution over actions or take the best action proposed :param past_measurement_dimensions: how many past measurements do we consider :param future_measurements_dimensions: how many future measurements do we consider :param cloning_epochs: :param drop_prob: :param hidden_dim: :param batch_size_cloning: :param validation_split: :param pretrain: Behavioral cloning before we start training the network :param pretrain_max_epochs: How many epochs do we pretrain :param random_action_probability: Exploration probability :param random_action_probability_decay: Exploration probability decay :param future_reward_discount: gamma in the reward function :param adverserial_max_epochs: How many epochs do we run one training for a cloning epoch :param cores_avail: :param balanced: Do we balance for the actions """ super().__init__(abr_name, max_quality_change, deterministic) self.cores_avail = cores_avail self.future_reward_discount = future_reward_discount self.random_action_probability_decay = random_action_probability_decay self.random_action_probability = random_action_probability self.pretrain = pretrain self.past_measurement_dimensions = past_measurement_dimensions self.n_actions = max_quality_change * 2 + 1 self.hidden_dim = hidden_dim self.drop_prob = drop_prob self.future_measurements_dimensions = future_measurements_dimensions self.discriminator = KerasDiscriminator(past_measurement_dimensions=self.past_measurement_dimensions, future_measurements_dimensions=self.future_measurements_dimensions, hidden_dim=self.hidden_dim, action_dimension=self.n_actions, drop_prob=self.drop_prob) self.gail_model = KerasGAIL(past_measurement_dimensions=self.past_measurement_dimensions, future_measurements_dimensions=self.future_measurements_dimensions, hidden_dim=self.hidden_dim, action_dimension=self.n_actions, drop_prob=self.drop_prob) self.value_model = KerasValue(past_measurement_dimensions=self.past_measurement_dimensions, future_measurements_dimensions=self.future_measurements_dimensions, hidden_dim=self.hidden_dim, drop_prob=self.drop_prob) self.pretrain_history = None self.discriminator_history = None self.value_history = None self.policy_history = None self.pretrain_history_last = None self.discriminator_history_last = None self.value_history_last = None self.policy_history_last = None self.pretrain_max_epochs = pretrain_max_epochs self.adverserial_max_epochs = adverserial_max_epochs self.batch_size_cloning = batch_size_cloning self.cloning_epochs = cloning_epochs self.trajectory_dummy = Trajectory() self.validation_split = validation_split self.balanced = balanced def copy(self): copy_ = GAILPPO(self.abr_name, self.max_quality_change, self.deterministic, self.past_measurement_dimensions, self.future_measurements_dimensions, self.cloning_epochs, self.drop_prob, self.hidden_dim, self.batch_size_cloning, self.validation_split, self.pretrain, self.pretrain_max_epochs, self.random_action_probability, self.random_action_probability_decay, self.adverserial_max_epochs, self.cores_avail) tmp_file_name = self.randomString(self.rnd_string_length) + 'tmp_id' self.gail_model.policy_model.model.save_weights(filepath=tmp_file_name) copy_.gail_model.policy_model.model.load_weights(tmp_file_name) os.remove(tmp_file_name) return copy_ def reset_learning(self): self.pretrain_history = None self.discriminator_history = None self.value_history = None self.policy_history = None self.pretrain_history_last = None self.discriminator_history_last = None self.value_history_last = None self.policy_history_last = None self.discriminator = KerasDiscriminator(past_measurement_dimensions=self.past_measurement_dimensions, future_measurements_dimensions=self.future_measurements_dimensions, hidden_dim=self.hidden_dim, action_dimension=self.n_actions, drop_prob=self.drop_prob) self.gail_model = KerasGAIL(past_measurement_dimensions=self.past_measurement_dimensions, future_measurements_dimensions=self.future_measurements_dimensions, hidden_dim=self.hidden_dim, action_dimension=self.n_actions, drop_prob=self.drop_prob) self.value_model = KerasValue(past_measurement_dimensions=self.past_measurement_dimensions, future_measurements_dimensions=self.future_measurements_dimensions, hidden_dim=self.hidden_dim, drop_prob=self.drop_prob) def next_quality(self, observation, reward): current_level = observation['current_level'][-1] streaming_enviroment = observation['streaming_environment'] observation = self.trajectory_dummy.scale_observation( observation) # This is important as the learned representation is also scaled state_t = [v for k, v in sorted( observation.items()) if 'streaming_environment' != k and 'future' not in k] state_t = np.array(state_t).T state_t = np.expand_dims(state_t, axis=0) state_t_future = [v for k, v in sorted( observation.items()) if 'streaming_environment' != k and 'future' in k] state_t_future = np.array(state_t_future).T state_t_future = np.expand_dims(state_t_future, axis=0) action_prob = self.gail_model.policy_model.model.predict([state_t, state_t_future]) self.likelihood_last_decision_val = action_prob if self.deterministic: next_quality_switch_idx = np.argmax(action_prob) else: probability = action_prob.flatten() next_quality_switch_idx = np.random.choice(np.arange(len(probability)), size=1, p=probability) next_quality = np.clip(current_level + self.quality_change_arr[next_quality_switch_idx], a_min=0, a_max=streaming_enviroment.max_quality_level) return next_quality def likelihood_last_decision(self): return self.likelihood_last_decision_val def reset(self): pass def split_input_data(self, expert_evaluation, expert_trajectory): train_idx, test_test2_idx = train_test_split(np.arange(len(expert_evaluation)), test_size=self.validation_split * 2.) test_idx, test2_idx = train_test_split(test_test2_idx, test_size=self.validation_split) state_train_idx = np.array( [expert_trajectory.trajectory_sample_association[expert_evaluation[idx].name] for idx in train_idx]).flatten() state_test_idx = np.array( [expert_trajectory.trajectory_sample_association[expert_evaluation[idx].name] for idx in test_idx]).flatten() return train_idx, test_idx, test2_idx, state_train_idx, state_test_idx def calculate_reference_reward(self, expert_evaluation, test_idx): return [frame.reward.mean() for frame in [expert_evaluation[i].streaming_session_evaluation for i in test_idx]] def clone_from_trajectory(self, expert_evaluation, expert_trajectory: Trajectory, streaming_enviroment, trace_list, video_csv_list, log_steps=False): self.reset_learning() self.fit_clustering_scorer(expert_trajectory) # Select the training/validation traces trace_list = np.array(trace_list) video_csv_list = np.array(video_csv_list) expert_evaluation = np.array(expert_evaluation) train_idx, test_idx = train_test_split(np.arange(len(expert_evaluation)), test_size=self.validation_split * 2.) test_idx, validation_idx = train_test_split(test_idx, test_size=0.5) trace_video_pair_list = [f.name for f in expert_evaluation[train_idx]] expert_trajectory_train = expert_trajectory.extract_trajectory(trace_video_pair_list=trace_video_pair_list) expert_trajectory_train.convert_list() trace_video_pair_list = [f.name for f in expert_evaluation[test_idx]] expert_trajectory_test = expert_trajectory.extract_trajectory(trace_video_pair_list=trace_video_pair_list) expert_trajectory_test.convert_list() trace_video_pair_list = [f.name for f in expert_evaluation[validation_idx]] expert_trajectory_validation = expert_trajectory.extract_trajectory(trace_video_pair_list=trace_video_pair_list) expert_trajectory_validation.convert_list() state_t_training = expert_trajectory_train.trajectory_state_t_arr state_t_future_training = expert_trajectory_train.trajectory_state_t_future action_training = to_categorical(expert_trajectory_train.trajectory_action_t_arr, self.n_actions) state_t_testing = expert_trajectory_test.trajectory_state_t_arr state_t_future_testing = expert_trajectory_test.trajectory_state_t_future action_testing = to_categorical(expert_trajectory_test.trajectory_action_t_arr, self.n_actions) weight_filepaths = [] behavioural_cloning_trace_generator_training = TrajectoryVideoStreaming(self, streaming_enviroment, trace_list=trace_list[train_idx], video_csv_list=video_csv_list[ train_idx]) keras_class_weighting = None if self.balanced: keras_class_weighting = class_weight.compute_class_weight('balanced', np.unique(action_training.argmax(1)), action_training.argmax(1)) if self.pretrain: history = self.gail_model.policy_model.model.fit( [state_t_training, state_t_future_training], action_training, validation_data=([state_t_testing, state_t_future_testing], action_testing), epochs=self.pretrain_max_epochs, verbose=0, callbacks=self.early_stopping, class_weight=keras_class_weighting).history self.pretrain_history_last = history.copy() self.pretrain_history = self.keep_last_entry(history) for cloning_iteration in tqdm(range(self.cloning_epochs), desc='Cloning Epochs'): # -------------------------------------------------------------------------------------------------- # Train Discriminator behavioural_cloning_training_evaluation, behavioural_cloning_training_trajectory = behavioural_cloning_trace_generator_training.create_trajectories( random_action_probability=0) behavioural_cloning_training_trajectory.convert_list() training_trajectory_state_t = behavioural_cloning_training_trajectory.trajectory_state_t_arr training_trajectory_state_t_future = behavioural_cloning_training_trajectory.trajectory_state_t_future behavioural_action = behavioural_cloning_training_trajectory.trajectory_action_t_arr behavioural_action_likelihood = behavioural_cloning_training_trajectory.trajectory_likelihood train_idx_clone, test_idx_clone = train_test_split(np.arange(len(training_trajectory_state_t)), test_size=self.validation_split) behavioral_action = to_categorical(behavioural_action, num_classes=self.n_actions) state_t_train = np.vstack([training_trajectory_state_t[train_idx_clone], state_t_training]) state_t_future_train = np.vstack( [training_trajectory_state_t_future[train_idx_clone], state_t_future_training]) action_train = np.vstack([behavioral_action[train_idx_clone], action_training]) target_label_train = to_categorical(np.vstack([0] * len(train_idx_clone) + [1] * len(action_training)), num_classes=2) state_t_validation = np.vstack([training_trajectory_state_t[test_idx_clone], state_t_testing]) state_t_future_validation = np.vstack( [training_trajectory_state_t_future[test_idx_clone], state_t_future_testing]) action_validation = np.vstack([behavioral_action[test_idx_clone], action_testing]) target_label_validation = to_categorical(np.vstack([0] * len(test_idx_clone) + [1] * len(action_testing)), num_classes=2) validation_data_discriminator = ( [state_t_validation, state_t_future_validation, action_validation], target_label_validation) data_train = [state_t_train, state_t_future_train, action_train] history = self.discriminator.model.fit(data_train, target_label_train, validation_data=validation_data_discriminator, epochs=self.adverserial_max_epochs, verbose=0).history # Repeated early stopping callback introduce errors self.discriminator_history_last = history.copy() history = self.keep_last_entry(history) if self.discriminator_history is None: self.discriminator_history = history else: for k, v in history.items(): self.discriminator_history[k] += history[k] data_predict_discriminator = [training_trajectory_state_t, training_trajectory_state_t_future, behavioral_action] discriminator_prediction = self.discriminator.model.predict(data_predict_discriminator)[:, 1] reward = np.log(discriminator_prediction) # Scales to 1.0 as recommended # Train the value net future_reward_obtained = [] i_start = 0 i_end = 0 for evaluation_dataframe in behavioural_cloning_training_evaluation: i_end += len(evaluation_dataframe.streaming_session_evaluation) reward_transform = list(reward[i_start:i_end]) # We ignore the last reward obtained as we don't have a corresponding state for i in range(1, len(reward_transform))[::-1]: exponent = (len(reward_transform) - i) reward_transform[i - 1] += reward_transform[i] * self.future_reward_discount ** exponent future_reward_obtained += reward_transform i_start = i_end future_reward_obtained = np.array(future_reward_obtained).reshape((-1, 1)) future_reward_predicted = self.value_model.model.predict( [training_trajectory_state_t, training_trajectory_state_t_future]) history = self.value_model.model.fit([training_trajectory_state_t, training_trajectory_state_t_future], future_reward_obtained, validation_split=0.2, epochs=self.adverserial_max_epochs, verbose=0).history self.value_history_last = history.copy() history = self.keep_last_entry(history) if self.value_history is None: self.value_history = history else: for k, v in history.items(): self.value_history[k] += history[k] estimated_advantage = future_reward_obtained - future_reward_predicted estimated_advantage = estimated_advantage # -------------------------------------------------------------------------------------------------------- # Fit with the PPO loss # print(np.mean(self.gail_model.policy_model.concatenate_informations.get_weights())) # print('---------' * 10) # print('---------' * 10) history = self.gail_model.gail_training_model.fit( [training_trajectory_state_t, training_trajectory_state_t_future, estimated_advantage, behavioural_action_likelihood], behavioral_action, validation_split=self.validation_split, epochs=self.adverserial_max_epochs, verbose=0, shuffle=True).history # print(np.mean(self.gail_model.policy_model.concatenate_informations.get_weights())) # print('=========' * 10) # print('=========' * 10) self.policy_history_last = history.copy() history = self.keep_last_entry(history) if self.policy_history is None: self.policy_history = history else: for k, v in history.items(): self.policy_history[k] += history[k] scoring_history, behavioural_cloning_evaluation = self.score(expert_evaluation[validation_idx], expert_trajectory_validation, streaming_enviroment, trace_list[validation_idx], video_csv_list[validation_idx]) if log_steps: logging_folder = 'logging_%s' % self.abr_name if not os.path.exists(logging_folder): os.makedirs(logging_folder) with open(os.path.join(logging_folder, 'logging_iteration_%d' % cloning_iteration), 'wb') as output_file: dill.dump(behavioural_cloning_evaluation, output_file) for k, v in scoring_history.items(): if k in self.policy_history: self.policy_history[k] += scoring_history[k] else: self.policy_history[k] = scoring_history[k] weight_filepath = self.rnd_id + '_policy_network_iteration_%d.h5' % cloning_iteration self.gail_model.policy_model.model.save_weights(filepath=weight_filepath) weight_filepaths.append(weight_filepath) best_iteration = self.opt_policy_opt_operator(self.policy_history[self.opt_policy_value_name]) self.gail_model.policy_model.model.load_weights(weight_filepaths[best_iteration]) logger.info('Restoring best iteration %d' % best_iteration) for path in weight_filepaths: os.remove(path) def save_model(self, weight_filepath): self.gail_model.policy_model.model.save_weights(filepath=weight_filepath) def load_model(self, weight_filepath): self.gail_model.policy_model.model.load_weights(weight_filepath) def score(self, expert_evaluation, expert_trajectory: Trajectory, streaming_enviroment, trace_list, video_csv_list, add_data=False): expert_trajectory.convert_list() behavioural_cloning_trace_generator_testing = TrajectoryVideoStreaming(self, streaming_enviroment, trace_list=trace_list, video_csv_list=video_csv_list) state_t_testing = expert_trajectory.trajectory_state_t_arr state_t_future_testing = expert_trajectory.trajectory_state_t_future approx_action = self.gail_model.policy_model.model.predict([state_t_testing, state_t_future_testing]).argmax(-1) expert_action = expert_trajectory.trajectory_action_t_arr behavioural_cloning_evaluation, behavioural_cloning_evaluation_trajectory = behavioural_cloning_trace_generator_testing.create_trajectories( random_action_probability=0, cores_avail=1) return self.score_comparison(expert_evaluation=expert_evaluation, expert_trajectory=expert_trajectory, expert_action=expert_action, approx_evaluation=behavioural_cloning_evaluation, approx_trajectory=behavioural_cloning_evaluation_trajectory, approx_action=approx_action, add_data=add_data)
def clone_from_trajectory(self, expert_evaluation, expert_trajectory: Trajectory, streaming_enviroment, trace_list, video_csv_list, log_steps=False): self.reset_learning() self.fit_clustering_scorer(expert_trajectory) trace_list = np.array(trace_list) video_csv_list = np.array(video_csv_list) expert_evaluation = np.array(expert_evaluation) train_idx, test_idx = train_test_split(np.arange(len(expert_evaluation)), test_size=self.validation_split * 2.) test_idx, validation_idx = train_test_split(test_idx, test_size=0.5) trace_video_pair_list = [f.name for f in expert_evaluation[train_idx]] expert_trajectory_train = expert_trajectory.extract_trajectory(trace_video_pair_list=trace_video_pair_list) expert_trajectory_train.convert_list() trace_video_pair_list = [f.name for f in expert_evaluation[test_idx]] expert_trajectory_test = expert_trajectory.extract_trajectory(trace_video_pair_list=trace_video_pair_list) expert_trajectory_test.convert_list() trace_video_pair_list = [f.name for f in expert_evaluation[validation_idx]] expert_trajectory_validation = expert_trajectory.extract_trajectory(trace_video_pair_list=trace_video_pair_list) expert_trajectory_validation.convert_list() state_t_training = expert_trajectory_train.trajectory_state_t_arr state_t_future_training = expert_trajectory_train.trajectory_state_t_future action_training = to_categorical(expert_trajectory_train.trajectory_action_t_arr, self.n_actions) state_t_testing = expert_trajectory_test.trajectory_state_t_arr state_t_future_testing = expert_trajectory_test.trajectory_state_t_future action_testing = to_categorical(expert_trajectory_test.trajectory_action_t_arr, self.n_actions) ############################################################################################################### #### Fit first network random_prediction_training = self.rnd_cloning_network.model.predict([state_t_training, state_t_future_training]) random_prediction_testing = self.rnd_cloning_network.model.predict([state_t_testing, state_t_future_testing]) testing_data = ([state_t_testing, state_t_future_testing], random_prediction_testing) self.pretrain_distill_history = self.bc_cloning_network.model.fit( [state_t_training, state_t_future_training], random_prediction_training, validation_data=testing_data, epochs=self.rde_distill_epochs, verbose=0, shuffle=True, callbacks=self.early_stopping).history trained_prediction_training = self.bc_cloning_network.model.predict([state_t_training, state_t_future_training]) scaling_factor = np.random.random(size=100) * 100 # Pick the hyperparameter randomly rewards = [np.exp(-fact * (np.square(trained_prediction_training - random_prediction_training)).mean( axis=-1)).flatten().mean() for fact in scaling_factor] self.scaling_factor_sigma = scaling_factor[np.argmin(np.abs(np.array(rewards) - 1.0))] print('Choosen Scaling factor %.2f' % self.scaling_factor_sigma) red_trajectory_generator_training = TrajectoryVideoStreaming(self, streaming_enviroment, trace_list=trace_list[train_idx], video_csv_list=video_csv_list[train_idx]) keras_class_weighting = None if self.balanced: keras_class_weighting = class_weight.compute_class_weight('balanced', np.unique(action_training.argmax(1)), action_training.argmax(1)) weight_filepaths = [] if self.pretrain: self.pretrain_bc_history = self.policy_network.policy_model.model.fit( [state_t_training, state_t_future_training], action_training, validation_data=([state_t_testing, state_t_future_testing], action_testing), epochs=self.rde_distill_epochs, verbose=0, callbacks=self.early_stopping, class_weight=keras_class_weighting).history for cloning_iteration in tqdm(range(self.cloning_epochs), desc='Cloning Epochs'): """ Iterations of the RED algorithm """ training_evaluation, training_trajectories = red_trajectory_generator_training.create_trajectories( random_action_probability=0) training_trajectories.convert_list() state_t_training_sampled = training_trajectories.trajectory_state_t_arr state_t_future_training_sampled = training_trajectories.trajectory_state_t_future action_sampled = to_categorical(training_trajectories.trajectory_action_t_arr, num_classes=self.n_actions) action_likelihood_sampled = training_trajectories.trajectory_likelihood bc_clone_prediction = self.bc_cloning_network.model.predict([state_t_training_sampled, state_t_future_training_sampled]) random_prediction = self.rnd_cloning_network.model.predict([state_t_training_sampled, state_t_future_training_sampled]) # report_var(·) = exp(−σ1‖fˆθ(·)−fθ(·)‖22) reward = np.exp(-self.scaling_factor_sigma * (np.square(bc_clone_prediction - random_prediction)).mean( axis=-1)).flatten() # Scales to 1.0 as recommended # Train the value net future_reward_obtained = [] i_start = 0 i_end = 0 for evaluation_dataframe in training_evaluation: i_end += len(evaluation_dataframe.streaming_session_evaluation) reward_transform = list(reward[i_start:i_end]) # We ignore the last reward obtained as we don't have a corresponding state for i in range(1, len(reward_transform))[::-1]: exponent = (len(reward_transform) - i) reward_transform[i - 1] += reward_transform[i] * self.future_reward_discount ** exponent future_reward_obtained += reward_transform i_start = i_end future_reward_obtained = np.array(future_reward_obtained).reshape((-1, 1)) future_reward_predicted = self.value_model.model.predict( [state_t_training_sampled, state_t_future_training_sampled]) history = self.value_model.model.fit([state_t_training_sampled, state_t_future_training_sampled], future_reward_obtained, validation_split=0.2, epochs=self.model_iterations, verbose=0, shuffle=True).history # Repeated early stopping callback introduce errors self.value_history_last = history.copy() history = self.keep_last_entry(history) if self.value_history is None: self.value_history = history else: for k, v in history.items(): self.value_history[k] += history[k] estimated_advantage = future_reward_obtained - future_reward_predicted estimated_advantage = estimated_advantage history = self.policy_network.gail_training_model.fit( [state_t_training_sampled, state_t_future_training_sampled, estimated_advantage, action_likelihood_sampled], action_sampled, validation_split=self.validation_split, epochs=self.model_iterations, verbose=0, shuffle=True).history self.policy_history_last = history.copy() history = self.keep_last_entry(history) if self.policy_history is None: self.policy_history = history else: for k, v in history.items(): self.policy_history[k] += history[k] scoring_history, behavioural_cloning_evaluation = self.score(expert_evaluation[validation_idx], expert_trajectory_validation, streaming_enviroment, trace_list[validation_idx], video_csv_list[validation_idx]) if log_steps: logging_folder = 'logging_%s' % self.abr_name if not os.path.exists(logging_folder): os.makedirs(logging_folder) with open(os.path.join(logging_folder, 'logging_iteration_%d' % cloning_iteration), 'wb') as output_file: dill.dump(behavioural_cloning_evaluation, output_file) for k, v in scoring_history.items(): if k in self.policy_history: self.policy_history[k] += scoring_history[k] else: self.policy_history[k] = scoring_history[k] weight_filepath = self.rnd_id + '_policy_network_iteration_%d.h5' % cloning_iteration self.policy_network.policy_model.model.save_weights(filepath=weight_filepath) weight_filepaths.append(weight_filepath) best_iteration = self.opt_policy_opt_operator(self.policy_history[self.opt_policy_value_name]) self.policy_network.policy_model.model.load_weights(weight_filepaths[best_iteration]) for path in weight_filepaths: os.remove(path)
def transform_csv(self, to_transform_path): """ Main transformation method :param to_transform_path: :return: """ self.reset() trace_video_pair_name = to_transform_path.split('/')[-2] client_logger_file = to_transform_path.replace( 'raw_dataframe.csv', 'local_client_state_logger.csv') client_logger_file = pd.read_csv(client_logger_file, index_col=0) client_logger_file['buffer_level'] = client_logger_file[ 'buffered_until'] - client_logger_file['played_until'] client_logger_file['time_elapsed'] = pd.to_timedelta( client_logger_file.timestamp_s - client_logger_file.timestamp_s.iloc[0], unit='s') client_logger_file['fps'] = client_logger_file.decodedFrames.diff() decoded_frame_unit = client_logger_file[['time_elapsed', 'fps']] client_logger_file = client_logger_file.drop('fps', axis=1) decoded_frame_unit = decoded_frame_unit.set_index('time_elapsed') decoded_frame_unit = decoded_frame_unit.resample( '%ds' % self.fps_smoothing_s).sum() / float(self.fps_smoothing_s) decoded_frame_unit['fps'] = decoded_frame_unit['fps'].map( self.map_experimental_fps_avail_fps) decoded_frame_unit = decoded_frame_unit.reset_index() client_logger_file = pd.merge_asof(client_logger_file, decoded_frame_unit, on='time_elapsed', direction='nearest') client_logger_file['current_level'] = ( client_logger_file['videoWidth'] * client_logger_file['videoHeight'] * client_logger_file['fps']).map( self.map_resolution_n_pixels) client_logger_file = client_logger_file.sort_values('timestamp_s') ### We have a quality level progression which we need to simulate quality_level_progression_list = [] insertion_points = (self.video_information_csv.time_s - self.video_information_csv.seg_len_s).values for n_segment, points in enumerate(insertion_points): client_logger_index = np.searchsorted( client_logger_file['played_until'], points) if points >= client_logger_file['played_until'].iloc[-1]: break quality_level_progression_list.append( client_logger_file['current_level'].iloc[client_logger_index]) trajectory_object = Trajectory() trajectory_object.new_trace_video_pair_name(trace_video_pair_name) previous_quality = 0 current_quality = 0 previous_observation = self.generate_observation_dictionary() previous_likelihood = np.zeros((1, len(self.quality_change_arr))) previous_likelihood[0, len(previous_likelihood) // 2 + 1] = 1. del previous_observation[ 'streaming_environment'] # We can't make use of this in the trajectory video_finished = False quality_iteration_idx = 0 logging_list = [] while not video_finished: observation, reward, video_finished, info = self.get_video_chunk( current_quality) switch = current_quality - previous_quality action_idx = self.map_switch_idx(switch) previous_quality = current_quality if video_finished or quality_iteration_idx >= len( quality_level_progression_list): # Add the last observation del observation[ 'streaming_environment'] # We can't make use of this in the trajectory trajectory_object.add_trajectory_triple( previous_observation, observation, action_idx) trajectory_object.add_likelihood(previous_likelihood, False) break # We actually need to map this # which is the closest quality to the played we can reach current_quality = quality_level_progression_list[ quality_iteration_idx] # Should we actually map this. I probaly should map the quality progression to the closes mapping of the quality switches switch = int( np.clip(current_quality - previous_quality, a_min=-self.max_switch_allowed, a_max=self.max_switch_allowed)) current_quality = previous_quality + switch del observation[ 'streaming_environment'] # We can't make use of this in the trajectory trajectory_object.add_trajectory_triple(previous_observation, observation, action_idx) trajectory_object.add_likelihood(previous_likelihood, False) previous_likelihood = np.zeros((1, len(self.quality_change_arr))) previous_likelihood[0, action_idx] = 1. previous_observation = observation quality_iteration_idx += 1 streaming_session_evaluation = pd.DataFrame( self.return_log_state(), columns=self.get_logging_columns()) logging_list.append( StreamingSessionEvaluation( streaming_session_evaluation=streaming_session_evaluation, name=trace_video_pair_name, max_buffer_length_s=self.buffer_threshold_ms / 1000., max_switch_allowed=self.max_switch_allowed)) return logging_list, trajectory_object
def transform_csv(self, to_transform_path): self.reset() trace_video_pair_name = to_transform_path.split('/')[-1] evaluation_dataframe = pd.read_csv(to_transform_path, index_col=0) quality_level_progression_list = evaluation_dataframe[ 'quality_level_chosen'].values trajectory_object = Trajectory() trajectory_object.new_trace_video_pair_name(trace_video_pair_name) previous_quality = 0 current_quality = 0 previous_observation = self.generate_observation_dictionary() previous_likelihood = np.zeros((1, len(self.quality_change_arr))) previous_likelihood[0, len(previous_likelihood) // 2 + 1] = 1. del previous_observation[ 'streaming_environment'] # We can't make use of this in the trajectory video_finished = False quality_iteration_idx = 0 logging_list = [] while not video_finished: observation, reward, video_finished, info = self.get_video_chunk( current_quality) switch = current_quality - previous_quality action_idx = self.map_switch_idx(switch) previous_quality = current_quality if video_finished or quality_iteration_idx >= len( quality_level_progression_list): # Add the last observation del observation[ 'streaming_environment'] # We can't make use of this in the trajectory trajectory_object.add_trajectory_triple( previous_observation, observation, action_idx) trajectory_object.add_likelihood(previous_likelihood, False) break current_quality = quality_level_progression_list[ quality_iteration_idx] # Should we actually map this. I probaly should map the quality progression to the closes mapping of the quality switches switch = int( np.clip(current_quality - previous_quality, a_min=-self.max_switch_allowed, a_max=self.max_switch_allowed)) current_quality = previous_quality + switch del observation[ 'streaming_environment'] # We can't make use of this in the trajectory trajectory_object.add_trajectory_triple(previous_observation, observation, action_idx) trajectory_object.add_likelihood(previous_likelihood, False) previous_likelihood = np.zeros((1, len(self.quality_change_arr))) previous_likelihood[0, action_idx] = 1. previous_observation = observation quality_iteration_idx += 1 streaming_session_evaluation = pd.DataFrame( self.return_log_state(), columns=self.get_logging_columns()) logging_list.append( StreamingSessionEvaluation( streaming_session_evaluation=streaming_session_evaluation, name=trace_video_pair_name, max_buffer_length_s=self.buffer_threshold_ms / 1000., max_switch_allowed=self.max_switch_allowed)) return logging_list, trajectory_object