def get_transposed_data(data: dict): validations.validate_data_dict_keys(data) for key in data['data']: transposed_data = [ transpose_data(datum) for datum in data['data'][key] ] data['data'][key] = tuple(transposed_data) return data
def get_sub_sampled_sequences(data: dict): validations.validate_all_data_present_in_data_dict(data) validations.validate_data_dict_keys(data) new_data = object_generator_utils.get_empty_data_dict() for key in data['data']: key_set = find_set_for_key(data, key) if key_set: sub_sample_sequences(data['data'][key], key, key_set, new_data) return new_data
def tensorify_data_gru_d(data: dict, cuda_enabled=False): """ @param data: Data dictionary that needs to be converted to tensors in GRUD style of data. @param cuda_enabled: If true, will convert data into cuda tensors. @return: Return Data dictionary with tensors which can be used to train. """ validations.validate_data_dict_keys(data) validations.validate_all_data_present_in_data_dict(data) for key in data['data'].keys(): data['data'][key] = get_data_and_label_tensor(data, key, cuda_enabled) return data
def data_debug_string(data: dict, seq_limit): """ @param data: @param seq_limit: Integer value to limit the seq. @return: Returns samples example of small slices of data. """ validations.validate_data_dict_keys(data) first_key = next(iter(data['data'].keys())) print('first_key: ', first_key) for idx, datum in enumerate(data['data'][first_key]): if idx == 3: print("Label: ", datum) else: print(datum[:seq_limit])
def evaluate_multitask_lstm_learner(data, key_set: str, multitask_lerner_model, classification_criterion, optimizer=None, use_histogram=False): validations.validate_data_dict_keys(data) validate_key_set_str(key_set) total_classification_loss = 0 labels = [] predictions = [] users = [] if not optimizer: multitask_lerner_model.eval() else: multitask_lerner_model.train() for key in data[key_set]: student_id = conversions.extract_student_id_from_key(key) student_key = 'student_' + str(student_id) actual_data, covariate_data, histogram_data, train_label = data[ 'data'][key] actual_data = actual_data[0].unsqueeze(0) if use_histogram: actual_data = histogram_data.unsqueeze(0) y_pred = multitask_lerner_model(student_key, actual_data, covariate_data) classification_loss = classification_criterion(y_pred, train_label) total_classification_loss += classification_loss.item() # Check if training if optimizer: multitask_lerner_model.zero_grad() classification_loss.backward() optimizer.step() labels.append(train_label) y_pred_squeezed = y_pred.squeeze(0) _, max_idx = y_pred_squeezed.max(0) predictions.append(max_idx) users.append(student_id) return total_classification_loss, labels, predictions, users
def convert_logical_not_missing_flags(data): validations.validate_data_dict_keys(data) new_dict = {} data_dict = {} new_dict['train_ids'] = data['train_ids'] new_dict['val_ids'] = data['val_ids'] new_dict['test_ids'] = data['test_ids'] for key in data['data'].keys(): mutable_data = list(data['data'][key]) mutable_data[1] = np.logical_not(np.array( data['data'][key][1])).astype(int).tolist() data_dict[key] = tuple(mutable_data) new_dict['data'] = data_dict return new_dict
def get_statistics_on_data_dict(data: dict, feature_list: list): """ @attention Statistics returned are ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']. @param data: Data in classic dictionary format. @param feature_list: Feature list for the data. @return: Statistics on whole data and raw appended data. """ validations.validate_data_dict_keys(data) validations.validate_all_data_present_in_data_dict(data) df_for_statistics = pd.DataFrame() for key in data['data']: unit_sequence = data['data'][key][0] df_for_statistics = df_for_statistics.append(pd.DataFrame(unit_sequence), ignore_index=True) if not data_manager.FLATTEN_SEQUENCE_TO_COLS: df_for_statistics.columns = feature_list df_for_statistics.replace(to_replace=-1, value=np.nan, inplace=True) return df_for_statistics.describe(percentiles=[0.25, 0.5, 0.75]), df_for_statistics
def add_mean_vector_to_data(data: dict): validations.validate_data_dict_keys(data) validations.validate_data_dict_data_len(data) for key in data['data']: data_list = list(data['data'][key]) feature_data = data_list[0] missing_flags = data_list[1] time_delta = data_list[2] label = data_list[3] mean_vector = [0] * len(feature_data) for i in range(len(feature_data)): mean_vector[i] = get_mean_for_series(feature_data[i], missing_flags[i]) data_tuple = (feature_data, missing_flags, time_delta, mean_vector, label) data['data'][key] = data_tuple return data
def evaluate_set(data, key_set: str, model, criterion, optimizer=None, train_covariates=False): validations.validate_data_dict_keys(data) validate_key_set_str(key_set) total_loss = 0 labels = [] predictions = [] if not optimizer: model.eval() else: model.train() for key in data[key_set]: actual_data, covariate_data, train_label = data['data'][key] y_pred = model( actual_data, covariate_data) if train_covariates else model(actual_data) y_pred_unqueezed = y_pred.unsqueeze(0) loss = criterion(y_pred_unqueezed, train_label) total_loss += loss.item() # Check if training if criterion and optimizer: model.zero_grad() loss.backward() optimizer.step() labels.append(train_label) _, max_idx = y_pred.max(0) predictions.append(max_idx) return total_loss, labels, predictions
def evaluate_multitask_learner(data, key_set: str, num_classes, multitask_lerner_model, reconstruction_criterion, classification_criterion, device, optimizer=None, alpha=1, beta=1, use_histogram=False, histogram_seq_len=None, ordinal_regression=False, use_covariates=True): validations.validate_data_dict_keys(data) validate_key_set_str(key_set) total_reconstruction_loss = 0 total_classification_loss = 0 total_joint_loss = 0 labels = [] predictions = [] users = [] if not optimizer: multitask_lerner_model.eval() else: multitask_lerner_model.train() for key in data[key_set]: student_id = conversions.extract_student_id_from_key(key) student_key = 'student_' + str(student_id) actual_data, covariate_data, histogram_data, train_label = data[ 'data'][key] if ordinal_regression: train_label_vector = get_target_vector_for_ordinal_regression( train_label, num_classes, device) actual_data = actual_data[0].unsqueeze(0) if use_histogram: if histogram_seq_len: histogram_data = histogram_data[:max(histogram_seq_len, len(histogram_data))] actual_data = histogram_data.unsqueeze(0) decoded_output, y_pred = multitask_lerner_model( student_key, actual_data, covariate_data if use_covariates else None) # decoded output is `None` if training on only co-variates. reconstruction_loss = reconstruction_criterion( actual_data, decoded_output ) if decoded_output is not None else object_generator.get_tensor_on_correct_device( [0]) total_reconstruction_loss += reconstruction_loss.item() if ordinal_regression: classification_loss = classification_criterion( y_pred, train_label_vector) else: classification_loss = classification_criterion(y_pred, train_label) total_classification_loss += classification_loss.item() joint_loss = alpha * reconstruction_loss + beta * classification_loss total_joint_loss += joint_loss.item() # Check if training if optimizer: multitask_lerner_model.zero_grad() joint_loss.backward() optimizer.step() labels.append(train_label) predicted_class = get_predicted_class( y_pred, ordinal_regression=ordinal_regression) predictions.append(predicted_class) users.append(student_id) return total_joint_loss, total_reconstruction_loss, total_classification_loss, labels, predictions, users