from src.bin import statistics
from src.data_manager import cross_val
from src.models.multitask_learning import multitask_autoencoder
from src.utils.read_utils import read_pickle

feature_list = data_manager.FEATURE_LIST

# ##### Pickle #####
data_file_path = os.path.join(
    definitions.DATA_DIR, 'training_data/shuffled_splits',
    'training_date_normalized_shuffled_splits_select_features_no_prev_stress_2.pkl'
)
data = read_pickle(data_file_path)

############ Stats #############
print(statistics.get_train_test_val_label_counts_from_raw_data(data))

################################## Init ##################################
use_historgram = True
autoencoder_bottle_neck_feature_size = 128
autoencoder_num_layers = 1
alpha, beta = 0.001, 1
decay = 0.0001
first_key = next(iter(data['data'].keys()))
if use_historgram:
    num_features = len(data['data'][first_key][4][0])
else:
    num_features = len(data['data'][first_key][0][0])
num_covariates = len(data['data'][first_key][definitions.COVARIATE_DATA_IDX])
shared_hidden_layer_size = 256
user_dense_layer_hidden_size = 64
Esempio n. 2
0
def search_multitask_auto_encoder(hyper_parameters_list, data: dict):
    splits = cross_val.get_k_fod_cross_val_splits_stratified_by_students(data)
    student_list = conversions.extract_distinct_student_idsfrom_keys(
        data['data'].keys())
    tensorified_data = tensorify.tensorify_data_gru_d(
        copy.deepcopy(data), torch.cuda.is_available())

    final_scores_for_each_config = []

    print("Label Distribution")
    print(statistics.get_train_test_val_label_counts_from_raw_data(data))

    for model_params_no, model_params in enumerate(hyper_parameters_list):
        print(
            "###################### Param Config No: {} ########################"
            .format(model_params_no))
        print("Params: ", model_params)

        (use_histogram, autoencoder_bottle_neck_feature_size,
         autoencoder_num_layers, alpha, beta, decay, num_features,
         num_covariates, shared_hidden_layer_size,
         user_dense_layer_hidden_size, num_classes, learning_rate, n_epochs,
         shared_layer_dropout_prob, user_head_dropout_prob, class_weights,
         device) = helper.get_params_from_model(model_params, data)

        best_val_scores = []

        for split_no, split in enumerate(splits):

            print("Split {}".format(split_no))

            best_split_score = -1

            tensorified_data['train_ids'] = split["train_ids"]
            tensorified_data['val_ids'] = split["val_ids"]
            tensorified_data['test_ids'] = []

            model, reconstruction_criterion, classification_criterion, optimizer = helper.init_multitask_autoencoder_learner(
                num_features, autoencoder_bottle_neck_feature_size,
                autoencoder_num_layers, shared_hidden_layer_size,
                user_dense_layer_hidden_size, num_classes, num_covariates,
                shared_layer_dropout_prob, user_head_dropout_prob,
                learning_rate, decay, class_weights, student_list)

            total_loss_over_epochs, scores_over_epochs = plotting.get_empty_stat_over_n_epoch_dictionaries(
            )
            reconstruction_loss_over_epochs = copy.deepcopy(
                total_loss_over_epochs)
            classification_loss_over_epochs = copy.deepcopy(
                total_loss_over_epochs)

            for epoch in tqdm.tqdm(range(n_epochs)):

                (train_total_loss, train_total_reconstruction_loss,
                 train_total_classification_loss, train_labels, train_preds,
                 train_users), (val_total_loss, val_total_reconstruction_loss,
                                val_total_classification_loss, val_labels,
                                val_preds,
                                val_users) = helper.train_for_one_epoch(
                                    tensorified_data, num_classes, model,
                                    reconstruction_criterion,
                                    classification_criterion, device,
                                    optimizer, alpha, beta, use_histogram)

                ######## Appending losses ########
                total_loss_over_epochs['train_loss'].append(train_total_loss)
                total_loss_over_epochs['val_loss'].append(val_total_loss)

                reconstruction_loss_over_epochs['train_loss'].append(
                    train_total_reconstruction_loss)
                reconstruction_loss_over_epochs['val_loss'].append(
                    val_total_reconstruction_loss)

                classification_loss_over_epochs['train_loss'].append(
                    train_total_classification_loss)
                classification_loss_over_epochs['val_loss'].append(
                    val_total_classification_loss)

                ######## Appending Metrics ########
                train_label_list = conversions.tensor_list_to_int_list(
                    train_labels)
                train_pred_list = conversions.tensor_list_to_int_list(
                    train_preds)
                val_label_list = conversions.tensor_list_to_int_list(
                    val_labels)
                val_pred_list = conversions.tensor_list_to_int_list(val_preds)

                train_scores = metrics.precision_recall_fscore_support(
                    train_label_list, train_pred_list,
                    average='weighted')[F_SCORE_INDEX]
                val_scores = metrics.precision_recall_fscore_support(
                    val_label_list, val_pred_list,
                    average='weighted')[F_SCORE_INDEX]

                scores_over_epochs['train_scores'].append(train_scores)
                scores_over_epochs['val_scores'].append(val_scores)

                if val_scores > best_split_score:
                    best_split_score = val_scores

            best_val_scores.append(best_split_score)

        avg_val_score = list_mean(best_val_scores)
        final_scores_for_each_config.append((avg_val_score, model_params))

        print("Average score for current configuration: {}".format(
            avg_val_score))

    grid_search_details_file_path = os.path.join(definitions.DATA_DIR,
                                                 "grid_search_details.pkl")
    write_utils.data_structure_to_pickle(final_scores_for_each_config,
                                         grid_search_details_file_path)