Example #1
0
def runBoostingRegressorWithSubstrings_and_Times(amount_of_runs, host_name,
                                                 root_name, passw_root,
                                                 database_name, query):
    total_true = 0  # the amount of correctly predicted pass/fail of the sum of both languages.
    total_prolog = 0  # the amount of correctly predicted pass/fail of prolog.
    total_haskell = 0  # the amount of correctly predicted pass/fail of haskell.
    total_avg_deviation = 0  # the sum of the average deviation of each run.
    total_avg_deviation_both = 0
    length_prediction_list = 1  # the amount of predictions made each run.

    query_result = Database_Functions.query_database_dataframe(
        host_name, root_name, passw_root, database_name,
        query)  # this is a dataframe with the needed data
    query_result, big_dict, time_dict = preprocessing_2(query_result)

    query_result = pandasql.sqldf(Queries.get_query_09_1819_df("query_result"),
                                  locals())

    grades = query_result[['user_id', 'score_prolog',
                           'score_haskell']].drop_duplicates(subset='user_id')
    # this is a dataframe with all user_id's and all scores
    grades.reset_index(
        drop=True, inplace=True
    )  # we reset the number index of the dataframe (purely cosmetics)
    possible_categories = query_result.query(
        'language==1')['category'].unique()
    # gras = query result + Time Dict.
    query_result = integrate_times_into_df(time_dict, query_result)
    # selecting only prolog as cat
    # possible_categories = query_result['category'].unique()

    # preprocessing(host_name, root_name, passw_root, database_name, Queries.get_query_06_)
    big_result_list = []
    for x in range(
            amount_of_runs):  # in this loop the experiment gets repeated
        print("run number " + str(x))
        verification_df = grades.sample(
            frac=0.1)  # this is a random selection of 10% of the dataframe
        train_df = grades.drop(
            verification_df.index
        )  # we drop the sample that we have selected to retain 90% to train

        training_users = set(train_df['user_id'].tolist()
                             )  # a set of all selected training-users
        verification_users = set(verification_df['user_id'].tolist())
        relevant_subset, total_freq_subset = get_relevant_subset(
            training_users, big_dict)
        trees, frequency_list_df_training = TreeConstructor.create_trees_with_subsets(
            train_df, relevant_subset, total_freq_subset)
        data_points_training_df = query_result.iloc[np.where(
            query_result.user_id.isin(training_users))]
        # we have one boosting trees per category from create_trees_with_subsets, we now predict one score per
        # user and append this to the dataframe.
        data_points_training_df = add_freq_predictions_to_df(
            trees, data_points_training_df, frequency_list_df_training)
        frequency_list_df_ver = make_frequency_list_df(big_dict,
                                                       verification_users,
                                                       total_freq_subset)

        # A dataframe of all submissions of the selected users.
        data_points_verification_df = query_result.drop(
            data_points_training_df.index)
        # we drop the selected training data to form the verification data
        data_points_verification_df = add_freq_predictions_to_df(
            trees, data_points_verification_df, frequency_list_df_ver)
        my_boosting_trees = TreeConstructor.build_big_boostingtree_with_dataframe(
            data_points_training_df, possible_categories)
        # this function returns a dictionary containing the trained decision-trees having the categories as key.

        predicted_list, actual_verification = TreeConstructor.make_boosting_predictions_with_grades_in_df(
            my_boosting_trees, data_points_verification_df,
            possible_categories)
        #  this function returns two lists containing lists of grades in float. Predictions and Actual grades to compare
        #        for x in range(len(predicted_list)):
        #            print(predicted_list[x][0])
        #            print(actual_verification[x])
        pass_fail_result = pass_fail_boosting2(predicted_list,
                                               actual_verification)
        # here we calculate all data we need
        deviation = average_deviation_boosting2(predicted_list,
                                                actual_verification)
        total_avg_deviation += deviation[0]
        total_avg_deviation_both += deviation[1]
        total_true += sum([x[1] for x in pass_fail_result])
        total_prolog += sum([x[0][0] for x in pass_fail_result])
        total_haskell += sum([x[0][1] for x in pass_fail_result])
        #

        # we add all the parameters because at the end we will divide it by the total amount of runs
        if length_prediction_list != len(pass_fail_result):
            length_prediction_list = len(pass_fail_result)
        big_result_list += [
            predicted_list[x][0].tolist() + actual_verification[x]
            for x in range(len(predicted_list))
        ]
    df = DataFrame(big_result_list,
                   columns=[
                       "Predicted Prolog", "Predicted Haskell",
                       "Actual Prolog", "Actual Haskell"
                   ])
    return [
        total_true / amount_of_runs, total_prolog / amount_of_runs,
        total_haskell / amount_of_runs, total_avg_deviation / amount_of_runs,
        length_prediction_list, total_avg_deviation_both / amount_of_runs, df
    ]
def runBoostingRegressorWithSubstrings_and_Times_k_cross_validation(
        amount_of_runs, k, grades, query_result, big_dict):

    # preprocessing(host_name, root_name, passw_root, database_name, Queries.get_query_06_)
    df = DataFrame(columns=[
        "Predicted Prolog", "Predicted Haskell", "Actual Prolog",
        "Actual Haskell"
    ])
    ################################################################## CROSS VALIDATION
    for i in range(
            amount_of_runs):  # in this loop the experiment gets repeated
        print("ST Run number " + str(i + 1))

        alldata: [] = split_dataset(grades, k)

        for x in range(k):
            print("K Run number" + str(x + 1))
            (train_df, verification_df) = get_remaining_dataset(alldata, x)
            #################################################################
            training_users = set(train_df['user_id'].tolist()
                                 )  # a set of all selected training-users
            verification_users = set(verification_df['user_id'].tolist())
            relevant_subset, total_freq_subset = get_relevant_subset(
                training_users, big_dict)
            trees, frequency_list_df_training = TreeConstructor.create_trees_with_subsets(
                train_df, relevant_subset, total_freq_subset)
            data_points_training_df = query_result.iloc[np.where(
                query_result.user_id.isin(training_users))]
            # we have one boosting trees per category from create_trees_with_subsets, we now predict one score per
            # user and append this to the dataframe.
            data_points_training_df = add_freq_predictions_to_df(
                trees, data_points_training_df, frequency_list_df_training)
            frequency_list_df_ver = make_frequency_list_df(
                big_dict, verification_users, total_freq_subset)

            # A dataframe of all submissions of the selected users.
            data_points_verification_df = query_result.drop(
                data_points_training_df.index)
            # we drop the selected training data to form the verification data
            data_points_verification_df = add_freq_predictions_to_df(
                trees, data_points_verification_df, frequency_list_df_ver)
            language_lists_prediction = []
            language_lists_actual = []
            for language in range(2, 0, -1):
                possible_categories = query_result.query(
                    'language==' + str(language))['category'].unique()

                my_boosting_trees = TreeConstructor.build_big_boostingtree_with_dataframe(
                    data_points_training_df, possible_categories)
                # this function returns a dictionary containing the trained decision-trees having the categories as key.

                predicted_list, actual_verification = TreeConstructor.make_boosting_predictions_with_grades_in_df(
                    my_boosting_trees, data_points_verification_df,
                    possible_categories, language)

                predicted_list = [x[0][language % 2] for x in predicted_list]
                language_lists_prediction.append(predicted_list)
                language_lists_actual.append(actual_verification)
                """
                pass_fail_result = [(predicted_list[x] >= 5 and actual_verification[x] >= 5)
                                or (predicted_list[x] < 5 and actual_verification[x] < 5) for x in
                                range(len(predicted_list))]  # here we calculate all data we need
                total_avg_deviation += sum([abs(predicted_list[x] - actual_verification[x]) for x in
                                            range(len(predicted_list))]) / len(predicted_list)
                if (language == 1):
                    total_haskell += sum(pass_fail_result)
                else:
                    total_prolog += sum(pass_fail_result)
                """

            for xx in range(0, len(language_lists_prediction), 2):
                dfx = DataFrame(
                    {'Predicted Prolog': language_lists_prediction[xx]})
                dfx['Predicted Haskell'] = language_lists_prediction[xx + 1]
                dfx['Actual Prolog'] = language_lists_actual[xx]
                dfx['Actual Haskell'] = language_lists_actual[xx + 1]
                df = concat([df, dfx])

            ## END INNER K-CROSS VALIDATION LOOP

        # END AMOUNt_OF_RUNS LOOP
    return df