def runBoostingRegressorWithSubstrings_and_Times(amount_of_runs, host_name, root_name, passw_root, database_name, query): total_true = 0 # the amount of correctly predicted pass/fail of the sum of both languages. total_prolog = 0 # the amount of correctly predicted pass/fail of prolog. total_haskell = 0 # the amount of correctly predicted pass/fail of haskell. total_avg_deviation = 0 # the sum of the average deviation of each run. total_avg_deviation_both = 0 length_prediction_list = 1 # the amount of predictions made each run. query_result = Database_Functions.query_database_dataframe( host_name, root_name, passw_root, database_name, query) # this is a dataframe with the needed data query_result, big_dict, time_dict = preprocessing_2(query_result) query_result = pandasql.sqldf(Queries.get_query_09_1819_df("query_result"), locals()) grades = query_result[['user_id', 'score_prolog', 'score_haskell']].drop_duplicates(subset='user_id') # this is a dataframe with all user_id's and all scores grades.reset_index( drop=True, inplace=True ) # we reset the number index of the dataframe (purely cosmetics) possible_categories = query_result.query( 'language==1')['category'].unique() # gras = query result + Time Dict. query_result = integrate_times_into_df(time_dict, query_result) # selecting only prolog as cat # possible_categories = query_result['category'].unique() # preprocessing(host_name, root_name, passw_root, database_name, Queries.get_query_06_) big_result_list = [] for x in range( amount_of_runs): # in this loop the experiment gets repeated print("run number " + str(x)) verification_df = grades.sample( frac=0.1) # this is a random selection of 10% of the dataframe train_df = grades.drop( verification_df.index ) # we drop the sample that we have selected to retain 90% to train training_users = set(train_df['user_id'].tolist() ) # a set of all selected training-users verification_users = set(verification_df['user_id'].tolist()) relevant_subset, total_freq_subset = get_relevant_subset( training_users, big_dict) trees, frequency_list_df_training = TreeConstructor.create_trees_with_subsets( train_df, relevant_subset, total_freq_subset) data_points_training_df = query_result.iloc[np.where( query_result.user_id.isin(training_users))] # we have one boosting trees per category from create_trees_with_subsets, we now predict one score per # user and append this to the dataframe. data_points_training_df = add_freq_predictions_to_df( trees, data_points_training_df, frequency_list_df_training) frequency_list_df_ver = make_frequency_list_df(big_dict, verification_users, total_freq_subset) # A dataframe of all submissions of the selected users. data_points_verification_df = query_result.drop( data_points_training_df.index) # we drop the selected training data to form the verification data data_points_verification_df = add_freq_predictions_to_df( trees, data_points_verification_df, frequency_list_df_ver) my_boosting_trees = TreeConstructor.build_big_boostingtree_with_dataframe( data_points_training_df, possible_categories) # this function returns a dictionary containing the trained decision-trees having the categories as key. predicted_list, actual_verification = TreeConstructor.make_boosting_predictions_with_grades_in_df( my_boosting_trees, data_points_verification_df, possible_categories) # this function returns two lists containing lists of grades in float. Predictions and Actual grades to compare # for x in range(len(predicted_list)): # print(predicted_list[x][0]) # print(actual_verification[x]) pass_fail_result = pass_fail_boosting2(predicted_list, actual_verification) # here we calculate all data we need deviation = average_deviation_boosting2(predicted_list, actual_verification) total_avg_deviation += deviation[0] total_avg_deviation_both += deviation[1] total_true += sum([x[1] for x in pass_fail_result]) total_prolog += sum([x[0][0] for x in pass_fail_result]) total_haskell += sum([x[0][1] for x in pass_fail_result]) # # we add all the parameters because at the end we will divide it by the total amount of runs if length_prediction_list != len(pass_fail_result): length_prediction_list = len(pass_fail_result) big_result_list += [ predicted_list[x][0].tolist() + actual_verification[x] for x in range(len(predicted_list)) ] df = DataFrame(big_result_list, columns=[ "Predicted Prolog", "Predicted Haskell", "Actual Prolog", "Actual Haskell" ]) return [ total_true / amount_of_runs, total_prolog / amount_of_runs, total_haskell / amount_of_runs, total_avg_deviation / amount_of_runs, length_prediction_list, total_avg_deviation_both / amount_of_runs, df ]
def runBoostingRegressorWithSubstrings_and_Times_k_cross_validation( amount_of_runs, k, grades, query_result, big_dict): # preprocessing(host_name, root_name, passw_root, database_name, Queries.get_query_06_) df = DataFrame(columns=[ "Predicted Prolog", "Predicted Haskell", "Actual Prolog", "Actual Haskell" ]) ################################################################## CROSS VALIDATION for i in range( amount_of_runs): # in this loop the experiment gets repeated print("ST Run number " + str(i + 1)) alldata: [] = split_dataset(grades, k) for x in range(k): print("K Run number" + str(x + 1)) (train_df, verification_df) = get_remaining_dataset(alldata, x) ################################################################# training_users = set(train_df['user_id'].tolist() ) # a set of all selected training-users verification_users = set(verification_df['user_id'].tolist()) relevant_subset, total_freq_subset = get_relevant_subset( training_users, big_dict) trees, frequency_list_df_training = TreeConstructor.create_trees_with_subsets( train_df, relevant_subset, total_freq_subset) data_points_training_df = query_result.iloc[np.where( query_result.user_id.isin(training_users))] # we have one boosting trees per category from create_trees_with_subsets, we now predict one score per # user and append this to the dataframe. data_points_training_df = add_freq_predictions_to_df( trees, data_points_training_df, frequency_list_df_training) frequency_list_df_ver = make_frequency_list_df( big_dict, verification_users, total_freq_subset) # A dataframe of all submissions of the selected users. data_points_verification_df = query_result.drop( data_points_training_df.index) # we drop the selected training data to form the verification data data_points_verification_df = add_freq_predictions_to_df( trees, data_points_verification_df, frequency_list_df_ver) language_lists_prediction = [] language_lists_actual = [] for language in range(2, 0, -1): possible_categories = query_result.query( 'language==' + str(language))['category'].unique() my_boosting_trees = TreeConstructor.build_big_boostingtree_with_dataframe( data_points_training_df, possible_categories) # this function returns a dictionary containing the trained decision-trees having the categories as key. predicted_list, actual_verification = TreeConstructor.make_boosting_predictions_with_grades_in_df( my_boosting_trees, data_points_verification_df, possible_categories, language) predicted_list = [x[0][language % 2] for x in predicted_list] language_lists_prediction.append(predicted_list) language_lists_actual.append(actual_verification) """ pass_fail_result = [(predicted_list[x] >= 5 and actual_verification[x] >= 5) or (predicted_list[x] < 5 and actual_verification[x] < 5) for x in range(len(predicted_list))] # here we calculate all data we need total_avg_deviation += sum([abs(predicted_list[x] - actual_verification[x]) for x in range(len(predicted_list))]) / len(predicted_list) if (language == 1): total_haskell += sum(pass_fail_result) else: total_prolog += sum(pass_fail_result) """ for xx in range(0, len(language_lists_prediction), 2): dfx = DataFrame( {'Predicted Prolog': language_lists_prediction[xx]}) dfx['Predicted Haskell'] = language_lists_prediction[xx + 1] dfx['Actual Prolog'] = language_lists_actual[xx] dfx['Actual Haskell'] = language_lists_actual[xx + 1] df = concat([df, dfx]) ## END INNER K-CROSS VALIDATION LOOP # END AMOUNt_OF_RUNS LOOP return df