def main(): #run the two datasets data_sets = ["basic_data", "basic_data_only_finishers", "basic_data_clean_lecture", "basic_data_piazza", "basic_data_piazza_only_finishers"] for data_set in data_sets: #load the data headings, data = data_work.load_data(data_set + ".csv", conversion_function = data_work.convert_survey_data, max_records = None) visualize_correlelations(data, headings, data_set) #draw scatterplots in matrix form scatterplot_matrix(data, headings, data_set) #generate the stats generate_stats(data, headings, data_set) #draw charts for the relationships between variables draw_charts(data, headings, data_set) #load unaltered data headings, data = data_work.load_data(data_set + ".csv", conversion_function = None, max_records = None) #draw some frequency histograms draw_histograms(data, headings, data_set) return
def main(): #run the two datasets data_sets = [ "basic_data", "basic_data_only_finishers", "basic_data_clean_lecture", "basic_data_piazza", "basic_data_piazza_only_finishers" ] for data_set in data_sets: #load the data headings, data = data_work.load_data( data_set + ".csv", conversion_function=data_work.convert_survey_data, max_records=None) #generate the stats generate_stats(data, headings, data_set) #draw charts for the relationships between variables draw_charts(data, headings, data_set) #load unaltered data headings, data = data_work.load_data(data_set + ".csv", conversion_function=None, max_records=None) #draw some frequency histograms draw_histograms(data, headings, data_set) return
def main(): #create a dictionary of feature setups #select the appropriate columns feature_dict = { #"Full" : header.tolist(), "Lecture Views" : ['overal_lecture_views', 'total_lecture_time'], "Piazza Use" : ['mid_on_piazza', 'final_on_piazza', 'piazza_posts', 'piazza_days', 'piazza_views'], "Lecture Pace" : ['lecture_1_pace_Late', 'lecture_1_pace_On-time', 'lecture_1_pace_Unknown', 'lecture_2_pace_Late', 'lecture_2_pace_On-time', 'lecture_2_pace_Unknown', 'lecture_3_pace_Late', 'lecture_3_pace_On-time', 'lecture_3_pace_Unknown', 'lecture_4_pace_Early', 'lecture_4_pace_Late', 'lecture_4_pace_On-time', 'lecture_4_pace_Unknown', 'lecture_5_pace_Early', 'lecture_5_pace_Late', 'lecture_5_pace_On-time', 'lecture_5_pace_Unknown', 'lecture_6_pace_Early', 'lecture_6_pace_Late', 'lecture_6_pace_On-time', 'lecture_6_pace_Unknown', 'lecture_7_pace_Early', 'lecture_7_pace_Late', 'lecture_7_pace_On-time', 'lecture_7_pace_Unknown', 'lecture_8_pace_Early', 'lecture_8_pace_Late', 'lecture_8_pace_On-time', 'lecture_8_pace_Unknown', 'lecture_9_pace_Early', 'lecture_9_pace_Late', 'lecture_9_pace_On-time', 'lecture_9_pace_Unknown', 'lecture_10_pace_Early', 'lecture_10_pace_Late', 'lecture_10_pace_On-time', 'lecture_10_pace_Unknown', 'lecture_11_pace_Early', 'lecture_11_pace_Late', 'lecture_11_pace_On-time', 'lecture_11_pace_Unknown', 'lecture_12_pace_Early', 'lecture_12_pace_Late', 'lecture_12_pace_On-time', 'lecture_12_pace_Unknown', 'lecture_13_pace_Early', 'lecture_13_pace_Late', 'lecture_13_pace_On-time', 'lecture_13_pace_Unknown', 'lecture_14_pace_Early', 'lecture_14_pace_Late', 'lecture_14_pace_On-time', 'lecture_14_pace_Unknown', 'lecture_15_pace_Early', 'lecture_15_pace_Late', 'lecture_15_pace_On-time', 'lecture_15_pace_Unknown', 'lecture_16_pace_Early', 'lecture_16_pace_Late', 'lecture_16_pace_On-time', 'lecture_16_pace_Unknown', 'lecture_17_pace_Early', 'lecture_17_pace_Late', 'lecture_17_pace_On-time', 'lecture_17_pace_Unknown', 'lecture_18_pace_Early', 'lecture_18_pace_Late', 'lecture_18_pace_On-time', 'lecture_18_pace_Unknown', 'lecture_19_pace_Early', 'lecture_19_pace_Late', 'lecture_19_pace_On-time', 'lecture_19_pace_Unknown', 'lecture_20_pace_Early', 'lecture_20_pace_Late', 'lecture_20_pace_On-time', 'lecture_20_pace_Unknown', 'lecture_21_pace_Early', 'lecture_21_pace_Late', 'lecture_21_pace_On-time', 'lecture_21_pace_Unknown', 'lecture_22_pace_Early', 'lecture_22_pace_Late', 'lecture_22_pace_On-time', 'lecture_22_pace_Unknown', 'lecture_23_pace_Early', 'lecture_23_pace_Late', 'lecture_23_pace_On-time', 'lecture_23_pace_Unknown', 'lecture_24_pace_Early', 'lecture_24_pace_Late', 'lecture_24_pace_On-time', 'lecture_24_pace_Unknown', 'lecture_25_pace_Early', 'lecture_25_pace_Late', 'lecture_25_pace_On-time', 'lecture_25_pace_Unknown', 'lecture_26_pace_Early', 'lecture_26_pace_Late', 'lecture_26_pace_On-time', 'lecture_26_pace_Unknown', 'overall_pace_Early', 'overall_pace_Late', 'overall_pace_On-time', 'overall_pace_Unknown'], "Classmate Contact" : ['qtr_on_piazza', 'qtr_email', 'qtr_hipchat', 'qrt_gplus', 'qtr_other_chat', 'qtr_phone', 'qtr_facebook', 'qtr_in_person', 'mid_on_piazza', 'mid_email', 'mid_hipchat', 'qrt_gplus', 'mid_other_chat', 'mid_phone', 'mid_facebook', 'mid_in_person', 'final_on_piazza', 'final_email', 'final_hipchat', 'qrt_gplus', 'final_other_chat', 'final_phone', 'final_facebook', 'final_in_person'], "Lecture Amount" : ['total_lecture_time', 'overal_lecture_views', 'lecture_1_views', 'lecture_2_views', 'lecture_3_views', 'lecture_4_views', 'lecture_5_views', 'lecture_6_views', 'lecture_7_views', 'lecture_8_views', 'lecture_9_views', 'lecture_10_views', 'lecture_11_views', 'lecture_12_views', 'lecture_13_views', 'lecture_14_views', 'lecture_15_views', 'lecture_16_views', 'lecture_17_views', 'lecture_18_views', 'lecture_19_views', 'lecture_20_views', 'lecture_21_views', 'lecture_22_views', 'lecture_23_views', 'lecture_24_views', 'lecture_25_views', 'lecture_26_views'], "Prior Experience" : ['formal_class_prog_taken', 'C', 'C#', 'C++', 'Java', 'JavaScript', 'Lisp', 'Objective C', 'Perl', 'PHP', 'Python', 'Ruby', 'Shell', 'Swift', 'Visual Basic', 'Other (specify below)', 'years_programming', 'prior_omscs_classes_completed', 'occupation', 'highest_education', 'besides_KBAI_how_many_classes', 'moocs_completed_outside_OMSCS'], "Self Assesment" : ['qtr_proj1_confidence_neither confident nor unconfident', 'qtr_proj1_confidence_no answer', 'qtr_proj1_confidence_somewhat confident', 'qtr_proj1_confidence_somewhat unconfident', 'qtr_proj1_confidence_very confident', 'qtr_proj1_confidence_very unconfident', 'qtr_proj2_confidence_neither confident nor unconfident', 'qtr_proj2_confidence_no answer', 'qtr_proj2_confidence_somewhat confident', 'qtr_proj2_confidence_somewhat unconfident', 'qtr_proj2_confidence_very confident', 'qtr_proj2_confidence_very unconfident', 'mid_proj2_confidence_neither confident nor unconfident', 'mid_proj2_confidence_no answer', 'mid_proj2_confidence_somewhat confident', 'mid_proj2_confidence_somewhat unconfident', 'mid_proj2_confidence_very confident', 'mid_proj2_confidence_very unconfident', 'mid_proj3_confidence_neither confident nor unconfident', 'mid_proj3_confidence_no answer', 'mid_proj3_confidence_somewhat confident', 'mid_proj3_confidence_somewhat unconfident', 'mid_proj3_confidence_very confident', 'mid_proj3_confidence_very unconfident', 'final_proj3_confidence_neither confident nor unconfident', 'final_proj3_confidence_no answer', 'final_proj3_confidence_somewhat confident', 'final_proj3_confidence_somewhat unconfident', 'final_proj3_confidence_very confident', 'final_proj3_confidence_very unconfident'] } #add the assignments in as variables to test too expanded_feature_dict = feature_dict.copy() for feature_set_name, select_columns in feature_dict.iteritems(): #create the new feature set names feature_plus_a1 = feature_set_name + " After A1" feature_plus_a2 = feature_set_name + " After A2" feature_plus_a3 = feature_set_name + " After A3" #add the new columns select_columns_plus_a1 = select_columns + ["Assig_1_full_40", "proj_1_100"] select_columns_plus_a2 = select_columns_plus_a1 + ["Assig_2_full_40", "proj_2_100"] select_columns_plus_a3 = select_columns_plus_a2 + ["Assig_3_full_40", "proj_3_100"] #add to the dictionary expanded_feature_dict[feature_plus_a1] = select_columns_plus_a1 expanded_feature_dict[feature_plus_a2] = select_columns_plus_a2 expanded_feature_dict[feature_plus_a3] = select_columns_plus_a3 #replace the dictionary feature_dict = expanded_feature_dict.copy() #list the data sources data_sources = [ "basic_data", "basic_data_only_finishers", "basic_data_clean_lecture", "basic_data_piazza", "basic_data_piazza_only_finishers" ] #create csv for results with open(os.path.join(results_location, 'regression_results.csv'), 'wb') as output_file: #establish the csv writer writer = csv.writer(output_file, delimiter=',') for data_source in data_sources: print("\n\n------------------") print("Data Set - %s" % data_source) print("------------------") #this section determines R^2 scores of the regressors writer.writerow(["R^2 Scores"]) writer.writerow(["Dataset - %s" % data_source] ) #load the data from the csv header, data = data_work.load_data(data_source + ".csv", conversion_function = data_work.convert_survey_data, max_records = None) #create headings writer.writerow(["Feature", "Decision Tree", "Boosting", "Random Forest", "Linear Regression", "Support Vector Machine", "", "Feature Details"]) #loop through all the feature set combos for feature_set_name, select_columns in feature_dict.iteritems(): print("\n\n------------------") print("Feature Set - %s" % feature_set_name) print("------------------") #get the data subset header_subset, data_subset = data_work.select_data_columns(header, data, column_names = ['course_grade'] + select_columns) #first run on the full set #assumes first column is Y X_train, X_test, y_train, y_test = data_work.divide_for_training(data_subset) #remove the label header header_subset = header_subset[1 : ] #scale the data X_train, X_test = data_work.scale_features(X_train, X_test) #test all to start run_regressors(X_train, y_train, X_test, y_test, header_subset, writer, data_source + "-" + feature_set_name + " Linear") #for degree in [2, 3, 4]: for degree in [2]: #convert to polynomials poly = PolynomialFeatures(degree=degree) X_train_poly = poly.fit_transform(X_train) X_test_poly = poly.fit_transform(X_test) #test all in poly run_regressors(X_train_poly , y_train, X_test_poly, y_test, header_subset, writer, data_source + "-" + feature_set_name + " Poly %i" % degree) ##test individually #for i in range(0, X_train.shape[1]): # run_regressors(X_train[:, i,np.newaxis], y_train, X_test[:, i,np.newaxis], y_test, header_subset[i + 1, np.newaxis], writer) return
def main(): #create a dictionary of feature setups #select the appropriate columns feature_dict = { #"Full" : header.tolist(), "Lecture Views": ['overal_lecture_views', 'total_lecture_time'], "Piazza Use": [ 'mid_on_piazza', 'final_on_piazza', 'piazza_posts', 'piazza_days', 'piazza_views' ], "Lecture Pace": [ 'lecture_1_pace_Late', 'lecture_1_pace_On-time', 'lecture_1_pace_Unknown', 'lecture_2_pace_Late', 'lecture_2_pace_On-time', 'lecture_2_pace_Unknown', 'lecture_3_pace_Late', 'lecture_3_pace_On-time', 'lecture_3_pace_Unknown', 'lecture_4_pace_Early', 'lecture_4_pace_Late', 'lecture_4_pace_On-time', 'lecture_4_pace_Unknown', 'lecture_5_pace_Early', 'lecture_5_pace_Late', 'lecture_5_pace_On-time', 'lecture_5_pace_Unknown', 'lecture_6_pace_Early', 'lecture_6_pace_Late', 'lecture_6_pace_On-time', 'lecture_6_pace_Unknown', 'lecture_7_pace_Early', 'lecture_7_pace_Late', 'lecture_7_pace_On-time', 'lecture_7_pace_Unknown', 'lecture_8_pace_Early', 'lecture_8_pace_Late', 'lecture_8_pace_On-time', 'lecture_8_pace_Unknown', 'lecture_9_pace_Early', 'lecture_9_pace_Late', 'lecture_9_pace_On-time', 'lecture_9_pace_Unknown', 'lecture_10_pace_Early', 'lecture_10_pace_Late', 'lecture_10_pace_On-time', 'lecture_10_pace_Unknown', 'lecture_11_pace_Early', 'lecture_11_pace_Late', 'lecture_11_pace_On-time', 'lecture_11_pace_Unknown', 'lecture_12_pace_Early', 'lecture_12_pace_Late', 'lecture_12_pace_On-time', 'lecture_12_pace_Unknown', 'lecture_13_pace_Early', 'lecture_13_pace_Late', 'lecture_13_pace_On-time', 'lecture_13_pace_Unknown', 'lecture_14_pace_Early', 'lecture_14_pace_Late', 'lecture_14_pace_On-time', 'lecture_14_pace_Unknown', 'lecture_15_pace_Early', 'lecture_15_pace_Late', 'lecture_15_pace_On-time', 'lecture_15_pace_Unknown', 'lecture_16_pace_Early', 'lecture_16_pace_Late', 'lecture_16_pace_On-time', 'lecture_16_pace_Unknown', 'lecture_17_pace_Early', 'lecture_17_pace_Late', 'lecture_17_pace_On-time', 'lecture_17_pace_Unknown', 'lecture_18_pace_Early', 'lecture_18_pace_Late', 'lecture_18_pace_On-time', 'lecture_18_pace_Unknown', 'lecture_19_pace_Early', 'lecture_19_pace_Late', 'lecture_19_pace_On-time', 'lecture_19_pace_Unknown', 'lecture_20_pace_Early', 'lecture_20_pace_Late', 'lecture_20_pace_On-time', 'lecture_20_pace_Unknown', 'lecture_21_pace_Early', 'lecture_21_pace_Late', 'lecture_21_pace_On-time', 'lecture_21_pace_Unknown', 'lecture_22_pace_Early', 'lecture_22_pace_Late', 'lecture_22_pace_On-time', 'lecture_22_pace_Unknown', 'lecture_23_pace_Early', 'lecture_23_pace_Late', 'lecture_23_pace_On-time', 'lecture_23_pace_Unknown', 'lecture_24_pace_Early', 'lecture_24_pace_Late', 'lecture_24_pace_On-time', 'lecture_24_pace_Unknown', 'lecture_25_pace_Early', 'lecture_25_pace_Late', 'lecture_25_pace_On-time', 'lecture_25_pace_Unknown', 'lecture_26_pace_Early', 'lecture_26_pace_Late', 'lecture_26_pace_On-time', 'lecture_26_pace_Unknown', 'overall_pace_Early', 'overall_pace_Late', 'overall_pace_On-time', 'overall_pace_Unknown' ], "Classmate Contact": [ 'qtr_on_piazza', 'qtr_email', 'qtr_hipchat', 'qrt_gplus', 'qtr_other_chat', 'qtr_phone', 'qtr_facebook', 'qtr_in_person', 'mid_on_piazza', 'mid_email', 'mid_hipchat', 'qrt_gplus', 'mid_other_chat', 'mid_phone', 'mid_facebook', 'mid_in_person', 'final_on_piazza', 'final_email', 'final_hipchat', 'qrt_gplus', 'final_other_chat', 'final_phone', 'final_facebook', 'final_in_person' ], "Lecture Amount": [ 'total_lecture_time', 'overal_lecture_views', 'lecture_1_views', 'lecture_2_views', 'lecture_3_views', 'lecture_4_views', 'lecture_5_views', 'lecture_6_views', 'lecture_7_views', 'lecture_8_views', 'lecture_9_views', 'lecture_10_views', 'lecture_11_views', 'lecture_12_views', 'lecture_13_views', 'lecture_14_views', 'lecture_15_views', 'lecture_16_views', 'lecture_17_views', 'lecture_18_views', 'lecture_19_views', 'lecture_20_views', 'lecture_21_views', 'lecture_22_views', 'lecture_23_views', 'lecture_24_views', 'lecture_25_views', 'lecture_26_views' ], "Prior Experience": [ 'formal_class_prog_taken', 'C', 'C#', 'C++', 'Java', 'JavaScript', 'Lisp', 'Objective C', 'Perl', 'PHP', 'Python', 'Ruby', 'Shell', 'Swift', 'Visual Basic', 'Other (specify below)', 'years_programming', 'prior_omscs_classes_completed', 'occupation', 'highest_education', 'besides_KBAI_how_many_classes', 'moocs_completed_outside_OMSCS' ], "Self Assesment": [ 'qtr_proj1_confidence_neither confident nor unconfident', 'qtr_proj1_confidence_no answer', 'qtr_proj1_confidence_somewhat confident', 'qtr_proj1_confidence_somewhat unconfident', 'qtr_proj1_confidence_very confident', 'qtr_proj1_confidence_very unconfident', 'qtr_proj2_confidence_neither confident nor unconfident', 'qtr_proj2_confidence_no answer', 'qtr_proj2_confidence_somewhat confident', 'qtr_proj2_confidence_somewhat unconfident', 'qtr_proj2_confidence_very confident', 'qtr_proj2_confidence_very unconfident', 'mid_proj2_confidence_neither confident nor unconfident', 'mid_proj2_confidence_no answer', 'mid_proj2_confidence_somewhat confident', 'mid_proj2_confidence_somewhat unconfident', 'mid_proj2_confidence_very confident', 'mid_proj2_confidence_very unconfident', 'mid_proj3_confidence_neither confident nor unconfident', 'mid_proj3_confidence_no answer', 'mid_proj3_confidence_somewhat confident', 'mid_proj3_confidence_somewhat unconfident', 'mid_proj3_confidence_very confident', 'mid_proj3_confidence_very unconfident', 'final_proj3_confidence_neither confident nor unconfident', 'final_proj3_confidence_no answer', 'final_proj3_confidence_somewhat confident', 'final_proj3_confidence_somewhat unconfident', 'final_proj3_confidence_very confident', 'final_proj3_confidence_very unconfident' ] } #list the data sources data_sources = [ "basic_data", "basic_data_only_finishers", "basic_data_clean_lecture", "basic_data_piazza", "basic_data_piazza_only_finishers" ] #create csv for results with open(os.path.join(results_location, 'regression_results.csv'), 'wb') as output_file: #establish the csv writer writer = csv.writer(output_file, delimiter=',') for data_source in data_sources: print("\n\n------------------") print("Data Set - %s" % data_source) print("------------------") #this section determines R^2 scores of the regressors writer.writerow(["R^2 Scores"]) writer.writerow(["Dataset - %s" % data_source]) #load the data from the csv header, data = data_work.load_data( data_source + ".csv", conversion_function=data_work.convert_survey_data, max_records=None) #create headings writer.writerow([ "Feature", "Decision Tree", "Boosting", "Random Forest", "Linear Regression", "Support Vector Machine", "", "Feature Details" ]) #loop through all the feature set combos for feature_set_name, select_columns in feature_dict.iteritems(): print("\n\n------------------") print("Feature Set - %s" % feature_set_name) print("------------------") #get the data subset header_subset, data_subset = data_work.select_data_columns( header, data, column_names=['course_grade'] + select_columns) #first run on the full set #assumes first column is Y X_train, X_test, y_train, y_test = data_work.divide_for_training( data_subset) #remove the label header header_subset = header_subset[1:] #scale the data X_train, X_test = data_work.scale_features(X_train, X_test) #test all to start run_regressors( X_train, y_train, X_test, y_test, header_subset, writer, data_source + "-" + feature_set_name + " Linear") #for degree in [2, 3, 4]: for degree in [2]: #convert to polynomials poly = PolynomialFeatures(degree=degree) X_train_poly = poly.fit_transform(X_train) X_test_poly = poly.fit_transform(X_test) #test all in poly run_regressors( X_train_poly, y_train, X_test_poly, y_test, header_subset, writer, data_source + "-" + feature_set_name + " Poly %i" % degree) ##test individually #for i in range(0, X_train.shape[1]): # run_regressors(X_train[:, i,np.newaxis], y_train, X_test[:, i,np.newaxis], y_test, header_subset[i + 1, np.newaxis], writer) return