コード例 #1
0
ファイル: descriptive_stats.py プロジェクト: dhegberg/euclid
def main():

    #run the two datasets
    data_sets = ["basic_data", "basic_data_only_finishers", "basic_data_clean_lecture", "basic_data_piazza",  "basic_data_piazza_only_finishers"]

    for data_set in data_sets:

        #load the data
        headings, data = data_work.load_data(data_set + ".csv", conversion_function = data_work.convert_survey_data, max_records = None)

        visualize_correlelations(data, headings, data_set)

        #draw scatterplots in matrix form
        scatterplot_matrix(data, headings, data_set)

        #generate the stats
        generate_stats(data, headings, data_set)

        #draw charts for the relationships between variables
        draw_charts(data, headings, data_set)

        #load unaltered data
        headings, data = data_work.load_data(data_set + ".csv", conversion_function = None, max_records = None)

        #draw some frequency histograms
        draw_histograms(data, headings, data_set)

    return
コード例 #2
0
ファイル: descriptive_stats.py プロジェクト: njpayne/euclid
def main():

    #run the two datasets
    data_sets = [
        "basic_data", "basic_data_only_finishers", "basic_data_clean_lecture",
        "basic_data_piazza", "basic_data_piazza_only_finishers"
    ]

    for data_set in data_sets:

        #load the data
        headings, data = data_work.load_data(
            data_set + ".csv",
            conversion_function=data_work.convert_survey_data,
            max_records=None)

        #generate the stats
        generate_stats(data, headings, data_set)

        #draw charts for the relationships between variables
        draw_charts(data, headings, data_set)

        #load unaltered data
        headings, data = data_work.load_data(data_set + ".csv",
                                             conversion_function=None,
                                             max_records=None)

        #draw some frequency histograms
        draw_histograms(data, headings, data_set)

    return
コード例 #3
0
ファイル: main.py プロジェクト: dhegberg/euclid
def main():


    #create a dictionary of feature setups
    #select the appropriate columns
    feature_dict = {
        #"Full" : header.tolist(),
        "Lecture Views" : ['overal_lecture_views', 'total_lecture_time'],
        "Piazza Use" : ['mid_on_piazza', 'final_on_piazza', 'piazza_posts', 'piazza_days', 'piazza_views'],
        "Lecture Pace" : ['lecture_1_pace_Late', 'lecture_1_pace_On-time', 'lecture_1_pace_Unknown', 'lecture_2_pace_Late', 'lecture_2_pace_On-time', 'lecture_2_pace_Unknown', 'lecture_3_pace_Late', 'lecture_3_pace_On-time', 'lecture_3_pace_Unknown', 'lecture_4_pace_Early', 'lecture_4_pace_Late', 'lecture_4_pace_On-time', 'lecture_4_pace_Unknown', 'lecture_5_pace_Early', 'lecture_5_pace_Late', 'lecture_5_pace_On-time', 'lecture_5_pace_Unknown', 'lecture_6_pace_Early', 'lecture_6_pace_Late', 'lecture_6_pace_On-time', 'lecture_6_pace_Unknown', 'lecture_7_pace_Early', 'lecture_7_pace_Late', 'lecture_7_pace_On-time', 'lecture_7_pace_Unknown', 'lecture_8_pace_Early', 'lecture_8_pace_Late', 'lecture_8_pace_On-time', 'lecture_8_pace_Unknown', 'lecture_9_pace_Early', 'lecture_9_pace_Late', 'lecture_9_pace_On-time', 'lecture_9_pace_Unknown', 'lecture_10_pace_Early', 'lecture_10_pace_Late', 'lecture_10_pace_On-time', 'lecture_10_pace_Unknown', 'lecture_11_pace_Early', 'lecture_11_pace_Late', 'lecture_11_pace_On-time', 'lecture_11_pace_Unknown', 'lecture_12_pace_Early', 'lecture_12_pace_Late', 'lecture_12_pace_On-time', 'lecture_12_pace_Unknown', 'lecture_13_pace_Early', 'lecture_13_pace_Late', 'lecture_13_pace_On-time', 'lecture_13_pace_Unknown', 'lecture_14_pace_Early', 'lecture_14_pace_Late', 'lecture_14_pace_On-time', 'lecture_14_pace_Unknown', 'lecture_15_pace_Early', 'lecture_15_pace_Late', 'lecture_15_pace_On-time', 'lecture_15_pace_Unknown', 'lecture_16_pace_Early', 'lecture_16_pace_Late', 'lecture_16_pace_On-time', 'lecture_16_pace_Unknown', 'lecture_17_pace_Early', 'lecture_17_pace_Late', 'lecture_17_pace_On-time', 'lecture_17_pace_Unknown', 'lecture_18_pace_Early', 'lecture_18_pace_Late', 'lecture_18_pace_On-time', 'lecture_18_pace_Unknown', 'lecture_19_pace_Early', 'lecture_19_pace_Late', 'lecture_19_pace_On-time', 'lecture_19_pace_Unknown', 'lecture_20_pace_Early', 'lecture_20_pace_Late', 'lecture_20_pace_On-time', 'lecture_20_pace_Unknown', 'lecture_21_pace_Early', 'lecture_21_pace_Late', 'lecture_21_pace_On-time', 'lecture_21_pace_Unknown', 'lecture_22_pace_Early', 'lecture_22_pace_Late', 'lecture_22_pace_On-time', 'lecture_22_pace_Unknown', 'lecture_23_pace_Early', 'lecture_23_pace_Late', 'lecture_23_pace_On-time', 'lecture_23_pace_Unknown', 'lecture_24_pace_Early', 'lecture_24_pace_Late', 'lecture_24_pace_On-time', 'lecture_24_pace_Unknown', 'lecture_25_pace_Early', 'lecture_25_pace_Late', 'lecture_25_pace_On-time', 'lecture_25_pace_Unknown', 'lecture_26_pace_Early', 'lecture_26_pace_Late', 'lecture_26_pace_On-time', 'lecture_26_pace_Unknown', 'overall_pace_Early', 'overall_pace_Late', 'overall_pace_On-time', 'overall_pace_Unknown'],
        "Classmate Contact" : ['qtr_on_piazza', 'qtr_email', 'qtr_hipchat', 'qrt_gplus', 'qtr_other_chat', 'qtr_phone', 'qtr_facebook', 'qtr_in_person', 'mid_on_piazza', 'mid_email', 'mid_hipchat', 'qrt_gplus', 'mid_other_chat', 'mid_phone', 'mid_facebook', 'mid_in_person', 'final_on_piazza', 'final_email', 'final_hipchat', 'qrt_gplus', 'final_other_chat', 'final_phone', 'final_facebook', 'final_in_person'],
        "Lecture Amount" : ['total_lecture_time', 'overal_lecture_views', 'lecture_1_views', 'lecture_2_views', 'lecture_3_views', 'lecture_4_views', 'lecture_5_views', 'lecture_6_views', 'lecture_7_views', 'lecture_8_views', 'lecture_9_views', 'lecture_10_views', 'lecture_11_views', 'lecture_12_views', 'lecture_13_views', 'lecture_14_views', 'lecture_15_views', 'lecture_16_views', 'lecture_17_views', 'lecture_18_views', 'lecture_19_views', 'lecture_20_views', 'lecture_21_views', 'lecture_22_views', 'lecture_23_views', 'lecture_24_views', 'lecture_25_views', 'lecture_26_views'],
        "Prior Experience" : ['formal_class_prog_taken', 'C', 'C#', 'C++', 'Java', 'JavaScript', 'Lisp', 'Objective C', 'Perl', 'PHP', 'Python', 'Ruby', 'Shell', 'Swift', 'Visual Basic', 'Other (specify below)', 'years_programming', 'prior_omscs_classes_completed', 'occupation', 'highest_education', 'besides_KBAI_how_many_classes', 'moocs_completed_outside_OMSCS'],
        "Self Assesment" : ['qtr_proj1_confidence_neither confident nor unconfident', 'qtr_proj1_confidence_no answer', 'qtr_proj1_confidence_somewhat confident', 'qtr_proj1_confidence_somewhat unconfident', 'qtr_proj1_confidence_very confident', 'qtr_proj1_confidence_very unconfident', 'qtr_proj2_confidence_neither confident nor unconfident', 'qtr_proj2_confidence_no answer', 'qtr_proj2_confidence_somewhat confident', 'qtr_proj2_confidence_somewhat unconfident', 'qtr_proj2_confidence_very confident', 'qtr_proj2_confidence_very unconfident', 'mid_proj2_confidence_neither confident nor unconfident', 'mid_proj2_confidence_no answer', 'mid_proj2_confidence_somewhat confident', 'mid_proj2_confidence_somewhat unconfident', 'mid_proj2_confidence_very confident', 'mid_proj2_confidence_very unconfident', 'mid_proj3_confidence_neither confident nor unconfident', 'mid_proj3_confidence_no answer', 'mid_proj3_confidence_somewhat confident', 'mid_proj3_confidence_somewhat unconfident', 'mid_proj3_confidence_very confident', 'mid_proj3_confidence_very unconfident', 'final_proj3_confidence_neither confident nor unconfident', 'final_proj3_confidence_no answer', 'final_proj3_confidence_somewhat confident', 'final_proj3_confidence_somewhat unconfident', 'final_proj3_confidence_very confident', 'final_proj3_confidence_very unconfident']
    }

    

    #add the assignments in as variables to test too
    expanded_feature_dict = feature_dict.copy()
    for feature_set_name, select_columns in feature_dict.iteritems():
        #create the new feature set names
        feature_plus_a1 = feature_set_name + " After A1"
        feature_plus_a2 = feature_set_name + " After A2"
        feature_plus_a3 = feature_set_name + " After A3"
        #add the new columns
        select_columns_plus_a1 = select_columns + ["Assig_1_full_40", "proj_1_100"]
        select_columns_plus_a2 = select_columns_plus_a1 + ["Assig_2_full_40", "proj_2_100"]
        select_columns_plus_a3 = select_columns_plus_a2 + ["Assig_3_full_40", "proj_3_100"] 
        #add to the dictionary
        expanded_feature_dict[feature_plus_a1] = select_columns_plus_a1
        expanded_feature_dict[feature_plus_a2] = select_columns_plus_a2
        expanded_feature_dict[feature_plus_a3] = select_columns_plus_a3

    #replace the dictionary
    feature_dict = expanded_feature_dict.copy()


    #list the data sources
    data_sources = [
        "basic_data", 
        "basic_data_only_finishers", 
        "basic_data_clean_lecture", 
        "basic_data_piazza",
        "basic_data_piazza_only_finishers"
        ]


    #create csv for results
    with open(os.path.join(results_location, 'regression_results.csv'), 'wb') as output_file:
            
        #establish the csv writer
        writer = csv.writer(output_file, delimiter=',')

        for data_source in data_sources:

            print("\n\n------------------")
            print("Data Set - %s" % data_source)
            print("------------------")

            #this section determines R^2 scores of the regressors
            writer.writerow(["R^2 Scores"])

            writer.writerow(["Dataset - %s" % data_source] )

            #load the data from the csv
            header, data = data_work.load_data(data_source + ".csv", conversion_function = data_work.convert_survey_data, max_records = None)

            #create headings
            writer.writerow(["Feature", "Decision Tree", "Boosting", "Random Forest", "Linear Regression", "Support Vector Machine", "", "Feature Details"]) 

            #loop through all the feature set combos
            for feature_set_name, select_columns in feature_dict.iteritems():

                print("\n\n------------------")
                print("Feature Set - %s" % feature_set_name)
                print("------------------")

                #get the data subset
                header_subset, data_subset = data_work.select_data_columns(header, data, column_names = ['course_grade'] + select_columns)

                #first run on the full set
                #assumes first column is Y
                X_train, X_test, y_train, y_test = data_work.divide_for_training(data_subset)

                #remove the label header
                header_subset = header_subset[1 : ]

                #scale the data
                X_train, X_test = data_work.scale_features(X_train, X_test)

                #test all to start
                run_regressors(X_train, y_train, X_test, y_test, header_subset, writer, data_source + "-" + feature_set_name + " Linear")

                #for degree in [2, 3, 4]:
                for degree in [2]:
                    #convert to polynomials
                    poly = PolynomialFeatures(degree=degree)
                    X_train_poly = poly.fit_transform(X_train)
                    X_test_poly = poly.fit_transform(X_test)

                    #test all in poly
                    run_regressors(X_train_poly , y_train, X_test_poly, y_test, header_subset, writer, data_source + "-" + feature_set_name + " Poly %i" % degree)

                ##test individually
                #for i in range(0, X_train.shape[1]):
                #    run_regressors(X_train[:, i,np.newaxis], y_train, X_test[:, i,np.newaxis], y_test, header_subset[i + 1, np.newaxis], writer)


    return
コード例 #4
0
ファイル: main.py プロジェクト: njpayne/euclid
def main():

    #create a dictionary of feature setups
    #select the appropriate columns
    feature_dict = {
        #"Full" : header.tolist(),
        "Lecture Views": ['overal_lecture_views', 'total_lecture_time'],
        "Piazza Use": [
            'mid_on_piazza', 'final_on_piazza', 'piazza_posts', 'piazza_days',
            'piazza_views'
        ],
        "Lecture Pace": [
            'lecture_1_pace_Late', 'lecture_1_pace_On-time',
            'lecture_1_pace_Unknown', 'lecture_2_pace_Late',
            'lecture_2_pace_On-time', 'lecture_2_pace_Unknown',
            'lecture_3_pace_Late', 'lecture_3_pace_On-time',
            'lecture_3_pace_Unknown', 'lecture_4_pace_Early',
            'lecture_4_pace_Late', 'lecture_4_pace_On-time',
            'lecture_4_pace_Unknown', 'lecture_5_pace_Early',
            'lecture_5_pace_Late', 'lecture_5_pace_On-time',
            'lecture_5_pace_Unknown', 'lecture_6_pace_Early',
            'lecture_6_pace_Late', 'lecture_6_pace_On-time',
            'lecture_6_pace_Unknown', 'lecture_7_pace_Early',
            'lecture_7_pace_Late', 'lecture_7_pace_On-time',
            'lecture_7_pace_Unknown', 'lecture_8_pace_Early',
            'lecture_8_pace_Late', 'lecture_8_pace_On-time',
            'lecture_8_pace_Unknown', 'lecture_9_pace_Early',
            'lecture_9_pace_Late', 'lecture_9_pace_On-time',
            'lecture_9_pace_Unknown', 'lecture_10_pace_Early',
            'lecture_10_pace_Late', 'lecture_10_pace_On-time',
            'lecture_10_pace_Unknown', 'lecture_11_pace_Early',
            'lecture_11_pace_Late', 'lecture_11_pace_On-time',
            'lecture_11_pace_Unknown', 'lecture_12_pace_Early',
            'lecture_12_pace_Late', 'lecture_12_pace_On-time',
            'lecture_12_pace_Unknown', 'lecture_13_pace_Early',
            'lecture_13_pace_Late', 'lecture_13_pace_On-time',
            'lecture_13_pace_Unknown', 'lecture_14_pace_Early',
            'lecture_14_pace_Late', 'lecture_14_pace_On-time',
            'lecture_14_pace_Unknown', 'lecture_15_pace_Early',
            'lecture_15_pace_Late', 'lecture_15_pace_On-time',
            'lecture_15_pace_Unknown', 'lecture_16_pace_Early',
            'lecture_16_pace_Late', 'lecture_16_pace_On-time',
            'lecture_16_pace_Unknown', 'lecture_17_pace_Early',
            'lecture_17_pace_Late', 'lecture_17_pace_On-time',
            'lecture_17_pace_Unknown', 'lecture_18_pace_Early',
            'lecture_18_pace_Late', 'lecture_18_pace_On-time',
            'lecture_18_pace_Unknown', 'lecture_19_pace_Early',
            'lecture_19_pace_Late', 'lecture_19_pace_On-time',
            'lecture_19_pace_Unknown', 'lecture_20_pace_Early',
            'lecture_20_pace_Late', 'lecture_20_pace_On-time',
            'lecture_20_pace_Unknown', 'lecture_21_pace_Early',
            'lecture_21_pace_Late', 'lecture_21_pace_On-time',
            'lecture_21_pace_Unknown', 'lecture_22_pace_Early',
            'lecture_22_pace_Late', 'lecture_22_pace_On-time',
            'lecture_22_pace_Unknown', 'lecture_23_pace_Early',
            'lecture_23_pace_Late', 'lecture_23_pace_On-time',
            'lecture_23_pace_Unknown', 'lecture_24_pace_Early',
            'lecture_24_pace_Late', 'lecture_24_pace_On-time',
            'lecture_24_pace_Unknown', 'lecture_25_pace_Early',
            'lecture_25_pace_Late', 'lecture_25_pace_On-time',
            'lecture_25_pace_Unknown', 'lecture_26_pace_Early',
            'lecture_26_pace_Late', 'lecture_26_pace_On-time',
            'lecture_26_pace_Unknown', 'overall_pace_Early',
            'overall_pace_Late', 'overall_pace_On-time', 'overall_pace_Unknown'
        ],
        "Classmate Contact": [
            'qtr_on_piazza', 'qtr_email', 'qtr_hipchat', 'qrt_gplus',
            'qtr_other_chat', 'qtr_phone', 'qtr_facebook', 'qtr_in_person',
            'mid_on_piazza', 'mid_email', 'mid_hipchat', 'qrt_gplus',
            'mid_other_chat', 'mid_phone', 'mid_facebook', 'mid_in_person',
            'final_on_piazza', 'final_email', 'final_hipchat', 'qrt_gplus',
            'final_other_chat', 'final_phone', 'final_facebook',
            'final_in_person'
        ],
        "Lecture Amount": [
            'total_lecture_time', 'overal_lecture_views', 'lecture_1_views',
            'lecture_2_views', 'lecture_3_views', 'lecture_4_views',
            'lecture_5_views', 'lecture_6_views', 'lecture_7_views',
            'lecture_8_views', 'lecture_9_views', 'lecture_10_views',
            'lecture_11_views', 'lecture_12_views', 'lecture_13_views',
            'lecture_14_views', 'lecture_15_views', 'lecture_16_views',
            'lecture_17_views', 'lecture_18_views', 'lecture_19_views',
            'lecture_20_views', 'lecture_21_views', 'lecture_22_views',
            'lecture_23_views', 'lecture_24_views', 'lecture_25_views',
            'lecture_26_views'
        ],
        "Prior Experience": [
            'formal_class_prog_taken', 'C', 'C#', 'C++', 'Java', 'JavaScript',
            'Lisp', 'Objective C', 'Perl', 'PHP', 'Python', 'Ruby', 'Shell',
            'Swift', 'Visual Basic', 'Other (specify below)',
            'years_programming', 'prior_omscs_classes_completed', 'occupation',
            'highest_education', 'besides_KBAI_how_many_classes',
            'moocs_completed_outside_OMSCS'
        ],
        "Self Assesment": [
            'qtr_proj1_confidence_neither confident nor unconfident',
            'qtr_proj1_confidence_no answer',
            'qtr_proj1_confidence_somewhat confident',
            'qtr_proj1_confidence_somewhat unconfident',
            'qtr_proj1_confidence_very confident',
            'qtr_proj1_confidence_very unconfident',
            'qtr_proj2_confidence_neither confident nor unconfident',
            'qtr_proj2_confidence_no answer',
            'qtr_proj2_confidence_somewhat confident',
            'qtr_proj2_confidence_somewhat unconfident',
            'qtr_proj2_confidence_very confident',
            'qtr_proj2_confidence_very unconfident',
            'mid_proj2_confidence_neither confident nor unconfident',
            'mid_proj2_confidence_no answer',
            'mid_proj2_confidence_somewhat confident',
            'mid_proj2_confidence_somewhat unconfident',
            'mid_proj2_confidence_very confident',
            'mid_proj2_confidence_very unconfident',
            'mid_proj3_confidence_neither confident nor unconfident',
            'mid_proj3_confidence_no answer',
            'mid_proj3_confidence_somewhat confident',
            'mid_proj3_confidence_somewhat unconfident',
            'mid_proj3_confidence_very confident',
            'mid_proj3_confidence_very unconfident',
            'final_proj3_confidence_neither confident nor unconfident',
            'final_proj3_confidence_no answer',
            'final_proj3_confidence_somewhat confident',
            'final_proj3_confidence_somewhat unconfident',
            'final_proj3_confidence_very confident',
            'final_proj3_confidence_very unconfident'
        ]
    }

    #list the data sources
    data_sources = [
        "basic_data", "basic_data_only_finishers", "basic_data_clean_lecture",
        "basic_data_piazza", "basic_data_piazza_only_finishers"
    ]

    #create csv for results
    with open(os.path.join(results_location, 'regression_results.csv'),
              'wb') as output_file:

        #establish the csv writer
        writer = csv.writer(output_file, delimiter=',')

        for data_source in data_sources:

            print("\n\n------------------")
            print("Data Set - %s" % data_source)
            print("------------------")

            #this section determines R^2 scores of the regressors
            writer.writerow(["R^2 Scores"])

            writer.writerow(["Dataset - %s" % data_source])

            #load the data from the csv
            header, data = data_work.load_data(
                data_source + ".csv",
                conversion_function=data_work.convert_survey_data,
                max_records=None)

            #create headings
            writer.writerow([
                "Feature", "Decision Tree", "Boosting", "Random Forest",
                "Linear Regression", "Support Vector Machine", "",
                "Feature Details"
            ])

            #loop through all the feature set combos
            for feature_set_name, select_columns in feature_dict.iteritems():

                print("\n\n------------------")
                print("Feature Set - %s" % feature_set_name)
                print("------------------")

                #get the data subset
                header_subset, data_subset = data_work.select_data_columns(
                    header,
                    data,
                    column_names=['course_grade'] + select_columns)

                #first run on the full set
                #assumes first column is Y
                X_train, X_test, y_train, y_test = data_work.divide_for_training(
                    data_subset)

                #remove the label header
                header_subset = header_subset[1:]

                #scale the data
                X_train, X_test = data_work.scale_features(X_train, X_test)

                #test all to start
                run_regressors(
                    X_train, y_train, X_test, y_test, header_subset, writer,
                    data_source + "-" + feature_set_name + " Linear")

                #for degree in [2, 3, 4]:
                for degree in [2]:
                    #convert to polynomials
                    poly = PolynomialFeatures(degree=degree)
                    X_train_poly = poly.fit_transform(X_train)
                    X_test_poly = poly.fit_transform(X_test)

                    #test all in poly
                    run_regressors(
                        X_train_poly, y_train, X_test_poly, y_test,
                        header_subset, writer, data_source + "-" +
                        feature_set_name + " Poly %i" % degree)

                ##test individually
                #for i in range(0, X_train.shape[1]):
                #    run_regressors(X_train[:, i,np.newaxis], y_train, X_test[:, i,np.newaxis], y_test, header_subset[i + 1, np.newaxis], writer)

    return