コード例 #1
0
def usLRtoPredict():
    examId_List = ["e1", "e2", "e3", "e4"]
    target_List = ["programStateRecord"]

    print('|数据名称|预测|r^2|spearman|')
    print('|-|-|-|')

    for eid in examId_List:
        for target in target_List:
            dataFileName = eid + "-" + target
            _data_matrix, _data_header, _score_array = getData("npsm//" +
                                                               dataFileName)

            _feature_matrix = getSerevalColumn(
                _data_matrix, [i for i in range(1, _data_header.__len__())])
            # _feature_matrix = getSerevalColumn(_data_matrix,[1,8]);
            # _feature_matrix = getSerevalColumn(_data_matrix,[1,4,8]);
            # _feature_matrix = getSerevalColumn(_data_matrix,[1,2,3,9]);
            _score_array = np.array(_score_array).reshape(
                _score_array.__len__(), 1)

            _lr = LinearRegression()
            _lr.fit(_feature_matrix, _score_array)
            y_predicted = _lr.predict(_feature_matrix)

            print("|", dataFileName, "|",
                  getprecisionWithTorlerate(_score_array, y_predicted,
                                            0.5), "|",
                  getprecisionWithTorlerate(_score_array, y_predicted,
                                            1.5), "|",
                  getprecisionWithTorlerate(_score_array, y_predicted, 2.5),
                  "|", r2_score(_score_array, y_predicted), "|",
                  spearmanr(_score_array, y_predicted), "|",
                  getprecisionWithTorlerate(_score_array, y_predicted, 5))
コード例 #2
0
def useLRtoPredictScore(targetFileName, exam_mark=None, needNorm=True):
    if exam_mark is None:
        exam_mark = DATA_MARK

    _file_Relative_Path = os.path.join(exam_mark, targetFileName)
    student_data, headerArray = load_data_from_file(_file_Relative_Path)

    _score_map = get_final_score_map()
    _score_array = []
    for _student_record in student_data:
        _score_array.append(_score_map[_student_record[0]])

    targetFeatureIndexArray = [i for i in range(1, headerArray.__len__())]
    featureMatrix = getSerevalColumn(student_data, targetFeatureIndexArray)

    if needNorm:
        featureMatrix = normizeMatrix(featureMatrix)

    _lr = LinearRegression(fit_intercept=True)
    _lr.fit(featureMatrix, _score_array)
    y_predicted = _lr.predict(featureMatrix)
    # y_predicted.astype(int)

    print()
    # print(headerArray);
    # print(_lr.coef_)
    # print(_lr.intercept_)

    print(getprecisionWithTorlerate(y_predicted, _score_array, 0.5),
          getprecisionWithTorlerate(y_predicted, _score_array, 1.5),
          getprecisionWithTorlerate(y_predicted, _score_array, 2.5),
          spearmanr(y_predicted, _score_array),
          r2_score(_score_array, y_predicted))
コード例 #3
0
def getDataAndScore(featureFileName, exam_mark=None, needHeader=False):
    if exam_mark is None:
        exam_mark = DATA_MARK

    _file_Relative_Path = os.path.join(exam_mark, featureFileName)
    student_data, headerArray = load_data_from_file(_file_Relative_Path)

    _feature_matrix = getSerevalColumn(
        student_data, [i for i in range(1, headerArray.__len__())])

    # index_array = [1, 1, 1, 1, 1, 2, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 4];
    # target_index_array = [];
    # for _index,_value in enumerate(index_array):
    #     if _value == 1:
    #         target_index_array.append(_index+1);
    # _feature_matrix = getSerevalColumn(student_data, target_index_array)

    # feature_array = ["saveInterval", "programTime", "totalLength", "pasteCount",
    #                   "codeIntervalCount","saveCount","buildInterval",
    #                   "score","codeTime","successCount","testCount",
    #                 "scoreRemainMiddle","avgRemoveErrorTime",
    #                 ];
    # _feature_matrix = getSerevalColumn(student_data,getTargetColumnList(headerArray,feature_array))

    # _score_map = get_final_score_map(None);
    _score_map = get_final_score_map()
    _score_array = []
    for record in student_data:
        _score_array.append(_score_map[record[0]])

    # sorted_score_array = sorted(_score_array);
    # gap = _score_array.__len__() / 10;
    # pre_score = -1;
    # score_to_index_map = {};
    # for index, score in enumerate(sorted_score_array):
    #     if score == pre_score:
    #         continue;
    #     else:
    #         fold_index = int(index / gap);
    #         score_to_index_map[score] = fold_index;
    #         pre_score = score;
    #
    # for index,value in enumerate(_score_array):
    #     _score_array[index] = score_to_index_map[value];

    if needHeader:
        return _feature_matrix, _score_array, headerArray
    else:
        return _feature_matrix, _score_array
コード例 #4
0
def tryAllFeatureCompositeWithSVM():
    data_matrix, scoreArray, header = load_routine_data(True)
    clf = SVC(kernel="poly", decision_function_shape="ovr", degree=2)

    compositeGenerator = GetColCombination(header)

    for _n in range(1, header.__len__() + 1):
        compositeGenerator.setCompositeNum(_n)

        canFind, colArray = compositeGenerator.getNextComposite()
        while canFind:
            print(colArray, end=" : ")
            colIndexArray = getTargetColumnList(header, colArray)
            data = getSerevalColumn(data_matrix, colIndexArray)
            precision_array = []
            for _index in range(10):
                scores = cross_val_score(clf,
                                         data,
                                         scoreArray,
                                         cv=StratifiedKFold(5, shuffle=True))
                precision_array.append(scores.mean())
            print(np.array(precision_array).mean())

            canFind, colArray = compositeGenerator.getNextComposite()
コード例 #5
0
def usLRtoPredictWithKFold():
    examId_List = ["e1", "e2", "e3", "e4"]
    target_List = ["programStateRecord"]

    print('|数据名称|预测|5分|10分|')
    print('|-|-|-|')

    for eid in examId_List:
        for target in target_List:
            dataFileName = eid + "-" + target
            _data_matrix, _data_header, _score_array = getData("npsm//" +
                                                               dataFileName)

            # _feature_matrix = getSerevalColumn(_data_matrix,[i for i in range(1,_data_header.__len__())]);
            _feature_matrix = getSerevalColumn(_data_matrix, [8])
            _score_array = np.array(_score_array).reshape(
                _score_array.__len__(), 1)

            # kf = KFold(n_splits=5, shuffle=True);
            #
            # accurate_array = [];
            # within_5_array = [];
            # within_10_array = [];
            # r_2_array = [];
            #
            # for train_index_array, test_index_array in kf.split(_feature_matrix):
            #     X_train = [];
            #     X_test = [];
            #     y_train = [];
            #     y_test = [];
            #     for train_index in train_index_array:
            #         X_train.append(_feature_matrix[train_index]);
            #         y_train.append(_score_array[train_index]);
            #
            #     for test_index in test_index_array:
            #         X_test.append(_feature_matrix[test_index]);
            #         y_test.append(_score_array[test_index])
            #
            #     _lr = LinearRegression(fit_intercept=True);
            #     _lr.fit(X_train, y_train);
            #     y_predicted = _lr.predict(X_test);
            #
            #     accurate_array.append(getprecisionWithTorlerate(y_test, y_predicted, 0.5));
            #     within_5_array.append(getprecisionWithTorlerate(y_test, y_predicted, 1.5));
            #     within_10_array.append(getprecisionWithTorlerate(y_test, y_predicted, 2.5));
            #
            #     y_total_predict = _lr.predict(_feature_matrix)
            #     r_2_array.append(r2_score(_score_array,y_total_predict));
            #
            #
            # print("|", dataFileName, "|", np.array(accurate_array).mean(), "|",np.array(within_5_array).mean(),
            #       "|",np.array(within_10_array).mean(), "|", np.array(r_2_array).mean());

            _lr = LinearRegression(fit_intercept=True)
            precision_array = []
            for _index in range(10):
                _scores = model_selection.cross_val_score(
                    _lr,
                    _feature_matrix,
                    _score_array,
                    cv=model_selection.StratifiedKFold(5, shuffle=True),
                    scoring=lr_precision)
                precision_array.append(_scores.mean())
            print("|", dataFileName, "|",
                  np.array(precision_array).mean(), "|")
コード例 #6
0
def searchAddOneFeatureOneTime(mark):
    # #"firstCodeTimeFromStart","saveInterval","pasteCount","buildInterval","codeBU","codeBS","scoreUp","successCount","debugTime","debugCount","debugErrorCount",
    # #"failCount","codeBE", "keepError","scoreRemainHigh","useDebug","codeTime","scoreRemainZero","hasBuildError",
    # feature_array = ["codeIntervalCount","totalLength","programTime","avgRemoveErrorTime",
    #                  "testCount","saveCount","longDeleteCount","score",
    #                  "scoreRemainMiddle","generateError","scoreDown","totalCount",
    #                 ];

    # feature_array = ["saveInterval","programTime","totalLength","codeTime","firstCodeTimeFromStart",
    #  "pasteCount","codeIntervalCount","saveCount","longDeleteCount","buildInterval",
    #  "codeBU","score","codeBS","testCount","successCount",
    #  "scoreUp","totalCount","scoreRemainZero","scoreRemainMiddle","avgRemoveErrorTime",
    #  "debugCount","debugTime","debugErrorCount","failCount","codeBE",
    #  "scoreDown","keepError","generateError","useDebug","hasBuildError",
    #  "scoreRemainHigh",
    # ];
    #5"firstCodeTimeFromStart", 16"totalCount",7"longDeleteCount",10"codeBS", 12 "scoreUp",
    #12 "codeBU",13"scoreRemainZero",14"debugCount",14"debugTime",14"debugErrorCount",18"useDebug",19"hasBuildError"
    #"totalLength","pasteCount", 提前移除
    feature_array = [
        "saveInterval",
        "programTime",
        "codeIntervalCount",
        "saveCount",
        "buildInterval",
        "score",
        "codeTime",
        "successCount",
        "testCount",
        "scoreRemainMiddle",
        "avgRemoveErrorTime",
        "failCount",
        "scoreDown",
        "keepError",
        "generateError",
        "codeBE",
        "scoreRemainHigh",
    ]

    # feature_array = ["buildInterval","saveInterval","codeIntervalCount","totalLength","programTime","codeTime",
    #                  "avgRemoveErrorTime","testCount",
    #                  "saveCount","scoreRemainMiddle",
    #                  "score","successCount","pasteCount",
    #                 ];

    # feature_array = ["codeIntervalCount","totalLength", "programTime","longDeleteCount",
    #               "avgRemoveErrorTime","testCount","saveCount","scoreRemainMiddle","score","scoreDown","generateError","totalCount",
    #              ];

    dataArray, scoreArray, headerArray = getDataAndScore("concatfeature",
                                                         mark,
                                                         needHeader=True)
    del headerArray[0]

    x_array = []
    y_array = []
    #逐步添加特征直至完成
    for _count in range(feature_array.__len__()):
        target_feature_name_array = feature_array[:_count + 1]
        # print(target_feature_name_array);
        indexList = getTargetColumnList(headerArray, target_feature_name_array)
        # print(indexList)
        featureMatrix = getSerevalColumn(dataArray, indexList)
        precision = useSVMtoPredictScore(featureMatrix, scoreArray)
        print("%d : %.4f" % (_count + 1, precision))
        x_array.append(_count + 1)
        y_array.append(precision)

    plt.figure()
    plt.plot(x_array, y_array)
    plt.show()