def useDecisionTreeToClassify(featureFileName,
                              exam_mark=None,
                              needOutputpdf=False,
                              max_depth=None):
    _feature_matrix, _score_array, headerArray = getDataAndScore(
        featureFileName, exam_mark, needHeader=True)

    _dt = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=2)
    _dt.fit(_feature_matrix, _score_array)

    #输出图像
    if (needOutputpdf):
        feature_names = []
        for featureIndex in range(1, headerArray.__len__()):
            feature_names.append(headerArray[featureIndex])

        dot_data = export_graphviz(_dt,
                                   out_file=None,
                                   feature_names=None,
                                   filled=True,
                                   rounded=True,
                                   special_characters=True)

        graph = graphviz.Source(dot_data, directory='out/')
        graph.render(mark + "-decisionTree")
def useElaticNettoPredictScoreWithKFold(targetFileName,
                                        exam_mark=None,
                                        needNorm=True):
    if exam_mark is None:
        exam_mark = DATA_MARK

    featureMatrix, _score_array = getDataAndScore(targetFileName, exam_mark)

    if needNorm:
        featureMatrix = normizeMatrix(featureMatrix)

    _lr = ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10],
                       l1_ratio=[1e-4, .01, .1, .5, .9, .99],
                       max_iter=5000,
                       cv=model_selection.StratifiedKFold(5, shuffle=True))
    precision_array = []
    for _index in range(10):
        _scores = model_selection.cross_val_score(
            _lr,
            featureMatrix,
            _score_array,
            cv=model_selection.StratifiedKFold(5, shuffle=True),
            scoring=lr_precision)
        precision_array.append(_scores.mean())
    print(np.array(precision_array).mean())
예제 #3
0
def useGaussianNBtoPredictScore(featureFileName,exam_mark=None):
    featureMatrix, scoreCol = getDataAndScore(featureFileName,exam_mark);
    precision_array = [];
    for _index in range(10):
        clf = GaussianNB();
        scores = cross_val_score(clf,featureMatrix,scoreCol,cv=StratifiedKFold(5,shuffle=True));
        precision_array.append(scores.mean())
    # print(precision_array);
    print(np.array(precision_array).mean())
def useKNNtoPredictScore(featureFileName,exam_mark=None,neighbour = 3):
    featureMatrix, scoreCol = getDataAndScore(featureFileName,exam_mark);
    featureMatrix = StandardScaler().fit_transform(featureMatrix)
    precision_array = [];
    for _index in range(10):
        clf = KNeighborsClassifier(neighbour);
        scores = cross_val_score(clf,featureMatrix,scoreCol,cv=StratifiedKFold(5,shuffle=True));
        precision_array.append(scores.mean())
    # print(precision_array);
    print(np.array(precision_array).mean())
예제 #5
0
def getRefResult(exam_mark):
    #由于挑出来的特征都差不多 这里直接拿第一次的结果做出来
    dataMatrix, scoreArray,header = getDataAndScore("concatfeature", exam_mark,needHeader=True);
    ranking =useREFToSelectFeature(exam_mark);

    print("[",end=" ")
    for _index,_value in enumerate(ranking):
        if _value == 1:
            print(" \""+header[_index+1]+"\",",end="");
    print("]")
예제 #6
0
def useREFToSelectFeature(examMark):
    # Load the digits dataset

    dataMatrix,scoreArray = getDataAndScore("concatfeature",examMark);

    # Create the RFE object and rank each pixel
    clf = RandomForestClassifier(n_estimators=1000, max_depth=None)
    # rfe = RFECV(estimator=clf,step=1,cv=StratifiedKFold(5,shuffle=True),n_jobs=-1)
    rfe = RFE(estimator=clf,step=1)
    rfe.fit(dataMatrix, scoreArray)
    return rfe.ranking_;
def useSVMtoPredictScore(featureFileName,exam_mark=None,kernel="rbf",decision_function_shape="ovr"):
    featureMatrix, scoreCol = getDataAndScore(featureFileName,exam_mark);
    featureMatrix = StandardScaler().fit_transform(featureMatrix)
    precision_array = [];
    for _index in range(10):
        _svc = SVC(kernel=kernel,decision_function_shape = decision_function_shape,degree=2);

        score_array = cross_val_score(_svc,featureMatrix,scoreCol,cv=StratifiedKFold(5,shuffle=True))
        # print(score_array.mean());
        precision_array.append(score_array.mean());

    # print(precision_array);
    print(np.array(precision_array).mean())
예제 #8
0
def useKMeansToPredict(featureFileName,exam_mark=None):
    featureMatrix, scoreCol,headerArray= getDataAndScore(featureFileName,exam_mark,needHeader=True);
    precision_array = [];
    clusterMap = {"saveInterval":2,"score":3,"scoreRemainMiddle":2}
    for _index in range(0,headerArray.__len__()):
        if headerArray[_index] in clusterMap :
            clf = KMeans(n_clusters=clusterMap[headerArray[_index]]);
            dataArray = getOneColumn(featureMatrix,_index);
            dataArray = np.array(dataArray).reshape(dataArray.__len__(),1);

            clf.fit(dataArray);
            #构建预测模型
            center_array = clf.cluster_centers_
            label_map = {} ;
            for centerIndex,center in enumerate(center_array):
                label_map[centerIndex] = center[0];

            center_array =center_array.reshape(center_array.__len__());
            center_array = sorted(center_array.tolist());
            for label in label_map :
                label_map[label] = center_array.index(label_map[label]);

            #预测
            grade_predict_array = clf.predict(dataArray);

            #判断准确率
            current_score_map = score_map[headerArray[_index]];
            t_t = 0;
            t_f = 0;
            f_t = 0;
            f_f = 0;
            for record_index in range(grade_predict_array.__len__()):
                grade_predict = label_map[grade_predict_array[record_index]];
                true_grade = current_score_map[scoreCol[record_index]];

                if true_grade == 1 and grade_predict ==1 :
                    t_t += 1;
                if true_grade == 1 and grade_predict ==0 :
                    t_f += 1;
                if true_grade == 0 and grade_predict == 1:
                    f_t += 1;
                if true_grade == 0 and grade_predict == 0:
                    f_f += 1;

            print( headerArray[_index] , " : ",t_t,t_f,f_t,f_f);
예제 #9
0
def useObserveToPredict(featureFileName, exam_mark=None):
    featureMatrix, scoreCol, headerArray = getDataAndScore(featureFileName,
                                                           exam_mark,
                                                           needHeader=True)
    precision_array = []
    gapMap = {
        "saveInterval": [60],
        "score": [40, 80],
        "scoreRemainMiddle": [80],
        "scoreUp": [40, 80]
    }
    for _index in range(0, headerArray.__len__()):
        if headerArray[_index] in gapMap:
            dataArray = getOneColumn(featureMatrix, _index)
            dataArray = np.array(dataArray).reshape(dataArray.__len__(), 1)

            #构建预测模型
            sortArray = sorted(dataArray)
            gap = gapMap[headerArray[_index]]
            for gap_index in range(gap.__len__()):
                gap[gap_index] = sortArray[(int)(gap[gap_index] / 100 *
                                                 sortArray.__len__())]

            #预测
            predict = []
            for record_index in range(dataArray.__len__()):
                predict_val = gap.__len__()
                for gap_index, gap_value in enumerate(gap):
                    if dataArray[record_index] <= gap_value:
                        predict_val = gap_index
                        break
                predict.append(predict_val)

            #判断准确率
            precision = 0.0
            current_score_map = score_map[headerArray[_index]]
            for record_index in range(dataArray.__len__()):
                true_grade = current_score_map[scoreCol[record_index]]
                if true_grade == predict[record_index]:
                    precision += 1

            print(headerArray[_index],
                  "%.4f" % (precision / dataArray.__len__()))
def useLRtoPredictScoreWithKFold(targetFileName,
                                 exam_mark=None,
                                 needNorm=True):
    if exam_mark is None:
        exam_mark = DATA_MARK

    featureMatrix, _score_array = getDataAndScore(targetFileName, exam_mark)

    if needNorm:
        featureMatrix = normizeMatrix(featureMatrix)

    _lr = LinearRegression(fit_intercept=True)
    precision_array = []
    for _index in range(10):
        _scores = model_selection.cross_val_score(
            _lr,
            featureMatrix,
            _score_array,
            cv=model_selection.StratifiedKFold(5, shuffle=True),
            scoring=lr_precision)
        precision_array.append(_scores.mean())
    print(np.array(precision_array).mean())
def useDecisionTreeToClassifyWithKFold(featureFileName,
                                       exam_mark=None,
                                       max_depth=None):
    _feature_matrix, _score_array = getDataAndScore(featureFileName, exam_mark)

    # kf = KFold(n_splits=5,shuffle=True);
    # accurate_array = [];
    # for train_index_array, test_index_array in kf.split(_feature_matrix):
    #     X_train = [];
    #     X_test = [];
    #     y_train = [];
    #     y_test = [];
    #     for train_index in train_index_array:
    #         X_train.append(_feature_matrix[train_index]);
    #         y_train.append(_score_array[train_index]);
    #
    #     for test_index in test_index_array:
    #         X_test.append(_feature_matrix[test_index]);
    #         y_test.append(_score_array[test_index])
    #
    #     _dt = DecisionTreeClassifier(max_depth=max_depth);
    #     _dt.fit(X_train, y_train);
    #     score = _dt.score(X_test,y_test);
    #
    #     accurate_array.append(score);
    # print(np.array(accurate_array).mean())

    precision_array = []
    for _index in range(10):
        _dt = DecisionTreeClassifier(max_depth=max_depth)
        score_array = cross_val_score(_dt,
                                      _feature_matrix,
                                      _score_array,
                                      cv=StratifiedKFold(5, shuffle=True))
        precision_array.append(score_array.mean())
    # print(precision_array);
    print(np.array(precision_array).mean())
def useLassotoPredictScoreWithKFold(targetFileName,
                                    exam_mark=None,
                                    needNorm=True):
    if exam_mark is None:
        exam_mark = DATA_MARK

    featureMatrix, _score_array = getDataAndScore(targetFileName, exam_mark)

    if needNorm:
        featureMatrix = normizeMatrix(featureMatrix)

    _lr = LassoCV(alphas=[0.01, 0.05, 0.1, 0.5, 1, 10],
                  cv=model_selection.StratifiedKFold(5, shuffle=True),
                  tol=1e-4)
    precision_array = []
    for _index in range(10):
        _scores = model_selection.cross_val_score(
            _lr,
            featureMatrix,
            _score_array,
            cv=model_selection.StratifiedKFold(5, shuffle=True),
            scoring=lr_precision)
        precision_array.append(_scores.mean())
    print(np.array(precision_array).mean())
예제 #13
0
def searchAddOneFeatureOneTime(mark):
    # #"firstCodeTimeFromStart","saveInterval","pasteCount","buildInterval","codeBU","codeBS","scoreUp","successCount","debugTime","debugCount","debugErrorCount",
    # #"failCount","codeBE", "keepError","scoreRemainHigh","useDebug","codeTime","scoreRemainZero","hasBuildError",
    # feature_array = ["codeIntervalCount","totalLength","programTime","avgRemoveErrorTime",
    #                  "testCount","saveCount","longDeleteCount","score",
    #                  "scoreRemainMiddle","generateError","scoreDown","totalCount",
    #                 ];

    # feature_array = ["saveInterval","programTime","totalLength","codeTime","firstCodeTimeFromStart",
    #  "pasteCount","codeIntervalCount","saveCount","longDeleteCount","buildInterval",
    #  "codeBU","score","codeBS","testCount","successCount",
    #  "scoreUp","totalCount","scoreRemainZero","scoreRemainMiddle","avgRemoveErrorTime",
    #  "debugCount","debugTime","debugErrorCount","failCount","codeBE",
    #  "scoreDown","keepError","generateError","useDebug","hasBuildError",
    #  "scoreRemainHigh",
    # ];
    #5"firstCodeTimeFromStart", 16"totalCount",7"longDeleteCount",10"codeBS", 12 "scoreUp",
    #12 "codeBU",13"scoreRemainZero",14"debugCount",14"debugTime",14"debugErrorCount",18"useDebug",19"hasBuildError"
    #"totalLength","pasteCount", 提前移除
    feature_array = [
        "saveInterval",
        "programTime",
        "codeIntervalCount",
        "saveCount",
        "buildInterval",
        "score",
        "codeTime",
        "successCount",
        "testCount",
        "scoreRemainMiddle",
        "avgRemoveErrorTime",
        "failCount",
        "scoreDown",
        "keepError",
        "generateError",
        "codeBE",
        "scoreRemainHigh",
    ]

    # feature_array = ["buildInterval","saveInterval","codeIntervalCount","totalLength","programTime","codeTime",
    #                  "avgRemoveErrorTime","testCount",
    #                  "saveCount","scoreRemainMiddle",
    #                  "score","successCount","pasteCount",
    #                 ];

    # feature_array = ["codeIntervalCount","totalLength", "programTime","longDeleteCount",
    #               "avgRemoveErrorTime","testCount","saveCount","scoreRemainMiddle","score","scoreDown","generateError","totalCount",
    #              ];

    dataArray, scoreArray, headerArray = getDataAndScore("concatfeature",
                                                         mark,
                                                         needHeader=True)
    del headerArray[0]

    x_array = []
    y_array = []
    #逐步添加特征直至完成
    for _count in range(feature_array.__len__()):
        target_feature_name_array = feature_array[:_count + 1]
        # print(target_feature_name_array);
        indexList = getTargetColumnList(headerArray, target_feature_name_array)
        # print(indexList)
        featureMatrix = getSerevalColumn(dataArray, indexList)
        precision = useSVMtoPredictScore(featureMatrix, scoreArray)
        print("%d : %.4f" % (_count + 1, precision))
        x_array.append(_count + 1)
        y_array.append(precision)

    plt.figure()
    plt.plot(x_array, y_array)
    plt.show()