Example #1
0
def usLRtoPredictWithExpDef():
    examId_List = ["e1", "e2", "e3", "e4"]
    algorithm_List = ["EQ", "Watwin"]
    target_List = ["finalscore"]

    scoreMap = getGradeMap()
    print('|数据名称|预测准确率|')
    print('|-|-|')

    for al in algorithm_List:
        for eid in examId_List:
            for target in target_List:
                dataFileName = eid + "-" + al + "-" + target
                student_data = getStudentData(al + "//" + dataFileName)
                # scoreArray = getOneColumn(student_data, 2);
                scoreArray = []
                final_score_map = get_final_score_map()
                scoreArray = getOneColumn(student_data, 2)
                for index in range(scoreArray.__len__()):
                    scoreArray[index] = scoreMap[scoreArray[index]]

                scoreArray = np.array(scoreArray).reshape(
                    scoreArray.__len__(), 1)

                watwinArray = getOneColumn(student_data, 1)
                watwinArray = np.array(watwinArray).reshape(
                    watwinArray.__len__(), 1)

                _lr = LinearRegression(fit_intercept=True)
                _lr.fit(watwinArray, scoreArray)
                y_predicted = _lr.predict(watwinArray)

                print("|", dataFileName, "|",
                      getprecisionWithTorlerate(y_predicted, scoreArray, 0.5),
                      "|")
Example #2
0
def scorefinalScoreWithExpDef():
    examId_List = ["e1", "e2", "e3", "e4"]
    algorithm_List = ["EQ", "Watwin"]
    target_List = ["finalscore"]

    scoreMap = getGradeMap()

    print('|数据名称|r^2|')
    print('|-|-|')
    model = LinearRegression()

    for al in algorithm_List:
        for target in target_List:
            for eid in examId_List:
                dataFileName = eid + "-" + al + "-" + target
                student_data = getStudentData(al + "//" + dataFileName)
                scoreArray = getOneColumn(student_data, 2)
                for index in range(scoreArray.__len__()):
                    scoreArray[index] = scoreMap[scoreArray[index]]
                # print(scoreArray);
                scoreArray = np.array(scoreArray).reshape(
                    scoreArray.__len__(), 1)
                watwinArray = getOneColumn(student_data, 1)

                watwinArray = np.array(watwinArray).reshape(
                    watwinArray.__len__(), 1)
                score = model.fit(watwinArray,
                                  scoreArray).score(watwinArray, scoreArray)
                print("|", dataFileName, "|", score, "|")
def calculateCorrelation(data_name):
    _data_matrix, _data_header, _score_array = getData(data_name);

    for index in range(1,_data_header.__len__()):
        _feature_array = getOneColumn(_data_matrix,index);

        print(_data_header[index],pearsonr(_feature_array,_score_array));
def calculateMIC(dataFileArray,data_mark = None,neadNorm =False) :
    mic_map = {};
    for dataFileName in dataFileArray :
        if data_mark is None:
            data_mark = DATA_MARK;
        _fileName = os.path.join(data_mark, dataFileName);
        student_data,headerArray = load_data_from_file(_fileName);

        _score_map = get_final_score_map();
        _score_array = [];
        for _student_record in student_data:
            _score_array.append(_score_map[_student_record[0]]);

        featureCount = headerArray.__len__() - 1;

        if(neadNorm):
            _score_array =normizeDataSet(_score_array);

        #计算皮尔森相关系数 并输出成markdown形式
        m = MINE()
        for index in range(1,featureCount+1) :
            dataArray = getOneColumn(student_data,index);
            if (neadNorm):
                dataArray = normizeDataSet(dataArray);
            m.compute_score(dataArray,_score_array);
            mic_map[headerArray[index]] = m.mic();

    sorted_list = sorted(mic_map.items(),key=lambda i : i[1],reverse=True);
    threhold = np.mean(list(mic_map.values()));
    for header,value in sorted_list:
        if value > threhold:
            print(header,value)
Example #5
0
def calculateMean(data_mark=None):
    if data_mark is None:
        data_mark = DATA_MARK
    _fileName = os.path.join(data_mark, "concatfeature")
    student_data, headerArray = load_data_from_file(_fileName)

    _score_map = get_final_score_map()
    _score_array = []
    for _student_record in student_data:
        _score_array.append(_score_map[_student_record[0]])

    featureCount = headerArray.__len__() - 1

    for index in range(1, featureCount + 1):
        if headerArray[index] in ["saveInterval", "score"]:
            dataArray = getOneColumn(student_data, index)
            value_map = {}
            #按照等地分开
            for _score_index, _score in enumerate(_score_array):
                if _score not in value_map:
                    value_map[_score] = []
                value_map[_score].append(dataArray[_score_index])

            print(headerArray[index])
            for _i in range(SCORE_FOLD):
                print(_i, "%.2f" % np.array(value_map[_i]).mean())
Example #6
0
def getGradeMap():
    scoreData = getStudentData("finalscore")
    scoreArray = getOneColumn(scoreData, 1)
    scoreArray.sort()

    gradeArray = [40, 50, 70]

    for index in range(gradeArray.__len__()):
        gradeArray[index] = float(
            gradeArray[index]) * scoreArray.__len__() / 100

    scoreMap = {}
    for index, value in enumerate(scoreArray):
        if value not in scoreMap:
            grade = -1
            if index < gradeArray[0]:
                grade = 0
            elif index < gradeArray[1]:
                grade = 1
            elif index < gradeArray[2]:
                grade = 2
            else:
                grade = 3
            scoreMap[value] = grade
    return scoreMap
Example #7
0
def scoreAll():
    examId_List = ["e1", "e2", "e3", "e4"]
    algorithm_List = ["EQ", "Watwin"]
    target_List = ["score", "finalscore"]

    print('|数据名称|r^2|')
    print('|-|-|')
    model = LinearRegression()

    for al in algorithm_List:
        for target in target_List:
            for eid in examId_List:
                dataFileName = eid + "-" + al + "-" + target
                student_data = getStudentData(al + "//" + dataFileName)

                scoreArray = []
                final_score_map = get_final_score_map()
                for _line in student_data:
                    scoreArray.append(final_score_map[str(_line[0])])
                scoreArray = np.array(scoreArray).reshape(
                    scoreArray.__len__(), 1)

                watwinArray = getOneColumn(student_data, 1)
                watwinArray = np.array(watwinArray).reshape(
                    watwinArray.__len__(), 1)

                score = model.fit(watwinArray,
                                  scoreArray).score(watwinArray, scoreArray)
                print("|", dataFileName, "|", score, "|")
Example #8
0
def calculate_pearson():
    data_matrix, scoreArray, header = load_routine_data(True)

    for _index, _value in enumerate(header):

        data_col = getOneColumn(data_matrix, _index)
        p, pvalue = pearsonr(data_col, scoreArray)
        sp, sp_pvalue = spearmanr(data_col, scoreArray)
        print(_value, p, pvalue, sp, sp_pvalue)
def normizeMatrixZscore(matrix):
    colCount = matrix[0].__len__()
    lineCount = matrix.__len__()
    from common.DataHelper import getOneColumn
    for _index in range(0, colCount):
        dataset = getOneColumn(matrix, _index)
        dataset = normizeDataSetZscore(dataset)
        for _line_index in range(0, lineCount):
            matrix[_line_index][_index] = dataset[_line_index]
    return matrix
Example #10
0
def draw_scatter():
    data_matrix, scoreArray, header = load_routine_data(True)

    for _index, _value in enumerate(header):
        data_col = getOneColumn(data_matrix, _index)

        plt.figure()
        plt.scatter(data_col, scoreArray)
        plt.title(_value)
        plt.show()
def showWatwinAndScore():
    x = np.arange(1,
                  _student_data.__len__() + 1)
    y1 = getOneColumn(_student_data, 1)
    y2 = getOneColumn(_student_data, 2)

    # for i in range(len(y2)):
    #     y2[i] = 10*(1-y2[i]);

    fig = plt.figure()

    #设置标题
    #fig.set_title('Scatter Plot')
    #设置X轴标签
    #plt.xlabel('X')
    #设置Y轴标签
    #plt.ylabel('Y')
    #画散点图
    plt.scatter(x, y1, c='r', marker='x')
    plt.scatter(x, y2, color='c', marker='+')

    #显示所画的图
    plt.show()
Example #12
0
def drawDataDistribution(dataFileName, data_mark=None):
    if data_mark is None:
        data_mark = DATA_MARK
    _fileName = os.path.join(data_mark, dataFileName)
    student_data, headerArray = load_data_from_file(_fileName)

    featureCount = headerArray.__len__() - 1
    for colomnIndex in range(1, featureCount + 1):
        data = getOneColumn(student_data, colomnIndex)
        max, min = getMaxAndMin(data)
        boxWidth = (max - min) / _BOX_COUNT

        x_tags = []
        rightBorders = []
        _left = _right = min
        for _index in range(0, _BOX_COUNT):
            _left = _right
            _right += boxWidth
            rightBorders.append(_right)
            x_tags.append("[%.2f,%.2f)" % (_left, _right))

        x_counts = [0] * _BOX_COUNT

        for _value in data:
            for _index, _border in enumerate(rightBorders):
                if _value <= _border:
                    x_counts[_index] += 1
                    break

        #将未分类的归到最后一类去
        unTagCount = data.__len__()
        for _value in x_counts:
            unTagCount -= _value
        x_counts[_BOX_COUNT - 1] += unTagCount

        xIndex = range(_BOX_COUNT)
        plot.bar(xIndex, x_counts)
        plot.xticks(xIndex, x_tags, rotation=10, fontsize=8)
        for _a, _b in zip(xIndex, x_counts):
            plot.text(_a, _b + 0.05, str(_b), ha='center', va='bottom')

        title = headerArray[colomnIndex]
        plot.title(title)
        parentPath = OUT_ROOT_PATH + "/" + data_mark + "/distribution/"
        checkThenMkdirs(parentPath)
        plot.savefig(parentPath + title)
        plot.clf()
Example #13
0
def useKMeansToPredict(featureFileName,exam_mark=None):
    featureMatrix, scoreCol,headerArray= getDataAndScore(featureFileName,exam_mark,needHeader=True);
    precision_array = [];
    clusterMap = {"saveInterval":2,"score":3,"scoreRemainMiddle":2}
    for _index in range(0,headerArray.__len__()):
        if headerArray[_index] in clusterMap :
            clf = KMeans(n_clusters=clusterMap[headerArray[_index]]);
            dataArray = getOneColumn(featureMatrix,_index);
            dataArray = np.array(dataArray).reshape(dataArray.__len__(),1);

            clf.fit(dataArray);
            #构建预测模型
            center_array = clf.cluster_centers_
            label_map = {} ;
            for centerIndex,center in enumerate(center_array):
                label_map[centerIndex] = center[0];

            center_array =center_array.reshape(center_array.__len__());
            center_array = sorted(center_array.tolist());
            for label in label_map :
                label_map[label] = center_array.index(label_map[label]);

            #预测
            grade_predict_array = clf.predict(dataArray);

            #判断准确率
            current_score_map = score_map[headerArray[_index]];
            t_t = 0;
            t_f = 0;
            f_t = 0;
            f_f = 0;
            for record_index in range(grade_predict_array.__len__()):
                grade_predict = label_map[grade_predict_array[record_index]];
                true_grade = current_score_map[scoreCol[record_index]];

                if true_grade == 1 and grade_predict ==1 :
                    t_t += 1;
                if true_grade == 1 and grade_predict ==0 :
                    t_f += 1;
                if true_grade == 0 and grade_predict == 1:
                    f_t += 1;
                if true_grade == 0 and grade_predict == 0:
                    f_f += 1;

            print( headerArray[_index] , " : ",t_t,t_f,f_t,f_f);
Example #14
0
def drawDataWithScorePic(dataFileName,needNorminize = False,out_mark = None) :
    '''
    画出特征的一元分布图
    :param needNormize:
    :return:
    '''
    _fileName = os.path.join(DATA_MARK,dataFileName);

    student_data,headerArray = load_data_from_file(_fileName);

    _score_map = get_final_score_map(None);

    _score_array = [];
    for _student_record in student_data:
        _score_array.append(_score_map[_student_record[0]]);

    if needNorminize :
        _score_array = normizeDataSet(_score_array);

    #遍历所有的特征
    for colomnIndex in range(1, headerArray.__len__()):
        data = getOneColumn(student_data, colomnIndex);

        # if headerArray[colomnIndex] == "avgRemoveErrorTime":
        #     for index in range(data.__len__()):
        #         if data[index] > 300:
        #             data[index] = 300;

        if (needNorminize):
            data = normizeDataSet(dataSetA=data);

        plot.scatter(_score_array, data ,s=2);
        title = headerArray[colomnIndex]+"-score";
        if(needNorminize):
            title += "-nominized";
        plot.title(title);
        plot.xlabel("score");
        plot.ylabel(headerArray[colomnIndex]);

        parentPath = OUT_ROOT_PATH +"/"+ DATA_MARK +"/scatterWithScore/";
        checkThenMkdirs(parentPath);
        if out_mark is not None:
            title += "-"+out_mark;
        plot.savefig(parentPath+ title);
        plot.clf();
Example #15
0
def useObserveToPredict(featureFileName, exam_mark=None):
    featureMatrix, scoreCol, headerArray = getDataAndScore(featureFileName,
                                                           exam_mark,
                                                           needHeader=True)
    precision_array = []
    gapMap = {
        "saveInterval": [60],
        "score": [40, 80],
        "scoreRemainMiddle": [80],
        "scoreUp": [40, 80]
    }
    for _index in range(0, headerArray.__len__()):
        if headerArray[_index] in gapMap:
            dataArray = getOneColumn(featureMatrix, _index)
            dataArray = np.array(dataArray).reshape(dataArray.__len__(), 1)

            #构建预测模型
            sortArray = sorted(dataArray)
            gap = gapMap[headerArray[_index]]
            for gap_index in range(gap.__len__()):
                gap[gap_index] = sortArray[(int)(gap[gap_index] / 100 *
                                                 sortArray.__len__())]

            #预测
            predict = []
            for record_index in range(dataArray.__len__()):
                predict_val = gap.__len__()
                for gap_index, gap_value in enumerate(gap):
                    if dataArray[record_index] <= gap_value:
                        predict_val = gap_index
                        break
                predict.append(predict_val)

            #判断准确率
            precision = 0.0
            current_score_map = score_map[headerArray[_index]]
            for record_index in range(dataArray.__len__()):
                true_grade = current_score_map[scoreCol[record_index]]
                if true_grade == predict[record_index]:
                    precision += 1

            print(headerArray[_index],
                  "%.4f" % (precision / dataArray.__len__()))
def split_score_to_k_fold(N=10):
    _final_score_matrix, _header_array, _header_type_array = load_data_from_file(
        "finalscore", needType=True)
    _score_array = getOneColumn(_final_score_matrix, 1)
    _score_array = sorted(_score_array)
    gap = _score_array.__len__() / N

    pre_score = -1
    score_to_index_map = {}
    for index, score in enumerate(_score_array):
        if score == pre_score:
            continue
        else:
            fold_index = int(index / gap)
            score_to_index_map[score] = fold_index
            pre_score = score

    for _line in _final_score_matrix:
        _line[1] = score_to_index_map[_line[1]]

    out_file_path = os.path.join(OUT_ROOT_PATH, "finalscore-" + str(N))

    out_file = open(out_file_path, "w")
    for index, value in enumerate(_header_array):
        if index != 0:
            out_file.write(",")
        out_file.write(value)
    out_file.write("\n")

    for index, value in enumerate(_header_type_array):
        if index != 0:
            out_file.write(",")
        out_file.write(value)
    out_file.write("\n")

    for _line in _final_score_matrix:
        for index, value in enumerate(_line):
            if index != 0:
                out_file.write(",")
            out_file.write(str(value))
        out_file.write("\n")
    out_file.close()
Example #17
0
def calculateT(data_mark=None):
    """
    判断两个群体的平均数 是否存在显著的差异
    :param data_mark:
    :return:
    """
    if data_mark is None:
        data_mark = DATA_MARK
    _fileName = os.path.join(data_mark, "concatfeature")
    student_data, headerArray = load_data_from_file(_fileName)

    _score_map = get_final_score_map()
    _score_array = []
    for _student_record in student_data:
        _score_array.append(_score_map[_student_record[0]])

    featureCount = headerArray.__len__() - 1

    for index in range(1, featureCount + 1):
        dataArray = getOneColumn(student_data, index)
        value_map = {}
        #按照等地分开
        for _score_index, _score in enumerate(_score_array):
            if _score not in value_map:
                value_map[_score] = []
            value_map[_score].append(dataArray[_score_index])

        print(headerArray[index])
        for _i in range(SCORE_FOLD):
            for _j in range(_i + 1, SCORE_FOLD):
                a = value_map[_i]
                b = value_map[_j]
                l, p = levene(*[a, b])
                t_value, p_value = 0, 0
                if p <= 0.05:
                    t_value, p_value = ttest_ind(a, b, equal_var=False)
                else:
                    t_value, p_value = ttest_ind(a, b, equal_var=True)

                if p_value <= 0.05:
                    # print( _i,_j,"|", t_value , p_value)
                    print(_i, _j)
Example #18
0
def usePearsonrCalAll():
    examId_List = ["e1", "e2", "e3", "e4"]
    algorithm_List = ["EQ", "Watwin"]
    target_List = ["score", "finalscore"]

    print('|数据名称|相关系数|p|')
    print('|-|-|-|')

    for al in algorithm_List:
        for target in target_List:
            for eid in examId_List:
                dataFileName = eid + "-" + al + "-" + target
                student_data = getStudentData(al + "//" + dataFileName)
                # scoreArray = getOneColumn(student_data, 2);
                scoreArray = []
                final_score_map = get_final_score_map()
                for _line in student_data:
                    scoreArray.append(final_score_map[str(_line[0])])

                watwinArray = getOneColumn(student_data, 1)
                c, p = pearsonr(scoreArray, watwinArray)
                print("|", dataFileName, "|", c, "|", p, "|")
Example #19
0
def calculateSpearman(dataFileName, data_mark=None, neadNorm=False):
    if data_mark is None:
        data_mark = DATA_MARK
    _fileName = os.path.join(data_mark, dataFileName)
    student_data, headerArray = load_data_from_file(_fileName)

    _score_map = get_final_score_map()
    _score_array = []
    for _student_record in student_data:
        _score_array.append(_score_map[_student_record[0]])

    featureCount = headerArray.__len__() - 1

    if (neadNorm):
        _score_array = normizeDataSet(_score_array)

    for index in range(1, featureCount + 1):
        dataArray = getOneColumn(student_data, index)
        if (neadNorm):
            dataArray = normizeDataSet(dataArray)
        pValue, p = spearmanr(dataArray, _score_array)
        print(headerArray[index], pValue, p)
Example #20
0
def calculateF(data_mark = None) :
    """
    判断两个群体的平均数 是否存在显著的差异
    :param data_mark:
    :return:
    """
    if data_mark is None:
        data_mark = DATA_MARK;
    _fileName = os.path.join(data_mark, "concatfeature");
    student_data,headerArray = load_data_from_file(_fileName);

    _score_map = get_final_score_map();
    _score_array = [];
    for _student_record in student_data:
        _score_array.append(_score_map[_student_record[0]]);

    featureCount = headerArray.__len__() - 1;

    for index in range(1,featureCount+1) :
        dataArray = getOneColumn(student_data,index);
        value_map = {}
        #按照等地分开
        for _score_index,_score in enumerate(_score_array ):
            if _score not in value_map :
                value_map[_score] = [];
            value_map[_score].append(dataArray[_score_index]);

        dataArray = [];
        for _score in value_map:
            dataArray.append(value_map[_score]);

        l,p = levene(*dataArray);
        if p <= 0.05:
            pass
            # print(headerArray[index],"levene Test show warning (p = %.2f)"%p);
        else :
            f,p = f_oneway(*dataArray);
            if p <= 0.05 :
                print(headerArray[index],f,p);
Example #21
0
def usLRtoPredict():
    examId_List = ["e1", "e2", "e3", "e4"]
    algorithm_List = ["EQ", "Watwin"]
    target_List = ["finalscore"]

    print('|数据名称|预测|5分|10分|')
    print('|-|-|-|')

    for al in algorithm_List:
        for eid in examId_List:
            for target in target_List:
                dataFileName = eid + "-" + al + "-" + target
                student_data = getStudentData(al + "//" + dataFileName)
                # scoreArray = getOneColumn(student_data, 2);
                scoreArray = []
                final_score_map = get_final_score_map()
                for _line in student_data:
                    scoreArray.append(final_score_map[str(_line[0])])

                scoreArray = np.array(scoreArray).reshape(
                    scoreArray.__len__(), 1)

                watwinArray = getOneColumn(student_data, 1)
                watwinArray = np.array(watwinArray).reshape(
                    watwinArray.__len__(), 1)
                _lr = LinearRegression(fit_intercept=True)
                _lr.fit(watwinArray, scoreArray)
                y_predicted = _lr.predict(watwinArray)
                print("|", dataFileName, "|",
                      getprecisionWithTorlerate(y_predicted, scoreArray,
                                                0.5), "|",
                      getprecisionWithTorlerate(y_predicted, scoreArray,
                                                1.5), "|",
                      getprecisionWithTorlerate(y_predicted, scoreArray, 2.5),
                      "|", r2_score(scoreArray, y_predicted), "|",
                      spearmanr(y_predicted, scoreArray))
Example #22
0
def usLRtoPredictWithKFold():
    examId_List = ["e1", "e2", "e3", "e4"]
    algorithm_List = ["EQ", "Watwin"]
    target_List = ["finalscore"]

    print('|数据名称|预测|5分|10分|')
    print('|-|-|-|')

    for al in algorithm_List:
        for eid in examId_List:
            for target in target_List:
                dataFileName = eid + "-" + al + "-" + target
                student_data = getStudentData(al + "//" + dataFileName)
                # scoreArray = getOneColumn(student_data, 2);
                scoreArray = []
                final_score_map = get_final_score_map()
                for _line in student_data:
                    scoreArray.append(final_score_map[str(_line[0])])

                scoreArray = np.array(scoreArray).reshape(
                    scoreArray.__len__(), 1)

                watwinArray = getOneColumn(student_data, 1)
                watwinArray = np.array(watwinArray).reshape(
                    watwinArray.__len__(), 1)

                kf = KFold(n_splits=10, shuffle=True)
                accurate_array = []
                within_5_array = []
                r_2_array = []
                within_10_array = []

                for train_index_array, test_index_array in kf.split(
                        watwinArray):
                    X_train = []
                    X_test = []
                    y_train = []
                    y_test = []
                    for train_index in train_index_array:
                        X_train.append(watwinArray[train_index])
                        y_train.append(scoreArray[train_index])

                    for test_index in test_index_array:
                        X_test.append(watwinArray[test_index])
                        y_test.append(scoreArray[test_index])

                    _lr = LinearRegression(fit_intercept=True)
                    _lr.fit(X_train, y_train)
                    y_predicted = _lr.predict(X_test)

                    accurate_array.append(
                        getprecisionWithTorlerate(y_predicted, y_test, 0.5))
                    within_5_array.append(
                        getprecisionWithTorlerate(y_test, y_predicted, 1.5))
                    within_10_array.append(
                        getprecisionWithTorlerate(y_test, y_predicted, 2.5))
                    r_2_array.append(r2_score(y_test, y_predicted))

                print("|", dataFileName, "|",
                      np.array(accurate_array).mean(), "|",
                      np.array(within_5_array).mean(), "|",
                      np.array(within_10_array).mean(), "|",
                      np.array(r_2_array).mean())