def usLRtoPredictWithExpDef(): examId_List = ["e1", "e2", "e3", "e4"] algorithm_List = ["EQ", "Watwin"] target_List = ["finalscore"] scoreMap = getGradeMap() print('|数据名称|预测准确率|') print('|-|-|') for al in algorithm_List: for eid in examId_List: for target in target_List: dataFileName = eid + "-" + al + "-" + target student_data = getStudentData(al + "//" + dataFileName) # scoreArray = getOneColumn(student_data, 2); scoreArray = [] final_score_map = get_final_score_map() scoreArray = getOneColumn(student_data, 2) for index in range(scoreArray.__len__()): scoreArray[index] = scoreMap[scoreArray[index]] scoreArray = np.array(scoreArray).reshape( scoreArray.__len__(), 1) watwinArray = getOneColumn(student_data, 1) watwinArray = np.array(watwinArray).reshape( watwinArray.__len__(), 1) _lr = LinearRegression(fit_intercept=True) _lr.fit(watwinArray, scoreArray) y_predicted = _lr.predict(watwinArray) print("|", dataFileName, "|", getprecisionWithTorlerate(y_predicted, scoreArray, 0.5), "|")
def scorefinalScoreWithExpDef(): examId_List = ["e1", "e2", "e3", "e4"] algorithm_List = ["EQ", "Watwin"] target_List = ["finalscore"] scoreMap = getGradeMap() print('|数据名称|r^2|') print('|-|-|') model = LinearRegression() for al in algorithm_List: for target in target_List: for eid in examId_List: dataFileName = eid + "-" + al + "-" + target student_data = getStudentData(al + "//" + dataFileName) scoreArray = getOneColumn(student_data, 2) for index in range(scoreArray.__len__()): scoreArray[index] = scoreMap[scoreArray[index]] # print(scoreArray); scoreArray = np.array(scoreArray).reshape( scoreArray.__len__(), 1) watwinArray = getOneColumn(student_data, 1) watwinArray = np.array(watwinArray).reshape( watwinArray.__len__(), 1) score = model.fit(watwinArray, scoreArray).score(watwinArray, scoreArray) print("|", dataFileName, "|", score, "|")
def calculateCorrelation(data_name): _data_matrix, _data_header, _score_array = getData(data_name); for index in range(1,_data_header.__len__()): _feature_array = getOneColumn(_data_matrix,index); print(_data_header[index],pearsonr(_feature_array,_score_array));
def calculateMIC(dataFileArray,data_mark = None,neadNorm =False) : mic_map = {}; for dataFileName in dataFileArray : if data_mark is None: data_mark = DATA_MARK; _fileName = os.path.join(data_mark, dataFileName); student_data,headerArray = load_data_from_file(_fileName); _score_map = get_final_score_map(); _score_array = []; for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]); featureCount = headerArray.__len__() - 1; if(neadNorm): _score_array =normizeDataSet(_score_array); #计算皮尔森相关系数 并输出成markdown形式 m = MINE() for index in range(1,featureCount+1) : dataArray = getOneColumn(student_data,index); if (neadNorm): dataArray = normizeDataSet(dataArray); m.compute_score(dataArray,_score_array); mic_map[headerArray[index]] = m.mic(); sorted_list = sorted(mic_map.items(),key=lambda i : i[1],reverse=True); threhold = np.mean(list(mic_map.values())); for header,value in sorted_list: if value > threhold: print(header,value)
def calculateMean(data_mark=None): if data_mark is None: data_mark = DATA_MARK _fileName = os.path.join(data_mark, "concatfeature") student_data, headerArray = load_data_from_file(_fileName) _score_map = get_final_score_map() _score_array = [] for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]) featureCount = headerArray.__len__() - 1 for index in range(1, featureCount + 1): if headerArray[index] in ["saveInterval", "score"]: dataArray = getOneColumn(student_data, index) value_map = {} #按照等地分开 for _score_index, _score in enumerate(_score_array): if _score not in value_map: value_map[_score] = [] value_map[_score].append(dataArray[_score_index]) print(headerArray[index]) for _i in range(SCORE_FOLD): print(_i, "%.2f" % np.array(value_map[_i]).mean())
def getGradeMap(): scoreData = getStudentData("finalscore") scoreArray = getOneColumn(scoreData, 1) scoreArray.sort() gradeArray = [40, 50, 70] for index in range(gradeArray.__len__()): gradeArray[index] = float( gradeArray[index]) * scoreArray.__len__() / 100 scoreMap = {} for index, value in enumerate(scoreArray): if value not in scoreMap: grade = -1 if index < gradeArray[0]: grade = 0 elif index < gradeArray[1]: grade = 1 elif index < gradeArray[2]: grade = 2 else: grade = 3 scoreMap[value] = grade return scoreMap
def scoreAll(): examId_List = ["e1", "e2", "e3", "e4"] algorithm_List = ["EQ", "Watwin"] target_List = ["score", "finalscore"] print('|数据名称|r^2|') print('|-|-|') model = LinearRegression() for al in algorithm_List: for target in target_List: for eid in examId_List: dataFileName = eid + "-" + al + "-" + target student_data = getStudentData(al + "//" + dataFileName) scoreArray = [] final_score_map = get_final_score_map() for _line in student_data: scoreArray.append(final_score_map[str(_line[0])]) scoreArray = np.array(scoreArray).reshape( scoreArray.__len__(), 1) watwinArray = getOneColumn(student_data, 1) watwinArray = np.array(watwinArray).reshape( watwinArray.__len__(), 1) score = model.fit(watwinArray, scoreArray).score(watwinArray, scoreArray) print("|", dataFileName, "|", score, "|")
def calculate_pearson(): data_matrix, scoreArray, header = load_routine_data(True) for _index, _value in enumerate(header): data_col = getOneColumn(data_matrix, _index) p, pvalue = pearsonr(data_col, scoreArray) sp, sp_pvalue = spearmanr(data_col, scoreArray) print(_value, p, pvalue, sp, sp_pvalue)
def normizeMatrixZscore(matrix): colCount = matrix[0].__len__() lineCount = matrix.__len__() from common.DataHelper import getOneColumn for _index in range(0, colCount): dataset = getOneColumn(matrix, _index) dataset = normizeDataSetZscore(dataset) for _line_index in range(0, lineCount): matrix[_line_index][_index] = dataset[_line_index] return matrix
def draw_scatter(): data_matrix, scoreArray, header = load_routine_data(True) for _index, _value in enumerate(header): data_col = getOneColumn(data_matrix, _index) plt.figure() plt.scatter(data_col, scoreArray) plt.title(_value) plt.show()
def showWatwinAndScore(): x = np.arange(1, _student_data.__len__() + 1) y1 = getOneColumn(_student_data, 1) y2 = getOneColumn(_student_data, 2) # for i in range(len(y2)): # y2[i] = 10*(1-y2[i]); fig = plt.figure() #设置标题 #fig.set_title('Scatter Plot') #设置X轴标签 #plt.xlabel('X') #设置Y轴标签 #plt.ylabel('Y') #画散点图 plt.scatter(x, y1, c='r', marker='x') plt.scatter(x, y2, color='c', marker='+') #显示所画的图 plt.show()
def drawDataDistribution(dataFileName, data_mark=None): if data_mark is None: data_mark = DATA_MARK _fileName = os.path.join(data_mark, dataFileName) student_data, headerArray = load_data_from_file(_fileName) featureCount = headerArray.__len__() - 1 for colomnIndex in range(1, featureCount + 1): data = getOneColumn(student_data, colomnIndex) max, min = getMaxAndMin(data) boxWidth = (max - min) / _BOX_COUNT x_tags = [] rightBorders = [] _left = _right = min for _index in range(0, _BOX_COUNT): _left = _right _right += boxWidth rightBorders.append(_right) x_tags.append("[%.2f,%.2f)" % (_left, _right)) x_counts = [0] * _BOX_COUNT for _value in data: for _index, _border in enumerate(rightBorders): if _value <= _border: x_counts[_index] += 1 break #将未分类的归到最后一类去 unTagCount = data.__len__() for _value in x_counts: unTagCount -= _value x_counts[_BOX_COUNT - 1] += unTagCount xIndex = range(_BOX_COUNT) plot.bar(xIndex, x_counts) plot.xticks(xIndex, x_tags, rotation=10, fontsize=8) for _a, _b in zip(xIndex, x_counts): plot.text(_a, _b + 0.05, str(_b), ha='center', va='bottom') title = headerArray[colomnIndex] plot.title(title) parentPath = OUT_ROOT_PATH + "/" + data_mark + "/distribution/" checkThenMkdirs(parentPath) plot.savefig(parentPath + title) plot.clf()
def useKMeansToPredict(featureFileName,exam_mark=None): featureMatrix, scoreCol,headerArray= getDataAndScore(featureFileName,exam_mark,needHeader=True); precision_array = []; clusterMap = {"saveInterval":2,"score":3,"scoreRemainMiddle":2} for _index in range(0,headerArray.__len__()): if headerArray[_index] in clusterMap : clf = KMeans(n_clusters=clusterMap[headerArray[_index]]); dataArray = getOneColumn(featureMatrix,_index); dataArray = np.array(dataArray).reshape(dataArray.__len__(),1); clf.fit(dataArray); #构建预测模型 center_array = clf.cluster_centers_ label_map = {} ; for centerIndex,center in enumerate(center_array): label_map[centerIndex] = center[0]; center_array =center_array.reshape(center_array.__len__()); center_array = sorted(center_array.tolist()); for label in label_map : label_map[label] = center_array.index(label_map[label]); #预测 grade_predict_array = clf.predict(dataArray); #判断准确率 current_score_map = score_map[headerArray[_index]]; t_t = 0; t_f = 0; f_t = 0; f_f = 0; for record_index in range(grade_predict_array.__len__()): grade_predict = label_map[grade_predict_array[record_index]]; true_grade = current_score_map[scoreCol[record_index]]; if true_grade == 1 and grade_predict ==1 : t_t += 1; if true_grade == 1 and grade_predict ==0 : t_f += 1; if true_grade == 0 and grade_predict == 1: f_t += 1; if true_grade == 0 and grade_predict == 0: f_f += 1; print( headerArray[_index] , " : ",t_t,t_f,f_t,f_f);
def drawDataWithScorePic(dataFileName,needNorminize = False,out_mark = None) : ''' 画出特征的一元分布图 :param needNormize: :return: ''' _fileName = os.path.join(DATA_MARK,dataFileName); student_data,headerArray = load_data_from_file(_fileName); _score_map = get_final_score_map(None); _score_array = []; for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]); if needNorminize : _score_array = normizeDataSet(_score_array); #遍历所有的特征 for colomnIndex in range(1, headerArray.__len__()): data = getOneColumn(student_data, colomnIndex); # if headerArray[colomnIndex] == "avgRemoveErrorTime": # for index in range(data.__len__()): # if data[index] > 300: # data[index] = 300; if (needNorminize): data = normizeDataSet(dataSetA=data); plot.scatter(_score_array, data ,s=2); title = headerArray[colomnIndex]+"-score"; if(needNorminize): title += "-nominized"; plot.title(title); plot.xlabel("score"); plot.ylabel(headerArray[colomnIndex]); parentPath = OUT_ROOT_PATH +"/"+ DATA_MARK +"/scatterWithScore/"; checkThenMkdirs(parentPath); if out_mark is not None: title += "-"+out_mark; plot.savefig(parentPath+ title); plot.clf();
def useObserveToPredict(featureFileName, exam_mark=None): featureMatrix, scoreCol, headerArray = getDataAndScore(featureFileName, exam_mark, needHeader=True) precision_array = [] gapMap = { "saveInterval": [60], "score": [40, 80], "scoreRemainMiddle": [80], "scoreUp": [40, 80] } for _index in range(0, headerArray.__len__()): if headerArray[_index] in gapMap: dataArray = getOneColumn(featureMatrix, _index) dataArray = np.array(dataArray).reshape(dataArray.__len__(), 1) #构建预测模型 sortArray = sorted(dataArray) gap = gapMap[headerArray[_index]] for gap_index in range(gap.__len__()): gap[gap_index] = sortArray[(int)(gap[gap_index] / 100 * sortArray.__len__())] #预测 predict = [] for record_index in range(dataArray.__len__()): predict_val = gap.__len__() for gap_index, gap_value in enumerate(gap): if dataArray[record_index] <= gap_value: predict_val = gap_index break predict.append(predict_val) #判断准确率 precision = 0.0 current_score_map = score_map[headerArray[_index]] for record_index in range(dataArray.__len__()): true_grade = current_score_map[scoreCol[record_index]] if true_grade == predict[record_index]: precision += 1 print(headerArray[_index], "%.4f" % (precision / dataArray.__len__()))
def split_score_to_k_fold(N=10): _final_score_matrix, _header_array, _header_type_array = load_data_from_file( "finalscore", needType=True) _score_array = getOneColumn(_final_score_matrix, 1) _score_array = sorted(_score_array) gap = _score_array.__len__() / N pre_score = -1 score_to_index_map = {} for index, score in enumerate(_score_array): if score == pre_score: continue else: fold_index = int(index / gap) score_to_index_map[score] = fold_index pre_score = score for _line in _final_score_matrix: _line[1] = score_to_index_map[_line[1]] out_file_path = os.path.join(OUT_ROOT_PATH, "finalscore-" + str(N)) out_file = open(out_file_path, "w") for index, value in enumerate(_header_array): if index != 0: out_file.write(",") out_file.write(value) out_file.write("\n") for index, value in enumerate(_header_type_array): if index != 0: out_file.write(",") out_file.write(value) out_file.write("\n") for _line in _final_score_matrix: for index, value in enumerate(_line): if index != 0: out_file.write(",") out_file.write(str(value)) out_file.write("\n") out_file.close()
def calculateT(data_mark=None): """ 判断两个群体的平均数 是否存在显著的差异 :param data_mark: :return: """ if data_mark is None: data_mark = DATA_MARK _fileName = os.path.join(data_mark, "concatfeature") student_data, headerArray = load_data_from_file(_fileName) _score_map = get_final_score_map() _score_array = [] for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]) featureCount = headerArray.__len__() - 1 for index in range(1, featureCount + 1): dataArray = getOneColumn(student_data, index) value_map = {} #按照等地分开 for _score_index, _score in enumerate(_score_array): if _score not in value_map: value_map[_score] = [] value_map[_score].append(dataArray[_score_index]) print(headerArray[index]) for _i in range(SCORE_FOLD): for _j in range(_i + 1, SCORE_FOLD): a = value_map[_i] b = value_map[_j] l, p = levene(*[a, b]) t_value, p_value = 0, 0 if p <= 0.05: t_value, p_value = ttest_ind(a, b, equal_var=False) else: t_value, p_value = ttest_ind(a, b, equal_var=True) if p_value <= 0.05: # print( _i,_j,"|", t_value , p_value) print(_i, _j)
def usePearsonrCalAll(): examId_List = ["e1", "e2", "e3", "e4"] algorithm_List = ["EQ", "Watwin"] target_List = ["score", "finalscore"] print('|数据名称|相关系数|p|') print('|-|-|-|') for al in algorithm_List: for target in target_List: for eid in examId_List: dataFileName = eid + "-" + al + "-" + target student_data = getStudentData(al + "//" + dataFileName) # scoreArray = getOneColumn(student_data, 2); scoreArray = [] final_score_map = get_final_score_map() for _line in student_data: scoreArray.append(final_score_map[str(_line[0])]) watwinArray = getOneColumn(student_data, 1) c, p = pearsonr(scoreArray, watwinArray) print("|", dataFileName, "|", c, "|", p, "|")
def calculateSpearman(dataFileName, data_mark=None, neadNorm=False): if data_mark is None: data_mark = DATA_MARK _fileName = os.path.join(data_mark, dataFileName) student_data, headerArray = load_data_from_file(_fileName) _score_map = get_final_score_map() _score_array = [] for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]) featureCount = headerArray.__len__() - 1 if (neadNorm): _score_array = normizeDataSet(_score_array) for index in range(1, featureCount + 1): dataArray = getOneColumn(student_data, index) if (neadNorm): dataArray = normizeDataSet(dataArray) pValue, p = spearmanr(dataArray, _score_array) print(headerArray[index], pValue, p)
def calculateF(data_mark = None) : """ 判断两个群体的平均数 是否存在显著的差异 :param data_mark: :return: """ if data_mark is None: data_mark = DATA_MARK; _fileName = os.path.join(data_mark, "concatfeature"); student_data,headerArray = load_data_from_file(_fileName); _score_map = get_final_score_map(); _score_array = []; for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]); featureCount = headerArray.__len__() - 1; for index in range(1,featureCount+1) : dataArray = getOneColumn(student_data,index); value_map = {} #按照等地分开 for _score_index,_score in enumerate(_score_array ): if _score not in value_map : value_map[_score] = []; value_map[_score].append(dataArray[_score_index]); dataArray = []; for _score in value_map: dataArray.append(value_map[_score]); l,p = levene(*dataArray); if p <= 0.05: pass # print(headerArray[index],"levene Test show warning (p = %.2f)"%p); else : f,p = f_oneway(*dataArray); if p <= 0.05 : print(headerArray[index],f,p);
def usLRtoPredict(): examId_List = ["e1", "e2", "e3", "e4"] algorithm_List = ["EQ", "Watwin"] target_List = ["finalscore"] print('|数据名称|预测|5分|10分|') print('|-|-|-|') for al in algorithm_List: for eid in examId_List: for target in target_List: dataFileName = eid + "-" + al + "-" + target student_data = getStudentData(al + "//" + dataFileName) # scoreArray = getOneColumn(student_data, 2); scoreArray = [] final_score_map = get_final_score_map() for _line in student_data: scoreArray.append(final_score_map[str(_line[0])]) scoreArray = np.array(scoreArray).reshape( scoreArray.__len__(), 1) watwinArray = getOneColumn(student_data, 1) watwinArray = np.array(watwinArray).reshape( watwinArray.__len__(), 1) _lr = LinearRegression(fit_intercept=True) _lr.fit(watwinArray, scoreArray) y_predicted = _lr.predict(watwinArray) print("|", dataFileName, "|", getprecisionWithTorlerate(y_predicted, scoreArray, 0.5), "|", getprecisionWithTorlerate(y_predicted, scoreArray, 1.5), "|", getprecisionWithTorlerate(y_predicted, scoreArray, 2.5), "|", r2_score(scoreArray, y_predicted), "|", spearmanr(y_predicted, scoreArray))
def usLRtoPredictWithKFold(): examId_List = ["e1", "e2", "e3", "e4"] algorithm_List = ["EQ", "Watwin"] target_List = ["finalscore"] print('|数据名称|预测|5分|10分|') print('|-|-|-|') for al in algorithm_List: for eid in examId_List: for target in target_List: dataFileName = eid + "-" + al + "-" + target student_data = getStudentData(al + "//" + dataFileName) # scoreArray = getOneColumn(student_data, 2); scoreArray = [] final_score_map = get_final_score_map() for _line in student_data: scoreArray.append(final_score_map[str(_line[0])]) scoreArray = np.array(scoreArray).reshape( scoreArray.__len__(), 1) watwinArray = getOneColumn(student_data, 1) watwinArray = np.array(watwinArray).reshape( watwinArray.__len__(), 1) kf = KFold(n_splits=10, shuffle=True) accurate_array = [] within_5_array = [] r_2_array = [] within_10_array = [] for train_index_array, test_index_array in kf.split( watwinArray): X_train = [] X_test = [] y_train = [] y_test = [] for train_index in train_index_array: X_train.append(watwinArray[train_index]) y_train.append(scoreArray[train_index]) for test_index in test_index_array: X_test.append(watwinArray[test_index]) y_test.append(scoreArray[test_index]) _lr = LinearRegression(fit_intercept=True) _lr.fit(X_train, y_train) y_predicted = _lr.predict(X_test) accurate_array.append( getprecisionWithTorlerate(y_predicted, y_test, 0.5)) within_5_array.append( getprecisionWithTorlerate(y_test, y_predicted, 1.5)) within_10_array.append( getprecisionWithTorlerate(y_test, y_predicted, 2.5)) r_2_array.append(r2_score(y_test, y_predicted)) print("|", dataFileName, "|", np.array(accurate_array).mean(), "|", np.array(within_5_array).mean(), "|", np.array(within_10_array).mean(), "|", np.array(r_2_array).mean())