def usLRtoPredict(): examId_List = ["e1", "e2", "e3", "e4"] target_List = ["programStateRecord"] print('|数据名称|预测|r^2|spearman|') print('|-|-|-|') for eid in examId_List: for target in target_List: dataFileName = eid + "-" + target _data_matrix, _data_header, _score_array = getData("npsm//" + dataFileName) _feature_matrix = getSerevalColumn( _data_matrix, [i for i in range(1, _data_header.__len__())]) # _feature_matrix = getSerevalColumn(_data_matrix,[1,8]); # _feature_matrix = getSerevalColumn(_data_matrix,[1,4,8]); # _feature_matrix = getSerevalColumn(_data_matrix,[1,2,3,9]); _score_array = np.array(_score_array).reshape( _score_array.__len__(), 1) _lr = LinearRegression() _lr.fit(_feature_matrix, _score_array) y_predicted = _lr.predict(_feature_matrix) print("|", dataFileName, "|", getprecisionWithTorlerate(_score_array, y_predicted, 0.5), "|", getprecisionWithTorlerate(_score_array, y_predicted, 1.5), "|", getprecisionWithTorlerate(_score_array, y_predicted, 2.5), "|", r2_score(_score_array, y_predicted), "|", spearmanr(_score_array, y_predicted), "|", getprecisionWithTorlerate(_score_array, y_predicted, 5))
def useLRtoPredictScore(targetFileName, exam_mark=None, needNorm=True): if exam_mark is None: exam_mark = DATA_MARK _file_Relative_Path = os.path.join(exam_mark, targetFileName) student_data, headerArray = load_data_from_file(_file_Relative_Path) _score_map = get_final_score_map() _score_array = [] for _student_record in student_data: _score_array.append(_score_map[_student_record[0]]) targetFeatureIndexArray = [i for i in range(1, headerArray.__len__())] featureMatrix = getSerevalColumn(student_data, targetFeatureIndexArray) if needNorm: featureMatrix = normizeMatrix(featureMatrix) _lr = LinearRegression(fit_intercept=True) _lr.fit(featureMatrix, _score_array) y_predicted = _lr.predict(featureMatrix) # y_predicted.astype(int) print() # print(headerArray); # print(_lr.coef_) # print(_lr.intercept_) print(getprecisionWithTorlerate(y_predicted, _score_array, 0.5), getprecisionWithTorlerate(y_predicted, _score_array, 1.5), getprecisionWithTorlerate(y_predicted, _score_array, 2.5), spearmanr(y_predicted, _score_array), r2_score(_score_array, y_predicted))
def getDataAndScore(featureFileName, exam_mark=None, needHeader=False): if exam_mark is None: exam_mark = DATA_MARK _file_Relative_Path = os.path.join(exam_mark, featureFileName) student_data, headerArray = load_data_from_file(_file_Relative_Path) _feature_matrix = getSerevalColumn( student_data, [i for i in range(1, headerArray.__len__())]) # index_array = [1, 1, 1, 1, 1, 2, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1, 4]; # target_index_array = []; # for _index,_value in enumerate(index_array): # if _value == 1: # target_index_array.append(_index+1); # _feature_matrix = getSerevalColumn(student_data, target_index_array) # feature_array = ["saveInterval", "programTime", "totalLength", "pasteCount", # "codeIntervalCount","saveCount","buildInterval", # "score","codeTime","successCount","testCount", # "scoreRemainMiddle","avgRemoveErrorTime", # ]; # _feature_matrix = getSerevalColumn(student_data,getTargetColumnList(headerArray,feature_array)) # _score_map = get_final_score_map(None); _score_map = get_final_score_map() _score_array = [] for record in student_data: _score_array.append(_score_map[record[0]]) # sorted_score_array = sorted(_score_array); # gap = _score_array.__len__() / 10; # pre_score = -1; # score_to_index_map = {}; # for index, score in enumerate(sorted_score_array): # if score == pre_score: # continue; # else: # fold_index = int(index / gap); # score_to_index_map[score] = fold_index; # pre_score = score; # # for index,value in enumerate(_score_array): # _score_array[index] = score_to_index_map[value]; if needHeader: return _feature_matrix, _score_array, headerArray else: return _feature_matrix, _score_array
def tryAllFeatureCompositeWithSVM(): data_matrix, scoreArray, header = load_routine_data(True) clf = SVC(kernel="poly", decision_function_shape="ovr", degree=2) compositeGenerator = GetColCombination(header) for _n in range(1, header.__len__() + 1): compositeGenerator.setCompositeNum(_n) canFind, colArray = compositeGenerator.getNextComposite() while canFind: print(colArray, end=" : ") colIndexArray = getTargetColumnList(header, colArray) data = getSerevalColumn(data_matrix, colIndexArray) precision_array = [] for _index in range(10): scores = cross_val_score(clf, data, scoreArray, cv=StratifiedKFold(5, shuffle=True)) precision_array.append(scores.mean()) print(np.array(precision_array).mean()) canFind, colArray = compositeGenerator.getNextComposite()
def usLRtoPredictWithKFold(): examId_List = ["e1", "e2", "e3", "e4"] target_List = ["programStateRecord"] print('|数据名称|预测|5分|10分|') print('|-|-|-|') for eid in examId_List: for target in target_List: dataFileName = eid + "-" + target _data_matrix, _data_header, _score_array = getData("npsm//" + dataFileName) # _feature_matrix = getSerevalColumn(_data_matrix,[i for i in range(1,_data_header.__len__())]); _feature_matrix = getSerevalColumn(_data_matrix, [8]) _score_array = np.array(_score_array).reshape( _score_array.__len__(), 1) # kf = KFold(n_splits=5, shuffle=True); # # accurate_array = []; # within_5_array = []; # within_10_array = []; # r_2_array = []; # # for train_index_array, test_index_array in kf.split(_feature_matrix): # X_train = []; # X_test = []; # y_train = []; # y_test = []; # for train_index in train_index_array: # X_train.append(_feature_matrix[train_index]); # y_train.append(_score_array[train_index]); # # for test_index in test_index_array: # X_test.append(_feature_matrix[test_index]); # y_test.append(_score_array[test_index]) # # _lr = LinearRegression(fit_intercept=True); # _lr.fit(X_train, y_train); # y_predicted = _lr.predict(X_test); # # accurate_array.append(getprecisionWithTorlerate(y_test, y_predicted, 0.5)); # within_5_array.append(getprecisionWithTorlerate(y_test, y_predicted, 1.5)); # within_10_array.append(getprecisionWithTorlerate(y_test, y_predicted, 2.5)); # # y_total_predict = _lr.predict(_feature_matrix) # r_2_array.append(r2_score(_score_array,y_total_predict)); # # # print("|", dataFileName, "|", np.array(accurate_array).mean(), "|",np.array(within_5_array).mean(), # "|",np.array(within_10_array).mean(), "|", np.array(r_2_array).mean()); _lr = LinearRegression(fit_intercept=True) precision_array = [] for _index in range(10): _scores = model_selection.cross_val_score( _lr, _feature_matrix, _score_array, cv=model_selection.StratifiedKFold(5, shuffle=True), scoring=lr_precision) precision_array.append(_scores.mean()) print("|", dataFileName, "|", np.array(precision_array).mean(), "|")
def searchAddOneFeatureOneTime(mark): # #"firstCodeTimeFromStart","saveInterval","pasteCount","buildInterval","codeBU","codeBS","scoreUp","successCount","debugTime","debugCount","debugErrorCount", # #"failCount","codeBE", "keepError","scoreRemainHigh","useDebug","codeTime","scoreRemainZero","hasBuildError", # feature_array = ["codeIntervalCount","totalLength","programTime","avgRemoveErrorTime", # "testCount","saveCount","longDeleteCount","score", # "scoreRemainMiddle","generateError","scoreDown","totalCount", # ]; # feature_array = ["saveInterval","programTime","totalLength","codeTime","firstCodeTimeFromStart", # "pasteCount","codeIntervalCount","saveCount","longDeleteCount","buildInterval", # "codeBU","score","codeBS","testCount","successCount", # "scoreUp","totalCount","scoreRemainZero","scoreRemainMiddle","avgRemoveErrorTime", # "debugCount","debugTime","debugErrorCount","failCount","codeBE", # "scoreDown","keepError","generateError","useDebug","hasBuildError", # "scoreRemainHigh", # ]; #5"firstCodeTimeFromStart", 16"totalCount",7"longDeleteCount",10"codeBS", 12 "scoreUp", #12 "codeBU",13"scoreRemainZero",14"debugCount",14"debugTime",14"debugErrorCount",18"useDebug",19"hasBuildError" #"totalLength","pasteCount", 提前移除 feature_array = [ "saveInterval", "programTime", "codeIntervalCount", "saveCount", "buildInterval", "score", "codeTime", "successCount", "testCount", "scoreRemainMiddle", "avgRemoveErrorTime", "failCount", "scoreDown", "keepError", "generateError", "codeBE", "scoreRemainHigh", ] # feature_array = ["buildInterval","saveInterval","codeIntervalCount","totalLength","programTime","codeTime", # "avgRemoveErrorTime","testCount", # "saveCount","scoreRemainMiddle", # "score","successCount","pasteCount", # ]; # feature_array = ["codeIntervalCount","totalLength", "programTime","longDeleteCount", # "avgRemoveErrorTime","testCount","saveCount","scoreRemainMiddle","score","scoreDown","generateError","totalCount", # ]; dataArray, scoreArray, headerArray = getDataAndScore("concatfeature", mark, needHeader=True) del headerArray[0] x_array = [] y_array = [] #逐步添加特征直至完成 for _count in range(feature_array.__len__()): target_feature_name_array = feature_array[:_count + 1] # print(target_feature_name_array); indexList = getTargetColumnList(headerArray, target_feature_name_array) # print(indexList) featureMatrix = getSerevalColumn(dataArray, indexList) precision = useSVMtoPredictScore(featureMatrix, scoreArray) print("%d : %.4f" % (_count + 1, precision)) x_array.append(_count + 1) y_array.append(precision) plt.figure() plt.plot(x_array, y_array) plt.show()