def useDecisionTreeToClassify(featureFileName, exam_mark=None, needOutputpdf=False, max_depth=None): _feature_matrix, _score_array, headerArray = getDataAndScore( featureFileName, exam_mark, needHeader=True) _dt = DecisionTreeClassifier(max_depth=max_depth, min_samples_leaf=2) _dt.fit(_feature_matrix, _score_array) #输出图像 if (needOutputpdf): feature_names = [] for featureIndex in range(1, headerArray.__len__()): feature_names.append(headerArray[featureIndex]) dot_data = export_graphviz(_dt, out_file=None, feature_names=None, filled=True, rounded=True, special_characters=True) graph = graphviz.Source(dot_data, directory='out/') graph.render(mark + "-decisionTree")
def useElaticNettoPredictScoreWithKFold(targetFileName, exam_mark=None, needNorm=True): if exam_mark is None: exam_mark = DATA_MARK featureMatrix, _score_array = getDataAndScore(targetFileName, exam_mark) if needNorm: featureMatrix = normizeMatrix(featureMatrix) _lr = ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[1e-4, .01, .1, .5, .9, .99], max_iter=5000, cv=model_selection.StratifiedKFold(5, shuffle=True)) precision_array = [] for _index in range(10): _scores = model_selection.cross_val_score( _lr, featureMatrix, _score_array, cv=model_selection.StratifiedKFold(5, shuffle=True), scoring=lr_precision) precision_array.append(_scores.mean()) print(np.array(precision_array).mean())
def useGaussianNBtoPredictScore(featureFileName,exam_mark=None): featureMatrix, scoreCol = getDataAndScore(featureFileName,exam_mark); precision_array = []; for _index in range(10): clf = GaussianNB(); scores = cross_val_score(clf,featureMatrix,scoreCol,cv=StratifiedKFold(5,shuffle=True)); precision_array.append(scores.mean()) # print(precision_array); print(np.array(precision_array).mean())
def useKNNtoPredictScore(featureFileName,exam_mark=None,neighbour = 3): featureMatrix, scoreCol = getDataAndScore(featureFileName,exam_mark); featureMatrix = StandardScaler().fit_transform(featureMatrix) precision_array = []; for _index in range(10): clf = KNeighborsClassifier(neighbour); scores = cross_val_score(clf,featureMatrix,scoreCol,cv=StratifiedKFold(5,shuffle=True)); precision_array.append(scores.mean()) # print(precision_array); print(np.array(precision_array).mean())
def getRefResult(exam_mark): #由于挑出来的特征都差不多 这里直接拿第一次的结果做出来 dataMatrix, scoreArray,header = getDataAndScore("concatfeature", exam_mark,needHeader=True); ranking =useREFToSelectFeature(exam_mark); print("[",end=" ") for _index,_value in enumerate(ranking): if _value == 1: print(" \""+header[_index+1]+"\",",end=""); print("]")
def useREFToSelectFeature(examMark): # Load the digits dataset dataMatrix,scoreArray = getDataAndScore("concatfeature",examMark); # Create the RFE object and rank each pixel clf = RandomForestClassifier(n_estimators=1000, max_depth=None) # rfe = RFECV(estimator=clf,step=1,cv=StratifiedKFold(5,shuffle=True),n_jobs=-1) rfe = RFE(estimator=clf,step=1) rfe.fit(dataMatrix, scoreArray) return rfe.ranking_;
def useSVMtoPredictScore(featureFileName,exam_mark=None,kernel="rbf",decision_function_shape="ovr"): featureMatrix, scoreCol = getDataAndScore(featureFileName,exam_mark); featureMatrix = StandardScaler().fit_transform(featureMatrix) precision_array = []; for _index in range(10): _svc = SVC(kernel=kernel,decision_function_shape = decision_function_shape,degree=2); score_array = cross_val_score(_svc,featureMatrix,scoreCol,cv=StratifiedKFold(5,shuffle=True)) # print(score_array.mean()); precision_array.append(score_array.mean()); # print(precision_array); print(np.array(precision_array).mean())
def useKMeansToPredict(featureFileName,exam_mark=None): featureMatrix, scoreCol,headerArray= getDataAndScore(featureFileName,exam_mark,needHeader=True); precision_array = []; clusterMap = {"saveInterval":2,"score":3,"scoreRemainMiddle":2} for _index in range(0,headerArray.__len__()): if headerArray[_index] in clusterMap : clf = KMeans(n_clusters=clusterMap[headerArray[_index]]); dataArray = getOneColumn(featureMatrix,_index); dataArray = np.array(dataArray).reshape(dataArray.__len__(),1); clf.fit(dataArray); #构建预测模型 center_array = clf.cluster_centers_ label_map = {} ; for centerIndex,center in enumerate(center_array): label_map[centerIndex] = center[0]; center_array =center_array.reshape(center_array.__len__()); center_array = sorted(center_array.tolist()); for label in label_map : label_map[label] = center_array.index(label_map[label]); #预测 grade_predict_array = clf.predict(dataArray); #判断准确率 current_score_map = score_map[headerArray[_index]]; t_t = 0; t_f = 0; f_t = 0; f_f = 0; for record_index in range(grade_predict_array.__len__()): grade_predict = label_map[grade_predict_array[record_index]]; true_grade = current_score_map[scoreCol[record_index]]; if true_grade == 1 and grade_predict ==1 : t_t += 1; if true_grade == 1 and grade_predict ==0 : t_f += 1; if true_grade == 0 and grade_predict == 1: f_t += 1; if true_grade == 0 and grade_predict == 0: f_f += 1; print( headerArray[_index] , " : ",t_t,t_f,f_t,f_f);
def useObserveToPredict(featureFileName, exam_mark=None): featureMatrix, scoreCol, headerArray = getDataAndScore(featureFileName, exam_mark, needHeader=True) precision_array = [] gapMap = { "saveInterval": [60], "score": [40, 80], "scoreRemainMiddle": [80], "scoreUp": [40, 80] } for _index in range(0, headerArray.__len__()): if headerArray[_index] in gapMap: dataArray = getOneColumn(featureMatrix, _index) dataArray = np.array(dataArray).reshape(dataArray.__len__(), 1) #构建预测模型 sortArray = sorted(dataArray) gap = gapMap[headerArray[_index]] for gap_index in range(gap.__len__()): gap[gap_index] = sortArray[(int)(gap[gap_index] / 100 * sortArray.__len__())] #预测 predict = [] for record_index in range(dataArray.__len__()): predict_val = gap.__len__() for gap_index, gap_value in enumerate(gap): if dataArray[record_index] <= gap_value: predict_val = gap_index break predict.append(predict_val) #判断准确率 precision = 0.0 current_score_map = score_map[headerArray[_index]] for record_index in range(dataArray.__len__()): true_grade = current_score_map[scoreCol[record_index]] if true_grade == predict[record_index]: precision += 1 print(headerArray[_index], "%.4f" % (precision / dataArray.__len__()))
def useLRtoPredictScoreWithKFold(targetFileName, exam_mark=None, needNorm=True): if exam_mark is None: exam_mark = DATA_MARK featureMatrix, _score_array = getDataAndScore(targetFileName, exam_mark) if needNorm: featureMatrix = normizeMatrix(featureMatrix) _lr = LinearRegression(fit_intercept=True) precision_array = [] for _index in range(10): _scores = model_selection.cross_val_score( _lr, featureMatrix, _score_array, cv=model_selection.StratifiedKFold(5, shuffle=True), scoring=lr_precision) precision_array.append(_scores.mean()) print(np.array(precision_array).mean())
def useDecisionTreeToClassifyWithKFold(featureFileName, exam_mark=None, max_depth=None): _feature_matrix, _score_array = getDataAndScore(featureFileName, exam_mark) # kf = KFold(n_splits=5,shuffle=True); # accurate_array = []; # for train_index_array, test_index_array in kf.split(_feature_matrix): # X_train = []; # X_test = []; # y_train = []; # y_test = []; # for train_index in train_index_array: # X_train.append(_feature_matrix[train_index]); # y_train.append(_score_array[train_index]); # # for test_index in test_index_array: # X_test.append(_feature_matrix[test_index]); # y_test.append(_score_array[test_index]) # # _dt = DecisionTreeClassifier(max_depth=max_depth); # _dt.fit(X_train, y_train); # score = _dt.score(X_test,y_test); # # accurate_array.append(score); # print(np.array(accurate_array).mean()) precision_array = [] for _index in range(10): _dt = DecisionTreeClassifier(max_depth=max_depth) score_array = cross_val_score(_dt, _feature_matrix, _score_array, cv=StratifiedKFold(5, shuffle=True)) precision_array.append(score_array.mean()) # print(precision_array); print(np.array(precision_array).mean())
def useLassotoPredictScoreWithKFold(targetFileName, exam_mark=None, needNorm=True): if exam_mark is None: exam_mark = DATA_MARK featureMatrix, _score_array = getDataAndScore(targetFileName, exam_mark) if needNorm: featureMatrix = normizeMatrix(featureMatrix) _lr = LassoCV(alphas=[0.01, 0.05, 0.1, 0.5, 1, 10], cv=model_selection.StratifiedKFold(5, shuffle=True), tol=1e-4) precision_array = [] for _index in range(10): _scores = model_selection.cross_val_score( _lr, featureMatrix, _score_array, cv=model_selection.StratifiedKFold(5, shuffle=True), scoring=lr_precision) precision_array.append(_scores.mean()) print(np.array(precision_array).mean())
def searchAddOneFeatureOneTime(mark): # #"firstCodeTimeFromStart","saveInterval","pasteCount","buildInterval","codeBU","codeBS","scoreUp","successCount","debugTime","debugCount","debugErrorCount", # #"failCount","codeBE", "keepError","scoreRemainHigh","useDebug","codeTime","scoreRemainZero","hasBuildError", # feature_array = ["codeIntervalCount","totalLength","programTime","avgRemoveErrorTime", # "testCount","saveCount","longDeleteCount","score", # "scoreRemainMiddle","generateError","scoreDown","totalCount", # ]; # feature_array = ["saveInterval","programTime","totalLength","codeTime","firstCodeTimeFromStart", # "pasteCount","codeIntervalCount","saveCount","longDeleteCount","buildInterval", # "codeBU","score","codeBS","testCount","successCount", # "scoreUp","totalCount","scoreRemainZero","scoreRemainMiddle","avgRemoveErrorTime", # "debugCount","debugTime","debugErrorCount","failCount","codeBE", # "scoreDown","keepError","generateError","useDebug","hasBuildError", # "scoreRemainHigh", # ]; #5"firstCodeTimeFromStart", 16"totalCount",7"longDeleteCount",10"codeBS", 12 "scoreUp", #12 "codeBU",13"scoreRemainZero",14"debugCount",14"debugTime",14"debugErrorCount",18"useDebug",19"hasBuildError" #"totalLength","pasteCount", 提前移除 feature_array = [ "saveInterval", "programTime", "codeIntervalCount", "saveCount", "buildInterval", "score", "codeTime", "successCount", "testCount", "scoreRemainMiddle", "avgRemoveErrorTime", "failCount", "scoreDown", "keepError", "generateError", "codeBE", "scoreRemainHigh", ] # feature_array = ["buildInterval","saveInterval","codeIntervalCount","totalLength","programTime","codeTime", # "avgRemoveErrorTime","testCount", # "saveCount","scoreRemainMiddle", # "score","successCount","pasteCount", # ]; # feature_array = ["codeIntervalCount","totalLength", "programTime","longDeleteCount", # "avgRemoveErrorTime","testCount","saveCount","scoreRemainMiddle","score","scoreDown","generateError","totalCount", # ]; dataArray, scoreArray, headerArray = getDataAndScore("concatfeature", mark, needHeader=True) del headerArray[0] x_array = [] y_array = [] #逐步添加特征直至完成 for _count in range(feature_array.__len__()): target_feature_name_array = feature_array[:_count + 1] # print(target_feature_name_array); indexList = getTargetColumnList(headerArray, target_feature_name_array) # print(indexList) featureMatrix = getSerevalColumn(dataArray, indexList) precision = useSVMtoPredictScore(featureMatrix, scoreArray) print("%d : %.4f" % (_count + 1, precision)) x_array.append(_count + 1) y_array.append(precision) plt.figure() plt.plot(x_array, y_array) plt.show()