# X_train, X_test = seqRowOfLabelData[train_index], seqRowOfLabelData[test_index] # y_train, y_test = labelVec[train_index], labelVec[test_index] # #KNN ## neigh = KNeighborsClassifier(n_neighbors=7, metric = 'cosine', weights = 'distance') # clf.fit(X_train, y_train) # labelPredict = clf.predict(X_test) # tmpAccuracyScoreList2.append(accuracy_score(y_test,labelPredict)) # sumarizeAccuracy2.append(np.average(tmpAccuracyScoreList2)) ## for k_KNN_feature_selection in range(5, 50): #k neightbor seq in knn graph print k_KNN_feature_selection fileName = "..//outputfile//" + version + "//" + testVer + "//knnGraph_" + str( k_KNN_feature_selection) + ".npy" sortedLaplaFeatureIndexes = np.array( algFile.loadSortedLaplaFeatureIndexes(fileName)) # sortedFeatureRowMat = np.array([featureRowMatrix[index] for index in sortedLaplaFeatureIndexes]) sortedFeatureRowMat = np.array( featureRowMatrix)[sortedLaplaFeatureIndexes] sumarizeAccuracy = [] sumarizeAccuracy2 = [] #Laplacian score for numOfFeature in kFeatures: print numOfFeature # build seqRowMat seqRowMatrix = np.transpose(sortedFeatureRowMat[:numOfFeature]) seqRowOfLabelData = seqRowMatrix[:numOfLabelData] # #build classifier # neigh = KNeighborsClassifier(n_neighbors=11, metric = 'cosine', weights = 'distance')
for train_index, test_index in loo.split(seqRowOfLabelData): X_train, X_test = seqRowOfLabelData[train_index], seqRowOfLabelData[test_index] y_train, y_test = labelVec[train_index], labelVec[test_index] #KNN neigh = KNeighborsClassifier(n_neighbors=9, metric = 'cosine', weights = 'distance') neigh.fit(X_train, y_train) labelPredict = neigh.predict(X_test) tmpAccuracyScoreList2.append(accuracy_score(y_test,labelPredict)) sumarizeAccuracy2.append(np.average(tmpAccuracyScoreList2)) for k_KNN_Graph_Preprocess in range(129,130): #k neightbor seq in knn graph print k_KNN_Graph_Preprocess fileName = "..//outputfile//"+version+ "//"+testVer+"//knnGraph_"+str(k_KNN_Graph_Preprocess) + ".npy" sortedLaplaFeatureIndexes = np.array(algFile.loadSortedLaplaFeatureIndexes(fileName)) # sortedFeatureRowMat = np.array([featureRowMatrix[index] for index in sortedLaplaFeatureIndexes]) sortedFeatureRowMat = np.array(featureRowMatrix)[sortedLaplaFeatureIndexes] sumarizeAccuracy = [] #Laplacian score for numOfFeature in kFeatures: # get seqRowMat seqRowMatrix = np.transpose(sortedFeatureRowMat[:numOfFeature]) seqRowOfLabelData = seqRowMatrix[:numOfLabelData] #leave one out tmpAccuracyScoreList = [] for train_index, test_index in loo.split(seqRowOfLabelData):
from sklearn.feature_selection import VarianceThreshold from sklearn.model_selection import KFold import matplotlib.pyplot as plt if __name__ == '__main__': startTime = datetime.now() fileh = tables.open_file("../outputfile/encodingFile.h5", mode="r") featureRowMatrix = fileh.root.featureRowMatrix sizeOfSeqList = featureRowMatrix.shape[1] labelDict = algFile.readLabelDictFromFile() labelVec = np.array(classifierFile.convertLabelDict2List(labelDict)) numOfLabelData = len(labelVec) kFeatures = range(1, 175) accuracyScoreList = [] accuracyScoreList2 = [] sortedLaplaFeatureIndexes = algFile.loadSortedLaplaFeatureIndexes() sortedFeatureRowMat = np.array([featureRowMatrix[index] for index in sortedLaplaFeatureIndexes]) sumarizeAccuracy = [] sumarizeAccuracy2 = [] #variance seqRowMatrixFull = algFile.readSeqRowMatFromFile() sel = VarianceThreshold(threshold=(.8 * (1 - .8))) newSeqRowMatrix = sel.fit_transform(seqRowMatrixFull) newSeqRowOfLabelData = newSeqRowMatrix[:numOfLabelData] kf = KFold(n_splits=20) #Laplacian score for numOfFeature in kFeatures: print numOfFeature
from sklearn.feature_selection import VarianceThreshold from sklearn.model_selection import KFold import matplotlib.pyplot as plt if __name__ == '__main__': startTime = datetime.now() fileh = tables.open_file("../outputfile/encodingFile.h5", mode="r") featureRowMatrix = fileh.root.featureRowMatrix sizeOfSeqList = featureRowMatrix.shape[1] labelDict = algFile.readLabelDictFromFile() labelVec = np.array(classifierFile.convertLabelDict2List(labelDict)) numOfLabelData = len(labelVec) kFeatures = range(1, 200) accuracyScoreList = [] accuracyScoreList2 = [] sortedLaplaFeatureIndexes = algFile.loadSortedLaplaFeatureIndexes( "3_3.npy") sortedFeatureRowMat = np.array( [featureRowMatrix[index] for index in sortedLaplaFeatureIndexes]) sumarizeAccuracy = [] sumarizeAccuracy2 = [] #variance seqRowMatrixFull = algFile.readSeqRowMatFromFile() # sel = VarianceThreshold(threshold=(.6 * (1 - .6))) sel = VarianceThreshold(threshold=0.2) newSeqRowMatrix = sel.fit_transform(seqRowMatrixFull) newSeqRowOfLabelData = newSeqRowMatrix[:numOfLabelData] loo = LeaveOneOut() #Laplacian score for numOfFeature in kFeatures: