def loadData(): print('##############loadData()###############') #fileName = '../processedData/1000Records.csv' balancedDataConvertedToIntegerFile = '../processedData/balancedDataConvertedToInteger.csv' fileName = '../processedData/DeathRecordsConvertedToInteger.csv' dataMatrix = dataIO.getDataMatrixFromCSV( balancedDataConvertedToIntegerFile) '''remove the column_Header/label_of_the_column from the data''' columnNames = dataMatrix[0:1] dataMatrix = dataMatrix[1:] '''get ICD10 code as output''' Y = dataMatrix[:, 24] Y = dataIO.convertDatatoFloat(Y, False) print('Conversion to float completed') '''remove OutputCoulumn and Those below mentioned columns: Id NumberOfEntityAxisConditions NumberOfRecordAxisConditions''' X = np.delete(dataMatrix, [0, 24, 29, 30], axis=1) columnNames = np.delete(columnNames, [0, 24, 29, 30]) X = dataIO.convertDatatoFloat(X, True) print('##############loadData() completed###############') return X, Y, columnNames
def getOnlyDiseaseData(inputFileName, outputFileName): dataMatrixWithLabel = dataIO.getDataMatrixFromCSV(inputFileName) '''ICD10Code = column 24''' icd10Codes = dataMatrixWithLabel[:, 24] dataMatrixWithOnlyDieseases = []; '''pattern to match''' pattern = re.compile('[A-R]+') counter = 0 '''write to file''' with open(outputFileName, 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL) for icd10Code in icd10Codes: if(re.match(pattern, icd10Code)): dataMatrixWithOnlyDieseases.append(dataMatrixWithLabel[counter]) writer.writerow(dataMatrixWithLabel[counter]) counter += 1 dataMatrixWithOnlyDieseases = np.array(dataMatrixWithOnlyDieseases) return dataMatrixWithOnlyDieseases
import DataIOFactory import numpy as np from sklearn.feature_selection import chi2 from sklearn.feature_selection import SelectKBest import FeatureSelection_Chi2 import matplotlib.pyplot as plt from matplotlib.colors import ListedColormap from sklearn import neighbors, datasets from sklearn.neighbors import KNeighborsRegressor import ResultAnalyzer import PCA '''reading balanced data from csv file''' balancedData = DataIOFactory.getDataMatrixFromCSV( "./categorizedData/balancedData.csv") ''' extracting feature labels and class label''' columns_name = np.array(balancedData[0]) #first rows has the column names features_label = np.delete(columns_name, [0, 24], axis=None) #removing the id and class column balancedData = np.delete(balancedData, (0), axis=0) #removing the label row from data '''shuffling the data''' balancedData = DataIOFactory.matrixShuffling(balancedData) '''Column 24, icd10 is our class and the rest are features ''' clss = balancedData[:, 24] # clss = np.delete(clss, (0), axis=0) #removing the label from this list - first column print("class lenght:", len(clss)) classes = DataIOFactory.classDiscreteValueConverToDecimal( clss)[:, None] #clss is just 1D array, we have to convert it to 2D array print('class shape', classes.shape) '''select the columns as input features - all columns but 0 and 24''' features = np.delete(balancedData, [0, 24], axis=1)
from sklearn.neural_network import MLPClassifier from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix from sklearn.metrics import f1_score from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import chi2 from sklearn.feature_selection import f_classif import math from operator import itemgetter from sklearn.metrics import precision_recall_fscore_support from matplotlib import colors from PIL.ImageColor import getcolor from DataIOFactory import balancingData '''reading data from csv file and only getting first 100,000 instances for this experiment''' data2DMat = DataIOFactory.getDataMatrixFromCSV("./DeathRecords.csv") print('type: ', type(data2DMat), ' shape: ', data2DMat.shape) ''' extracting feature labels and class label''' columns_name = np.array(data2DMat[0]) #first rows has the column names features_label = np.delete(columns_name, [0, 24], axis=None) #removing the id and class column data2DMat = np.delete(data2DMat, (0), axis=0) print('whole data shape ', data2DMat.shape) '''shuffling the data''' shuffeledMat = DataIOFactory.matrixShuffling(data2DMat) ''' this dataset is about mortality and death could have different causes. in this part, we are only taking the data instances that are dealing with a type of disease based on the column: Icd10 imbalanced data '''
f1Score = f1_score(testY, predictedY, average='weighted') print('accuracy:', accuracy) print('confusionMatrix: ', confusionMatrix) print('f1Score: ', f1Score) print( '################# classifyUsingKNNCentroid() finished ##################' ) print("--- %s seconds ---" % (time.time() - start_time)) balancedDataConvertedToIntegerFile = '../processedData/balancedDataConvertedToInteger.csv' fileName = '../processedData/DeathRecordsConvertedToInteger.csv' dataMatrix = dataIO.getDataMatrixFromCSV(fileName) '''remove the column_Header/label_of_the_column from the data''' columnNames = dataMatrix[0:1] dataMatrix = dataMatrix[1:] '''shuffle data''' dataIO.shuffleData(dataMatrix) '''get ICD10 code as output''' Y = dataMatrix[:, 24] '''remove OutputCoulumn and Those below mentioned columns: Id NumberOfEntityAxisConditions NumberOfRecordAxisConditions''' X = np.delete(dataMatrix, [0, 24, 29, 30], axis=1) columnNames = np.delete(columnNames, [0, 24, 29, 30])
def changeStringToInteger(fileName, cols, hasColumnHeader): print('################# changeStringToInteger() started##################') start_time = time.time() dataMatrixWithLabel = dataIO.getDataMatrixFromCSV(fileName) '''remove the column_Header/label_of_the_column from the data''' if(hasColumnHeader): dataMatrix = dataMatrixWithLabel[1:] for i in cols: '''Column 5 = Sex: F = 0 M = 1 ''' if(i == 6): c6 = [] for rowData in dataMatrix[:, i]: if(rowData == 'F'): c6.append(0) else: c6.append(1) '''Column 15 = Marital Status S = Never married, single = 0 M = Married = 1 W = Widowed = 2 D = Divorced = 3 U = Marital Status unknown = 4''' if(i == 15): c15 = [] for rowData in dataMatrix[:, i]: if(rowData == 'S'): c15.append(0) elif(rowData == 'M'): c15.append(1) elif(rowData == 'W'): c15.append(2) elif(rowData == 'D'): c15.append(3) elif(rowData == 'U'): c15.append(4) '''Column 18 = InjuryAtWork Y = Yes = 0 N = No = 1 U = Unknown = 2''' if(i == 18): c18 = [] for rowData in dataMatrix[:, i]: if(rowData == 'Y'): c18.append(0) elif(rowData == 'N'): c18.append(1) elif(rowData == 'U'): c18.append(2) '''Column 20 = Method Of Desposition B = Burial = 0 C = Cremation = 1 O = Other = 2 U = Unknown = 3 E = Unknown = 3 R = Unknown = 3''' if(i == 20): c20 = [] for rowData in dataMatrix[:, i]: if(rowData == 'B'): c20.append(0) elif(rowData == 'C'): c20.append(1) elif(rowData == 'O'): c20.append(2) elif(rowData == 'U'): c20.append(3) elif(rowData == 'E'): c20.append(3) elif(rowData == 'R'): c20.append(3) else: c20.append(3) '''Column 21 = Autopsy: Y = Yes = 0 N = No = 1 U = Unknown = 2 ''' if(i == 21): c21 = [] for rowData in dataMatrix[:, i]: if(rowData == 'Y'): c21.append(0) elif(rowData == 'N'): c21.append(1) elif(rowData == 'U'): c21.append(2) else: c21.append(2) '''replace the values into main matirx''' dataMatrix[:, 6] = c6 dataMatrix[:, 15] = c15 dataMatrix[:, 18] = c18 dataMatrix[:, 20] = c20 dataMatrix[:, 21] = c21 print('################# changeStringToInteger() finished ##################') print("--- %s seconds ---" % (time.time() - start_time)) return dataMatrix