예제 #1
0
def loadData():
    print('##############loadData()###############')
    #fileName = '../processedData/1000Records.csv'
    balancedDataConvertedToIntegerFile = '../processedData/balancedDataConvertedToInteger.csv'
    fileName = '../processedData/DeathRecordsConvertedToInteger.csv'

    dataMatrix = dataIO.getDataMatrixFromCSV(
        balancedDataConvertedToIntegerFile)
    '''remove the column_Header/label_of_the_column from the data'''
    columnNames = dataMatrix[0:1]

    dataMatrix = dataMatrix[1:]
    '''get ICD10 code as output'''
    Y = dataMatrix[:, 24]
    Y = dataIO.convertDatatoFloat(Y, False)
    print('Conversion to float completed')
    '''remove OutputCoulumn and 
    Those below mentioned columns:
    Id
    NumberOfEntityAxisConditions
    NumberOfRecordAxisConditions'''
    X = np.delete(dataMatrix, [0, 24, 29, 30], axis=1)
    columnNames = np.delete(columnNames, [0, 24, 29, 30])

    X = dataIO.convertDatatoFloat(X, True)
    print('##############loadData() completed###############')
    return X, Y, columnNames
예제 #2
0
def getOnlyDiseaseData(inputFileName, outputFileName):

    dataMatrixWithLabel = dataIO.getDataMatrixFromCSV(inputFileName)
    
    '''ICD10Code = column 24'''
    icd10Codes = dataMatrixWithLabel[:, 24]
    
    dataMatrixWithOnlyDieseases = [];
    
    '''pattern to match'''
    pattern = re.compile('[A-R]+')
    
    counter = 0
    
    '''write to file''' 
    with open(outputFileName, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for icd10Code in icd10Codes:
            if(re.match(pattern, icd10Code)):
                dataMatrixWithOnlyDieseases.append(dataMatrixWithLabel[counter])
                writer.writerow(dataMatrixWithLabel[counter])
            counter += 1
            
    
    dataMatrixWithOnlyDieseases = np.array(dataMatrixWithOnlyDieseases)
    return dataMatrixWithOnlyDieseases 
import DataIOFactory
import numpy as np
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
import FeatureSelection_Chi2
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets
from sklearn.neighbors import KNeighborsRegressor
import ResultAnalyzer
import PCA
'''reading balanced data from csv file'''
balancedData = DataIOFactory.getDataMatrixFromCSV(
    "./categorizedData/balancedData.csv")
''' extracting feature labels and class label'''
columns_name = np.array(balancedData[0])  #first rows has the column names
features_label = np.delete(columns_name, [0, 24],
                           axis=None)  #removing the id and class column
balancedData = np.delete(balancedData, (0),
                         axis=0)  #removing the label row from data
'''shuffling the data'''
balancedData = DataIOFactory.matrixShuffling(balancedData)
'''Column 24, icd10 is our class and the rest are features '''
clss = balancedData[:, 24]
# clss = np.delete(clss, (0), axis=0) #removing the label from this list - first column
print("class lenght:", len(clss))
classes = DataIOFactory.classDiscreteValueConverToDecimal(
    clss)[:, None]  #clss is just 1D array, we have to convert it to 2D array
print('class shape', classes.shape)
'''select the columns as input features - all columns but 0 and 24'''
features = np.delete(balancedData, [0, 24], axis=1)
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_classif
import math
from operator import itemgetter
from sklearn.metrics import precision_recall_fscore_support
from matplotlib import colors
from PIL.ImageColor import getcolor
from DataIOFactory import balancingData
'''reading data from csv file and only getting first 100,000 instances for this experiment'''

data2DMat = DataIOFactory.getDataMatrixFromCSV("./DeathRecords.csv")
print('type: ', type(data2DMat), ' shape: ', data2DMat.shape)
''' extracting feature labels and class label'''

columns_name = np.array(data2DMat[0])  #first rows has the column names
features_label = np.delete(columns_name, [0, 24],
                           axis=None)  #removing the id and class column
data2DMat = np.delete(data2DMat, (0), axis=0)
print('whole data shape  ', data2DMat.shape)
'''shuffling the data'''
shuffeledMat = DataIOFactory.matrixShuffling(data2DMat)
'''
this dataset is about mortality and death could have different causes.
in this part, we are only taking the data instances that are dealing with a type of disease based on the column: Icd10
imbalanced data
'''
    f1Score = f1_score(testY, predictedY, average='weighted')

    print('accuracy:', accuracy)
    print('confusionMatrix: ', confusionMatrix)
    print('f1Score: ', f1Score)

    print(
        '################# classifyUsingKNNCentroid() finished ##################'
    )
    print("--- %s seconds ---" % (time.time() - start_time))


balancedDataConvertedToIntegerFile = '../processedData/balancedDataConvertedToInteger.csv'

fileName = '../processedData/DeathRecordsConvertedToInteger.csv'
dataMatrix = dataIO.getDataMatrixFromCSV(fileName)
'''remove the column_Header/label_of_the_column from the data'''
columnNames = dataMatrix[0:1]
dataMatrix = dataMatrix[1:]
'''shuffle data'''
dataIO.shuffleData(dataMatrix)
'''get ICD10 code as output'''
Y = dataMatrix[:, 24]
'''remove OutputCoulumn and 
Those below mentioned columns:
Id
NumberOfEntityAxisConditions
NumberOfRecordAxisConditions'''
X = np.delete(dataMatrix, [0, 24, 29, 30], axis=1)
columnNames = np.delete(columnNames, [0, 24, 29, 30])
예제 #6
0
def changeStringToInteger(fileName, cols, hasColumnHeader):
    print('################# changeStringToInteger() started##################')
    start_time = time.time()
    dataMatrixWithLabel = dataIO.getDataMatrixFromCSV(fileName)
    
    '''remove the column_Header/label_of_the_column from the data'''
    if(hasColumnHeader):
        dataMatrix = dataMatrixWithLabel[1:]
    
    for i in cols:
        
        '''Column 5 = Sex: 
        F = 0
        M = 1
        '''
        if(i == 6):
            c6 = []
            for rowData in dataMatrix[:, i]:
                if(rowData == 'F'):
                    c6.append(0)
                else:
                    c6.append(1)
                    
        '''Column 15 = Marital Status
        S  =  Never married, single  = 0
        M  =  Married                = 1
        W  =  Widowed                = 2
        D  =  Divorced               = 3
        U  =  Marital Status unknown = 4'''
        if(i == 15):
            c15 = []
            for rowData in dataMatrix[:, i]:
                if(rowData == 'S'):
                    c15.append(0)
                elif(rowData == 'M'):
                    c15.append(1)
                elif(rowData == 'W'):
                    c15.append(2)
                elif(rowData == 'D'):
                    c15.append(3)
                elif(rowData == 'U'):
                    c15.append(4)
                    
        '''Column 18 = InjuryAtWork
        Y  =  Yes          = 0
        N  =  No           = 1
        U  =  Unknown      = 2'''
        if(i == 18):
            c18 = []
            for rowData in dataMatrix[:, i]:
                if(rowData == 'Y'):
                    c18.append(0)
                elif(rowData == 'N'):
                    c18.append(1)
                elif(rowData == 'U'):
                    c18.append(2)
                    
        '''Column 20 = Method Of Desposition
        B  =  Burial          = 0
        C  =  Cremation      = 1
        O  =  Other          = 2
        U  = Unknown         = 3
        E  = Unknown         = 3
        R  = Unknown         = 3'''
        if(i == 20):
            c20 = []
            for rowData in dataMatrix[:, i]:
                if(rowData == 'B'):
                    c20.append(0)
                elif(rowData == 'C'):
                    c20.append(1)
                elif(rowData == 'O'):
                    c20.append(2)
                elif(rowData == 'U'):
                    c20.append(3)
                elif(rowData == 'E'):
                    c20.append(3)
                elif(rowData == 'R'):
                    c20.append(3)
                else:
                    c20.append(3)
                    
        
        '''Column 21 = Autopsy: 
        Y = Yes     = 0
        N = No      = 1
        U = Unknown = 2
        '''            
        if(i == 21):
            c21 = []
            for rowData in dataMatrix[:, i]:
                if(rowData == 'Y'):
                    c21.append(0)
                elif(rowData == 'N'):
                    c21.append(1)
                elif(rowData == 'U'):
                    c21.append(2)
                else:
                    c21.append(2)
         
    
    '''replace the values into main matirx'''        
    dataMatrix[:, 6] = c6
    dataMatrix[:, 15] = c15
    dataMatrix[:, 18] = c18
    dataMatrix[:, 20] = c20
    dataMatrix[:, 21] = c21
    
    print('################# changeStringToInteger() finished ##################')
    print("--- %s seconds ---" % (time.time() - start_time))
    return dataMatrix