def setUp(self): cols = ['ThirtyDayReadmitFLG', 'SystolicBPNBR', 'LDLNBR'] df = pd.read_csv(fixture('HCPyDiabetesClinical.csv'), na_values=['None'], usecols=cols) np.random.seed(42) self.o = DevelopSupervisedModel(modeltype='classification', df=df, predictedcol='ThirtyDayReadmitFLG', impute=True)
def main(): t0 = time.time() # CSV snippet for reading data into dataframe df = pd.read_csv( 'healthcareai/tests/fixtures/DiabetesClinicalSampleData.csv', na_values=['None']) # SQL snippet for reading data into dataframe # import pyodbc # cnxn = pyodbc.connect("""SERVER=localhost; # DRIVER={SQL Server Native Client 11.0}; # Trusted_Connection=yes; # autocommit=True""") # # df = pd.read_sql( # sql="""SELECT # * # FROM [SAM].[dbo].[HCPyDiabetesClinical]""", # con=cnxn) # # # Set None string to be None type # df.replace(['None'],[None],inplace=True) # Look at data that's been pulled in print(df.head()) print(df.dtypes) # Drop columns that won't help machine learning df.drop(['PatientID', 'InTestWindowFLG'], axis=1, inplace=True) # Step 1: compare two models o = DevelopSupervisedModel( modeltype='classification', df=df, predictedcol='ThirtyDayReadmitFLG', graincol='PatientEncounterID', #OPTIONAL impute=True, debug=False) # Run the linear model o.linear(cores=1) # Run the random forest model o.random_forest(cores=1, tune=True) # Look at the RF feature importance rankings o.plot_rffeature_importance(save=False) # Create ROC plot to compare the two models o.plot_roc(debug=False, save=False) print('\nTime:\n', time.time() - t0)
def setUp(self): df = pd.read_csv(fixture('HCPyDiabetesClinical.csv'), na_values=['None']) # Drop uninformative columns df.drop(['PatientID', 'InTestWindowFLG'], axis=1, inplace=True) np.random.seed(42) self.o = DevelopSupervisedModel(modeltype='classification', df=df, predictedcol='ThirtyDayReadmitFLG', impute=True) self.o.linear(cores=1)
def setUp(self): df = pd.read_csv(fixture('DiabetesClinicalSampleData.csv'), na_values=['None']) # Drop uninformative columns df.drop(['PatientID', 'InTestWindowFLG'], axis=1, inplace=True) # Convert numeric columns to factor/category columns np.random.seed(42) self.o = DevelopSupervisedModel(modeltype='classification', df=df, predictedcol='ThirtyDayReadmitFLG', impute=True) self.o.random_forest(cores=1)
class TestLinearDevTuneFalse(unittest.TestCase): def setUp(self): df = pd.read_csv(fixture('HCPyDiabetesClinical.csv'), na_values=['None']) # Drop uninformative columns df.drop(['PatientID', 'InTestWindowFLG'], axis=1, inplace=True) np.random.seed(42) self.o = DevelopSupervisedModel(modeltype='classification', df=df, predictedcol='ThirtyDayReadmitFLG', impute=True) self.o.linear(cores=1) def runTest(self): self.assertAlmostEqual(np.round(self.o.au_roc, 6), 0.672075) def tearDown(self): del self.o
class TestRFDevTuneFalse(unittest.TestCase): def setUp(self): df = pd.read_csv(fixture('DiabetesClinicalSampleData.csv'), na_values=['None']) # Drop uninformative columns df.drop(['PatientID', 'InTestWindowFLG'], axis=1, inplace=True) # Convert numeric columns to factor/category columns np.random.seed(42) self.o = DevelopSupervisedModel(modeltype='classification', df=df, predictedcol='ThirtyDayReadmitFLG', impute=True) self.o.random_forest(cores=1) def runTest(self): self.assertAlmostEqual(np.round(self.o.au_roc, 6), 0.965070) def tearDown(self): del self.o
class TestRFDevTuneTrue2ColError(unittest.TestCase): def setUp(self): cols = ['ThirtyDayReadmitFLG', 'SystolicBPNBR', 'LDLNBR'] df = pd.read_csv(fixture('HCPyDiabetesClinical.csv'), na_values=['None'], usecols=cols) np.random.seed(42) self.o = DevelopSupervisedModel(modeltype='classification', df=df, predictedcol='ThirtyDayReadmitFLG', impute=True) def runTest(self): self.assertRaises(ValueError, lambda: self.o.random_forest(cores=1, tune=True)) def tearDown(self): del self.o