Esempio n. 1
0
    def setUp(self):
        cols = ['ThirtyDayReadmitFLG', 'SystolicBPNBR', 'LDLNBR']
        df = pd.read_csv(fixture('HCPyDiabetesClinical.csv'),
                         na_values=['None'],
                         usecols=cols)

        np.random.seed(42)
        self.o = DevelopSupervisedModel(modeltype='classification',
                                        df=df,
                                        predictedcol='ThirtyDayReadmitFLG',
                                        impute=True)
Esempio n. 2
0
def main():

    t0 = time.time()

    # CSV snippet for reading data into dataframe
    df = pd.read_csv(
        'healthcareai/tests/fixtures/DiabetesClinicalSampleData.csv',
        na_values=['None'])

    # SQL snippet for reading data into dataframe
    # import pyodbc
    # cnxn = pyodbc.connect("""SERVER=localhost;
    #                          DRIVER={SQL Server Native Client 11.0};
    #                          Trusted_Connection=yes;
    #                          autocommit=True""")
    #
    # df = pd.read_sql(
    #     sql="""SELECT
    #            *
    #            FROM [SAM].[dbo].[HCPyDiabetesClinical]""",
    #     con=cnxn)
    #
    # # Set None string to be None type
    # df.replace(['None'],[None],inplace=True)

    # Look at data that's been pulled in
    print(df.head())
    print(df.dtypes)

    # Drop columns that won't help machine learning
    df.drop(['PatientID', 'InTestWindowFLG'], axis=1, inplace=True)

    # Step 1: compare two models
    o = DevelopSupervisedModel(
        modeltype='classification',
        df=df,
        predictedcol='ThirtyDayReadmitFLG',
        graincol='PatientEncounterID',  #OPTIONAL
        impute=True,
        debug=False)

    # Run the linear model
    o.linear(cores=1)

    # Run the random forest model
    o.random_forest(cores=1, tune=True)

    # Look at the RF feature importance rankings
    o.plot_rffeature_importance(save=False)

    # Create ROC plot to compare the two models
    o.plot_roc(debug=False, save=False)

    print('\nTime:\n', time.time() - t0)
Esempio n. 3
0
    def setUp(self):
        df = pd.read_csv(fixture('HCPyDiabetesClinical.csv'),
                         na_values=['None'])

        # Drop uninformative columns
        df.drop(['PatientID', 'InTestWindowFLG'], axis=1, inplace=True)

        np.random.seed(42)
        self.o = DevelopSupervisedModel(modeltype='classification',
                                        df=df,
                                        predictedcol='ThirtyDayReadmitFLG',
                                        impute=True)
        self.o.linear(cores=1)
    def setUp(self):
        df = pd.read_csv(fixture('DiabetesClinicalSampleData.csv'),
                         na_values=['None'])

        # Drop uninformative columns
        df.drop(['PatientID', 'InTestWindowFLG'], axis=1, inplace=True)

        # Convert numeric columns to factor/category columns
        np.random.seed(42)
        self.o = DevelopSupervisedModel(modeltype='classification',
                                        df=df,
                                        predictedcol='ThirtyDayReadmitFLG',
                                        impute=True)
        self.o.random_forest(cores=1)
Esempio n. 5
0
class TestLinearDevTuneFalse(unittest.TestCase):
    def setUp(self):
        df = pd.read_csv(fixture('HCPyDiabetesClinical.csv'),
                         na_values=['None'])

        # Drop uninformative columns
        df.drop(['PatientID', 'InTestWindowFLG'], axis=1, inplace=True)

        np.random.seed(42)
        self.o = DevelopSupervisedModel(modeltype='classification',
                                        df=df,
                                        predictedcol='ThirtyDayReadmitFLG',
                                        impute=True)
        self.o.linear(cores=1)

    def runTest(self):

        self.assertAlmostEqual(np.round(self.o.au_roc, 6), 0.672075)

    def tearDown(self):
        del self.o
class TestRFDevTuneFalse(unittest.TestCase):
    def setUp(self):
        df = pd.read_csv(fixture('DiabetesClinicalSampleData.csv'),
                         na_values=['None'])

        # Drop uninformative columns
        df.drop(['PatientID', 'InTestWindowFLG'], axis=1, inplace=True)

        # Convert numeric columns to factor/category columns
        np.random.seed(42)
        self.o = DevelopSupervisedModel(modeltype='classification',
                                        df=df,
                                        predictedcol='ThirtyDayReadmitFLG',
                                        impute=True)
        self.o.random_forest(cores=1)

    def runTest(self):

        self.assertAlmostEqual(np.round(self.o.au_roc, 6), 0.965070)

    def tearDown(self):
        del self.o
Esempio n. 7
0
class TestRFDevTuneTrue2ColError(unittest.TestCase):
    def setUp(self):
        cols = ['ThirtyDayReadmitFLG', 'SystolicBPNBR', 'LDLNBR']
        df = pd.read_csv(fixture('HCPyDiabetesClinical.csv'),
                         na_values=['None'],
                         usecols=cols)

        np.random.seed(42)
        self.o = DevelopSupervisedModel(modeltype='classification',
                                        df=df,
                                        predictedcol='ThirtyDayReadmitFLG',
                                        impute=True)

    def runTest(self):
        self.assertRaises(ValueError,
                          lambda: self.o.random_forest(cores=1, tune=True))

    def tearDown(self):
        del self.o