Esempio n. 1
0
def get_final_ds(data_patient, data_pcr, path_referential):

    # data frame obtenu après suppression des duplications du jeu de donnée patient
    df_patient_dedup = script_dd.detect_duplicates(data_patient,
                                                   path_referential)

    # data frame obtenu après un processus de data cleaning appliqué au jeu de données
    # sur les données relatifs aux contamination par le Covid19
    df_cleaned_pcr = get_cleaned_df_pcr(data_pcr)
    # On fusionne les dataframes de patients (dédupliquée) avec celle des résultats des test du Covid19
    df_final_ds = pd.merge(df_cleaned_pcr,
                           df_patient_dedup,
                           on='patient_id',
                           how='left')

    # Pour fiabiliser notre analyse statistique, nous éliminons toutes les lignes
    # avec des valeurs manquantes ou érronées sur l'une des colonnes :
    # date_of_birth, postcode et patient_id.
    # (Rappelons que dans cette exemple toutes les valeurs de patient_id sont valides)
    df_final_ds = df_final_ds[df_final_ds['patient_id'].notnull()
                              & df_final_ds['date_of_birth'].notnull()
                              & df_final_ds['postcode'].notnull()
                              & df_final_ds['postcode'] != 0]
    # Supprimer les lignes avec des valeurs inconnues au niveau de 'state'
    df_final_ds = df_final_ds[df_final_ds['state'] != "UNKNOWN"]

    return df_final_ds
def test_get_corrected_age_wdob():
        
    df = pd.DataFrame([
        [378167, 2428, 'NSW', np.nan, 31],
        [427069, 6000, 'WA', 19451796, 75],
        ], columns = ['patient_id', 'postcode', 'state', 'date_of_birth', 'age'])
    
    df_result = detect_duplicates(df, path_referential)
    
    df_expected = pd.DataFrame([
        [378167, 2428, 'NSW', np.datetime64('NaT'), np.nan],
        [427069, 6000, 'WA', np.datetime64('NaT'), np.nan],
        ], columns = ['patient_id', 'postcode', 'state', 'date_of_birth', 'age'])
    
    assert_frame_equal(left=df_expected.reset_index(drop=True), right=df_result.reset_index(drop=True),
                       check_dtype=False)   
def test_delete_duplicates_nv():
    
    df = pd.DataFrame([
        [104136, 812, 'NT', 19801222, 40],
        [np.nan, 4221, 'QLD', 19771201, 43],
        [None, 2087, 'NSW', 19330815, 87]
        ], columns = ['patient_id', 'postcode', 'state', 'date_of_birth', 'age'])
    
    df_result = detect_duplicates(df, path_referential)
    
    df_expected = pd.DataFrame([
        [104136, 812, 'NT', datetime.datetime.strptime('1980-12-22', '%Y-%m-%d'), 40]
        ], columns = ['patient_id', 'postcode', 'state', 'date_of_birth', 'age'])
    
    assert_frame_equal(left=df_expected.reset_index(drop=True), right=df_result.reset_index(drop=True),
                       check_dtype=False)
def test_get_corrected_age():
        
    df = pd.DataFrame([
        [100064, 4208, 'QLD', 19810905.0, np.nan],
        [100215, 6107, 'WA', 19061018.0, ''],
        [100363, 3029, 'VIC', 19030606.0, 32]
        ], columns = ['patient_id', 'postcode', 'state', 'date_of_birth', 'age'])
    
    df_result = detect_duplicates(df, path_referential)
    
    df_expected = pd.DataFrame([
        [100064, 4208, 'QLD', datetime.datetime.strptime('1981-09-05', '%Y-%m-%d'), 39],
        [100215, 6107, 'WA', datetime.datetime.strptime('1906-10-18', '%Y-%m-%d') , 114],
        [100363, 3029, 'VIC', datetime.datetime.strptime('1903-06-06', '%Y-%m-%d'), 117]
        ], columns = ['patient_id', 'postcode', 'state', 'date_of_birth', 'age'])
    
    assert_frame_equal(left=df_expected.reset_index(drop=True), right=df_result.reset_index(drop=True),
                       check_dtype=False)  
def test_get_corrected_state(): 
    
    df = pd.DataFrame([
        [100390, 6155, 'qld', 19160912, 104],
        [100559, 2400, ' ', 19570220, 63],
        [100901, 5333, np.nan, 19750207, 45]
        ], columns = ['patient_id', 'postcode', 'state', 'date_of_birth', 'age'])
    
    df_result = detect_duplicates(df, path_referential)
    
    df_expected = pd.DataFrame([
        [100390, 6155, 'WA', datetime.datetime.strptime('1916-09-12', '%Y-%m-%d'), 104],
        [100559, 2400, 'NSW', datetime.datetime.strptime('1957-02-20', '%Y-%m-%d') , 63],
        [100901, 5333, 'SA', datetime.datetime.strptime('1975-02-07', '%Y-%m-%d'), 45]
        ], columns = ['patient_id', 'postcode', 'state', 'date_of_birth', 'age'])
    
    assert_frame_equal(left=df_expected.reset_index(drop=True), right=df_result.reset_index(drop=True),
                       check_dtype=False)    
def test_delete_duplicates():
    
    df = pd.DataFrame([
        [771155, 4210, 'QL', 19790108, np.nan],
        [771155, 'null', 'nsss', 19991892.0, 'age of the patient'],
        [771155, '4210', 'nsss', np.nan, 32]
        ], columns = ['patient_id', 'postcode', 'state', 'date_of_birth', 'age'])
    
    df_result = detect_duplicates(df, path_referential)
    
    df_expected = pd.DataFrame([
        [771155, 4210, 'QLD', datetime.datetime.strptime('1979-01-08', '%Y-%m-%d'), 41]
        ], columns = ['patient_id', 'postcode', 'state', 'date_of_birth', 'age'])
    
    #df_expected.loc[:,'patient_id'] = df_expected['patient_id'].astype(int)
    #df_expected.loc[:,'postcode'] = df_expected['postcode'].astype(int)
    #df_expected.loc[:,'state'] = df_expected['state'].astype(str)
    #df_expected.loc[:,'age'] = df_expected['age'].astype(float)
    
    assert_frame_equal(left=df_expected.reset_index(drop=True), right=df_result.reset_index(drop=True),
                       check_dtype=False)