Beispiel #1
0
def prepare_for_learning(data,
                         data_struct,
                         variables_to_incl,
                         variables_to_exclude,
                         goal,
                         group_by_record=True,
                         use_outcome=None,
                         additional_fn=None,
                         use_imputation=True):

    outcomes, used_columns = calculate_outcomes(data, data_struct)
    data = pd.concat([data, outcomes], axis=1)

    # Group per record id
    if group_by_record:
        outcomes = outcomes.groupby(by=data['Record Id'], axis=0) \
                           .last() \
                           .reset_index(drop=True)
        data = data.groupby(by='Record Id', axis=0) \
                   .last() \
                   .reset_index(drop=False)

    x, y, all_outcomes = select_x_y(data, outcomes, used_columns, goal)

    # Select variables to include in prediction
    variables_to_incl['Field Variable Name'] += ['hospital']
    x = select_variables(x, data_struct, variables_to_incl)

    # Select variables to exclude
    x = x.drop(is_in_columns(variables_to_exclude, x), axis=1)

    # Remove columns without information
    hospital = x.loc[:, 'hospital']
    records = x.loc[:, 'Record Id']
    x = x.drop(['hospital', 'Record Id'], axis=1)
    x = x.loc[:, x.nunique() > 1]  # Remove columns without information

    if use_imputation:
        x = impute_missing_values(x, data_struct)

    x = x.fillna(0)  # Fill missing values with 0 (0==missing or no asik)

    x = x.astype(float)
    print('LOG: Using <{}:{}> as y.'.format(goal[0], goal[1]))
    print('LOG: Selected {} variables for predictive model'.format(
        x.columns.size))

    # Remove samples with missing y
    if goal[0] != 'survival':
        has_y = y.notna()
        x = x.loc[has_y, :]
        y = y.loc[has_y]

    return x, y, data, hospital, records
def prepare_for_learning(data,
                         data_struct,
                         variables_to_incl,
                         goal,
                         group_by_record=True,
                         use_outcome=None,
                         additional_fn=None):
    # Get all outcomes
    # outcomes, used_columns = calculate_outcomes(data, data_struct)

    outcomes, used_columns = calculate_outcomes(data, data_struct)
    data = pd.concat([data, outcomes], axis=1)

    # Group per record id
    if group_by_record:
        outcomes = outcomes.groupby(by=data['Record Id'],
                                    axis=0).last().reset_index(drop=True)
        data = data.groupby(by='Record Id',
                            axis=0).last().reset_index(drop=True)

    x, y, outcome_name = select_x_y(data, outcomes, used_columns, goal=goal)

    # Select variables to include in prediction
    x = select_variables(x, data_struct, variables_to_incl)
    # Select variables to exclude
    #       TODO: used_columns (can't include columns used to calculate the outcome)
    #             any other columns

    # Remove columns without information
    x = x.loc[:, x.nunique() > 1]  # Remove columns without information
    x = x.fillna(
        0)  # Fill missing values with 0 (as far as I know 0==missing or no)

    print('LOG: Using <{}> as y.'.format(outcome_name))
    print('LOG: Selected {} variables for predictive model'.format(
        x.columns.size))

    return x, y, data
def prepare_for_learning(data, data_struct, variables_to_incl,
                         variables_to_exclude, goal,
                         group_by_record=True, use_outcome=None,
                         additional_fn=None,
                         remove_records_threshold_above=1,
                         remove_features_threshold_above=0.5,
                         pcr_corona_confirmed_only=True):

    outcomes, used_columns = calculate_outcomes(data, data_struct)
    data = pd.concat([data, outcomes], axis=1)

    # Group per record id
    if group_by_record:
        outcomes = outcomes.groupby(by=data['Record Id'], axis=0) \
                           .last() \
                           .reset_index(drop=True)
        data = data.groupby(by='Record Id', axis=0) \
                   .last() \
                   .reset_index(drop=False)

    x, y, all_outcomes = select_x_y(data, outcomes, used_columns, goal)

    # Remove samples with missing y
    if goal[0] != 'survival':
        has_y = y.notna()
        x = x.loc[has_y, :]
        y = y.loc[has_y]

    # Include only patients with CONFIRMED covid infection (PCR+ or Coronavirus)
    # This excludes patients based on CORADS > 4
    if pcr_corona_confirmed_only:
        is_confirmed_patient = \
            (data['Coronavirus']==1) |\
            (data['pcr_pos']==1) |\
            (data['corads_admission_cat_4']==1) |\
            (data['corads_admission_cat_5']==1)
        x = x.loc[is_confirmed_patient, :]
        y = y.loc[is_confirmed_patient]

    # Select variables to include in prediction
    variables_to_incl['Field Variable Name'] += ['hospital']
    variables_to_exclude += ['Coronavirus', 'pcr_pos']
    x = select_variables(x, data_struct, 
                         variables_to_incl,
                         variables_to_exclude)

    # Select time frame
    days_until_death = outcomes.loc[x.index, 'Days until death'].copy()
    days_until_discharge = outcomes.loc[x.index, 'Days until discharge'].copy()
    has_death_week = (days_until_death>=0) & (days_until_death<=7)
    has_disch_week = (days_until_discharge>=0) & (days_until_discharge<=7)
    outcome_in_week = has_death_week | has_disch_week |\
                      (outcomes['Levend dag 21 maar nog in het ziekenhuis - totaal']==1)

    # tmp = x.drop(['Record Id', 'hospital'], axis=1)

    # x = x.loc[outcome_in_week, :]
    # y = y.loc[outcome_in_week]
    days_until_death = days_until_death[outcome_in_week]


    # Drop features with too many missing
    if remove_features_threshold_above is not None:
        threshold = remove_features_threshold_above
        has_too_many_missing = (x.isna().sum(axis=0)/x.shape[0]) > threshold
        x = x.loc[:, ~has_too_many_missing]
        print('LOG: dropped features: {}, due to more than {}% missing'
              .format(has_too_many_missing.loc[has_too_many_missing].index.to_list(),
                      threshold*100))

    # Remove records with too many missing
    if remove_records_threshold_above is not None:
        threshold = remove_records_threshold_above
        has_too_many_missing = ((x.isna().sum(axis=1))/(x.shape[1])) > threshold
        print('LOG: Dropped {} records, due to more than {}% missing'
              .format(has_too_many_missing.sum(), threshold*100))
        x = x.loc[~has_too_many_missing, :]
        y = y.loc[~has_too_many_missing]

    # Combine and Rename hospitals
    name_combined = 'Center com'
    cutoff = 100
    hospital = x.loc[:, 'hospital'].copy()
    counts = hospital.value_counts()
    hospital.loc[hospital.isin(counts[counts<cutoff].index)] = 'zzzzz' # Quick hack for correct sorting
    print('LOG: Hospitals to be combined: {}'.format(list(counts[counts<cutoff].index)))
    print('LOG: Combined {} hospitals to a single hospital of size n={}'\
           .format((counts<cutoff).sum(), counts[counts<cutoff].sum()))
           
    unique_hospitals = np.sort(hospital.unique())
    hosp_change_dict = dict(zip(
        unique_hospitals, 
        ['Center '+str(i) for i in range(unique_hospitals.size)]))
    hosp_change_dict['zzzzz'] = name_combined
    hospital = hospital.replace(hosp_change_dict)

    # Remove columns without information
    records = x.loc[:, 'Record Id']
    x = x.drop(['hospital', 'Record Id'], axis=1)
    x = x.loc[:, x.nunique() > 1]  # Remove columns without information

    print('LOG: Using <{}:{}> as y.'.format(goal[0], goal[1]))
    # print('LOG: Class distribution: 1: {}, 0: {}, total: {}'\
    #        .format(y[y.columns[0]].value_counts()[1], y[y.columns[0]].value_counts()[0], y[y.columns[0]].size))
    print('LOG: Selected {} variables for predictive model'
           .format(x.columns.size))

    explore_data(x, y)

    return x, y, data, hospital, records, days_until_death