Python save_sanitized_df_to_csv Examples

Programming Language: Python

Namespace/Package Name: datapackage_io_util

Method/Function: save_sanitized_df_to_csv

Examples at hotexamples.com: 3

Python save_sanitized_df_to_csv - 3 examples found. These are the top rated real world Python examples of datapackage_io_util.save_sanitized_df_to_csv extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: mimic_direct_extract.py Project: joefarrington/MIMIC_Extract

def save_pop(data_df,
             outPath,
             static_filename,
             pop_size_int,
             static_data_schema,
             host=None):
    # Connect to local postgres version of mimic

    # Serialize to disk
    csv_fpath = os.path.join(outPath, static_filename)
    save_sanitized_df_to_csv(csv_fpath, data_df, static_data_schema)
    """
    # Lower cost to doing this conversion and of serializing it afterwards
    #  (http://matthewrocklin.com/blog/work/2015/03/16/Fast-Serialization)
    data['admission_type'] = data['admission_type'].astype('category')
    data['gender'] = data['gender'].astype('category')
    data['first_careunit'] = data['first_careunit'].astype('category')
    data['ethnicity'] = data['ethnicity'].astype('category')

    # Process the timestamps
    data['intime'] = pd.to_datetime(data['intime']) #, format="%m/%d/%Y"))
    data['outtime'] = pd.to_datetime(data['outtime'])
    data['admittime'] = pd.to_datetime(data['admittime'])
    data['dischtime'] = pd.to_datetime(data['dischtime'])
    data['deathtime'] = pd.to_datetime(data['deathtime'])
    # Serialize to disk
    data.to_csv(os.path.join(outPath, static_filename))
    """
    return data_df

Example #2

Show file

File: mimic_direct_extract.py Project: simonbing/MIMIC_Extract

def save_pop(data_df,
             outPath,
             static_filename,
             pop_size_int,
             static_data_schema,
             host=None):
    # Connect to local postgres version of mimic

    # Serialize to disk
    csv_fpath = os.path.join(outPath, static_filename)
    save_sanitized_df_to_csv(csv_fpath, data_df, static_data_schema)

    return data_df

Example #3

Show file

File: mimic_direct_extract.py Project: joefarrington/MIMIC_Extract

def save_outcome(data,
                 dbname,
                 schema_name,
                 outPath,
                 outcome_filename,
                 outcome_hd5_filename,
                 outcome_columns_filename,
                 outcome_schema,
                 host=None,
                 password=None):
    """ Retrieve outcomes from DB and save to disk

    Vent and vaso are both there already - so pull the start and stop times from there! :)

    Returns
    -------
    Y : Pandas dataframe
        Obeys the outcomes data spec
    """
    icuids_to_keep = get_values_by_name_from_df_column_or_index(
        data, 'icustay_id')
    icuids_to_keep = set([str(s) for s in icuids_to_keep])

    # Add a new column called intime so that we can easily subtract it off
    data = data.reset_index()
    data = data.set_index('icustay_id')
    data['intime'] = pd.to_datetime(data['intime'])  #, format="%m/%d/%Y"))
    data['outtime'] = pd.to_datetime(data['outtime'])
    icustay_timediff_tmp = data['outtime'] - data['intime']
    icustay_timediff = pd.Series([
        timediff.days * 24 + timediff.seconds // 3600
        for timediff in icustay_timediff_tmp
    ],
                                 index=data.index.values)

    # Setup access to PSQL db
    query_args = {'dbname': dbname}
    if args['psql_host'] is not None: query_args['host'] = args['psql_host']
    if args['psql_password'] is not None:
        query_args['password'] = args['psql_password']
    con = psycopg2.connect(**query_args)
    cur = con.cursor()

    # Query on ventilation data
    cur.execute('SET search_path to ' + schema_name)
    query = """
    select i.subject_id, i.hadm_id, v.icustay_id, v.ventnum, v.starttime, v.endtime
    FROM icustay_detail i
    INNER JOIN ventdurations v ON i.icustay_id = v.icustay_id
    where v.icustay_id in ({icuids})
    and v.starttime between intime and outtime
    and v.endtime between intime and outtime;
    """.format(icuids=','.join(icuids_to_keep))
    vent_data = pd.read_sql_query(query, con)
    vent_data = continuous_outcome_processing(vent_data, data,
                                              icustay_timediff)
    vent_data = vent_data.apply(add_outcome_indicators)
    vent_data.rename(columns={'on': 'vent'}, inplace=True)
    vent_data = vent_data.reset_index()

    # Get the patients without the intervention in there too so that we
    ids_with = vent_data['icustay_id']
    ids_with = set(map(int, ids_with))
    ids_all = set(map(int, icuids_to_keep))
    ids_without = (ids_all - ids_with)
    #ids_without = map(int, ids_without)

    # Create a new fake dataframe with blanks on all vent entries
    out_data = data.copy(deep=True)
    out_data = out_data.reset_index()
    out_data = out_data.set_index('icustay_id')
    out_data = out_data.iloc[out_data.index.isin(ids_without)]
    out_data = out_data.reset_index()
    out_data = out_data[['subject_id', 'hadm_id', 'icustay_id']]
    out_data['max_hours'] = out_data['icustay_id'].map(icustay_timediff)

    # Create all 0 column for vent
    out_data = out_data.groupby('icustay_id')
    out_data = out_data.apply(add_blank_indicators)
    out_data.rename(columns={'on': 'vent'}, inplace=True)
    out_data = out_data.reset_index()

    # Concatenate all the data vertically
    Y = pd.concat([
        vent_data[['subject_id', 'hadm_id', 'icustay_id', 'hours_in', 'vent']],
        out_data[['subject_id', 'hadm_id', 'icustay_id', 'hours_in', 'vent']]
    ],
                  axis=0)

    # Start merging all other interventions
    table_names = [
        'vasopressordurations', 'adenosinedurations', 'dobutaminedurations',
        'dopaminedurations', 'epinephrinedurations', 'isupreldurations',
        'milrinonedurations', 'norepinephrinedurations',
        'phenylephrinedurations', 'vasopressindurations'
    ]
    column_names = [
        'vaso', 'adenosine', 'dobutamine', 'dopamine', 'epinephrine',
        'isuprel', 'milrinone', 'norepinephrine', 'phenylephrine',
        'vasopressin'
    ]

    # TODO(mmd): This section doesn't work. What is its purpose?
    for t, c in zip(table_names, column_names):
        # TOTAL VASOPRESSOR DATA
        cur.execute('SET search_path to ' + schema_name)
        query = """
        select i.subject_id, i.hadm_id, v.icustay_id, v.vasonum, v.starttime, v.endtime
        FROM icustay_detail i
        INNER JOIN {table} v ON i.icustay_id = v.icustay_id
        where v.icustay_id in ({icuids})
        and v.starttime between intime and outtime
        and v.endtime between intime and outtime;
        """.format(icuids=','.join(icuids_to_keep), table=t)
        new_data = pd.read_sql_query(query, con)
        new_data = continuous_outcome_processing(new_data, data,
                                                 icustay_timediff)
        new_data = new_data.apply(add_outcome_indicators)
        new_data.rename(columns={'on': c}, inplace=True)
        new_data = new_data.reset_index()
        # c may not be in Y if we are only extracting a subset of the population, in which c was never
        # performed.
        if not c in new_data:
            print "Column ", c, " not in data."
            continue

        Y = Y.merge(
            new_data[['subject_id', 'hadm_id', 'icustay_id', 'hours_in', c]],
            on=['subject_id', 'hadm_id', 'icustay_id', 'hours_in'],
            how='left')

        # Sort the values
        Y.fillna(0, inplace=True)
        Y[c] = Y[c].astype(int)
        #Y = Y.sort_values(['subject_id', 'icustay_id', 'hours_in']) #.merge(df3,on='name')
        Y = Y.reset_index(drop=True)
        print 'Extracted ' + c + ' from ' + t

    tasks = ["colloid_bolus", "crystalloid_bolus", "nivdurations"]

    for task in tasks:
        cur.execute('SET search_path to ' + schema_name)
        if task == 'nivdurations':
            query = """
            select i.subject_id, i.hadm_id, v.icustay_id, v.starttime, v.endtime
            FROM icustay_detail i
            INNER JOIN {table} v ON i.icustay_id = v.icustay_id
            where v.icustay_id in ({icuids})
            and v.starttime between intime and outtime
            and v.endtime between intime and outtime;
            """.format(icuids=','.join(icuids_to_keep), table=task)
        else:
            query = """
            select i.subject_id, i.hadm_id, v.icustay_id, v.charttime AS starttime, 
                   v.charttime AS endtime
            FROM icustay_detail i
            INNER JOIN {table} v ON i.icustay_id = v.icustay_id
            where v.icustay_id in ({icuids})
            and v.charttime between intime and outtime
            """.format(icuids=','.join(icuids_to_keep), table=task)
        new_data = pd.read_sql_query(query, con=con)
        if new_data.shape[0] == 0:
            continue
        new_data = continuous_outcome_processing(new_data, data,
                                                 icustay_timediff)
        new_data = new_data.apply(add_outcome_indicators)
        new_data.rename(columns={'on': task}, inplace=True)
        new_data = new_data.reset_index()
        new_data.to_csv('new_task.csv')
        Y = Y.merge(
            new_data[['subject_id', 'hadm_id', 'icustay_id', 'hours_in',
                      task]],
            on=['subject_id', 'hadm_id', 'icustay_id', 'hours_in'],
            how='left')

        # Sort the values
        Y.fillna(0, inplace=True)
        Y[task] = Y[task].astype(int)
        Y = Y.reset_index(drop=True)
        print 'Extracted ' + task

    # TODO: ADD THE RBC/PLT/PLASMA DATA
    # TODO: ADD DIALYSIS DATA
    # TODO: ADD INFECTION DATA

    cur.close()
    con.close()

    Y = Y.filter(
        items=['subject_id', 'hadm_id', 'icustay_id', 'hours_in', 'vent'] +
        column_names + tasks)
    Y.subject_id = Y.subject_id.astype(int)
    Y.icustay_id = Y.icustay_id.astype(int)
    Y.hours_in = Y.hours_in.astype(int)
    Y.vent = Y.vent.astype(int)
    Y.vaso = Y.vaso.astype(int)
    y_id_cols = ID_COLS + ['hours_in']
    Y = Y.sort_values(y_id_cols)
    Y.set_index(y_id_cols, inplace=True)

    print 'Shape of Y : ', Y.shape

    # SAVE AS NUMPY ARRAYS AND TEXT FILES
    #np_Y = Y.as_matrix()
    #np.save(os.path.join(outPath, outcome_filename), np_Y)

    # Turn back into columns
    df = Y.reset_index()
    df = sanitize_df(df, outcome_schema)
    csv_fpath = os.path.join(outPath, outcome_filename)
    save_sanitized_df_to_csv(csv_fpath, df, outcome_schema)

    col_names = list(df.columns.values)
    col_names = col_names[3:]
    with open(os.path.join(outPath, outcome_columns_filename), 'w') as f:
        f.write('\n'.join(col_names))

    # TODO(mmd): Why does df have the index? Is sanitize making multiindex?
    # SAVE THE DATA AS A PANDAS OBJECT
    # TODO(mike hughes): Why writing out Y after you've separately sanitized df?
    Y.to_hdf(os.path.join(outPath, outcome_hd5_filename), 'Y')
    return df