def clean_notes(df_notes, args, config, client):
    g_notes = pd_utils.split_df(df_notes, chunk_size=args.chunk_size)

    _ = dask_utils.apply_groups(g_notes,
                                client,
                                process_chunk_clean_row,
                                config,
                                progress_bar=True)

    return None
def load_raw_ts_data(df_listfile, args, config, client) -> pd.DataFrame:
    chunks = pd_utils.split_df(df_listfile, chunk_size=args.chunk_size)
    stacked_dfs = dask_utils.apply_groups(chunks,
                                          client,
                                          process_chunk,
                                          config,
                                          progress_bar=True)

    stacked_df = pd.concat(stacked_dfs)
    stacked_df = stacked_df.reset_index(drop=True)

    return stacked_df
def create_all_combined_records(df_episodes, args, config, client):
    g_episodes = pd_utils.split_df(df_episodes, chunk_size=args.chunk_size)

    # this completes
    all_record_dfs = dask_utils.apply_groups(
        #all_record_dfs = pd_utils.apply_groups(
        g_episodes,
        client,
        process_chunk_final_record,
        args,
        config,
        progress_bar=True)

    df_all_records = pd.concat(all_record_dfs)
    return df_all_records
def create_bow(df_notes, args, config, client):
    g_notes = pd_utils.split_df(df_notes, chunk_size=args.chunk_size)

    f = mp_filenames.get_mimic_notes_count_vectorizer_filename(
        config['analysis_basepath'])

    icv_fit_load = joblib.load(f)

    # make sure to erase the function pointer for loading tokens
    icv_fit_load.get_tokens = None

    _ = dask_utils.apply_groups(g_notes,
                                client,
                                process_chunk_transform,
                                config,
                                icv_fit_load,
                                progress_bar=True)
def create_count_vectorizer(df_notes, args, config, client):
    g_notes = pd_utils.split_df(df_notes, chunk_size=args.chunk_size)

    # create independent vectorizers for each group
    fit_icvs = dask_utils.apply_groups(g_notes,
                                       client,
                                       process_chunk_count_vectorizer,
                                       config,
                                       progress_bar=True)

    # merge them
    icv_fit = IncrementalCountVectorizer.merge(fit_icvs,
                                               min_df=config['min_df'],
                                               max_df=config['max_df'],
                                               get_tokens=get_tokens)

    # and write to disk
    f = mp_filenames.get_mimic_notes_count_vectorizer_filename(
        config['analysis_basepath'])

    shell_utils.ensure_path_to_file_exists(f)
    joblib.dump(icv_fit, f)