Ejemplo n.º 1
0
def context_sources_to_dfs(excel_dirpath, context_sources=CONTEXT_SOURCES):
    """Loads sources of context data into data frames"""
    source_dfs = {}
    for excel_filepath in list_excel_files(excel_dirpath):
        dfs = read_excel_to_dataframes(excel_filepath)
        for source_key, _ in context_sources.items():
            if not source_key in dfs:
                continue
            source_dfs[source_key] = dfs[source_key]
    return source_dfs
Ejemplo n.º 2
0
def prepare_media_links_df(excel_dirpath, project_uuid, all_contexts_df):
    """Prepares a media link dataframe."""
    df_link = None
    for excel_filepath in list_excel_files(excel_dirpath):
        if not 'Media' in excel_filepath:
            continue
        dfs = read_excel_to_dataframes(excel_filepath)
        df_link = prepare_media_links_from_dfs(project_uuid, dfs,
                                               all_contexts_df)
    return df_link
Ejemplo n.º 3
0
def prepare_catalog(project_uuid, excel_dirpath):
    """Prepares catalog dataframes."""
    dfs = None
    for excel_filepath in list_excel_files(excel_dirpath):
        if not 'Catalog' in excel_filepath:
            continue
        dfs = read_excel_to_dataframes(excel_filepath)
        df_f = dfs[CATALOG_ATTRIBUTES_SHEET]
        df_f = drop_empty_cols(df_f)
        df_f = update_multivalue_columns(df_f)
        df_f = clean_up_multivalue_cols(df_f)
        dfs[CATALOG_ATTRIBUTES_SHEET] = df_f
    return dfs
Ejemplo n.º 4
0
def prep_field_tables(excels_filepath,
                      project_uuid,
                      year,
                      field_data_preps=None):
    """Prepares main field created data tables."""
    if field_data_preps is None:
        field_data_preps = FIELD_DATA_PREPS
    excels = list_excel_files(excels_filepath)
    field_config_dfs = {}
    for excel_filepath in excels:
        dfs = read_excel_to_dataframes(excel_filepath)
        for act_sheet, config in field_data_preps.items():
            if not act_sheet in dfs:
                # Not applicable.
                continue
            df_f = drop_empty_cols(dfs[act_sheet])
            df_f = update_multivalue_columns(df_f)
            df_f = clean_up_multivalue_cols(
                df_f, skip_cols=SKIP_MULTI_VALUE_REDACTIONS)
            if 'child_context_cols' in config:
                df_f = prepare_trench_contexts(
                    df_f,
                    year,
                    child_context_cols=config['child_context_cols'])
            if config.get('tb_new_title') is not None:
                # Do a Trench book specific change, making a new
                # title column.
                df_f = add_make_new_trench_book_title_column(
                    df_f, config['tb_new_title'])
            if config.get('tb_doc_type') is not None:
                # Note that all of the data (so far) are for
                doc_type_col, doc_type = config.get('tb_doc_type')
                df_f[doc_type_col] = doc_type
            if config.get('tb_entry_year') is not None:
                # Add the Trench Book entry year.
                entry_year_col = config.get('tb_entry_year')
                df_f[entry_year_col] = year
            if config.get('tb_doc_type_root') is not None:
                df_f['subject_uuid_source'] = UUID_SOURCE_KOBOTOOLBOX
                df_f = add_trench_book_parents(df_f, project_uuid, year,
                                               config)
            dfs[act_sheet] = df_f
            config['dfs'] = dfs
            field_config_dfs[act_sheet] = config
    return field_config_dfs
Ejemplo n.º 5
0
def make_all_export_media_df(excels_dirpath,
                             media_cols_endswith=None,
                             new_file_prefixes=None):
    """Make a dataframe of all media in all export files."""
    if new_file_prefixes is None:
        new_file_prefixes = MEDIA_SOURCE_FILE_PREFIXS
    df_all_media_list = []
    for excel_filepath in list_excel_files(excels_dirpath):
        excel_file = os.path.basename(excel_filepath)
        dfs = read_excel_to_dataframes(excel_filepath)
        df_media = make_dfs_media_df(dfs,
                                     media_cols_endswith=media_cols_endswith)
        if df_media is None:
            continue
        df_media['source_file'] = excel_file
        df_media['new_filename'] = df_media['filename'].apply(revise_filename)
        for file_start, prefix in new_file_prefixes.items():
            if not excel_file.startswith(file_start):
                continue
            df_media['new_filename'] = prefix + df_media['new_filename']
            if MEDIA_SOURCE_COMPOSITION_TYPES.get(file_start):
                df_media[
                    'Type of Composition Subject'] = MEDIA_SOURCE_COMPOSITION_TYPES[
                        file_start]
        df_all_media_list.append(df_media)
    if not len(df_all_media_list):
        return None
    df_all_media = pd.concat(df_all_media_list)
    if df_all_media.empty:
        return None
    df_all_media = df_all_media[df_all_media['new_filename'].notnull()]
    df_all_media.drop_duplicates(subset=['new_filename'], inplace=True)
    expected_len = len(df_all_media.index)
    if (len(df_all_media['new_filename'].unique().tolist()) != expected_len or
            len(df_all_media['filename'].unique().tolist()) != expected_len):
        raise RuntimeError(
            'Expected {}, but have {} filenames, and {} new-filenames'.format(
                expected_len, len(df_all_media['filename'].unique().tolist()),
                len(df_all_media['new_filename'].unique().tolist())))
    return df_all_media