Example #1
0
def merge_checklists(
    summary_base: Any, sector_files: List[Any],
    stem_to_colname: Union[dict, List[str]], taxonomy: Taxonomy,
    local_translation_context: LocalTranslationContext
) -> Tuple[pd.DataFrame, List[str], List[str]]:
    # Easier to use single column summary_base, but this will transform it if needed
    if isinstance(summary_base, Path):
        template = read_excel_or_csv_path(summary_base)
        # Create a single column master for summary
        summary_base = recombine_transformed_checklist(template, taxonomy)
    elif isinstance(summary_base, pd.DataFrame):
        summary_base = summary_base

    base_has_adult_col = 'Ad' in summary_base.columns
    base_has_immature_col = 'Im' in summary_base.columns
    has_adult_col = False
    has_immature_col = False

    # Start of big processing loop
    summary = summary_base.copy()
    sector_unique = 1
    sector_cols = []
    for idx, fpath in enumerate(sector_files):
        try:
            if isinstance(fpath, Path):
                sector_col = stem_to_colname.get(fpath.stem, None)
            else:
                sector_col = stem_to_colname[idx]
        except Exception as ee:
            print(ee, idx, fpath)
            sector_col = None

        if not sector_col:
            sector_col = f'X{sector_unique}'
            sector_unique += 1

        sector_cols.append(sector_col)
        print(f'Processing {sector_col}')

        summary_common_names = summary.CommonName.values
        summary_common_names_lower = [
            xs.lower() for xs in summary_common_names
        ]
        if isinstance(fpath, Path):
            checklist = read_excel_or_csv_path(fpath)
            # Only Excel files would be double column. CSV files could be hand made,
            # so clean them up. Double translation takes a long time, so avoid when
            # possible
            if fpath.suffix == '.xlsx':
                checklist = recombine_transformed_checklist(
                    checklist, taxonomy)
            else:
                cleaned_common_names = clean_common_names(
                    checklist.CommonName, taxonomy, local_translation_context)
                checklist.CommonName = cleaned_common_names
            # print(checklist.Total)
            xdtypes = {'CommonName': str, 'Total': int}
            checklist = checklist.astype(dtype=xdtypes)

            # so = pd.to_numeric(summary.NACC_SORT_ORDER, errors='coerce')
            # summary.NACC_SORT_ORDER = pd.Series(so).fillna(taxonomy.INVALID_NACC_SORT_ORDER)

        else:  # isinstance(summary_base, pd.DataFrame):
            checklist = fpath

        # Drop any rows with a blank CommonName. This can occur if the checklist is a summary
        # report with a 'Total' row at the bottom, and 'Total' is not a valid species
        checklist = checklist[checklist.CommonName != '']

        # Sector checklists may have added species not on the template
        checklist['cnlower'] = [xs.lower() for xs in checklist.CommonName]
        checklist_common_names_lower = set(
            [xs.lower() for xs in checklist.CommonName])
        names_to_add = checklist_common_names_lower - set(
            summary_common_names_lower)
        if not names_to_add == set():
            species_to_add = taxonomy.filter_species(list(names_to_add))
            if len(species_to_add) > 0:
                print(f'Added species: {species_to_add}')
            # Fix capitalization
            names_to_add = clean_common_names(list(names_to_add), taxonomy,
                                              local_translation_context)
            blank_row = pd.Series([''] * len(summary.columns),
                                  index=summary.columns)
            rows_to_add = []
            for cn in names_to_add:
                row = blank_row.copy()
                row['CommonName'] = cn
                if cn.lower() in species_to_add:
                    row['Rare'] = 'X'
                total = checklist[checklist.cnlower ==
                                  cn.lower()]['Total'].values[0]
                row[sector_col] = total
                rows_to_add.append(row)

            summary = summary.append(rows_to_add, ignore_index=True)

        #
        has_adult_col = 'Ad' in checklist.columns
        has_immature_col = 'Im' in checklist.columns

        summary[sector_col] = 0  # 'Total' field for this sector

        if has_adult_col:
            ad_col = f'Ad-{sector_col}'
            summary[ad_col] = 0

        if has_immature_col:
            im_col = f'Im-{sector_col}'
            summary[im_col] = 0

        # # S
        # # Fill in total for existing names
        # already_present_names = set(summary_common_names) & set(checklist.CommonName)
        # for cn in set(checklist.CommonName):
        #     total = checklist[checklist.CommonName == cn]['Total'].values[0]
        #     summary.loc[summary.CommonName == cn, sector_col] = total

        # print(summary.shape, len(summary_common_names_lower))
        summary_common_names_lower = [xs.lower() for xs in summary.CommonName]

        summary['cnlower'] = summary_common_names_lower
        for ix, row in checklist.iterrows():
            # if row.Total:
            #     print(row)
            total = row.FrozenTotal if 'FrozenTotal' in checklist.columns else row.Total
            mask = summary.cnlower == row.cnlower
            summary.loc[mask, sector_col] = total

        summary.drop(['cnlower'], axis=1, inplace=True)
        #     if has_adult_col:
        #         adult_total = checklist[checklist.CommonName == cn]['Ad'].values[0]
        #         summary.loc[summary.CommonName == cn, ad_col] = adult_total
        #
        #     if has_immature_col:
        #         immature_total = checklist[checklist.CommonName == cn]['Im'].values[0]
        #         summary.loc[summary.CommonName == cn, im_col] = immature_total

    # Fill in zeros for missing sector_col values; may have blanks if species added
    # for col in sector_cols:
    #     summary[col] = summary[col].apply(pd.to_numeric).fillna(0)

    # Do sums for Ad/Im columns. Ad == 'Adult/White'
    if base_has_adult_col:
        ad_cols = [xs for xs in summary.columns if xs.startswith('Ad-')]
        summary['Ad'] = summary[ad_cols].apply(
            pd.to_numeric).fillna(0).sum(axis=1).astype(int)

    if base_has_immature_col:
        im_cols = [xs for xs in summary.columns if xs.startswith('Im-')]
        summary['Im'] = summary[im_cols].apply(
            pd.to_numeric).fillna(0).sum(axis=1).astype(int)

    # Look up Group and TaxonOrder for anything missing these (may have been added species)

    for idx, row in summary.iterrows():
        record = taxonomy.find_local_name_row(row['CommonName'])
        if record is not None:
            summary.at[idx, 'TaxonOrder'] = record.TAXON_ORDER
            summary.at[idx, 'Group'] = record.SPECIES_GROUP
            so = record.NACC_SORT_ORDER if record.NACC_SORT_ORDER != 0 else \
                taxonomy.INVALID_NACC_SORT_ORDER
            summary.at[idx, 'NACC_SORT_ORDER'] = so
            so = record.ABA_SORT_ORDER if record.ABA_SORT_ORDER != 0 else \
                taxonomy.INVALID_NACC_SORT_ORDER
            summary.at[idx, 'ABA_SORT_ORDER'] = so
            summary.at[idx, 'Category'] = record.Category

    # Re-sort by TaxonOrder
    # Must sort before creating formulae for Total
    so = pd.to_numeric(summary.NACC_SORT_ORDER, errors='coerce')
    summary.NACC_SORT_ORDER = pd.Series(so).fillna(
        taxonomy.INVALID_NACC_SORT_ORDER)
    so = pd.to_numeric(summary.ABA_SORT_ORDER, errors='coerce')
    summary.ABA_SORT_ORDER = pd.Series(so).fillna(
        taxonomy.INVALID_NACC_SORT_ORDER)

    try:
        summary = summary.sort_values(by=['NACC_SORT_ORDER']).reset_index(
            drop=True)
    except TypeError as te:
        print(te)
        traceback.print_exc(file=sys.stdout)
        return summary

    # Now set the overall total field:
    #     sector_cols = [xs for xs in summary.columns if xs.startswith('Sector')]
    # summary['Total'] = summary[sector_cols].apply(pd.to_numeric).fillna(0).sum(axis=1).astype(int)

    col_letters = excel_columns()
    #     team_start_col = col_letters[len(base_columns)]
    std_columns = [
        'Group', 'CommonName', 'Rare', 'Total', 'Category', 'TaxonOrder',
        'NACC_SORT_ORDER', 'ABA_SORT_ORDER'
    ]
    # Filter out any missing columns
    std_columns = [col for col in std_columns if col in summary.columns]
    # team_start_col = col_letters[index_of_first_subtotal_column(summary)]
    sector_start_col = col_letters[len(std_columns)]
    sector_end_col = col_letters[len(summary.columns) - 1]
    total_formula = [
        f'=SUM(${sector_start_col}{ix}:${sector_end_col}{ix})'
        for ix in range(2, summary.shape[0] + 2)
    ]
    summary['Total'] = total_formula

    # Add last row for Total and each Sector total
    totals_row = pd.Series([''] * len(summary.columns), index=summary.columns)
    totals_row['Group'] = 'Totals'
    totals_row['TaxonOrder'] = 99999
    totals_row['NACC_SORT_ORDER'] = taxonomy.INVALID_NACC_SORT_ORDER
    totals_row['ABA_SORT_ORDER'] = taxonomy.INVALID_NACC_SORT_ORDER

    # Formula for Grand Total, e.g. =SUM($D$2:$D$245)
    total_col_letter = col_letters[std_columns.index('Total')]
    total_formula = f'=SUM(${total_col_letter}2:${total_col_letter}{summary.shape[0] + 1})'
    totals_row.Total = total_formula

    # sector_cols = [xs for xs in summary.columns if xs.startswith('Sector')]
    sector_totals = summary[sector_cols].apply(
        pd.to_numeric).fillna(0).sum(axis=0).astype(int)
    for col, st in sector_totals.items():
        totals_row[col] = st

    summary = summary.append(totals_row, ignore_index=True)

    cols_to_drop = [
        col for col in summary.columns
        if (col.startswith('Ad-') or col.startswith('Im-'))
    ]
    summary.drop(labels=cols_to_drop, axis=1, inplace=True)

    summary.rename(columns={
        'Ad': 'Adult/White',
        'Im': 'Immature/Blue'
    },
                   inplace=True)

    # Re-order columns
    # print(sector_cols)
    # print(summary.columns)

    new_col_order = [
        col for col in [
            'Group', 'CommonName', 'Rare', 'Total', 'Category', 'TaxonOrder',
            'NACC_SORT_ORDER', 'ABA_SORT_ORDER'
        ] if col in summary.columns
    ]
    new_col_order.extend(sector_cols)
    summary = summary[new_col_order]

    # Don't hide 'Rare' since this will be frequently used in a filter
    cols_to_hide = [
        'D', 'Difficulty', 'Adult', 'Immature', 'W-morph', 'B-Morph'
    ]

    if 'Adult/White' in summary.columns:
        if summary['Adult/White'].apply(pd.to_numeric).fillna(0).sum() == 0:
            cols_to_hide.append('Adult/White')
    if 'Immature/Blue' in summary.columns:
        if summary['Immature/Blue'].apply(pd.to_numeric).fillna(0).sum() == 0:
            cols_to_hide.append('Immature/Blue')

    cols_to_highlight = list(
        set(summary.columns) & {'Total', 'Adult/White', 'Immature/Blue'})

    return summary, cols_to_hide, cols_to_highlight