def write_basic_spreadsheet(df: pd.DataFrame, fpath: Path, column_widths: dict, columns_to_center: List[str]): if df.empty: return None df = df.fillna('') default_column_width = 14 # if not specified in column_widths xsheet_name = 'Basic' with pd.ExcelWriter(fpath.as_posix(), engine='xlsxwriter') as writer: df.to_excel(writer, index=False, sheet_name=xsheet_name) # Get the xlsxwriter workbook and worksheet objects. workbook = writer.book worksheet = writer.sheets[xsheet_name] xlfmts = add_workbook_formats(workbook) # https://stackoverflow.com/questions/43991505/xlsxwriter-set-global-font-size workbook.formats[0].set_font_size( 14) # to make it readable when printed # ---------------------------------------------- # Populate col_infos with formatting information col_infos = {} excel_letters = excel_columns() for ix, col in enumerate(df.columns): col_letter = excel_letters[ix] col_info = {} fmt = xlfmts[Xlformat.CENTER] if ( columns_to_center is not None and col in columns_to_center) else None col_info['format'] = fmt col_info['width'] = column_widths.get(col, default_column_width) col_info['xl_col_letter'] = f'{col_letter}' col_infos[col] = col_info colspec = pd.DataFrame(col_infos).T # Set the column widths and format. # Set formats with e.g. 'C:C' for col_num, col_info in colspec.iterrows(): xl_col_letter = col_info['xl_col_letter'] wid = col_info['width'] fmt = col_info['format'] worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', wid, fmt) # Make the sheet banded make_sheet_banded(worksheet, df) worksheet.freeze_panes(1, 0) # Freeze the first row. # Write the column headers with the defined format. for col, col_info in colspec.iterrows(): col_num = list(colspec.index).index(col) worksheet.write(0, col_num, col, xlfmts[Xlformat.HEADER])
def extract_from_audubon_results(fpath: Path) -> Tuple[pd.DataFrame, dict]: # Kinda hacky, works for 121st count results (2020 season) # Use local version of open_workbook to fix: # Suppress annoying xlrd warning "WARNING *** file size ..." #37 encoding_override = None with open(fpath, 'rb') as fp: book = open_workbook(file_contents=fp.read(), encoding_override=encoding_override) sheet = book.sheet_by_index(0) # Count code is in cell $A6$AC, name in $A6$L, date in $A6$AO count_info_row = 5 count_code_col = excel_columns().index('AC') count_name_col = excel_columns().index('L') count_date_col = excel_columns().index('AO') circle_code = sheet.cell_value(rowx=count_info_row, colx=count_code_col) circle_name = sheet.cell_value(rowx=count_info_row, colx=count_name_col) count_date = sheet.cell_value(rowx=count_info_row, colx=count_date_col) col_g = sheet.col_values(excel_columns().index('G')) col_s = sheet.col_values(excel_columns().index('S')) # Scan down column G to find boundaries for species list species_index = col_g.index('Species') total_individuals_index = col_g.index('Total Individuals') species = col_g[species_index + 1: total_individuals_index] totals = pd.Series(col_s[species_index + 1: total_individuals_index]).apply( str_with_comma_to_num).fillna(0).astype(int) # print(circle_name, circle_code, count_date) # print(f'Species count: {len(species)}') circle_info = {'Name': circle_name, 'Code': circle_code, 'Date': count_date} df = pd.DataFrame(list(zip(species, totals)), columns=['CommonName', 'Total']) return df, circle_info
def write_possible_translations_spreadsheet(translations_df, translations_xl_path): if translations_df.empty: return None # LocalSpeciesName eBirdSpeciesName levd match_whole_line regex circle AltName1 Lev2 AltName2 col_widths = [50, 50, 12, 16, 12, 12, 30] for ix in range(MAX_SPECIES_ALTERNATIVES - 1): col_widths.append(30) col_widths.append(10) xsheet_name = 'Possible Translations' sheet_names = banded_sheets = [xsheet_name] center_cols = [ col for col in translations_df.columns if col.startswith('lev') ] for col in ['match_whole_line', 'regex', 'circle']: center_cols.append(col) with pd.ExcelWriter(translations_xl_path, engine='xlsxwriter') as writer: translations_df.to_excel(writer, index=False, sheet_name=xsheet_name) # Get the xlsxwriter workbook and worksheet objects. workbook = writer.book xlfmts = add_workbook_formats(workbook) excel_letters = excel_columns() for sheet_num, sheet_name in enumerate(sheet_names): worksheet = writer.sheets[xsheet_name] # Set the column width and format. widths = col_widths col_vals = translations_df.columns.values # df_columns[sheet_num].values for ix, wid in enumerate(widths): col_letter = excel_letters[ix] fmt = xlfmts[ Xlformat.CENTER] if col_vals[ix] in center_cols else None worksheet.set_column(f'{col_letter}:{col_letter}', wid, fmt) if sheet_name in banded_sheets: make_sheet_banded(worksheet, translations_df) # Write the column headers with the defined format. for col_num, value in enumerate(col_vals): worksheet.write(0, col_num, value, xlfmts[Xlformat.HEADER])
def format_col_if_other_col(checklist, worksheet, cols_to_format: List[str], condition_cols: List[str], xformat, to_match: str = 'X'): # Make CommonName background yellow if CountSpecial column is set # Assumes format has been added to workbook already # condition_col is the "other col" xl_last_data_row = checklist.shape[ 0] + 1 # plus 1 is because data starts at row 2 # Add variants with blanks to handle two column layouts (Double) # For example, both 'Adult' and 'Adult ' could be in columns cols_to_format.extend([xs + ' ' for xs in cols_to_format]) condition_cols.extend([xs + ' ' for xs in condition_cols]) cols_to_format_idxs = [ idx for idx, xs in enumerate(checklist.columns) if xs in cols_to_format ] # What column do we look for an 'X' in? These are the condition columns x_cols_idxs = [ idx for idx, xs in enumerate(checklist.columns) if xs in condition_cols ] col_letters = excel_columns() for cn_idx, cond_idx in zip(cols_to_format_idxs, x_cols_idxs): col2fmt_letter = col_letters[cn_idx] cond_col_letter = col_letters[cond_idx] to_format_cells = f'{col2fmt_letter}2:{col2fmt_letter}{xl_last_data_row}' criteria_cells = f'{cond_col_letter}2' criteria = f'=EXACT({criteria_cells}, {to_match})' # print(f'rarity_criteria: {rarity_criteria}') worksheet.conditional_format(to_format_cells, { 'type': 'formula', 'criteria': criteria, 'format': xformat })
def create_ebird_summary(summary_base: pd.DataFrame, personal_checklists: pd.DataFrame, checklist_meta: pd.DataFrame, circle_abbrev, parameters: Parameters, sector_name: str, taxonomy: Taxonomy, output_path) -> Tuple[Any, List[str]]: # Each checklist becomes a column in the summary sheet # Start of big processing loop summary = summary_base.copy() # team_cols = set() summary_common_names = list(summary.CommonName.values) checklist_meta = checklist_meta.copy()[ checklist_meta.sharing != 'secondary'] checklist_meta.sort_values( by=['location_group', 'locId', 'obsDt', 'groupId', 'Name'], na_position='first', inplace=True) sector_subids = list(personal_checklists.subId.values) sector_checklist_meta = checklist_meta[checklist_meta.subId.isin( sector_subids)] # Group CommonName Rare Total TaxonOrder NACC_SORT_ORDER summary['FrozenTotal'] = 0 # placeholder if 'Category' not in summary.columns: summary['Category'] = create_category_column(summary, taxonomy) std_columns = [ 'Group', 'CommonName', 'Rare', 'Total', 'FrozenTotal', 'Category', 'TaxonOrder', 'NACC_SORT_ORDER', 'ABA_SORT_ORDER' ] summary = summary[std_columns] # Sector checklists may have added species not on the template # Add on rows for these new species additional_rare = [] names_to_add = set( personal_checklists.CommonName.values) - set(summary_common_names) if not names_to_add == set(): # print(f'Need to add: {names_to_add}') # blank_row = pd.Series([''] * len(summary.columns), index=summary.columns) rows_to_add = [] for common_name in names_to_add: row_rarity = create_row_for_missing_species( common_name, summary, taxonomy) if row_rarity is None: continue row, rarity = row_rarity rows_to_add.append(row) if rarity: additional_rare.append(common_name) summary = summary.append(rows_to_add, ignore_index=True) summary_common_names.extend(list(names_to_add)) # Re-sort by TaxonOrder # Sorting has to be done before we create any formulae for Totals summary = summary.sort_values(by=['NACC_SORT_ORDER']).reset_index( drop=True) # Use the order from checklist_meta and add a column to summary for each checklist # personal_columns = [] for subid in sector_checklist_meta.subId.values: pcsub = personal_checklists[personal_checklists.subId == subid] species_totals = [] for common_name in summary.CommonName.values: species_row = pcsub[pcsub.CommonName == common_name] species_total = 0 if species_row.empty else species_row.Total.values[ 0] species_totals.append(species_total) # Add the column to summary summary[subid] = species_totals # Don't think we need the filter any more, since that was done above rare_species = filter_additional_rare(taxonomy, additional_rare) if len(rare_species) > 0: print( f' Requires rare bird form: {", ".join(rare_species)} [not on master list]' ) # Re-sort by TaxonOrder # Sorting has to be done before we create any formulae for Totals summary = summary.sort_values(by=['NACC_SORT_ORDER']).reset_index( drop=True) # We don't rename columns until right before we create Excel file team_cols = sector_checklist_meta.subId.values # The complexity here is because we can have cases where a single birder birded # near-duplicate locations. This means location_group is e.g. L13065376+L13065792 # but each of these checklist should be considered separate (use SUM not MAX) # Example in CAMP 2020/Rancho San Carlos: # L13065376-S78154180-09:24-Jeff Manker | L13065792-S78156572-10:10-Jeff Manker | # L13065792-S78184574-10:44-Jeff Manker mask = sector_checklist_meta.location_group.isnull() usemaxtmp = sector_checklist_meta[~mask] single_birder_locids = set() for locgrp, grp in usemaxtmp.groupby(['location_group']): # print(locgrp) if len(set(grp.Name)) == 1: # Same birder but possible location dups single_birder_locids |= set(grp.locId.values) mask_single = checklist_meta.locId.isin(single_birder_locids) mask |= mask_single use_sum_locids = sector_checklist_meta[mask].locId.values # Remove duplicates but keep in order use_max_locids = list( dict.fromkeys(sector_checklist_meta[~mask].locId.values)) # These are the columns we can just total up use_sum_subids = sector_checklist_meta[sector_checklist_meta.locId.isin( use_sum_locids)].subId.values use_sum_total = summary[use_sum_subids].apply( pd.to_numeric).fillna(0).sum(axis=1).astype(int) use_max_total = 0 # ToDo: logic is duplicated below for locid in use_max_locids: # subIds are the column names right now # subids = sector_checklist_meta[sector_checklist_meta.locId == locid].subId.values mask = [(lg.startswith(locid) if lg is not None else False) for lg in checklist_meta.location_group.values] subids = checklist_meta[mask].subId.values # This can be empty if it is not the first in a set of duplicate locations if len(subids) == 0: continue # print(locid, subids) max_vals = summary[subids].apply( pd.to_numeric).fillna(0).max(axis=1).astype(int) use_max_total += max_vals summary_total = use_sum_total + use_max_total # print(sum(summary_total)) # Values computed by formulae are only evaluated after a workbook has been opened and # saved by Excel. This means if we create these files but never open them, the Total # field will show up as 0 (a string formula converted to numeric) # Add this so that service_merge/merge_checklists has an actual value to use # ToDo: fix summary_total to use SUM/MAX summary['FrozenTotal'] = summary_total # Actually, make it a formula # Has to be after sorting # base_columns = ['Group', 'CommonName', 'Rare', 'Total', 'TaxonOrder'] # Group CommonName Rare Total Ad Im CountSpecial # =SUM($F5:$Q5) col_letters = excel_columns() # std_columns = ['Group', 'CommonName', 'Rare', 'Total', 'Category', 'TaxonOrder', # 'NACC_SORT_ORDER'] sum_start_index = len(std_columns) sum_end_index = len(std_columns) + len(use_sum_locids) - 1 sum_start_col = col_letters[sum_start_index] sum_end_col = col_letters[sum_end_index] # Start template for total with non-duplicate columns sum_formula_template = f'=SUM(${sum_start_col}INDEX:${sum_end_col}INDEX)' header_cell_groups = [] max_formula_totals = [] max_formula = None for locid in use_max_locids: # subIds are the column names right now # subids = sector_checklist_meta[sector_checklist_meta.locId == locid].subId.values mask = [(lg.startswith(locid) if lg is not None else False) for lg in checklist_meta.location_group.values] subids = checklist_meta[mask].subId.values # This can be empty if it is not the first in a set of duplicate locations if len(subids) == 0: continue max_start_index = list(summary.columns).index(subids[0]) max_end_index = list(summary.columns).index(subids[-1]) max_start_col = col_letters[max_start_index] max_end_col = col_letters[max_end_index] max_formula_template = f'MAX(${max_start_col}INDEX:${max_end_col}INDEX)' max_formula_totals.append(max_formula_template) # Collect up the header cells so we can color different groups header_cell_group = f'${max_start_col}1:${max_end_col}1' header_cell_groups.append(header_cell_group) if len(max_formula_totals): max_formula = '+'.join(max_formula_totals) total_formula = [] for ix in range(2, summary.shape[0] + 2): sft = sum_formula_template.replace('INDEX', str(ix)) tf_sum = f'{sft}' if max_formula is None: total_formula.append(tf_sum) else: mft = max_formula.replace('INDEX', str(ix)) tf_max = f'{mft}' total_formula.append(tf_sum + '+' + tf_max) # print(f' {total_formula[0]}') summary['Total'] = total_formula # Add last row for Total and each Sector total totals_row = pd.Series([''] * len(summary.columns), index=summary.columns) totals_row['Group'] = 'Totals' totals_row['TaxonOrder'] = 99999 totals_row['NACC_SORT_ORDER'] = taxonomy.INVALID_NACC_SORT_ORDER totals_row['ABA_SORT_ORDER'] = taxonomy.INVALID_NACC_SORT_ORDER # Formula for Grand Total, e.g. =SUM($D$2:$D$245) total_col_letter = col_letters[std_columns.index('Total')] total_formula = f'=SUM(${total_col_letter}2:${total_col_letter}{summary.shape[0] + 1})' totals_row.Total = total_formula # sector_cols = [xs for xs in summary.columns if xs.startswith('Sector')] sector_totals = summary[team_cols].apply( pd.to_numeric).fillna(0).sum(axis=0).astype(int) for col, st in sector_totals.items(): totals_row[col] = st summary = summary.append(totals_row, ignore_index=True) # Rename columns to more human readable form newcols = create_personal_checklist_columns(sector_checklist_meta) summary.rename(columns=newcols, inplace=True) # Don't hide 'Rare' since this will be frequently used in a filter cols_to_hide = [ 'D', 'Difficulty', 'Adult', 'Immature', 'W-morph', 'B-Morph', 'Ad', 'Im', 'CountSpecial', 'FrozenTotal' ] cols_to_highlight = list( set(summary.columns) & {'Total', 'Adult/White', 'Immature/Blue'}) outname = output_path / f'{circle_abbrev}-EBird-Summary-{sector_name}.xlsx' write_final_checklist_spreadsheet(summary, outname, parameters.parameters, additional_sheets=None, cols_to_hide=cols_to_hide, cols_to_highlight=cols_to_highlight, header_cell_groups=header_cell_groups) return summary, rare_species
def write_ground_truths(truths, out_path: Path): if truths.empty: return None truths = truths.copy() xsheet_name = 'Ground Truths' # Columns # name Category ABED-1 ABED-1v ABED-2 ABED-2v ABED-3 ABED-3v ABED-4 ABED-4v... cols_to_center = truths.columns.drop('name').drop( 'Category') # everything else is centered column_widths = {'name': 40, 'Category': 10} for col in cols_to_center: column_widths[col] = 5 if col.endswith('v') else 11 numeric_cols = cols_to_center text_cols = ['name', 'Category' ] # force otherwise may interpret original_line as a formula xl_last_data_row = truths.shape[ 0] + 1 # plus 1 is because data starts at row 2 with pd.ExcelWriter(out_path.as_posix(), engine='xlsxwriter') as writer: truths.to_excel(writer, index=False, sheet_name=xsheet_name) # Get the xlsxwriter workbook and worksheet objects. workbook = writer.book xlfmts = add_workbook_formats(workbook) # ---------------------------------------------- # Populate col_infos with formatting information col_infos = {} col_letters = excel_columns() for ix, col in enumerate(truths.columns): col_letter = col_letters[ix] col_info = {} # col_info['hide'] = col in real_cols_to_hide # col_info['highlight'] = col in real_cols_to_highlight if col in numeric_cols: fmt = xlfmts[Xlformat.NUMERIC_CENTERED] elif col in text_cols: fmt = xlfmts[Xlformat.TEXT] else: fmt = xlfmts[ Xlformat.CENTER] if col in cols_to_center else None col_info['format'] = fmt col_info['width'] = column_widths.get(col, 10) col_info['xl_col_letter'] = f'{col_letter}' col_infos[col] = col_info colspec = pd.DataFrame(col_infos).T # ---------------------------------------------- worksheet = writer.sheets[xsheet_name] title = 'Ground Truths' worksheet.set_header(f'&C&16&"Times New Roman,Regular"{title}') # footer_fmt = f'&C&12&"Times New Roman,Regular"' # worksheet.set_footer(f'{footer_fmt}Region: {region} Party: {party} Date: {dtcstr}') # https://xlsxwriter.readthedocs.io/page_setup.html # A common requirement is to fit the printed output to n pages wide but # have the height be as long as necessary # worksheet.fit_to_pages(1, 0) # Highlight numbers > 0 for species count last_column_letter = col_letters[truths.shape[1] - 1] v_total_cell_range = f'C2:{last_column_letter}{xl_last_data_row}' worksheet.conditional_format( v_total_cell_range, { 'type': 'cell', 'criteria': 'equal to', 'value': True, 'format': xlfmts[Xlformat.GREEN] }) # Set the column width and format. # Set formats with e.g. 'C:C' for col_num, col_info in colspec.iterrows(): xl_col_letter = col_info['xl_col_letter'] wid = col_info['width'] fmt = col_info['format'] worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', wid, fmt) # https://xlsxwriter.readthedocs.io/worksheet.html#set_column # for ix, col_info in colspec[colspec.hide].iterrows(): # xl_col_letter = col_info['xl_col_letter'] # worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', None, None, {'hidden': 1}) # Make the sheet banded make_sheet_banded(worksheet, truths) # Write the column headers with the defined format. for col, col_info in colspec.iterrows(): # fmt = col_info['format'] col_num = list(colspec.index).index(col) worksheet.write(0, col_num, col, xlfmts[Xlformat.HEADER])
def write_nlp_statistics(nlp_statistics, stats_path: Path): if nlp_statistics.empty: return None nlp_statistics = nlp_statistics.copy() xsheet_name = 'ParsePDF Statistics' # Columns # ['family', 'unknown', 'intersections', 'line_token_count', 'line', 'original_line', 'guess', # 'levd', 'line_len', 'lev_len_pct', 'species_inferred', 'is_species_line', # 'guess_correct', 'source'] cols_to_center = [ 'is_group', 'non_avian', 'intersections', 'line_token_count', 'levd', 'tx_line_len', 'species_inferred', 'is_species_line', 'guess_correct', 'source' ] column_widths = { 'classification': 20, 'original_line': 45, 'transformed_line': 45, 'species': 45, 'closest_match': 45, 'is_group': 12, 'non_avian': 12, 'intersections': 12, 'line_token_count': 12, 'levd': 11, 'tx_line_len': 11, 'lev_len_pct': 11, 'species_inferred': 14, 'exact_match': 14, 'verified': 14, 'source': 14 } numeric_cols = ['intersections', 'line_token_count', 'levd', 'line_len'] text_cols = [ 'classification', 'transformed_line', 'original_line', 'species', 'closest_match' ] # force otherwise may interpret original_line as a formula with pd.ExcelWriter(stats_path.as_posix(), engine='xlsxwriter') as writer: nlp_statistics.to_excel(writer, index=False, sheet_name=xsheet_name) # Get the xlsxwriter workbook and worksheet objects. workbook = writer.book xlfmts = add_workbook_formats(workbook) # ---------------------------------------------- # Populate col_infos with formatting information col_infos = {} excel_letters = excel_columns() for ix, col in enumerate(nlp_statistics.columns): col_letter = excel_letters[ix] col_info = {} # col_info['hide'] = col in real_cols_to_hide # col_info['highlight'] = col in real_cols_to_highlight if col in numeric_cols: fmt = xlfmts[Xlformat.NUMERIC_CENTERED] elif col == 'lev_len_pct': fmt = xlfmts[Xlformat.PERCENTAGE] elif col in text_cols: fmt = xlfmts[Xlformat.TEXT] else: fmt = xlfmts[ Xlformat.CENTER] if col in cols_to_center else None col_info['format'] = fmt col_info['width'] = column_widths.get(col, 10) col_info['xl_col_letter'] = f'{col_letter}' col_infos[col] = col_info colspec = pd.DataFrame(col_infos).T # ---------------------------------------------- worksheet = writer.sheets[xsheet_name] title = 'NLP Statistics' worksheet.set_header(f'&C&16&"Times New Roman,Regular"{title}') # footer_fmt = f'&C&12&"Times New Roman,Regular"' # worksheet.set_footer(f'{footer_fmt}Region: {region} Party: {party} Date: {dtcstr}') # https://xlsxwriter.readthedocs.io/page_setup.html # A common requirement is to fit the printed output to n pages wide but have the # height be as long as necessary # worksheet.fit_to_pages(1, 0) # rare_name_cells = f'G2:G{xl_last_data_row}' # rarity_criteria = '=EXACT(H2,"X")' # worksheet.conditional_format(rare_name_cells, # {'type': 'formula', 'criteria': rarity_criteria, 'format': format_rare}) # Set the column width and format. # Set formats with e.g. 'C:C' for col_num, col_info in colspec.iterrows(): xl_col_letter = col_info['xl_col_letter'] wid = col_info['width'] fmt = col_info['format'] worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', wid, fmt) # https://xlsxwriter.readthedocs.io/worksheet.html#set_column # for ix, col_info in colspec[colspec.hide].iterrows(): # xl_col_letter = col_info['xl_col_letter'] # worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', None, None, {'hidden': 1}) # Make the sheet banded make_sheet_banded(worksheet, nlp_statistics) # Write the column headers with the defined format. for col, col_info in colspec.iterrows(): # fmt = col_info['format'] col_num = list(colspec.index).index(col) worksheet.write(0, col_num, col, xlfmts[Xlformat.HEADER])
def write_final_checklist_spreadsheet(checklist, checklist_path: Path, parameters: dict, additional_sheets: Optional[List[dict]], cols_to_hide: list = None, cols_to_highlight: list = None, header_cell_groups: List[str] = None): # updated_checklist is the filled-in local_checklist # It may be wrapped to a two column (printing) format if cols_to_highlight is None: cols_to_highlight = ['Total'] if cols_to_hide is None: cols_to_hide = ['Group', 'R', 'TaxonOrder'] if checklist.empty: return None checklist = checklist.copy() xsheet_name = 'Final Checklist' # Columns # Group, CommonName, R, Total, TaxonOrder, Group_, CommonName_, R_, Total_, TaxonOrder_ # A B C D E F G H I J real_cols_to_hide = [ x for x in checklist.columns if x.rstrip() in cols_to_hide ] if cols_to_hide else [] real_cols_to_highlight = [x for x in checklist.columns if x.rstrip() in cols_to_highlight] \ if cols_to_highlight else [] cols_to_center = [ 'R', 'Total', 'TaxonOrder', 'Rare', 'Category', 'NACC_SORT_ORDER', 'ABA_SORT_ORDER' ] stripped_widths = { 'Group': 20, 'CommonName': 40, 'R': 5, 'Total': 7, 'TaxonOrder': 8, 'LocalName': 35, 'may_need_writeup': 35, 'Rare': 10, 'D': 3, 'Adult': 6, 'Immature': 6, 'W-morph': 6, 'B-Morph': 6, 'Difficulty': 6, 'Adult/White': 11, 'Immature/Blue': 11, 'Ad': 3, 'Im': 3, 'CountSpecial': 3, 'Category': 10, 'NACC_SORT_ORDER': 8, 'ABA_SORT_ORDER': 8 } xl_last_data_row = checklist.shape[ 0] + 1 # plus 1 is because data starts at row 2 fill_values = { 'Group': '', 'CommonName': '', 'Rare': '', 'TaxonOrder': 99999, 'Group ': '', 'CommonName ': '', 'Rare ': '', 'TaxonOrder ': 99999 } checklist = checklist.fillna(value=fill_values) # Probably sector names standard_cols_base = [ 'CommonName', 'LocalName', 'Group', 'Category', 'TaxonOrder', 'NACC_SORT_ORDER', 'ABA_SORT_ORDER', 'Total' ] standard_cols = standard_cols_base.copy() for col in standard_cols_base: standard_cols.append(col + ' ') non_standard_cols = [ col for col in checklist.columns if col not in standard_cols ] for col in non_standard_cols: cols_to_center.append(col) if not stripped_widths.get(col, None): stripped_widths[col] = 14 try: intcols = [col for col in checklist.columns if col.startswith('Taxon')] for col in intcols: checklist[col] = pd.to_numeric(checklist[col], errors='coerce') # checklist = checklist.astype({col: 'int32'}, errors='ignore') except Exception as ee: print(f'Failed to set type of column "{col}" to numeric', ee) checklist.to_csv(checklist_path.parent / f'failure-checklist.csv', index=False) unknown_idxs = checklist.index[checklist[col] == 'UNKNOWN'] display(checklist.loc[unknown_idxs]) traceback.print_exc(file=sys.stdout) # pass checklist.astype({'Total': str}) with pd.ExcelWriter(checklist_path.as_posix(), engine='xlsxwriter') as writer: checklist.to_excel(writer, index=False, sheet_name=xsheet_name) # Get the xlsxwriter workbook and worksheet objects. workbook = writer.book xlfmts = add_workbook_formats(workbook) # https://stackoverflow.com/questions/43991505/xlsxwriter-set-global-font-size workbook.formats[0].set_font_size( 14) # to make it readable when printed # ---------------------------------------------- # Populate col_infos with formatting information col_infos = {} col_letters = excel_columns() assert (len(checklist.columns) <= len(col_letters)) # 702 for ix, col in enumerate(checklist.columns): stripped_col = col.rstrip() col_letter = col_letters[ix] col_info = { 'hide': col in real_cols_to_hide, 'highlight': col in real_cols_to_highlight, 'format': xlfmts[Xlformat.CENTER] if col in cols_to_center else None, 'width': stripped_widths.get(stripped_col, 10), 'xl_col_letter': f'{col_letter}' } col_infos[col] = col_info colspec = pd.DataFrame(col_infos).T # ---------------------------------------------- worksheet = writer.sheets[xsheet_name] date_of_count = parameters['CountDate'] dtcount = datetime.strptime(date_of_count, '%Y-%m-%d') dtcstr = dtcount.strftime("%d %b %Y") yr = dtcount.strftime("%Y") title = parameters.get('FinalChecklistTitle', '') worksheet.set_header(f'&C&16&"Times New Roman,Regular"{title} {yr}') region = parameters['CircleAbbrev'] party = parameters['CircleID'] footer_fmt = f'&C&12&"Times New Roman,Regular"' worksheet.set_footer( f'{footer_fmt}Region: {region} Party: {party} Date: {dtcstr}' ) # print(f'parameters: {parameters}') page_breaks = parameters.get('page_breaks', None) if page_breaks: # When splitting into 2 columns, the page breaks are known print(f'page_breaks: {page_breaks}') worksheet.set_h_pagebreaks(page_breaks) else: # https://xlsxwriter.readthedocs.io/page_setup.html # A common requirement is to fit the printed output to n pages wide but # have the height be as long as necessary # print('fitting to 1 page wide') worksheet.fit_to_pages(1, 0) # Highlight numbers > 0 for species count for ix, col_info in colspec[colspec.highlight].iterrows(): xl_col_letter = col_info['xl_col_letter'] v_total_cell_range = f'{xl_col_letter}2:{xl_col_letter}{xl_last_data_row}' worksheet.conditional_format( v_total_cell_range, { 'type': 'cell', 'criteria': '>', 'value': 0, 'format': xlfmts[Xlformat.GREEN] }) excel_letters = excel_columns() # Make CommonName bold if Rare column is set cols_to_bold_idxs = [ idx for idx, xs in enumerate(checklist.columns) if xs.startswith('CommonName') ] rare_cols_idxs = [ idx for idx, xs in enumerate(checklist.columns) if xs.startswith('Rare') ] for cn_idx, ra_idx in zip(cols_to_bold_idxs, rare_cols_idxs): letter = excel_letters[cn_idx] letter_rare = excel_letters[ra_idx] format_rare = workbook.add_format({'bold': True}) # 'bg_color': '#FFC7CE', rare_name_cells = f'{letter}2:{letter}{xl_last_data_row}' rarity_criteria_cells = f'{letter_rare}2' rarity_criteria = f'=EXACT({rarity_criteria_cells},"X")' # print(f'rarity_criteria: {rarity_criteria}') worksheet.conditional_format( rare_name_cells, { 'type': 'formula', 'criteria': rarity_criteria, 'format': format_rare }) # Make CommonName background yellow if CountSpecial column is set format_col_if_other_col(checklist, worksheet, ['CommonName'], ['CountSpecial'], xlfmts[Xlformat.COUNTSPECIAL]) # format_col_if_other_col(checklist, worksheet, col_to_format, condition_cols, xformat) # Color the 'D' (Difficulty) column based on value in 'Difficulty' column xformats = [ xlfmts[idx] for idx in [Xlformat.EASY, Xlformat.MARGINAL, Xlformat.DIFFICULT] ] for to_match, xformat in zip(['E', 'M', 'D'], xformats): format_col_if_other_col(checklist, worksheet, ['D'], ['Difficulty'], xformat, to_match) # Color the 'Ad' (Adult) column based on value in 'Adult' column format_col_if_other_col(checklist, worksheet, ['Ad', 'Im'], ['Adult', 'Immature'], xlfmts[Xlformat.AGE], 'X') # Highlight the 'Ad', 'Im' if non-zero values in 'W-morph', 'B-Morph' # The 'Ad', 'Im' columns are overloaded here since there is no species overlap format_col_if_other_col(checklist, worksheet, ['Ad', 'Im'], ['W-morph', 'B-Morph'], xlfmts[Xlformat.MORPH], 'X') # Italicize non-species try: if 'Category' in checklist.columns: cols_to_italicize_idxs = [ idx for idx, xs in enumerate(checklist.columns) if xs.startswith('CommonName') ] category_cols_idxs = [ idx for idx, xs in enumerate(checklist.columns) if xs.startswith('Category') ] for cn_idx, ca_idx in zip(cols_to_italicize_idxs, category_cols_idxs): letter = excel_letters[cn_idx] letter_category = excel_letters[ca_idx] common_name_cells = f'{letter}2:{letter}{xl_last_data_row}' category_criteria_cells = f'{letter_category}2' # category_criteria = f'=EXACT({category_criteria_cells},"slash")' category_criteria = f'={category_criteria_cells}<>"species"' # print(f'category_criteria: {category_criteria}') worksheet.conditional_format( common_name_cells, { 'type': 'formula', 'criteria': category_criteria, 'format': xlfmts[Xlformat.ITALIC] }) except Exception as ee: print(ee) print(checklist.columns) print(category_cols_idxs) traceback.print_exc(file=sys.stdout) raise # rare_name_cells = f'G2:G{xl_last_data_row}' # rarity_criteria = '=EXACT(H2,"X")' # worksheet.conditional_format(rare_name_cells, # {'type': 'formula', 'criteria': rarity_criteria, 'format': format_rare}) # Set the column width and format. # Set formats with e.g. 'C:C' for col_num, col_info in colspec.iterrows(): xl_col_letter = col_info['xl_col_letter'] wid = col_info['width'] fmt = col_info['format'] worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', wid, fmt) # https://xlsxwriter.readthedocs.io/worksheet.html#set_column for ix, col_info in colspec[colspec.hide].iterrows(): xl_col_letter = col_info['xl_col_letter'] worksheet.set_column(f'{xl_col_letter}:{xl_col_letter}', None, None, {'hidden': 1}) # Make the sheet banded make_sheet_banded(worksheet, checklist) # Set the width, and other properties of a row # row (int) – The worksheet row (zero indexed). # height (float) – The row height. worksheet.set_row(0, 70, None, None) worksheet.freeze_panes(1, 0) # Freeze the first row. # header_cell_groups if header_cell_groups is not None: for ix, header_cell_group in enumerate(header_cell_groups): category_criteria = f'=True' # print(header_cell_group, ix, fmt) worksheet.conditional_format( header_cell_group, { 'type': 'formula', 'criteria': category_criteria, 'format': choose_format_accent(xlfmts, ix) }) # Write the column headers with the defined format. for col, col_info in colspec.iterrows(): # fmt = col_info['format'] col_num = list(colspec.index).index(col) worksheet.write(0, col_num, col, xlfmts[Xlformat.HEADER]) # *** if additional_sheets is not None: for sheet_info in additional_sheets: # print(sheet_info['sheet_name']) df = sheet_info['data'] df.to_excel(writer, index=False, sheet_name=sheet_info['sheet_name']) worksheet = writer.sheets[sheet_info['sheet_name']] make_sheet_banded(worksheet, df) center_cols = sheet_info['to_center'] for col, wid in sheet_info['widths'].items(): col_index = list(df.columns).index(col) col_letter = excel_letters[col_index] fmt = xlfmts[ Xlformat.CENTER] if col in center_cols else None worksheet.set_column(f'{col_letter}:{col_letter}', wid, fmt) # Set the width, and other properties of a row # row (int) – The worksheet row (zero indexed). # height (float) – The row height. # worksheet.set_row(0, 70, None, None) worksheet.freeze_panes(1, 0) # Freeze the first row. # Set the header format worksheet.write_row(0, 0, list(df.columns), xlfmts[Xlformat.HEADER])