Exemple #1
0
    def load_excel_to_df(argsdict: dict, filename_list: list,
                         column_names_list: list):
        """
        Reads a CVR excel file and saves it as a pandas data frame.
        Combines multiple CVR files and assumes columns are identical.
        Renames unnamed columns by duplicating last column name.
        This is specific to ES&S cvr files.
        """
        for idx, file_name in enumerate(filename_list):
            utils.sts(f"Reading cvr file {file_name}...")
            if not idx:
                #CVR.data_frame = pd.read_excel(file, engine='xlrd')
                CVR.data_frame = DB.load_data(dirname='archives',
                                              name=file_name,
                                              user_format=True)
            else:
                # df = pd.read_excel(file, engine='xlrd')
                df = DB.load_data(dirname='archives',
                                  name=file_name,
                                  user_format=True)
                CVR.data_frame = CVR.data_frame.append(df, ignore_index=True)

        if argsdict.get('convert_cvr_image_cells_to_writein', False):
            CVR.set_cells_with_images_to_writeins(argsdict['cvr'])

        if column_names_list:
            utils.sts(
                "replacing column names with replacement column names provided."
            )
            # use the replacement column headers instead of those provided.
            orig_col_names = CVR.data_frame.columns
            if not len(orig_col_names) == len(column_names_list):
                utils.sts(
                    "replacement column headers not right length to replace header names in CVR"
                )
                sys.exit(1)
            # we will replace any "blank" col names with "Unnamed: XXX" so we can remove them later.
            for i, orig_col_name in enumerate(orig_col_names):
                if re.match(r'Unnamed:', orig_col_name):
                    column_names_list[i] = orig_col_name
            CVR.data_frame.columns = column_names_list

        utils.sts("Checking for duplicate column names.")
        # at this point, there should be no duplicate column names.
        column_name_set = len(set(CVR.data_frame.columns))
        column_name_list = len(list(CVR.data_frame.columns))
        if not column_name_set == column_name_list:
            utils.sts("Column Names are duplicated")
            sys.exit(1)

        utils.sts(
            "Replacing columns with 'Unnamed' with prior named column name.")
        CVR.data_frame.columns = CVR.rename_unnamed(
            list(CVR.data_frame.columns))
def cmpcvr_by_one_tasklist(argsdict, tasklist_name):
    """ This is the primary function to be run inside lambda for cmpcvr.
    
        tasklist_name is like "{archive_root}_chunk_{chunk_idx}"
    """
    # set s3 vs local mode -- this probably better done long before this point.
    DB.set_DB_mode()        

    contests_dod = DB.load_data('styles', 'contests_dod.json')
    if CVR.data_frame.empty:
        CVR.load_cvrs_to_df(argsdict)
    
    #        marks/chunks/{archive_root}_chunk_{chunk_idx}.csv           # individual marks chunks. These are kept for cmpcvr


    if not DB.file_exists(file_name=tasklist_name+'.csv', dirname='marks', subdir="chunks"):
        utils.sts(f"Logic Error: no marks df missing: {tasklist_name}")
        traceback.print_stack()
        sys.exit(1)

    audit_df = DB.load_data(dirname='marks', subdir="chunks", name=tasklist_name, format='.csv')
    
    #---------------------------------------
    # primary call of this function performs chunk comparison
    
    overvotes_results, disagreed_results, blank_results = compare_chunk_with_cvr(
        argsdict=argsdict,
        contests_dod=contests_dod,
        cvr_df=CVR.data_frame,
        audit_df=audit_df,
        chunk_name=tasklist_name,
        )
    #---------------------------------------
    """
        cmpcvr/chunks/disagreed_{archive_root}_chunk_{chunk_idx}.csv    # individual cmpcvr disagreed chunks
        cmpcvr/chunks/overvotes_{archive_root}_chunk_{chunk_idx}.csv    # individual cmpcvr overvote chunks
    """
        
       
    DB.save_data(data_item=disagreed_results, 
        dirname='cmpcvr', subdir='chunks', 
        name=f"disagreed-{tasklist_name}.csv")

    DB.save_data(data_item=disagreed_results, 
        dirname='cmpcvr', subdir='chunks', 
        name=f"overvotes-{tasklist_name}.csv")

    DB.save_data(data_item=blank_results, 
        dirname='cmpcvr', subdir='chunks', 
        name=f"blanks-{tasklist_name}.csv")
Exemple #3
0
def get_replacement_cvr_header(argsdict: dict) -> list:
    """
    :param args_dict: Dict of arguments passed on script input.
    """
    utils.sts("Loading EIF...", 3)

    eif_filename = argsdict.get('eif')

    eif_df = DB.load_data(dirname='EIFs', name=eif_filename, user_format=True)

    eif_df = check_table(eif_df,
                         table_name=eif_filename,
                         required_columns_list=EIF_REQUIRED_COLS,
                         strip_cols=EIF_STRIP_COLS)

    cvr_replacement_header_list = list(eif_df['official_contest_name'])
    expected_initial_cvr_cols = argsdict.get(
        'initial_cvr_cols', ['Cast Vote Record', 'Precinct', 'Ballot Style'])
    if not all(item in cvr_replacement_header_list
               for item in expected_initial_cvr_cols):
        expected_cols = ','.join(expected_initial_cvr_cols)
        utils.sts(
            f"ERROR: CVR does not have the expected fields in the header {expected_cols}",
            0)
        sys.exit(1)
    return cvr_replacement_header_list
def read_settings_csv_file(dirname, name, argspecs_dod, name_field='name', value_field='value'):
    """ reads settings with columns name_field and value_field into dict[name] = value
    """

    inputdict = {}  
    error_flag = False
    if not name:
        return {}

    print(f"Input file specified. Reading input from file '{name}'...")
    
    # need to be able to load from s3 or local.
    settings_df = DB.load_data(dirname='input_files', name=name, format='.csv', user_format=True, s3flag=False)
    
    settings_lod = settings_df.to_dict(orient='records')
    
    for setting_dict in settings_lod:
        name = setting_dict[name_field].strip(' ')
        
        if name not in argspecs_dod:
            print (f"{name_field} '{name}' not supported.")
            error_flag = True
            continue
            
        add_value_of_type(
            inputdict, 
            name=name, 
            spec_type=argspecs_dod[name]['type'], 
            valstr=setting_dict[value_field]
            )
            
    if error_flag:
        sys.exit(1)
            
    return inputdict
Exemple #5
0
def load_bof_df(argsdict):
    """returns conversions for ballot options.
    This function implements the Ballot Options File (BOF)
    """
    bof_columns = [
        'official_contest_name',
        # official contest name used as a means to look up the ballot option.
        'official_option',
        # one option per record used as a second index to look up the ballot option
        'ballot_option',
        # ballot options as shown on the ballot, and only provided if the ballot
        # option differs from the official option.
    ]
    bof_filename = argsdict.get('bof')
    if not bof_filename:
        return None

    bof_df = DB.load_data(dirname='EIFs',
                          name=bof_filename,
                          silent_error=False,
                          user_format=True)

    bof_df = check_table(bof_df,
                         table_name=bof_filename,
                         required_columns_list=bof_columns,
                         strip_cols=bof_columns)

    utils.sts(f"BOF {bof_filename} loaded.")
    return bof_df
def load_one_marks_df(df_file):
    """
    prior operation creates a separate NNNNN_marks_df.csv file for each ballot.
    now creating .csv file
    This supports incremental operation.
    """
    #utils.sts(f"Loading df chunk {df_file}")
    #marks_df = DB.load_df(name=df_file, dirname='results')
    marks_df = DB.load_data(dirname='marks', name=df_file, format='.csv')
    return marks_df
Exemple #7
0
def gen_style_filepaths(style_num):
    #style_dict = DB.load_style(**{'name': style_num})
    style_dict = DB.load_data(dirname='styles',
                              subdir=style_num,
                              name=f'{style_num}_style',
                              silent_error=True)

    try:
        return style_dict['filepaths']
    except TypeError:
        return None
Exemple #8
0
def get_manual_styles_to_contests(argsdict,
                                  save=True,
                                  silent_error=False) -> dict:
    """
    :manual_styles_to_contests_path str: Path to CSV file with contests and styles table.
    :return: Dict with keys of styles and values of contests list.
    
    @@TODO: This is a user-generated file and should NOT use load_data().
    """
    manual_styles_to_contests_filename = argsdict.get(
        'manual_styles_to_contests_filename')
    if not manual_styles_to_contests_filename:
        return None

    manual_styles_to_contests = {}

    # check in both EIFs and config for this file.
    mstc_df = DB.load_data('EIFs',
                           name=manual_styles_to_contests_filename,
                           format='.csv',
                           silent_error=silent_error)
    if mstc_df is None:
        mstc_df = DB.load_data('config',
                               name=manual_styles_to_contests_filename,
                               format='.csv',
                               silent_error=silent_error)
        if mstc_df is None:
            return None

    for col in mstc_df.columns[1:]:
        stripped_col = col.strip(' ')
        # the following will only work if blank entries are not already changed to ''
        # would be better if we actually detected the '1' or 1 and '0' or 0 or blank.

        #manual_styles_to_contests[col] = mstc_df.dropna(subset={col})['contest'].tolist()

        manual_styles_to_contests[stripped_col] = mstc_df.loc[
            mstc_df[col].str.contains('1'), 'contest'].tolist()

    return manual_styles_to_contests
def get_style_lookup_table(argsdict: dict, s3=False):
    """
    :manual_styles_to_contests_filename str: filename to CSV file with contests and styles table.
    :return: pandas df suitable for lookup of precinct, party, and provide style_num
    """
    
    style_lookup_table_filename = argsdict.get('style_lookup_table_filename')
    if not style_lookup_table_filename:
        return None
    
    utils.sts("style lookup table specified. Loading...", 3, end='')

    style_lookup_table_df = DB.load_data('config', style_lookup_table_filename, user_format=True)
        
    utils.sts("completed", 3)
    
    return style_lookup_table_df
Exemple #10
0
def delegated_gentemplate(dirname, task_args, s3flag=None):
    args.argsdict = argsdict = task_args['argsdict']
    
    chunk_idx   = task_args['chunk_idx']
    tasklist    = task_args['filelist']         # bif segment defining ballots included 
    style_num   = task_args['group_name']
    
    if isinstance(tasklist[0], str):
        # when using individual files, tasklist[0] is the tasklist file name.
        tasklist_lod = DB.load_data(dirname='styles', subdir='tasks', name=tasklist[0], format='.csv', type='lod')
    else:
        tasklist_lod = tasklist[0]
    
    if argsdict['include_gentemplate']:
        # generate a "blank" ballot image for this style in dirname 'styles'
        generate_template_for_style_by_tasklist_lod(argsdict, tasklist_lod=tasklist_lod)
    
    style_rois_list = None
    if argsdict['include_genrois']:
        # generate rois information to dirname 'rois'
        style_rois_list = genrois.genrois_one_style(argsdict, style_num)

    if argsdict['include_maprois']:
        style_rois_map_df, error_flag = maprois.maprois_discover_style(
            argsdict,
            style_num,
            style_rois_list=style_rois_list,
            #rois_map_df=None,
            contests_dod=None,
            style_to_contests_dol=None,
            )
            
        #import pdb; pdb.set_trace()
        if error_flag or not len(style_rois_map_df.index):
            logs.exception_report(f"Failed to map style:{style_num}")
            logs.report_lambda_logfile(s3dirname='styles', chunk_name=f"{style_num}_styles_chunk_{chunk_idx}", rootname='map_report', subdir='logs_failed_maps')
        else:
            logs.report_lambda_logfile(s3dirname='styles', chunk_name=f"{style_num}_styles_chunk_{chunk_idx}", rootname='map_report', subdir='logs_good_maps')
            create_redlined_images(argsdict, style_num, style_rois_map_df)
            DB.save_data(data_item=style_rois_map_df, dirname='styles', subdir='roismap', name=f"{style_num}_roismap", format='.csv')
Exemple #11
0
def get_style_fail_to_map(style_num):

    try:
        style_failed_to_map_dict
    except NameError:
        style_failed_to_map_dict = {}

    if not style_num in style_failed_to_map_dict:
        # we have not checked it before.

        #style_dict = DB.load_style(name=style_num, silent_error=True)
        style_dict = DB.load_data(dirname='styles',
                                  subdir=style_num,
                                  name=f'{style_num}_style',
                                  silent_error=True)
        if style_dict is None:
            style_failed_to_map_dict[style_num] = True
        else:
            style_failed_to_map_dict[style_num] = style_dict.get(
                'style_failed_to_map', False)

    return style_failed_to_map_dict[style_num]
Exemple #12
0
def get_ballot_type_id_from_card_code(card_code):

    global CONV_card_code_TO_ballot_type_id_DICT

    if not CONV_card_code_TO_ballot_type_id_DICT:
        utils.sts("Recovering card_code_to_ballot_type_id_dict")
        #CONV_card_code_TO_ballot_type_id_DICT = DB.load_style(name='CONV_card_code_TO_ballot_type_id_DICT')
        CONV_card_code_TO_ballot_type_id_DICT = DB.load_data(
            dirname='styles',
            name='CONV_card_code_TO_ballot_type_id_DICT.json',
            silent_error=True)
        # if the file does not exist, then None is returned.

    try:
        ballot_type_id = CONV_card_code_TO_ballot_type_id_DICT[card_code]
    except (KeyError, TypeError):
        utils.exception_report(
            "get_ballot_type_id_from_card_code() Logic error: Could not find card_code in conv_dict"
        )
        return None

    return ballot_type_id
def get_cvr_info(argsdict):
    """ returns ballotid_to_style_dict and parsed_dominion_cvr, if available.
        THIS IS USED FOR BIF CREATION, NOT DEPRECATED.
        But 'BUILT_BALLOTID_TO_STYLE_DICT.json' is not needed.
    """

    vendor = argsdict.get('vendor')
    parsed_dominion_cvr = {}
    ballotid_to_style_dict = {}

    # first get the information ready from the CVR
    # the cvrs are not synchronized with the file archives so we need to load the entire cvr data.

    if vendor == 'Dominion' and not parsed_dominion_cvr:
        cvr_list = argsdict.get('cvr')
        if cvr_list and cvr_list[0] != '(not available)':
            utils.sts('Parsing Dominion CVRs')
            for cvr_path in [c for c in argsdict.get('cvr') if c != '(not available)']:
                # The following parses all the CVR chunks and produces a single dominion CVR.
                parsed_dominion_cvr.update(parse_dominion_cvr_chunks_to_dict(argsdict, cvr_path))
        elif argsdict.get('use_built_ballotid_to_style_dict'):
            #ballotid_to_style_dict = DB.load_style(name='BUILT_BALLOTID_TO_STYLE_DICT', silent_error=False)
            ballotid_to_style_dict = DB.load_data(dirname='styles', name='BUILT_BALLOTID_TO_STYLE_DICT.json')
           

    elif vendor == 'ES&S' and not ballotid_to_style_dict:
        utils.sts('Parsing ES&S CVRs')
        """
        To avoid loading all cvr files into one dataframe and risking memory overflow,
        styles_dict is generated from each CVR file and then merged.
        """
        utils.sts("creating ballotid to style lookup dict...", 3)

        # if no CVRs exist, this just returns empty dict.
        ballotid_to_style_dict = convert_cvr_to_styles_ess(argsdict, silent_error=True)
        
    return ballotid_to_style_dict, parsed_dominion_cvr
def genreport(argsdict):
    """
    This is a primary entry point from main.
    It processes all marks_df and creates a report
    no arguments.
    algorithm:
    initialize results_dod
    process each votes_df in auditcvr directory:
        get list of unique contests
            for each contest:
                get list of unique options listed (assumes NoMarks already removed)
                for each option:
                    get total of votes for records with contest and option
                    total num_votes for the contest/option combination.
                    record in results_dod
                    print to console.
    save resultsN_json for each archive
    save results_json for all ballots included in the run.

    Note: this does not include precinct level report but is per-archive
    TODO: This should be decomposed into two steps, first to access the results,
            and second to produce a report in a some format.
    """
    utils.sts("Creating Results", 3)

    contests_dod = DB.load_data('styles', 'contests_dod.json')
    utils.sts(f"Total of {len(contests_dod)} contests.", 3)
    results_dod = {}

    dtype = {'idx': int, 'ballot_id': int, 'style_num': str, 'precinct': str,
        'option': str, 'has_indication': str, 'writein_name': str,
        'num_marks': int, 'num_votes': int, 'pixel_metric_value': int,
        'overvotes': int, 'undervotes': int, 'ssidx': int, 'delta_y': int}

    # if argsdict.get('use_lambdas'):
        # try:
            # columns = ['idx', 'ballot_id', 'style_num', 'precinct', 'contest', 'option',
            # 'has_indication', 'num_marks', 'num_votes', 'pixel_metric_value',
            # 'writein_name', 'overvotes', 'undervotes', 'ssidx', 'delta_y',
            # 'ev_coord_str', 'ev_logical_style', 'ev_precinct_id']
            # dtype = {'idx': int, 'ballot_id': int, 'style_num': str, 'precinct': str,
            # 'option': str, 'has_indication': str, 'writein_name': str,
            # 'num_marks': int, 'num_votes': int, 'pixel_metric_value': int,
            # 'overvotes': int, 'undervotes': int, 'ssidx': int, 'delta_y': int}
            # marks_df = pd.read_csv(f"{config_dict['RESOURCES_PATH']}{config_dict['RESULTS_PATHFRAG']}ballot_marks_df.csv",
                                   # dtype=dtype, skiprows=1, names=columns)
        # except ValueError:
    marks_df = DB.load_data(dirname='marks', name='marks.csv', dtype=dtype)
    # else:
        # marks_df = load_all_marks_df()
    utils.sts(f"Total of {len(marks_df.index)} records in combined marks_df")

    contests_list = list(marks_df['contest'].unique())
    num_ballots = len(marks_df['ballot_id'].unique())
    num_styles = len(marks_df['style_num'].unique())
    utils.sts(f"Total of {len(contests_list)} contests on {num_ballots} ballots with {num_styles} styles.")
    for contest in contests_list:
        contest_df = marks_df.loc[marks_df['contest'] == contest]
        # return all rows where option starts with '#contest'
        contest_headers_df = contest_df[contest_df['option'].str.match('#contest')]
        overvotes = contest_headers_df['overvotes'].sum()
        undervotes = contest_headers_df['undervotes'].sum()
        totvotes = contest_df['num_votes'].sum()
        num_contest_ballots = len(contest_headers_df.index)
        results_dod[contest] = {'overvotes': overvotes, 'undervotes': undervotes}
        print(f"\n-----------------------------------------\n{contest}")
        options_list = contests_dod[contest]['official_options_list']
        for option in options_list:
            option_df = marks_df.loc[(marks_df['contest'] == contest) & (marks_df['option'] == option)]
            option_votes = option_df['num_votes'].sum()
            print("   %20s: %8.1u  %3.2f%%" % (option, option_votes, (option_votes / totvotes) * 100))
            results_dod[contest][option] = option_votes
        writein_df = marks_df.loc[(marks_df['contest'] == contest) & (marks_df['option'].str.match('writein'))]
        writein_votes = writein_df['num_votes'].sum()
        print("   %20s: %8.1u" % ('Write-ins', writein_votes))
        results_dod[contest]['writein'] = writein_votes

        print("   %20s: %8.1u" % ('Total votes', totvotes))
        print("   %20s: %8.1u" % ('Overvotes', overvotes))
        print("   %20s: %8.1u" % ('Undervotes', undervotes))
        print("   %20s: %8.1u" % ('Contest Ballots', num_contest_ballots))
def extractvote_by_one_tasklist(
        argsdict: dict,
        tasklist_name: str,
        ):
    """ ACTIVE
    
    Extract vote from all ballots as specified in tasklist chunk in extraction_tasks folder.

    params:
    :param argsdict: provides arguments from input file or CLI such as filter specs.
    :param tasklist_name: created by f"{BIF.name}_chunk_{'%4.4u' % (chunk_index)}.csv"
            tasklist is found in extaction_tasks folder.

    produces results/marks_{tasklist_name}

    This is the primary extraction function for lambda operation.
    
    PRIOR TO LAUNCHING THIS:
        Check availability of:
            styles/rois_map_df.csv      -- as a result of gentemplates, genrois, genmap
            styles/contests_dod.json    -- based on EIF
            

    """

    current_archive_basename = ''
    archive = None

    # set s3 vs local mode
    DB.set_DB_mode()        

    # initialize results.
    DB.BALLOT_MARKS_DF = pd.DataFrame()
    
    rois_map_df      = DB.load_data('styles', 'roismap.csv')
    contests_dod     = DB.load_data('styles', 'contests_dod.json')

    #extraction_tasks_df = DB.load_df_csv(name=tasklist_name, dirname='extraction_tasks', s3flag=argsdict['use_s3_results'])
    extraction_tasks_df = DB.load_data(dirname='marks', subdir='tasks', name=tasklist_name)

    #archives_folder_path = argsdict['archives_folder_path']

    for task_idx in range(len(extraction_tasks_df.index)):

        task_dict           = extraction_tasks_df.iloc[task_idx]
        ballot_id           = task_dict['ballot_id']
        precinct            = task_dict['precinct']
        archive_basename    = task_dict['archive_basename']

        """ has structure of BIF
            ('archive_basename', str),
            ('ballot_id', str),
            ('file_paths', str),    # note, may be semicolon separated list.
            ('cvr_file', str),
            ('precinct', str),
            ('party', str),
            ('style_num', str),
            ('card_code', str),
            ('ballot_type_id', str),
            ('sheet0', 'Int32'),                 # 0, 1 ...
            ('is_bmd', 'Int32'),
            ('style_roi_corrupted', 'Int32'),
            ('other_comments', str),
        """

        ballot_style_overrides_dict = args.get_ballot_style_overrides(argsdict)

        #ballot_id, vendor='ES&S', precinct=None, party=None, group=None, extension=None, file_paths=[]):
        # this call does nothing more than initialize the instance data
        ballot = Ballot(argsdict, 
            file_paths = re.split(r';', task_dict['file_paths']), 
            ballot_id=ballot_id, 
            precinct=precinct, 
            archive_basename=archive_basename)

        ballot.ballotdict['is_bmd'] = bool(utils.set_default_int(task_dict.get('is_bmd', 0), 0))

        if (ballot.ballotdict['is_bmd'] and not argsdict['include_bmd_ballot_type'] or
            not ballot.ballotdict['is_bmd'] and not argsdict['include_nonbmd_ballot_type']):

            utils.exception_report(f"Tasklist says is_bmd is {ballot.ballotdict['is_bmd']} "
                "but argsdict does not include that type. Extract tasklists may be stale")
            continue

        if archive_basename != current_archive_basename:
            if current_archive_basename and archive:
                archive.close()
            utils.sts (f"opening archive: '{archive_basename}'...", 3)
            archive = open_archive(argsdict, archive_basename)
            current_archive_basename = archive_basename

        if not ballot.load_source_files(archive):
            string = f"EXCEPTION: Could not load source files from archive {archive_basename} offset {task_idx} for ballot_id: {ballot_id} Precinct: {precinct}"
            utils.exception_report(string)
            continue

        utils.sts(f"\n{'-'*50}\nProcessing tasklist:{tasklist_name} offset: {task_idx} ballot_id:{ballot_id}", 3)

        ballot.get_ballot_images()      # this reads images from PDFs

        #-----------------------------------------------------
        # this is the primary function call, performed for each ballot,
        # and producing a marks_df for this ballot, with one record for
        # each option.
        
        ballot_marks_df = extract_vote_from_ballot(
            argsdict, ballot, rois_map_df, contests_dod,
            ballot_style_overrides_dict,
            )
            
        # the above function makes exception reports if:
        #   1. the style cannot be read from the ballot, alignment or barcode error.
        #   2. the style failed to map.
        #-----------------------------------------------------

        if ballot_marks_df is None or not len(ballot_marks_df.index):
            continue    # not successful and exception has already been logged.

        DB.BALLOT_MARKS_DF = DB.BALLOT_MARKS_DF.append(ballot_marks_df, sort=False, ignore_index=True)
        continue

    #DB.save_df_csv(name=tasklist_name, dirname='marks', df=DB.BALLOT_MARKS_DF)
    DB.save_data(data_item=DB.BALLOT_MARKS_DF, dirname='marks', subdir='chunks', name=f"marks_{tasklist_name}")
Exemple #16
0
def load_eif(argsdict):
    """returns conversions for contest names.
    This function implements the new EIF format which provides the following columns:
    # columns are defined above. 
    """
    """=================================================================
    The EIF file is used as follows:
        replace cvr columns
            when the column 'original_cvr_header' exists, then it signals replacement of the original header.
                when the cvr is read, the columns are replaced by the 
                values in the 'official_contest_name' column.
                This column also includes the initial indexing column names, 
                Cast Vote Record, Precinct, and Ballot Style, if they exist in the CVR.
                after the columns are replaced, then the official_contest_names 
                are always used when data is accessed from any cvr file.
        ballot text lookup      
            when we are mapping contests and options to ROIs as extracted from the ballot,
            we need the actual text as provided on the ballot for the best matching success.
            for that purpose, the styledict provides the the contests 
            included. It must be built or provided using the official_contest_names.
            Then, we need to look up the ballot text from the EIF file, which has the following components:
                ballot_contest_name
                ballot_options
                description
                writein_num
            to create this lookup, the leading rows, containing any of: (Cast Vote Record, Precinct, Ballot Style)
            then the list is scanned and rows with duplicate official_contest_name contents are deleted.
            delete (or ignore) column 'original_cvr_columns'
            look up the (first) record matching the official contest name, return the 
            ballot_contest_name, [ballot_options], description, and writein_num.
                if the ballot_contest_name is blank, the official_contest_name is used.
                if the ballot_options columns is blank, the official options field is used.
                if the writein_num is blank, it defaults to the vote_for field, which defaults to 1.
                    If the writein_num is 0, it is not considered blank, and it returns 0.
                                        
    """
    utils.sts("Loading EIF...", 3)
    eif_filename = argsdict.get('eif')

    eif_df = DB.load_data(dirname='EIFs', name=eif_filename, user_format=True)

    eif_df = check_table(eif_df,
                         table_name=eif_filename,
                         required_columns_list=EIF_REQUIRED_COLS,
                         strip_cols=EIF_STRIP_COLS)

    if 'original_cvr_header' in list(eif_df.columns):
        # not needed to check this if there is no original CVR header.
        cvr_replacement_header_list = eif_df['official_contest_name'].tolist()
        expected_initial_cvr_cols = argsdict.get(
            'initial_cvr_cols',
            ['Cast Vote Record', 'Precinct', 'Ballot Style'])
        if not expected_initial_cvr_cols == cvr_replacement_header_list[
                0:len(expected_initial_cvr_cols)]:
            utils.sts(
                f"ERROR: EIF list of initial_cvr_cols does not match input file setting {','.join(expected_initial_cvr_cols)}",
                0)
            sys.exit(1)

        # drop the first few rows of the dataframe, we don't need these extra CVR fields.
        contest_lookup_df = eif_df.iloc[len(expected_initial_cvr_cols):]
    else:
        contest_lookup_df = eif_df

    # eliminate all the duplicates which occur when vote_for > 1
    contest_lookup_df = contest_lookup_df.drop_duplicates(
        'official_contest_name', keep='first')
    utils.sts("EIF loaded OK.", 3)
    return contest_lookup_df
def generate_cmpcvr_report(argsdict):
    discrepancy_reports = []
    report_dirpath = DB.dirpath_from_dirname('reports')
    report_path = f"{report_dirpath}Discrepancy Report for Automated Independent Audit.html"
    #cmpcvr_dirpath = DB.dirpath_from_dirname('cmpcvr')
    #try:
    # #    cmpcvr_agreed_df = pd.read_csv(f"{cmpcvr_dirpath}cmpcvr-agreed.csv")
    # cmpcvr_agreed_df = DB.load_data(dirname='cmpcvr', name='cmpcvr-agreed.csv', silent_error=True)
    # #except pd.errors.EmptyDataError:
    # if cmpcvr_agreed_df is None:
    # cmpcvr_agreed_df = pd.DataFrame(columns=['ballot_id', 'style', 'precinct', 'contest', 'agreed', 'blank',
    #                                         'chunk_name', 'contests_mismatch'])
    # try:
    # cmpcvr_disagreed_df = pd.read_csv(f"{cmpcvr_dirpath}cmpcvr-disagreed.csv")
    # except pd.errors.EmptyDataError:
    cmpcvr_disagreed_df = DB.load_data(dirname='cmpcvr',
                                       name='disagreed.csv',
                                       silent_error=True)
    if cmpcvr_disagreed_df is None:
        cmpcvr_disagreed_df = pd.DataFrame(columns=[
            'ballot_id', 'style', 'precinct', 'contest', 'agreed', 'blank',
            'chunk_name', 'contests_mismatch', 'vote_difference', 'audit_info',
            'cvr_info'
        ])
    # try:
    # columns = ['idx', 'ballot_id', 'style', 'precinct', 'contest', 'option',
    # 'has_indication', 'num_marks', 'num_votes', 'pixel_metric_value',
    # 'writein_name', 'overvotes', 'undervotes', 'ssidx', 'delta_y',
    # 'ev_coord_str', 'ev_logical_style', 'ev_precinct_id']
    # dtype = {'idx': int, 'ballot_id': int, 'style': int, 'precinct': str,
    # 'option': str, 'has_indication': str, 'writein_name': str,
    # 'num_marks': int, 'num_votes': int, 'pixel_metric_value': float,
    # 'overvotes': int, 'undervotes': int, 'ssidx': int, 'delta_y': int}
    # ballot_marks_df = pd.read_csv(cmpcvr_dirpath + 'ballot_marks_df.csv', dtype=dtype, skiprows=1, names=columns)
    # except ValueError:
    # ballot_marks_df = pd.read_csv(cmpcvr_dirpath + 'ballot_marks_df.csv')

    # the following will require that all marks_df segments are combined.
    ballot_marks_df = DB.load_data(dirname='marks',
                                   name='marks.csv',
                                   silent_error=True)

    num_marks_ballots = len(ballot_marks_df['ballot_id'].unique())

    precincts = ballot_marks_df['precinct'].unique().tolist()
    for precinct in precincts:
        #precinct_cmpcvr_agreed_df = cmpcvr_agreed_df.loc[cmpcvr_agreed_df['precinct'] == precinct]
        precinct_cmpcvr_disagreed_df = cmpcvr_disagreed_df.loc[
            cmpcvr_disagreed_df['precinct'] == precinct]
        disagreed_rows = len(
            precinct_cmpcvr_disagreed_df['ballot_id'].unique())

        # Pass precincts in which number of disagreed ballots is smaller than the threshold.
        discrepancy = round((disagreed_rows / num_marks_ballots) * 100, 2)
        if discrepancy < argsdict.get('precinct_reporting_threshold_percent',
                                      0):
            continue
        precinct_report_path = f"{report_dirpath}Report - {precinct}.html"
        discrepancy_reports.append({
            'precinct': precinct,
            'ballots': num_marks_ballots,
            'discrepancy': discrepancy,
            'path': precinct_report_path
        })
        precinct_marks_df = ballot_marks_df.loc[ballot_marks_df['precinct'] ==
                                                precinct]
        with open(precinct_report_path, 'w') as html_file:
            doc = build_discrepancy_reports(
                precinct,
                precinct_agreed_df=None,
                precinct_disagreed_df=precinct_cmpcvr_disagreed_df,
                precinct_marks_df=precinct_marks_df)

            html_file.write(doc.render())
    with open(report_path, 'w') as html_file:
        doc = build_discrepancy_parent_report(discrepancy_reports)
        html_file.write(doc.render())
        utils.sts(os.path.abspath(report_path))
Exemple #18
0
def build_template_tasklists(argsdict):
    """ with all bif chunks created, scan them and create template_tasklists.
        each tasklist contains records from bif for ballots to be included
        in the template. These are written to template_tasklists folder.
        
        Note that this processes BIFs one at a time, rather than combining
        them all in memory, which is not scalable.
    """

    utils.sts("Building template tasklists...", 3)
    
    incomplete_style_ballots_dodf = {}      # dict keyed by style of df
    completed_eff_styles_dodf = {}          # 

    num_ballots_to_combine = argsdict.get('threshold', 50)

    # then following works even if bif is generated from CVR.
    # because the separate bif csv files are still produced.
    bif_names = get_biflist(fullpaths=False)

    if argsdict['merge_similar_styles']:
        
        #sheetstyle_map_dict = DB.load_json('styles', 'sheetstyle_map_dict.json', silent_error=False)
        sheetstyle_map_dict = DB.load_data(dirname='styles', name='sheetstyle_map_dict.json')

    for bif_name in bif_names:
        utils.sts(f"  Processing bif {bif_name}...", 3)
        
        BIF.load_bif(name=bif_name)
        reduced_df = BIF.df_without_corrupted_and_bmd()
        reduced_df = set_style_from_party_if_enabled(argsdict, reduced_df)
        
        style_nums_in_this_bif = list(reduced_df['style_num'].unique())
        utils.sts(f"  Found {len(style_nums_in_this_bif)} unique styles", 3)

        for style_num in style_nums_in_this_bif:
            utils.sts(f"Processing style:{style_num} ", 3, end='')
            previously_captured = 0

            eff_style = style_num
            if argsdict['merge_similar_styles']:
                eff_style = sheetstyle_map_dict[style_num[1:]]          # skip language char.
                # this is the contests-only style on per sheet basis.
                # it does not have language included. So we add the language from original style
                lang_code = style_num[0:1]                              # first char
                eff_style = "%1.1u%4.4u" % (int(lang_code), int(eff_style))

                utils.sts(f"Effective (merged) style is:{eff_style} ", 3, end='')
            
            if eff_style in completed_eff_styles_dodf:
                utils.sts(" Tasklist already created", 3)
                continue

            # first see if we were already working on this style
            if eff_style in incomplete_style_ballots_dodf:
                previously_captured = len(incomplete_style_ballots_dodf[eff_style].index)
                utils.sts(f"Previously captured {previously_captured} ", 3, end='')
            # find records with this eff_style

            style_df = reduced_df[(reduced_df['style_num'] == style_num)][0:(num_ballots_to_combine-previously_captured)]
            utils.sts(f" Just Captured {len(style_df.index)}", 3, end='')

            if previously_captured:
                style_df = incomplete_style_ballots_dodf[eff_style].append(style_df, ignore_index=True)
                utils.sts(f" Total captured {len(style_df.index)}", 3, end='')
            if len(style_df.index) >= num_ballots_to_combine:
                completed_eff_styles_dodf[eff_style] = style_df
                try:
                    del incomplete_style_ballots_dodf[eff_style]
                except: pass
                utils.sts(" Full", 3)
            else:
                utils.sts(" Queued", 3)
                incomplete_style_ballots_dodf[eff_style] = style_df


    # skip those that have too few records, i.e. < min_ballots_required

    min_ballots_required = argsdict.get('min_ballots_required', 1)
    too_few_ballots_styles = []
    template_tasklists_dodf = {}
    for eff_style, style_df in {**completed_eff_styles_dodf, **incomplete_style_ballots_dodf}.items():
        num_records = len(style_df.index)
        if num_records < min_ballots_required:
            utils.sts(f"Style has too few records, {min_ballots_required} ballots are required, skipping...", 3)
            too_few_ballots_styles.append(style_num)
            continue
        template_tasklists_dodf[eff_style] = style_df
        
    # write tasklists
    utils.sts("\n  Writing tasklists:", 3)
    if not argsdict['use_single_template_task_file']:
        for eff_style, style_df in template_tasklists_dodf.items():
            utils.sts(f"  Writing tasklists for style:{eff_style} with {'%2.2u' % (len(style_df.index))} entries ", 3, end='')
            style_df.sort_values(by=['archive_basename'], inplace=True)
            pathname = DB.save_data(data_item=style_df, dirname='styles', subdir='tasks', name=str(eff_style), format='.csv')
            utils.sts(f"to {pathname}", 3)
    else:
        template_tasklists_dolod = utils.dodf_to_dolod(template_tasklists_dodf)
        utils.sts(f"Writing combined tasklists with {'%2.2u' % (len(template_tasklists_dolod))} tasklists ", 3, end='')
        DB.save_data(data_item=template_tasklists_dolod, dirname='styles', name="template_tasklists_dolod.json")

    completed_count = len(completed_eff_styles_dodf)
    incompleted_count = len(incomplete_style_ballots_dodf)

    utils.sts(  f"Total number of styles detected: {completed_count + incompleted_count} \n"
                f"            Completed tasklists: {completed_count}\n"
                f"   Incomplete tasklists created: {incompleted_count}\n"
                f"    Styles will too-few ballots: {too_few_ballots_styles}\n"
                , 3)
Exemple #19
0
def gentemplates_by_tasklists(argsdict):
    """
    ACTIVE
    This replaces the gentemplates function.
    given tasklists which exist in the tasklist folder,
    read each in turn and if the number of ballots included meet a minimum,
    process each line item in turn.
    The style is the name of the tasklist.

    Tasklists are generated by reviewing the BIF tables.
    
    Each delegetion to lambdas (or performed locally) will include 
    subprocesses according to the argsdict parameters:
    
        include_gentemplate_tasks       - include the generation of tasklists prior to delegation.
        use_single_template_task_file   - means a single JSON file will be created instead of separate task files on s3
                                            and a portion of that task list will be passed to each lambda
        include_gentemplate             - for each style, combine ballots to create a base template
        include_genrois                 - generate regions of interest (ROIs) and OCR
        include_maprois                 - map the official contest names to what is read on the ballot to create roismap
        

    
    """
    styles_on_input = []
    #attempted_but_failed_styles = []   # will need to determine by looking for templates

    utils.sts('Generating style templates from a combined set of ballot images', 3)

    # this loads and parses the EIF
    contests_dod = create_contests_dod(argsdict)
    #DB.save_style(name='contests_dod', style_data=contests_dod)
    DB.save_data(data_item=contests_dod, dirname='styles', name='contests_dod.json')

    # style_to_contests_dol
    # if the CVR is available, we can get a list of styles that are associated with a ballot_type_id.
    # this may be enough to know exactly what contests are on a given ballot, but only if the 
    # style which keys this list is also directly coupled with the card_code read from the ballot.
    # In some cases, such as Dane County, WI, this is a 1:1 correspondence. But SF has an complex
    # style conversion which is nontrivial to figure out. 
    # thus, this is still needed in style discovery.

    style_to_contests_dol = DB.load_data(dirname='styles', name='CVR_STYLE_TO_CONTESTS_DICT.json', silent_error=True)
    if not style_to_contests_dol:
        logs.sts("CVR_STYLE_TO_CONTESTS_DICT.json not available. Trying to convert CVR to styles", 3)
        style_to_contests_dol = convert_cvr_to_styles(argsdict, silent_error=True)
        if not style_to_contests_dol:
            logs.sts("Unable to convert CVR to style_to_contests_dol, trying manual_styles_to_contests", 3)
            style_to_contests_dol = get_manual_styles_to_contests(argsdict, silent_error=True)

        if style_to_contests_dol:
            DB.save_data(data_item=style_to_contests_dol, dirname='styles', name='CVR_STYLE_TO_CONTESTS_DICT.json')
            
    if not style_to_contests_dol:
        logs.sts("style_to_contests_dol unavailable. full style search is required.", 3)

    if argsdict.get('use_lambdas'):
        LambdaTracker.clear_requests()

    first_pass = True

    if argsdict['use_single_template_task_file']:
        template_tasklists_dolod = DB.load_data(dirname='styles', name="template_tasklists_dolod.json")
        total_num = len(template_tasklists_dolod)
        utils.sts(f"Found {total_num} taskslists", 3)
        
        for chunk_idx, (style_num, style_lod) in enumerate(template_tasklists_dolod.items()):
            if not style_num: continue
            
            if argsdict.get('include_style_num') and style_num not in argsdict['include_style_num'] or \
                argsdict.get('exclude_style_num') and style_num in argsdict['exclude_style_num']:
                continue
            
            styles_on_input.append(style_num)

            if argsdict.get('incremental_gentemplate', False) and DB.template_exists(style_num):
                utils.sts(f"Style {style_num} already generated, skipping...", 3)
                continue
                
            utils.sts(f"Processing template for style {style_num} #{chunk_idx}: of {total_num} ({round(100 * (chunk_idx+1) / total_num, 2)}%)")

            # the function call below will delegate to lambdas if use_lambdas is True.
            build_one_chunk(argsdict,
                dirname='styles', 
                subdir=style_num,
                chunk_idx=chunk_idx, 
                filelist=[style_lod],            # only one style per lambda chunk, but can execute gentemplate, genrois, and maprois for same style.
                group_name=style_num, 
                task_name='gentemplate', 
                incremental=False,
                )

            if argsdict['use_lambdas'] and first_pass and argsdict['one_lambda_first']:
                if not wait_for_lambdas(argsdict, task_name='gentemplate'):
                    utils.exception_report("task 'gentemplate' failed delegation to lambdas.")
                    sys.exit(1)           
                first_pass = False
            # if not generate_template_for_style_by_tasklist_df(argsdict, style_num, tasklist_df):
                # attempted_but_failed_styles.append(style_num)
        
    else:    
        tasklists = DB.list_files_in_dirname_filtered(dirname='styles', subdir="tasks", file_pat=r'.*\.csv', fullpaths=False)
        total_num = len(tasklists)
        utils.sts(f"Found {total_num} taskslists", 3)

        for chunk_idx, tasklist_name in enumerate(tasklists):
            if tasklist_name == '.csv': continue
            
            style_num = os.path.splitext(os.path.basename(tasklist_name))[0]
            styles_on_input.append(style_num)

            if args.argsdict.get('incremental_gentemplate', False) and DB.template_exists(style_num):
                utils.sts(f"Style {style_num} already generated, skipping...", 3)
                continue
                
            utils.sts(f"Processing template for style {style_num} #{chunk_idx}: of {total_num} ({round(100 * (chunk_idx+1) / total_num, 2)}%)")

            # the function call below will delegate to lambdas if use_lambdas is True.
            build_one_chunk(argsdict,
                dirname='styles', 
                chunk_idx=chunk_idx, 
                filelist=[tasklist_name], 
                group_name=style_num, 
                task_name='gentemplate', 
                incremental=False,
                )
            if argsdict['use_lambdas'] and first_pass and argsdict['one_lambda_first']:
                if not wait_for_lambdas(argsdict, task_name='gentemplate'):
                    utils.exception_report("task 'gentemplate' failed delegation to lambdas.")
                    sys.exit(1)           
                first_pass = False

    wait_for_lambdas(argsdict, task_name='gentemplate')
    post_gentemplate_cleanup(argsdict)
Exemple #20
0
def convert_cvr_to_styles_ess(argsdict: dict = None, silent_error: bool = False):
    """ ACTIVE -- this is used to create BIF.
        open each of the ess cvr files and create cvr_ballotid_to_style_dict
        by reading Ballot Style column.
        returns the cvr_ballotid_to_style_dict
    """
    
    if not argsdict['cvr'] or argsdict['cvr'] == ['(not available)'] or argsdict['cvr'] == ['']:
        utils.sts("CVR file not specified")
        if silent_error:
            return {}
        else:
            sys.exit(1)
    cvr_replacement_header_list = get_replacement_cvr_header(argsdict)
    master_styles_dict = {}
    cvr_ballotid_to_style_dict = {}
    for cvr_file in argsdict['cvr']:
        utils.sts(f"Processing cvr file: {cvr_file}", 3)
        #cvr_df = pd.read_excel(cvr_file, engine='xlrd')
        cvr_df = DB.load_data(dirname='archives', name=cvr_file, user_format=False)
        
        # probably all of this jazz below should be encapsulated.

        if cvr_replacement_header_list:
            # use the official contest names for column headers instead of those provided.
            orig_col_names = cvr_df.columns
            if len(orig_col_names) != len(cvr_replacement_header_list):
                utils.sts("official contest names list not right length to replace header names in CVR")
                sys.exit(1)
            # we will replace any "blank" col names with "Unnamed: XXX" so we can remove them later.
            for i, orig_col_name in enumerate(orig_col_names):
                if re.match(r'^Unnamed:', orig_col_name):
                    cvr_replacement_header_list[i] = orig_col_name
            cvr_df.columns = cvr_replacement_header_list

        # remove columns that had no names. These are when vote_for is > 1.
        dup_col_names = []
        for column in list(cvr_df.columns):
            if re.match(r'^Unnamed:', column):
                dup_col_names.append(column)
        cvr_df.drop(columns=dup_col_names, inplace=True)

        # remove leading and trailing spaces.
        if argsdict.get('check_dup_contest_names', True):
            duplicates = utils.find_duplicates(cvr_df.columns)
            if duplicates:
                string = '\n'.join(duplicates)
                utils.sts(f'Duplicate columns detected in CVR. All contest names must be unique.\n'
                          f'{string}')
                sys.exit(1)
        utils.sts('Generating cvr_to_styles_dict', 3)
        styles_dict = cvr_to_styles_dict(argsdict, cvr_df)
        
        utils.sts('Generated cvr_to_styles_dict OK', 3)
        # combine with the master_styles_dict, discarding any duplicates that might span cvr blocks.
        master_styles_dict = {**master_styles_dict, **styles_dict}
        ballotid_to_style_dict = cvr_to_ballotid_to_style_dict(cvr_df)
        cvr_ballotid_to_style_dict = {**cvr_ballotid_to_style_dict, **ballotid_to_style_dict}

    total_styles = len(master_styles_dict)

    utils.sts(f"Total of {total_styles} unique styles detected.\nWriting styles to contests dict to JSON file...", 3)

    DB.save_data(master_styles_dict, dirname='styles', name='CVR_STYLE_TO_CONTESTS_DICT.json')
    
    return cvr_ballotid_to_style_dict