def load_source_files(self, archive=None, mode='archive'):
        """ given file paths, load the files into ballot.ballotimgdict['source_files']
            returns False if loading failed.
            mode:
                'archive'   -- loads files identified in ballot instance from archive
                                note that this archive may be located on s3.
                'local'     -- treat filepaths as local paths of extracted files.
                's3'        -- loads files already extracted from zip archives on s3 (DEPRECATED)
            
        """
        if not isinstance(self.ballotdict['file_paths'], list):
            self.ballotdict['file_paths'] = [self.ballotdict['file_paths']]

        for file_path in self.ballotdict['file_paths']:
            if mode == 'archive':
                # this creates dict of 'name': (name), 'bytes_array': (the file)
                #if file_path.endswith('.tif'):
                #    import pdb; pdb.set_trace()

                ballot_file = get_archived_file(archive, file_path)
                if not ballot_file:
                    return False
            elif mode == 'local':
                with open(file_path, 'rb') as fh:
                    bytes_array = fh.read()
                ballot_file = {'name': file_path, 'bytes_array': bytes_array}
            else:
                utils.exception_report(
                    f"Invalid mode:{mode} in load_source_files()")
                sys.exit(1)

            self.ballotimgdict['source_files'].append(ballot_file)
        return True
    def get_ballot_images(self):
        """
        Processes files already read as dict of name, bytes_array
        Skips over the step of placing in source.
        """

        self.ballotimgdict['images'] = []
        extension = self.ballotdict['extension']
        utils.sts(f"Converting images from {extension} data...", 3, end='')

        for filedict in self.ballotimgdict['source_files']:
            if extension == '.pdf':
                images = get_images_from_pdf(filedict)
            elif extension == '.pbm':
                images = get_images_from_pbm(filedict)
            elif extension == '.tif':
                images = get_images_from_tif(filedict)
            elif extension == '.png':
                images = get_images_from_png(filedict)
            else:
                utils.exception_report(
                    f"get_ballot_images(): 'extension':{extension} not recognized."
                )
                sys.exit(1)
            self.ballotimgdict['images'].extend(images)
        utils.sts(f"{len(self.ballotimgdict['images'])} image(s) converted.",
                  3)
def fuzzy_compare_str_to_list(correct_strlist: list,
                              ocr_str: str,
                              thres: float,
                              fuzzy_compare_mode='best_of_all') -> tuple:
    """ return True if ocr_str is found in correct_strlist
        with index offset where it is found, and metric.
    """
    utils.sts(
        f"Comparing strlists\ncorrect '{join_remove_nl(correct_strlist)}'\n"
        f"ocr_str '{ocr_str}'", 3)
    metrics = fuzzy_metrics_str_to_list(correct_strlist, ocr_str,
                                        fuzzy_compare_mode)

    if not metrics:
        return False, 0, 0

    # note that we can't just sort the metrics here because we need to keep them in order.
    max_metric = max(metrics)
    max_idx = metrics.index(max_metric)

    if len(metrics) > 1:
        metrics.sort(reverse=True)
        if (metrics[0] > 0.7) and (metrics[0] - metrics[1] < 0.3):
            string = f"Close fuzzy discrimination: max_metric:{metrics[0]} next_metric:{metrics[1]}\n" \
                     f"ocr_str:{ocr_str} correct_strlist:{', '.join(correct_strlist)}"
            utils.exception_report(string)

    return bool(max_metric > thres), max_idx, max_metric
def parse_s3path(s3path):
    """ the s3 path we use is the same as what is used by s3 console.
        format is:
            s3://<bucket>/<prefix>/<basename>
            
        where <prefix>/<basename> is the key.
    """
    s3dict = {}
    match = re.search(r'(.*://)([^/]+)/(.*/)(.*)$', s3path)
    
    if match:
        s3dict['protocol']      = match[1]
        s3dict['bucket']        = match[2]
        s3dict['prefix']        = match[3]
        s3dict['basename']      = match[4]
        s3dict['key']           = s3dict['prefix'] + s3dict['basename']
    
    if (not match or
        s3dict['protocol'] != 's3://' or
        not s3dict['bucket'] or
        not s3dict['key']):
    
        utils.exception_report(f"s3_path format invalid: {s3path}")
        sys.exit(1)
    return s3dict
Example #5
0
def update_CONV_card_code_TO_ballot_type_id_DICT(card_code, ballot_type_id):

    if card_code in CONV_card_code_TO_ballot_type_id_DICT:
        if CONV_card_code_TO_ballot_type_id_DICT[card_code] != ballot_type_id:
            utils.exception_report(
                f"CONV_card_code_TO_ballot_type_id_DICT is inconsistent. "
                f"card_code:{card_code} provides {CONV_card_code_TO_ballot_type_id_DICT[card_code]} instead of ballot_type_id:{ballot_type_id}"
            )
    else:
        CONV_card_code_TO_ballot_type_id_DICT[card_code] = ballot_type_id
def extractvote_by_tasklists(argsdict: dict):
    """
    ACTIVE
    This replaces the extractvotes function.
    given tasklists which exist in the extraction_tasks folder,

    Tasklists are generated by reviewing the BIF tables.
    Each tasklist creates a separate f"marks_{tasklist_name}.csv" file in the results folder.

    """
    logs.sts('Extracting marks from extraction tasklists', 3)

    tasklists = DB.list_files_in_dirname_filtered(dirname='marks', subdir='tasks', file_pat=r'^[^~].*\.csv$', fullpaths=False)
    total_num = len(tasklists)
    utils.sts(f"Found {total_num} taskslists", 3)

    use_lambdas = argsdict['use_lambdas']

    if use_lambdas:
        LambdaTracker.clear_requests()
        #clear_instructions(config_d.TASKS_BUCKET, Job.get_path_name())

    biflist = get_biflist(no_ext=True)

    for bif_idx, bifname in enumerate(biflist):
        archive_name = re.sub(r'_bif', '', bifname)
        genmarks_tasks = [t for t in tasklists if t.startswith(archive_name)]
    
        for chunk_idx, tasklist_name in enumerate(genmarks_tasks):
        
            #----------------------------------
            # this call may delegate to lambdas and return immediately
            # if 'use_lambdas' is enabled.
            # otherwise, it blocks until the chunk is completed.
            
            build_one_chunk(argsdict, 
                dirname='marks', 
                chunk_idx=chunk_idx, 
                filelist=[tasklist_name], 
                group_name=bifname,
                task_name='extractvote', 
                incremental=False)

            #----------------------------------

            if not chunk_idx and not bif_idx and argsdict['one_lambda_first']:
                if not wait_for_lambdas(argsdict, task_name='extractvote'):
                    utils.exception_report("task 'extractvote' failed delegation to lambdas.")
                    sys.exit(1)           

    wait_for_lambdas(argsdict, task_name='extractvote')

    utils.combine_dirname_chunks_each_archive(argsdict, dirname='marks')
    logs.get_and_merge_s3_logs(dirname='marks', rootname='log', chunk_pat=r"_chunk_\d+", subdir="chunks")
    logs.get_and_merge_s3_logs(dirname='marks', rootname='exc', chunk_pat=r"_chunk_\d+", subdir="chunks")
def get_ballot_from_image_filepaths(argsdict:dict, file_paths:list, mode='archive', archive_basename=None, archive=None):
    """ given list of one or two filepaths that comprise the ballot,
        access the images and extract the style and BMD status information.
        creates ballot object and returns it.
        
        if argsdict['style_from_party'] provides a list of style nums for each party, then
            set style_num according to that but also leave card_code equal to what was read from the card.
        
    """
    # this call does nothing more than initialize the instance data
    ballot = Ballot(argsdict, file_paths=file_paths, archive_basename=archive_basename)

    if mode=='local':
        ballot.load_source_files(archive=None, mode=mode)
        
    elif mode=='archive':
        ballot_id = ballot.ballotdict['ballot_id']
        precinct = ballot.ballotdict['precinct']

        #    utils.sts (f"opening archive: '{archive_basename}'...", 3)
        #    #archives_folder_path = argsdict['archives_folder_path']
        #    archive = open_archive(argsdict, archive_basename)
        #    current_archive_basename = archive_basename

        if not ballot.load_source_files(archive):
            string = f"EXCEPTION: Could not load source files from archive {archive_basename} for ballot_id: {ballot_id} Precinct: {precinct}"
            utils.exception_report(string)
            sys.exit(1)

    ballot.get_ballot_images()      # this reads images from PDFs
    ballot.align_images()
    ballot.read_style_num_from_barcode(argsdict)
 
# this now handled after bif is built -- see set_style_from_party_if_enabled
#    style_from_party = argsdict.get('style_from_party', '')
#    if style_from_party:
#        party = ballot.ballotdict['party']
#        if not party:
#            utils.exception_report("style_from_party was specified but no party is available")
#            return
#        style_from_party_dict = eval(style_from_party)
#        try:
#            ballot.ballotdict['style_num'] = style_from_party_dict[party]
#        except KeyError:
#            utils.exception_report(f"party {party} not found in style_from_party {style_from_party} directive")

    if not ballot.ballotdict['card_code']:
        # here, we find that we are unable to read the style from the ballot.
        # see if this is a ballot with a barcode

        barcodes = barcode_decode(ballot.ballotimgdict['images'][0])
        ballot.ballotdict['is_bmd'] = bool(barcodes)

    return ballot
def check_mark_dfs_vs_ballotid_dict(argsdict, cvr_ballotid_to_style_dict):
    """
    DEPRECATED. SHOULD USE BIF INSTEAD BUT REPLACEMENT IS NOT WRITTEN YET.
    Checks if ballot id from mark_dfs is in cvr_ballotid_to_style_dict.
    :param file_paths:
    """
    print("This function is deprecated.")
    sys.exit(1)
    
    utils.sts ("Checking all mark_dfs are in the CVR and the reverse...", 3)
    if not cvr_ballotid_to_style_dict:
        utils.sts ("Can't check if ballots are in the CVR, no cvr_ballotid_to_style_dict exists", 3)
        return 0
    
    # create a new dictionary with same keys, all False.
    found_ballotids = dict.fromkeys(cvr_ballotid_to_style_dict, False)

    missing_cvr_record_report_str = ''
    missing_marks_df_report_str = ''
    num_missing_cvr_records = 0
    
    marks_df_list = get_marks_df_list()
    
    num_marks_dfs = len(marks_df_list)
    utils.sts (f"{num_marks_dfs} extracted ballots found.", 3)

    for marks_df_name in marks_df_list:
        ballot_id = get_ballot_id_from_marks_df_name(marks_df_name)
        if ballot_id in cvr_ballotid_to_style_dict:
            found_ballotids[ballot_id] = True
        else:
            missing_cvr_record_report_str += f"Ballot marks_df_{ballot_id}.json extraction file exists but is not included in Cast Vote Records.\n"
            num_missing_cvr_records += 1

    total_missing_files = 0
    for key in found_ballotids:
        if not found_ballotids[key]:
            missing_marks_df_report_str += f"Ballot {key} exists in CVR files but no marks_df_{key}.json file found in the results folder.\n"
            total_missing_files += 1

    utils.sts (f"Total of {len(marks_df_list)} marks_df files found.\n"
                f"checked against {len(cvr_ballotid_to_style_dict)} CVR records.", 3)
    if missing_cvr_record_report_str:
        missing_cvr_record_report_str = f"### EXCEPTION: {num_missing_cvr_records} ballot image files exist that do not have matching marks_df files\n" + missing_cvr_record_report_str
        utils.exception_report(missing_cvr_record_report_str)
    if missing_marks_df_report_str:
        missing_marks_df_report_str = f"### EXCEPTION: {total_missing_files} CVR records exist that have no corresponding marks_df files.\n" + missing_marks_df_report_str
        utils.exception_report(missing_marks_df_report_str)
Example #9
0
def dominion_ballot_type_to_external_id(ballot_type_id) -> int:
    """ The following converts from the BallotTypeId found in the CVR JSON file into ExternalId 
        This conversion is available in the file BallotTypeManifest.json but was hardcoded here for now.
    """
    global CONV_ballot_type_id_TO_external_id_DICT

    if not CONV_ballot_type_id_TO_external_id_DICT:
        # create a straight lookup table for speed.
        CONV_ballot_type_id_TO_external_id_DICT = utils.invert_dol_to_dict(
            CONV_external_id_TO_ballot_type_id_DOL)

    try:
        external_id = CONV_ballot_type_id_TO_external_id_DICT[ballot_type_id]
    except KeyError:
        utils.exception_report(
            "dominion_ballot_type_to_external_id() Logic error: Could not find ballot_type_id in conv_dict"
        )
        sys.exit(1)

    return external_id
Example #10
0
def get_next_ballot_paths(index, archive, file_paths, extension=None):
    """
    given entire list of file_paths and index in archive,
    Returns a list of one or two filepaths that relate to a single ballot
    Most ballot types(.pdf, .png, .tif) have one file per both sides but
    .pbm has two filenames per ballot.

    """
    try:
        file_path = file_paths[index]
    except:
        pass

    # for most cases, there is only one file per ballot sheet. .pbm has two files per sheet.
    return_paths = [file_path]

    if extension is None:
        _, extension = os.path.splitext(
            file_path)  # note: extension includes '.'

    if extension == '.pbm':
        index += 1
        try:
            R_file_path = file_paths[index]
        except:
            utils.exception_report(
                f"Warning: could not find rear file of .pbm file {file_path}, insufficient files."
            )
            return index - 1, return_paths

        if file_path.endswith('F.pbm') and R_file_path.endswith('R.pbm'):
            _, _, ballotid = analyze_ballot_filepath(file_path)
            _, _, R_ballotid = analyze_ballot_filepath(R_file_path)
            if ballotid == R_ballotid:
                return_paths.append(R_file_path)
        else:
            utils.exception_report(
                f"Warning: could not find rear file of .pbm file {file_path}")
            return index - 1, return_paths

    return index, return_paths
Example #11
0
def read_raw_ess_barcode(image, ballot_id=''):
    """ This function reads the timing marks on left edge and extracts binary code
        based on the width of the timing marks.
        image: np.array image using cv2 format.
        returns card_code: hex string expressing the binary code starting at the top.
        returns None if the length of binary is incorrect.
        
        @@TODO: read_raw_ess_barcode: calculate region based on page size rather than config values.
        @@TODO: read_raw_ess_barcode: improve robustness of conversion so it is more immune to stray marks.
    """
    
    code_img = image[
               config_dict['CODE_ROI']['y']:
               config_dict['CODE_ROI']['y\''],
               config_dict['CODE_ROI']['x']:
               config_dict['CODE_ROI']['x\'']
               ]

    inner_code = ''
    _, code_thresh = cv2.threshold(
        code_img, config_dict['THRESHOLD']['code-contours'], 255, 1)
    code_contours, _ = cv2.findContours(code_thresh, 1, cv2.CHAIN_APPROX_SIMPLE)

    for code_cnt in reversed(code_contours):
        code_area = cv2.contourArea(code_cnt)
        x_1, y_1, x_2, y_2 = cv2.boundingRect(code_cnt)
        mean = sum(cv2.mean(code_img[y_1:y_1 + y_2, x_1:x_1 + x_2]))
        factor = (255.0 - mean + config_dict['CODE_MEAN_OFFSET']) / 255.0

        if config_dict['CODE_ROI']['max-size'] > code_area * factor \
                >= config_dict['CODE_ROI']['min-size']:
            inner_code += '0' if code_area * factor \
                                 < config_dict['THRESHOLD']['code'] else '1'
    if not len(inner_code) == config_dict['CODE_CHECKSUM']:
        utils.exception_report(
            f"### EXCEPTION: style inner code '{inner_code}' has {len(inner_code)} bits, "
            f"expected {config_dict['CODE_CHECKSUM']}. ballot_id:{ballot_id}")
        return None
    card_code = hex(int(inner_code, 2))
    return card_code
Example #12
0
def get_ballot_type_id_from_card_code(card_code):

    global CONV_card_code_TO_ballot_type_id_DICT

    if not CONV_card_code_TO_ballot_type_id_DICT:
        utils.sts("Recovering card_code_to_ballot_type_id_dict")
        #CONV_card_code_TO_ballot_type_id_DICT = DB.load_style(name='CONV_card_code_TO_ballot_type_id_DICT')
        CONV_card_code_TO_ballot_type_id_DICT = DB.load_data(
            dirname='styles',
            name='CONV_card_code_TO_ballot_type_id_DICT.json',
            silent_error=True)
        # if the file does not exist, then None is returned.

    try:
        ballot_type_id = CONV_card_code_TO_ballot_type_id_DICT[card_code]
    except (KeyError, TypeError):
        utils.exception_report(
            "get_ballot_type_id_from_card_code() Logic error: Could not find card_code in conv_dict"
        )
        return None

    return ballot_type_id
Example #13
0
def get_box_sizes_list(argsdict, sheet0: int = 0, page0: int = 0):

    layout_params = get_layout_params(argsdict)

    # this list is universal
    box_sizes_lists_by_layout_type = {
        '3col': [layout_params['v3col']],
        '2col': [layout_params['v2col']],
        '1&2col': [layout_params['v2col'], layout_params['v1col']
                   ],  # note: list format with narrower columns first
        '1col': [layout_params['v1col']],
    }

    page_layout_type = get_page_layout_type(argsdict, sheet0, page0)
    if not page_layout_type in box_sizes_lists_by_layout_type:
        utils.exception_report(
            f"Page layout specified ('{page_layout_type}') for sheet0 {sheet0} and page0 {page0} not recognized.\n"
            f"Must be one of: {box_sizes_lists_by_layout_type.keys()}")
        sys.exit()

    box_sizes_list = box_sizes_lists_by_layout_type[page_layout_type]

    return box_sizes_list
Example #14
0
def get_parsed_barcode(hex_code_str: str,
                       ballot_id: str = '',
                       precinct: str = '') -> int:
    """Takes the hex representation of the left side barcode found
    on the ballot and parses it to the decimal representation.
    :param hex_code_str: Hex number value provided as string 0xHHHHHHHHHHHHH
    'param ballot_id, precinct optional for exception report only.
    :return: Style number represented as decimal.
    """
    if not hex_code_str: return None

    hex_code = int(hex_code_str,
                   0)  # base of zero causes base guessing behavior.
    ones = bin(hex_code).count('1')
    if not ones & 1:
        # even number of ones is not allowed.
        string = f"### WARNING: parity error in card_code: " \
                f"ballot_id:'{ballot_id}' Precinct:'{precinct}'\n" \
                f"card_code:'{hex_code_str}' probably misread."
        if __name__ != "__main__":
            utils.exception_report(string)
        else:
            print(string)
        return None

    core_style_code = get_core_style_code(hex_code)
    style_num = translate_hex_code(core_style_code)

    if int(style_num) > 211:
        string = f"### WARNING: Known_Limitation_002: converted style number " \
                "is out of range. result is not certain but may be okay. " \
                f"balot_id:'{ballot_id}' Precinct:'{precinct}\n" \
                f"card_code:'{hex_code_str}' calculated style_num '{style_num}'"
        if __name__ != "__main__":
            utils.exception_report(string)
        else:
            print(string)
        return None

    # in most cases, the upper portion of card code is always 0x21
    # however, if not, then party can be encoded here.

    partydigits = hex_code_str[2:4]
    try:
        party = PARTY_DICT[partydigits]
        style_num = party * 1000 + style_num
    except IndexError:
        string = f"### WARNING: upper digits of card code: encoding unexpected. " \
                f"balot_id:'{ballot_id}' Precinct:'{precinct}\n" \
                f"card_code '{hex_code_str}' calculated style_num '{style_num}'"
        if __name__ != "__main__":
            utils.exception_report(string)

    return style_num
def test_fuzzy_compare_permuted_strsets():
    result = fuzzy_compare_permuted_strsets(
        ['Bill', 'John', 'Gary', 'Mary', 'William'],
        ['William', 'John', 'Bill', 'Mary', 'Gary'], 0.9)
    if not result == (True, 1.0, [4, 1, 0, 3, 2]):
        utils.exception_report("test_fuzzy_compare_permuted_strsets error")
Example #16
0
def results_for_dominion(ballot_id: str, argsdict: dict, contests_dod: dict,
                         cvr_df: pd.DataFrame) -> dict:
    # Step 1: Get dict with keys of contest names and values of selected options
    row = cvr_df.loc[cvr_df['Cast Vote Record'] == int(ballot_id)].copy()
    assert not row.empty, f'Ballot {ballot_id} not found in CVR data frame'
    # Remove all columns that do not apply to this ballot_id.
    row.replace(r'^\s*$', np.nan, regex=True, inplace=True)
    row.dropna(axis='columns', how='all', inplace=True)
    cvr_contests = {}
    for contest in list(row.columns)[len(argsdict['initial_cvr_cols']):]:
        if row[contest]._typ == 'dataframe':
            cvr_contests[contest] = row[contest].iloc[0].tolist()
        elif row[contest]._typ == 'series':
            cvr_contests[contest] = row[contest].tolist()
        else:
            raise ValueError('Unrecognized data frame type')
    assert cvr_contests, f'No CVR contests found for ballot {ballot_id}'

    # Step 2: Get dict with keys of contest names and values of contest details.
    cvr_option_regex = argsdict.get('cvr_option_regex')
    cvr_unified_results = {}
    for cvr_contest, cvr_selections in cvr_contests.items():
        official_options_list = contests_dod[cvr_contest][
            'official_options_list']
        cvr_unified_results[cvr_contest] = {
            'overvotes': 0,
            'undervotes': 0,
            'num_ballots': 1,
            'tot_votes': 0,
            'writeins': 0,
            'unrecognized_selections': [],
            'votes': dict.fromkeys(official_options_list, 0)
        }
        if cvr_selections[0] == 'overvote':
            # contest is overvoted, set overvote=1 and leave it at that.
            cvr_unified_results[cvr_contest]['overvotes'] = 1
            if cvr_selections.count('overvote') != len(cvr_selections):
                raise ValueError(
                    "Unexpected condition. 'overvote' should be repeated for all selections"
                )
            continue  # No more to do if this is an overvote condition.
        for selection in cvr_selections:
            if selection == 'undervote':
                cvr_unified_results[cvr_contest]['undervotes'] += 1
                continue
            if selection == 'write-in:':
                cvr_unified_results[cvr_contest]['writeins'] += 1
                cvr_unified_results[cvr_contest]['tot_votes'] += 1
                continue
            if cvr_option_regex:
                match = re.search(cvr_option_regex, selection)
                if match:
                    selection = match[1]

            # For now, assume CVR entry might be formatted as PTY first last (NNNNN)
            # remove party and number
            selection = re.sub(r'^(DEM|REP|REF|NPA|GRN|PNF|LIB)\s*', '',
                               selection)
            selection = re.sub(r'\(\d+\)$', '', selection)
            selection = selection.strip()

            # Try to find the option in the official options list.
            # If this does not work very well, we may have to add a column to BOF
            # file to provide a conversion.
            # Return zero-based index in the list of first item whose value is x.
            # If not found, it raises a ValueError exception
            if selection not in official_options_list:
                cvr_unified_results[cvr_contest][
                    'unrecognized_selections'].append(selection)
                utils.exception_report(
                    f"EXCEPTION: selection {selection} unrecognized for {cvr_contest}"
                )
            else:
                # Can vote only one time for any one option.
                cvr_unified_results[cvr_contest]['votes'][selection] = 1
                cvr_unified_results[cvr_contest]['tot_votes'] += 1

    return cvr_unified_results
def extractvote_by_one_tasklist(
        argsdict: dict,
        tasklist_name: str,
        ):
    """ ACTIVE
    
    Extract vote from all ballots as specified in tasklist chunk in extraction_tasks folder.

    params:
    :param argsdict: provides arguments from input file or CLI such as filter specs.
    :param tasklist_name: created by f"{BIF.name}_chunk_{'%4.4u' % (chunk_index)}.csv"
            tasklist is found in extaction_tasks folder.

    produces results/marks_{tasklist_name}

    This is the primary extraction function for lambda operation.
    
    PRIOR TO LAUNCHING THIS:
        Check availability of:
            styles/rois_map_df.csv      -- as a result of gentemplates, genrois, genmap
            styles/contests_dod.json    -- based on EIF
            

    """

    current_archive_basename = ''
    archive = None

    # set s3 vs local mode
    DB.set_DB_mode()        

    # initialize results.
    DB.BALLOT_MARKS_DF = pd.DataFrame()
    
    rois_map_df      = DB.load_data('styles', 'roismap.csv')
    contests_dod     = DB.load_data('styles', 'contests_dod.json')

    #extraction_tasks_df = DB.load_df_csv(name=tasklist_name, dirname='extraction_tasks', s3flag=argsdict['use_s3_results'])
    extraction_tasks_df = DB.load_data(dirname='marks', subdir='tasks', name=tasklist_name)

    #archives_folder_path = argsdict['archives_folder_path']

    for task_idx in range(len(extraction_tasks_df.index)):

        task_dict           = extraction_tasks_df.iloc[task_idx]
        ballot_id           = task_dict['ballot_id']
        precinct            = task_dict['precinct']
        archive_basename    = task_dict['archive_basename']

        """ has structure of BIF
            ('archive_basename', str),
            ('ballot_id', str),
            ('file_paths', str),    # note, may be semicolon separated list.
            ('cvr_file', str),
            ('precinct', str),
            ('party', str),
            ('style_num', str),
            ('card_code', str),
            ('ballot_type_id', str),
            ('sheet0', 'Int32'),                 # 0, 1 ...
            ('is_bmd', 'Int32'),
            ('style_roi_corrupted', 'Int32'),
            ('other_comments', str),
        """

        ballot_style_overrides_dict = args.get_ballot_style_overrides(argsdict)

        #ballot_id, vendor='ES&S', precinct=None, party=None, group=None, extension=None, file_paths=[]):
        # this call does nothing more than initialize the instance data
        ballot = Ballot(argsdict, 
            file_paths = re.split(r';', task_dict['file_paths']), 
            ballot_id=ballot_id, 
            precinct=precinct, 
            archive_basename=archive_basename)

        ballot.ballotdict['is_bmd'] = bool(utils.set_default_int(task_dict.get('is_bmd', 0), 0))

        if (ballot.ballotdict['is_bmd'] and not argsdict['include_bmd_ballot_type'] or
            not ballot.ballotdict['is_bmd'] and not argsdict['include_nonbmd_ballot_type']):

            utils.exception_report(f"Tasklist says is_bmd is {ballot.ballotdict['is_bmd']} "
                "but argsdict does not include that type. Extract tasklists may be stale")
            continue

        if archive_basename != current_archive_basename:
            if current_archive_basename and archive:
                archive.close()
            utils.sts (f"opening archive: '{archive_basename}'...", 3)
            archive = open_archive(argsdict, archive_basename)
            current_archive_basename = archive_basename

        if not ballot.load_source_files(archive):
            string = f"EXCEPTION: Could not load source files from archive {archive_basename} offset {task_idx} for ballot_id: {ballot_id} Precinct: {precinct}"
            utils.exception_report(string)
            continue

        utils.sts(f"\n{'-'*50}\nProcessing tasklist:{tasklist_name} offset: {task_idx} ballot_id:{ballot_id}", 3)

        ballot.get_ballot_images()      # this reads images from PDFs

        #-----------------------------------------------------
        # this is the primary function call, performed for each ballot,
        # and producing a marks_df for this ballot, with one record for
        # each option.
        
        ballot_marks_df = extract_vote_from_ballot(
            argsdict, ballot, rois_map_df, contests_dod,
            ballot_style_overrides_dict,
            )
            
        # the above function makes exception reports if:
        #   1. the style cannot be read from the ballot, alignment or barcode error.
        #   2. the style failed to map.
        #-----------------------------------------------------

        if ballot_marks_df is None or not len(ballot_marks_df.index):
            continue    # not successful and exception has already been logged.

        DB.BALLOT_MARKS_DF = DB.BALLOT_MARKS_DF.append(ballot_marks_df, sort=False, ignore_index=True)
        continue

    #DB.save_df_csv(name=tasklist_name, dirname='marks', df=DB.BALLOT_MARKS_DF)
    DB.save_data(data_item=DB.BALLOT_MARKS_DF, dirname='marks', subdir='chunks', name=f"marks_{tasklist_name}")
def fuzzy_compare_permuted_strsets(correct_strlist,
                                   ocr_strlist,
                                   thres,
                                   fuzzy_compare_mode='best_of_all') -> tuple:
    """ compare sets of strings in all possible permutations and return the first match of all components.
        returns tuple:
            bool    -- True if mapping meets threshold, and then map is provided.
            metric  -- minimum metric of the best mapping or first failing metric
            map     -- list of indexes of the correct_strlist that match the given order of the ocr_str_list
        resulting permutation list provide selected correct_strlist for given ocr_strlist
        True, max_metric, permutations_listoflist[best_permutation_idx]
    """
    """
        Prior algorithm was O(n!) and has been rewritten
        New algorithm should NOT calculate all the permutations, as even with only 5 options 120 mappings just be checked.
        6, 720 7, 5040; 8, 40320 cases. So this algorithm will never complete with just a dozen candidates (results in 479 million cases).
        
        INSTEAD:
            Can change to this algorithm which is O(n**2) 
        
        Each ocr_item, ocr_item should be compared with each correct_str
            create a table of fuzzy comparison metrics a row at a time.
            if any row of metrics has no item that reaches the threshold, can return immediately.
            this table can simply be a list of lists.
        
                            correct_str items
                |--------------------------------------------
        ocr_item|   0       1       2        3   ...     n-1
            0   | m[0][0] m[0][1]
            1   | m[1][0] ...
            2   | ...
           ...  |
           n-1  |
                |--------------------------------------------
        
        Then, process the array to choose the best match of each correct_str item with each ocr_item
        
        take each row, locate max_metric in the row. It should also be the best match in the column.
        return list of indexes of correct_str item that matches each ocr_item.
        record an exception if max metric of each row is not also the max metric of each column.
        
        For example, 
        assert fuzzy_compare_permuted_strsets(['Bill','John','Gary','Mary','William'], ['William','John,'Bill','Mary','Gary'], 0.9) == (True, 1.0, [4,1,0,3,2])
        
        ocr_metrics_table =
        
        [0,0,0,0,1] 4
        [0,1,0,0,0] 1
        [1,0,0,0,0] 0
        [0,0,0,1,0] 3
        [0,0,1,0,0] 2
        
         2,1,4,3,0  <- max_col_idxs
         
        then invert_idx_list(max_col_idxs) == idx_list
        
        THIS CAN BE FURTHER IMPROVED if necessary:
        This function is used in the mapping the ocr strings from rois to contests and options.
        The frame is slid down if a match is not found. If that occurs, then the existing metric matrix
        can be used again by sliding it one notch as well.
        
        PROPOSED IMPROVEMENT:
        For a given correct_strlist, save the map. If the function is invoked with the same correct_strlist, 
        try the saved map. If it matches use it. if not, continue with full analysis.
        

    """
    """
    DEPRECATED code
    permutations_listoflist = list(itertools.permutations(range(len(correct_strlist))))
    # this provides the permutations of the indexes of the options which should be searched between the two lists
    matching_permutations = []
    matching_permutation_metrics = []
    permutation_metrics = []
    for permutation_list in permutations_listoflist:
        # following sort of thing works in R but not in python.
        permuted_correct_strlist = []
        for i in permutation_list:
            permuted_correct_strlist.append(correct_strlist[i])
        
        flag, metric = fuzzy_compare_strlists(permuted_correct_strlist, ocr_strlist, thres)
        if metric == 1.0:
            # if we find an exact match, return immediately with that permutation
            return True, metric, permutation_list
        if flag:
            matching_permutations.append(permutation_list)
            matching_permutation_metrics.append(metric)
        permutation_metrics.append(metric)
    
    max_metric = max(permutation_metrics)
    if not len(matching_permutations):
        # no matches, can deal with that quickly.
        return False, max_metric, []
    if len(matching_permutations) == 1:
        return True, matching_permutation_metrics[0], matching_permutations[0]
    # more than one match, but none exact. return the best match
    best_permutation_idx = permutation_metrics.index(max_metric)
    return True, max_metric, permutations_listoflist[best_permutation_idx]
    """

    #import pdb; pdb.set_trace()
    ocr_metrics_table = []
    max_idxs = []
    max_metrics = []
    for ocr_str in ocr_strlist:
        ocr_metrics_list = fuzzy_metrics_str_to_list(correct_strlist, ocr_str,
                                                     fuzzy_compare_mode)
        max_metric = max(ocr_metrics_list)
        max_idx = ocr_metrics_list.index(max_metric)
        if max_metric < thres:
            return False, max_metric, [
            ]  # return early if ocr_str cannot be found in correct_strlist
        max_idxs.append(max_idx)
        max_metrics.append(max_metric)
        ocr_metrics_table.append(ocr_metrics_list)  # create list of list.

    max_col_idxs = []
    for idx in range(len(correct_strlist)):
        metrics_by_col = [a[idx]
                          for a in ocr_metrics_table]  # get the idx'th column
        max_metric_by_col = max(metrics_by_col)
        max_idx_by_col = metrics_by_col.index(max_metric_by_col)
        max_col_idxs.append(max_idx_by_col)

    inverted_col_idxs = invert_idx_list(max_col_idxs)

    if not max_idxs == inverted_col_idxs:
        utils.exception_report(
            f"### EXCEPTION: fuzzy_compare_permuted_strsets: correct_strlist:{correct_strlist}\n"
            f"cannot be mapped to ocr_strlist {ocr_strlist}\n"
            f"maxes of rows not maxes of cols.\n")
        return False, 0, []

    return True, min(max_metrics), max_idxs
def extract_vote_from_ballot(
        argsdict: dict,
        ballot: Ballot,
        rois_map_df,
        contests_dod,
        ballot_style_overrides_dict,
        #cvr_ballotid_to_style_dict, -- no longer uses this because BIF table has the information, accessed through Ballot.
        ):
    """ ACTIVE
    
    This function may run in AWS Lambda.
    Ballot images files have just been extracted from archive.
    :param ballot: Ballot from which votes should be extracted.
        ballot.ballotdict['is_bmd'] should be initialized
    :param rois_map_df: DataFrame objec with map of targets on all styles.
    :return: DataFrame with ballot marks info.
    """
    ballot_id = ballot.ballotdict['ballot_id']

    if ballot.ballotdict['is_bmd']:

        if argsdict['vendor'] == 'ES&S':
            # this is ES&S Specific
            # the following function analyzes the EV ballot using OCR and
            #   ballot_marks_df also contains the barcode strings for each selection (if successful).
            ballot_marks_df = analyze_bmd_ess(argsdict, ballot, rois_map_df, contests_dod)

        elif argsdict['vendor'] == 'Dominion':
            ballot_marks_df = analyze_bmd_dominion(argsdict, ballot, rois_map_df, contests_dod)

        if ballot_marks_df is None:
            string = "### EXCEPTION: BMD ballot analysis failed.\n" \
                     + f"ballot_id: {ballot_id} Precinct: {ballot.ballotdict['precinct']}"
            utils.exception_report(string)
            return None

        return ballot_marks_df

    # otherwise, this is a nonBMD ballot

    ballot.align_images()
    style_num = ballot.read_style_num_from_barcode(argsdict)
    if not style_num:
        # the barcode conversion failed. Exception handled internal to the call above.
        return None

    style_num = one_style_from_party_if_enabled(argsdict, style_num, ballot.ballotdict['party'])
    ballot.ballotdict['style_num'] = style_num

    ballot.get_timing_marks()       # for each image, capture the timing marks to ballot instance.

    if get_style_fail_to_map(style_num):
        # note that this reads the style file keeps a list of styles so it does not have to
        # read the style file each time.
        # we can't process this ballot because we were unable to map the style.
        # this will also return true if the style is out of range.

        # first we will check if there are any merged styles
        error_flag = True
        merged_styles_str = argsdict.get('merged_styles', '')
        if merged_styles_str:
            #merged_styles = json.loads(merged_styles_str)
            merged_styles = eval(merged_styles_str)
            if style_num in merged_styles or int(style_num) in merged_styles:
                eff_style_num = str(merged_styles.get(style_num, merged_styles.get(int(style_num), '')))
                utils.sts(f"INFO: Style {style_num} did not map, using merged style {eff_style_num}", 3)
                error_flag = False
                style_num = eff_style_num

        if error_flag:
            utils.exception_report(f"### EXCEPTION: style failed to map and no merged style found, ballot_id: "
                     f"{ballot_id} style: {style_num} Precinct: {ballot.ballotdict['precinct']}")
            return None

    # Get the subset of rows from the rois_map_df related to this style
    style_rois_map_df = rois_map_df.loc[rois_map_df['style_num'] == int(style_num)]

    """---------------------------------------------------------------------
    Proceed with analysis
        Given ballot object which provides the style_num and rois_map_df
        Lookup records that correspond to the style_num from rois_map_df
        For each contest and option line, access roi of ballot and interpret
        the mark. Add record to the marks_df for each contest/option pair.
        Also evaluates each contest regarding overvotes and completed num_votes
        based on the overvote status, and fills in each contest header record
        regarding overvotes, undervotes.
    """

    utils.sts(f"Style {style_num} read from ballot. Analyzing Ballot and extracting the marks...", 3)
    ballot_marks_df = analyze_images_by_style_rois_map_df(argsdict, ballot, style_rois_map_df)
    return ballot_marks_df
Example #20
0
def get_layout_params(argsdict: dict):
    """ get the page_layout
        returns list of box sizes. First size will have narrowist columns
        sheet0 and page0 only needed for box_sizes to be correct.
    """
    global LAYOUT_PARAMS_DICT

    try:
        if LAYOUT_PARAMS_DICT:
            return LAYOUT_PARAMS_DICT
    except NameError:
        pass

    vendor = argsdict['vendor']
    layout_params = {}

    # target area should be even so we can divide by two from center.
    layout_params['target_area_w'] = 36
    layout_params['target_area_h'] = 26

    if vendor == 'ES&S':

        # for ocr_based_genrois,
        # we have the box surrounding the text, and the larger blk_region
        # where this text is found. For each roi, we can modify the blk
        # by accepting the x and w parameters, and setting the y and h
        # parameters according to typical offsets.

        # ES&S has nominal timing mark period of 55 pixels, and nom. space of 27
        # thus centerline of timing mark to first gap is 27

        layout_params['blk_y_os'] = 27
        layout_params['blk_h_nom'] = 110

        # the following parameters are related to graphics-first segmentation
        layout_params['h_max_option'] = 105
        # largest roi that could be an option block
        # this determines if the box will be cropped as option
        # (to exclude target graphic)

        layout_params['h_min_option'] = 60
        # option blocks below this are "one-liners" and could be single words
        # that should be converted with tessearact as single words.

        layout_params['v3col'] = {
            'w_col': 530,
            'w_min': 510,
            'w_max': 550,
            'h_min': 45,
            'h_max': 2500
        }
        layout_params['v2col'] = {
            'w_col': 795,
            'w_min': 760,
            'w_max': 815,
            'h_min': 45,
            'h_max': 2500
        }
        layout_params['v1col'] = {
            'w_col': 1590,
            'w_min': 1570,
            'w_max': 1610,
            'h_min': 45,
            'h_max': 500
        }

        if argsdict['target_side'] == 'left':
            # example Dane County 2018
            layout_params['option_crop'] = {
                'top_margin': 5,
                'btm_margin': 5,
                'lft_margin': 50,
                'rgt_margin': 10,
            }
            layout_params['full_crop'] = {
                'top_margin': 5,
                'btm_margin': 5,
                'lft_margin': 5,
                'rgt_margin': 5,
            }
            # best-guess of target location, based on Dane County
            # for ES&S, analyzed typical roi. 1.5" tall on screen = 54 pixels
            # this is the offset from Top-left corner of the ROI
            # x_os = 0.9" => 32
            # y_os = 0.7" => 26
            layout_params['norm_targ_os'] = {'ref': 'tl', 'x': 32, 'y': 26}
            layout_params['adjust_target_x'] = True
        else:
            utils.exception_report(
                "get_layout_params: target_side: right not defined for ES&S")
            sys.exit(1)

    elif vendor == 'Dominion':
        layout_params['h_max_option'] = 105
        # largest roi that could be an option block
        # this determines if the box will be cropped as option
        # (to exclude target graphic)

        layout_params['h_min_option'] = 60
        # option blocks below this size are "one-liners"

        layout_params['v3col'] = {
            'w_col': 520,
            'w_min': 500,
            'w_max': 540,
            'h_min': 45,
            'h_max': 1600
        }
        layout_params['v2col'] = {
            'w_col': 778,
            'w_min': 758,
            'w_max': 798,
            'h_min': 45,
            'h_max': 1000
        }
        layout_params['v1col'] = {
            'w_col': 1557,
            'w_min': 1537,
            'w_max': 1577,
            'h_min': 45,
            'h_max': 500
        }

        if argsdict['target_side'] == 'left':
            # example Leon County 2018
            layout_params['option_crop'] = {
                'top_margin': 2,
                'btm_margin': 5,
                'lft_margin': 60,
                'rgt_margin': 75,
            }
            layout_params['full_crop'] = {
                'top_margin': 2,
                'btm_margin': 2,
                'lft_margin': 2,
                'rgt_margin': 5,
            }
            layout_params['norm_targ_os'] = {'ref': 'tl', 'x': 32, 'y': 26}
            layout_params['adjust_target_x'] = True

        else:
            # 'target_side' == 'right' -- example is SF.
            layout_params['option_crop'] = {
                'top_margin': 2,
                'btm_margin': 2,
                'lft_margin': 2,
                'rgt_margin': 70,
            }
            layout_params['full_crop'] = {
                'top_margin': 2,
                'btm_margin': 2,
                'lft_margin': 2,
                'rgt_margin': 5,
            }

            layout_params['norm_targ_os'] = {'ref': 'tr', 'x': -45, 'y': 22}

            # when yes_no_in_descr == True
            layout_params['comb_yes_targ_os'] = {
                'ref': 'br',
                'x': -45,
                'y': -62
            }
            layout_params['comb_no_targ_os'] = {
                'ref': 'br',
                'x': -45,
                'y': -22
            }
            layout_params['adjust_target_x'] = False

            layout_params['h_max_option'] = 160
            # largest roi that could be an option block
            # this determines if the box will be cropped as option
            # (to exclude target graphic)

    if argsdict.get('h_max_option'):
        layout_params['h_max_option'] = argsdict['h_max_option']

    LAYOUT_PARAMS_DICT = layout_params
    return layout_params
Example #21
0
def dominion_build_effective_style_num(argsdict,
                                       card_code,
                                       ballot_type_id=None) -> (str, int):
    """ Dominion Ballots from SF 2020-03 use a complex style system.
        The card_code is the value on teh ballot and the balot_type_id
        is derived from the CVR JSON file and identifies broad categories 
        that may indicate different contest option ordering.
        
        Convert the card_code and ballot_type_id to an internally used
        style_num and sheet0 number.
        
        The style_block cannot be used to generate the ballot_type_id.
        BIF files can be scanned to generate conversion from card_code to ballot_type_id.
        
    """
    style_num = card_code
    sheet0 = 0
    if not argsdict['conv_card_code_to_style_num']:
        return style_num, sheet0

    if not ballot_type_id:
        ballot_type_id = get_ballot_type_id_from_card_code(card_code)

    if not ballot_type_id:
        return None, None

    election_name = argsdict.get('election_name', '')

    if election_name == 'CA_San_Francisco_2020_Pri':

        if (ballot_type_id > 999):
            utils.exception_report(
                f"dominion_build_effective_style_num -- Type code out of range:{ballot_type_id}"
            )

        # dominion constructs the card_code (code on the ballot) based on the
        # party, type of ballot NP or regular), language, and sheet.
        # Also, the ballot_type_id provides different option ordering
        # the core type value is one of the following 57 combinations.

        # Lang & Sheet | Party and type (NP or not)  -- core style num is card_code % 57
        #   SP  CH  FI |DEM NPDEM   REP     AI      NPAI    PF      LIB     NPLIB   GRN     NP    sheet_lang
        #   --  --  -- |--- ------  ---     ---     ----    ---     ---     -----   ---     ---   ----------
        #   S1         |1   7       13      19      25      31      37      43      49                 0
        #   S2         |2   8       14      20      26      32      38      44      50      55         1
        #       S1     |3   9       15      21      27      33      39      45      51                 2
        #       S2     |4   10      16      22      28      34      40      46      52      56         3
        #           S1 |5   11      17      23      29      35      41      47      53                 4
        #           S2 |6   12      18      24      30      36      42      48      54      57         5
        #   --  --  -- |--- ------  ---     ---     ----    ---     ---     -----   ---     ---   ----------
        #       party->|1   2       3       4       5       6       7       8       9       0
        #   1   2   3  <-- lang

        # Note that NP ballots do not have sheet 1.
        # S2 of the same language seems to always be the same.
        #
        # For example, card_code = 22454
        # 22545 % 57 = 53. This is GRN party S1 in Tagalog (FI) language.
        #
        # ExternalId in BallotTypeManifest.json appears to be sufficient to discriminate for other reasons.
        #   Later discovered this is not true. There was a thought that the style_block i.e. card_code // 57
        #   would be the same as the ballot_type_id, but it is not, as we proved that multiple style_blocks
        #   are mapped to the same ballot_type_code.
        # Because ballot_type_id helps discrimnate between option ordering, we also need to include that.
        # ballot_type_id range is 1 to 180. We multiply that by 100 and add the core code.
        # The ballot_type_id applies to both sheets.
        # This is an internal style number used to refer to ballot styles and templates.
        # contests in each style is based on the ballot_type_id, which can be derived from this
        # style_num by dividing by 100.
        #
        # sheet1 value used in style_num is 1-based but internal sheet0 is 0-based.

        try:
            core_style = int(card_code) % 57
            #style_block = int(card_code) // 57

            sheet_lang = (core_style - 1) % 6
            if core_style > 54:
                sheet1 = 2
                lang = core_style - 54
                party = 0
            else:
                sheet1 = sheet_lang % 2 + 1
                lang = sheet_lang // 2 + 1
                party = (core_style - 1) // 6 + 1
        except:
            utils.exception_report(
                f"Could not construct effective style_num from card_code:{card_code} and ballot_type_id:{ballot_type_id}"
            )

        variety = dominion_ballot_type_to_external_id(ballot_type_id)
        sheet0 = sheet1 - 1

        if str(sheet0) in argsdict.get('non_partisan_sheet0s', []):
            style_num = "%1.1u%1.1u%1.1u%1.1u%3.3u" % (lang, 0, sheet1,
                                                       variety, ballot_type_id)
        else:
            style_num = "%1.1u%1.1u%1.1u%1.1u%3.3u" % (lang, party, sheet1,
                                                       variety, ballot_type_id)

        #utils.sts(f"style_block:{style_block} ballot_type_id:{ballot_type_id}", 3)

        return str(style_num), sheet0
    elif election_name == 'FL_Leon_2018':
        pass  # use style_num and sheet0 as defined.
    else:
        utils.exception_report(
            f"dominion_build_effective_style_num not defined for this election: {election_name}."
        )
        sys.exit(1)

    return str(style_num), sheet0
def genbif_from_cvr(argsdict: dict):
    """
        If CVR files are available with style information, this 
        function can be used to generate the BIF data file.
        
        THIS RUNS VERY FAST NOW, do not neet lambdas if CVR exsists.
    """

    utils.sts('Generating BIFs')

    # if cvr is provided, us it for information here.
    ballotid_to_style_dict, parsed_dominion_cvr = get_cvr_info(argsdict)

    # check to see if style lookup table is specified.
    style_lookup_table_df = get_style_lookup_table(argsdict)
    
    pstyle_region_str = argsdict.get('pstyle_region')
    pstyle_region_dict = json.loads(pstyle_region_str) if (pstyle_region_str) else None
    pstyle_pattern = argsdict.get('pstyle_pattern', '')
    vendor = argsdict.get('vendor')

    for archive_idx, source in enumerate(argsdict['source']):
        archive_basename = os.path.basename(source)
        archive_root = os.path.splitext(archive_basename)[0]
        archive = open_archive(argsdict, archive_basename)

        df_dict = {}        # to save time, we will build the dataframe as a dict of dict, then in one swoop create the dataframe.
        file_paths = get_image_file_paths_from_archive(archive)
        utils.sts(f"Total of {len(file_paths)} image files in the archive")

        # now scan archives for additional information.

        for index, file_path in enumerate(file_paths):
            style = card_code = ballot_type_id = ''
            _, ballot_file_paths = get_next_ballot_paths(index, archive, file_paths)
            _, _, ballot_id = analyze_ballot_filepath(ballot_file_paths[0])

            # initialize defaults in local dict
            bifdict = {c: '' for c in BIF.get_bif_columns()}
            party = bifdict['party'] = get_party(argsdict, file_path)
            precinct = bifdict['precinct'] = get_precinct(argsdict, file_path)
            bifdict['sheet0'] = '0'
            
            #utils.sts(f"Processing {ballot_id} precinct {precinct} party {party}", 3)
            if vendor == 'Dominion':
                if parsed_dominion_cvr:
                    try:
                        ballot_rec = parsed_dominion_cvr[ballot_id]
                    except KeyError:
                        bifdict['comments'] = "Couldn't find ballot id in the CVR dict"
                    else:
                        for field in ['style_num', 'cvr_name', 'card_code', 'ballot_type_id']:
                            bifdict[field] = ballot_rec[field]
                        bifdict['is_bmd'] = '1' if ballot_rec['is_bmd'] else '0'
                        bifdict['sheet0'] = str(ballot_rec['sheet0'])

                else:
                    try:
                        style_num = str(ballotid_to_style_dict[ballot_id])
                    except (KeyError, TypeError):
                        utils.exception_report(f"ballot_id {ballot_id} found in {source} but not in ballotid_to_style_dict. Skipping.")
                        continue
                    bifdict['style_num'] = bifdict['card_code'] = style_num

                # the following creates the CONV_card_code_TO_ballot_type_id_DICT
                card_code = bifdict['card_code']
                
                update_CONV_card_code_TO_ballot_type_id_DICT(card_code, ballot_type_id)

            elif vendor == 'ES&S':

                is_bmd = is_archived_file_BMD_type_ess(argsdict, archive, ballot_file_paths[0])
                bifdict['is_bmd'] = '1' if is_bmd else '0'

                if ballotid_to_style_dict:
                    try:
                        style = str(ballotid_to_style_dict[int(ballot_id)])
                    except KeyError:
                        utils.exception_report(f"ballot_id {ballot_id} found in {source} but not in cvr. Skipping.")
                        continue
                    card_code = style
                    
                elif style_lookup_table_df is not None:
                    # style lookup table has been specified and loaded. 
                    # look up style based on party and precinct values from path.
                    #To select a row based on multiple conditions you can use &:
                    
                    try:
                        lookup_row = style_lookup_table_df.loc[(style_lookup_table_df['party'] == party) & (style_lookup_table_df['precinct'] == int(precinct))]
                    except Exception as err:
                        utils.exception_report(f"style lookup table format problem: {err}")
                        sys.exit(1)
                    if len(lookup_row) > 1:
                        utils.exception_report(f"Duplicate row values in style lookup table: {lookup_row}")
                    
                    is_bmd = is_archived_file_BMD_type_ess(argsdict, archive, ballot_file_paths[0])
                    bifdict['is_bmd'] = '1' if is_bmd else '0'
                    bifdict['style_num'] = str(lookup_row['style_num'].values.item())
                    bifdict['archive_basename'] = archive_basename
                    bifdict['ballot_id'] = ballot_id
                    bifdict['file_paths'] = ';'.join(ballot_file_paths)
                    bifdict['card_code'] = str(lookup_row['card_code'].values.item())
                
                else:
                    # if we do not have the ballot_id_to_style dict, this happens if there is no CVR.
                    # we must determine the style and bmd status by inspection of ballots.
                    # this can be very time consuming!
                    # NOTE: should use genbif_from_ballots
                   

                    # @@ Should check to see if bif files already exist and appear to have the correct number of records.
                    bifdict = create_bif_dict_by_reading_ballot(argsdict, ballot_id, index, archive_basename, archive, ballot_file_paths,
                                                                pstyle_region_dict, pstyle_pattern)

            df_dict[index] = bifdict

        # create the dataframe all at once.
        df = pd.DataFrame.from_dict(df_dict, "index")
        DB.save_data(data_item=df, dirname='bif', name=f"{archive_root}_bif.csv")
    def align_images(self):
        """ Aligns and crops ballot images.
            Also updates determinants.
        
            card_code attribute also updated for 'Dominion' vendor
        
        """
        error = False
        vendor = self.ballotdict['vendor']
        extension = self.ballotdict['extension']
        ballot_id = self.ballotdict['ballot_id']

        utils.sts(f"Aligning {vendor} ballots, ballot_id:{ballot_id}...",
                  3,
                  end='')

        if vendor == 'ES&S':
            """ ES&S has two image formats:
                1. multipage PDF with up to two pages. Images are already correctly sequenced and oriented but not aligned.
                2. single-page PBM, with one PBM page per file. Two files are opened and loaded at this point
            """
            if extension == '.pdf':
                self.ballotimgdict['images'], self.ballotdict[
                    'determinants'] = alignment_utils.ess_align_images(
                        self.ballotimgdict['images'])
            elif extension == '.pbm':
                self.ballotimgdict[
                    'images'] = alignment_utils.dane2016_alignment(
                        self.ballotimgdict['images'])
            else:
                error = True
        elif vendor == 'Dominion':
            """ Dominion uses two types of image format.
                1. Combined image with front, back, and "auditmark" (graphical CVR details embedded in the image) as one long page
                2. Separate pages using multi-page TIF format.
                3. extract timing marks and card_code
            """
            if extension == '.tif':
                if len(self.ballotimgdict['images']) > 1:
                    # this is the multi-page TIF format. Align each page separately
                    # Used by more recent versions of Dominion system
                    # We do not need to align page 3 (index 2) as this is the audit mark.
                    for index in range(2):
                        #, image in enumerate(self.ballotimgdict['images']):
                        img, _, card_code = alignment_utils.dominion_alignment(
                            self.ballotimgdict['images'][index], ballot_id)

                        if img is not None:
                            self.ballotimgdict['images'][index] = img[0]
                        self.ballotdict[
                            'card_code'] = card_code  #if there was an error, card_code could be None.
                        #elif index:
                        #    del self.ballotimgdict['images'][index]
                elif len(self.ballotimgdict['images']) == 1:
                    # this is the combined format, which returns a list of images.
                    imgs, _, card_code = alignment_utils.dominion_alignment(
                        self.ballotimgdict['images'][0], ballot_id)
                    if imgs is not None:
                        self.ballotimgdict['images'] = imgs
                    self.ballotdict[
                        'card_code'] = card_code  #if there was an error, card_code could be None.
            elif extension == '.png':
                # this is the combined format, which returns a list of images.
                imgs, _, card_code = alignment_utils.dominion_alignment(
                    self.ballotimgdict['images'][0], ballot_id)
                if imgs is not None:
                    self.ballotimgdict['images'] = imgs
                self.ballotdict[
                    'card_code'] = card_code  #if there was an error, card_code could be None.
            else:
                error = True
        else:
            error = True
        if error:
            utils.exception_report(
                f"Ballot.align_images {vendor} not supported with file extension {extension}"
            )
def genbif_from_ballots(argsdict: dict):
    """
    This function is used when no cvr exists and we need to scan all the
    ballots to create bifs. This is a slow process, so we create
    tasklist for lambdas processing.
    """

    if argsdict['use_s3_results']:
        DB.delete_dirname_files_filtered(dirname='bif', s3flag=True, file_pat=None)
        DB.delete_dirname_files_filtered(dirname='bif', subdir='chunks', s3flag=True, file_pat=None)

    # Clear lambda tracker catche
    if argsdict.get('use_lambdas'):
        LambdaTracker.clear_requests()

    max_chunk_size = argsdict.get('genbif_ballots_per_chunk', 200)
    max_concurrency = argsdict.get('max_lambda_concurrency', 1000)
    chunk_limit = argsdict.get('genbif_chunk_limit', None)
    num_archives = len(argsdict['source'])
    max_concurrency = max_concurrency // num_archives

    utils.sts('Generating tasklists to scan ballots to create bifs')
    for archive_idx, source in enumerate(argsdict['source']):
        archive_basename = os.path.basename(source)
        archive = open_archive(argsdict, archive_basename) # will open on s3 directly if using s3
        file_paths = get_image_file_paths_from_archive(archive)
        utils.sts(f"Total of {len(file_paths)} image files in the archive")

        filelist = []
        for index, file_path in enumerate(file_paths):
            _, ballot_file_paths = get_next_ballot_paths(index, archive, file_paths)
            #_, _, ballot_id = analyze_ballot_filepath(ballot_file_paths[0])

            filelist.append( ';'.join(ballot_file_paths) )
        utils.sts(f"Total of {len(filelist)} ballots in the archive")
        archive.close()

        chunks_lol = utils.split_list_into_chunks_lol(item_list=filelist, max_chunk_size=max_chunk_size, max_concurrency=max_concurrency)
        num_chunks = len(chunks_lol)
        utils.sts(f"Split into {num_chunks} chunks with maximum of {max_chunk_size} ballots each.")
        #count = 0
        
        # The loop below may delegate processing to lambdas.
        # Should perform consistency checks here (or before this point) to avoid any costly errors, such as:
        #   1. output bucket specified exists and is writeable.
        # It would be best to make these checks as settings file is initially processed.
        
        
        for chunk_idx, filelist in enumerate(chunks_lol):
            if chunk_limit and chunk_idx >= chunk_limit:
                break
            utils.sts(f"Processing chunk #{chunk_idx} with {len(filelist)} ballots", 3)
            
            build_one_chunk(
                argsdict=argsdict,
                dirname='bif',
                subdir='chunks',
                chunk_idx=chunk_idx, 
                filelist=filelist, 
                group_name=archive_basename, 
                task_name='bif',
                incremental = argsdict['incremental_genbif']
                )   # this may delegate to one lambda
            #count = count+1
            if argsdict['use_lambdas'] and not archive_idx and not chunk_idx and argsdict['one_lambda_first']:
                if not wait_for_lambdas(argsdict, task_name='bif'):
                    utils.exception_report("task 'bif' failed delegation to lambdas.")
                    sys.exit(1)           


    wait_for_lambdas(argsdict, task_name='bif')      # @@ wait_for_lambdas should be enhanced to track specific tasks or better use SQS messaging.
    
    for archive_idx, source in enumerate(argsdict['source']):
        archive_rootname = os.path.splitext(os.path.basename(source))[0]

        dirname = 'bif'

        DB.combine_dirname_chunks(
            dirname=dirname, subdir='chunks', 
            dest_name=f"{archive_rootname}_{dirname}.csv", 
            file_pat=fr"{archive_rootname}_{dirname}_chunk_\d+\.csv")
            
        logs.get_and_merge_s3_logs(dirname='bif', rootname='log', chunk_pat=fr'{archive_rootname}_{dirname}_chunk_\d+', subdir='chunks')
        logs.get_and_merge_s3_logs(dirname='bif', rootname='exc', chunk_pat=fr'{archive_rootname}_{dirname}_chunk_\d+', subdir='chunks')
def cmpcvr_by_tasklists(argsdict: dict):
    """
    ACTIVE
    Comparison with CVR proceeds using the same chunks as were used in extraction.
    Each marks tasklist is a BIF table with information about each ballots, one per record.
    After extractvote is completed, marks_chunks folder contains marks_df.csv for each chunk.
    As the BIF table is sorted by 'cvrfile', this will reduce the size of CVR that must be loaded.

    """
    utils.sts('cmpcvr by tasklists', 3)

    # get the list of all extraction tasks in marks/tasks/ subfolder, without .csv extension.
    # name is like {archive_root}_chunk_{chunk_idx}.csv 
    tasklists = DB.list_files_in_dirname_filtered(dirname='marks', subdir='tasks', file_pat=r'.*\.csv$', fullpaths=False, no_ext=True)
    total_num = len(tasklists)
    utils.sts(f"Found {total_num} tasklists", 3)

    use_lambdas = argsdict['use_lambdas']

    if use_lambdas:
        LambdaTracker.clear_requests()

    # The 'extraction_tasks' are ordered also according to archive_root.

    archive_rootnames = []                     
    for source in argsdict['source']:
        archive_rootname = os.path.splitext(os.path.basename(source))[0]
        archive_rootnames.append(archive_rootname)                     

    for archive_idx, archive_rootname in enumerate(archive_rootnames):
        # process the tasklists one archive at a time.
        cmpcvr_tasks = [t for t in tasklists if t.startswith(archive_rootname)]
    
        for chunk_idx, tasklist_name in enumerate(cmpcvr_tasks):
        
            #----------------------------------
            # this call may delegate to lambdas and return immediately
            # if 'use_lambdas' is enabled.
            # otherwise, it blocks until the chunk is completed.
            # once the lambda is launched, processing continues at
            # 'delegated_cmpcvr()' below.
            
            build_one_chunk(argsdict, 
                dirname='cmpcvr', 
                chunk_idx=chunk_idx, 
                filelist=[tasklist_name], #tasklist name will be like {archive_root}_chunk_{chunk_idx}
                group_name=archive_rootname,
                task_name='cmpcvr', 
                incremental=False)
            #----------------------------------

            if not chunk_idx and not archive_idx and argsdict['one_lambda_first']:
                if not wait_for_lambdas(argsdict, task_name='cmpcvr'):
                    utils.exception_report("task 'cmpcvr' failed delegation to lambdas.")
                    sys.exit(1)           

    wait_for_lambdas(argsdict, task_name='cmpcvr')

    for archive_rootname in archive_rootnames:
    
        #cmpcvr/chunks/disagreed_{archive_root}_chunk_{chunk_idx}.csv    # individual cmpcvr disagreed chunks
        #cmpcvr/chunks/overvotes_{archive_root}_chunk_{chunk_idx}.csv # individual cmpcvr overvote chunks

        DB.combine_dirname_chunks(dirname='cmpcvr', subdir='chunks', 
            dest_name=archive_rootname+'_cmpcvr.csv', 
            file_pat=fr'{archive_rootname}_chunk_\d+\.csv')
            
        DB.combine_dirname_chunks(dirname='cmpcvr', subdir='chunks', 
            dest_name=archive_rootname+'disagreed.csv', 
            file_pat=fr'disagreed_{archive_rootname}_chunk_\d+\.csv')
            
        DB.combine_dirname_chunks(dirname='cmpcvr', subdir='chunks', 
            dest_name=archive_rootname+'overvotes.csv', 
            file_pat=fr'overvotes_{archive_rootname}_chunk_\d+\.csv')
            
        logs.get_and_merge_s3_logs(dirname='cmpcvr', rootname='log', chunk_pat=fr'{archive_rootname}_chunk_\d+', subdir='chunks')
        logs.get_and_merge_s3_logs(dirname='cmpcvr', rootname='exc', chunk_pat=fr'{archive_rootname}_chunk_\d+', subdir='chunks')
Example #26
0
def gentemplates_by_tasklists(argsdict):
    """
    ACTIVE
    This replaces the gentemplates function.
    given tasklists which exist in the tasklist folder,
    read each in turn and if the number of ballots included meet a minimum,
    process each line item in turn.
    The style is the name of the tasklist.

    Tasklists are generated by reviewing the BIF tables.
    
    Each delegetion to lambdas (or performed locally) will include 
    subprocesses according to the argsdict parameters:
    
        include_gentemplate_tasks       - include the generation of tasklists prior to delegation.
        use_single_template_task_file   - means a single JSON file will be created instead of separate task files on s3
                                            and a portion of that task list will be passed to each lambda
        include_gentemplate             - for each style, combine ballots to create a base template
        include_genrois                 - generate regions of interest (ROIs) and OCR
        include_maprois                 - map the official contest names to what is read on the ballot to create roismap
        

    
    """
    styles_on_input = []
    #attempted_but_failed_styles = []   # will need to determine by looking for templates

    utils.sts('Generating style templates from a combined set of ballot images', 3)

    # this loads and parses the EIF
    contests_dod = create_contests_dod(argsdict)
    #DB.save_style(name='contests_dod', style_data=contests_dod)
    DB.save_data(data_item=contests_dod, dirname='styles', name='contests_dod.json')

    # style_to_contests_dol
    # if the CVR is available, we can get a list of styles that are associated with a ballot_type_id.
    # this may be enough to know exactly what contests are on a given ballot, but only if the 
    # style which keys this list is also directly coupled with the card_code read from the ballot.
    # In some cases, such as Dane County, WI, this is a 1:1 correspondence. But SF has an complex
    # style conversion which is nontrivial to figure out. 
    # thus, this is still needed in style discovery.

    style_to_contests_dol = DB.load_data(dirname='styles', name='CVR_STYLE_TO_CONTESTS_DICT.json', silent_error=True)
    if not style_to_contests_dol:
        logs.sts("CVR_STYLE_TO_CONTESTS_DICT.json not available. Trying to convert CVR to styles", 3)
        style_to_contests_dol = convert_cvr_to_styles(argsdict, silent_error=True)
        if not style_to_contests_dol:
            logs.sts("Unable to convert CVR to style_to_contests_dol, trying manual_styles_to_contests", 3)
            style_to_contests_dol = get_manual_styles_to_contests(argsdict, silent_error=True)

        if style_to_contests_dol:
            DB.save_data(data_item=style_to_contests_dol, dirname='styles', name='CVR_STYLE_TO_CONTESTS_DICT.json')
            
    if not style_to_contests_dol:
        logs.sts("style_to_contests_dol unavailable. full style search is required.", 3)

    if argsdict.get('use_lambdas'):
        LambdaTracker.clear_requests()

    first_pass = True

    if argsdict['use_single_template_task_file']:
        template_tasklists_dolod = DB.load_data(dirname='styles', name="template_tasklists_dolod.json")
        total_num = len(template_tasklists_dolod)
        utils.sts(f"Found {total_num} taskslists", 3)
        
        for chunk_idx, (style_num, style_lod) in enumerate(template_tasklists_dolod.items()):
            if not style_num: continue
            
            if argsdict.get('include_style_num') and style_num not in argsdict['include_style_num'] or \
                argsdict.get('exclude_style_num') and style_num in argsdict['exclude_style_num']:
                continue
            
            styles_on_input.append(style_num)

            if argsdict.get('incremental_gentemplate', False) and DB.template_exists(style_num):
                utils.sts(f"Style {style_num} already generated, skipping...", 3)
                continue
                
            utils.sts(f"Processing template for style {style_num} #{chunk_idx}: of {total_num} ({round(100 * (chunk_idx+1) / total_num, 2)}%)")

            # the function call below will delegate to lambdas if use_lambdas is True.
            build_one_chunk(argsdict,
                dirname='styles', 
                subdir=style_num,
                chunk_idx=chunk_idx, 
                filelist=[style_lod],            # only one style per lambda chunk, but can execute gentemplate, genrois, and maprois for same style.
                group_name=style_num, 
                task_name='gentemplate', 
                incremental=False,
                )

            if argsdict['use_lambdas'] and first_pass and argsdict['one_lambda_first']:
                if not wait_for_lambdas(argsdict, task_name='gentemplate'):
                    utils.exception_report("task 'gentemplate' failed delegation to lambdas.")
                    sys.exit(1)           
                first_pass = False
            # if not generate_template_for_style_by_tasklist_df(argsdict, style_num, tasklist_df):
                # attempted_but_failed_styles.append(style_num)
        
    else:    
        tasklists = DB.list_files_in_dirname_filtered(dirname='styles', subdir="tasks", file_pat=r'.*\.csv', fullpaths=False)
        total_num = len(tasklists)
        utils.sts(f"Found {total_num} taskslists", 3)

        for chunk_idx, tasklist_name in enumerate(tasklists):
            if tasklist_name == '.csv': continue
            
            style_num = os.path.splitext(os.path.basename(tasklist_name))[0]
            styles_on_input.append(style_num)

            if args.argsdict.get('incremental_gentemplate', False) and DB.template_exists(style_num):
                utils.sts(f"Style {style_num} already generated, skipping...", 3)
                continue
                
            utils.sts(f"Processing template for style {style_num} #{chunk_idx}: of {total_num} ({round(100 * (chunk_idx+1) / total_num, 2)}%)")

            # the function call below will delegate to lambdas if use_lambdas is True.
            build_one_chunk(argsdict,
                dirname='styles', 
                chunk_idx=chunk_idx, 
                filelist=[tasklist_name], 
                group_name=style_num, 
                task_name='gentemplate', 
                incremental=False,
                )
            if argsdict['use_lambdas'] and first_pass and argsdict['one_lambda_first']:
                if not wait_for_lambdas(argsdict, task_name='gentemplate'):
                    utils.exception_report("task 'gentemplate' failed delegation to lambdas.")
                    sys.exit(1)           
                first_pass = False

    wait_for_lambdas(argsdict, task_name='gentemplate')
    post_gentemplate_cleanup(argsdict)
Example #27
0
def generate_template_for_style_by_tasklist_lod(argsdict: dict,
                                                tasklist_lod: list = None):
    """ ACTIVE
        This function is driven by a preselected set of ballots listed in BIF format.
        This list is prefiltered to exclude BMD ballots, and the ballots are all of the
        same physical style so they can be combined to produce a template with higher
        resolution and which largely excludes random marks. Generates a set of template images
        1. opens the files either from local zip archives or on s3 bucket (already unzipped.)
        2. aligns the images to alignment targets.
        3. reads the barcode style and checks it with the card_code (which may differ from the style_num)
        4. gets the timing marks.
        5. calls generate_style_template(), which:
            a. reviews the images and chooses the most average image in terms of stretch.
            b. discards any excessively stretched images.
            c. stretch-fixes the rest on timing-mark basis to "standard" timing marks.
            d. combines into one image.
            e. saves style information as JSON.
    """
    global archive
    global current_archive_basename
    current_archive_basename = ''

    ballot_queue = []
    ballots_unprocessed = []
    tot_failures = 0

    #if not tasklist_lod:
    #    tasklist_lod = tasklist_df.to_dict(orient='records')
    for task_idx, item_dict in enumerate(tasklist_lod):
        #import pdb; pdb.set_trace()

        archive_basename = item_dict['archive_basename']
        ballot_file_paths = re.split(r';', item_dict['file_paths'])
        precinct = item_dict['precinct']
        sheet0 = item_dict['sheet0']
        card_code = item_dict['card_code']
        style_num = item_dict['style_num']      # will be the same for all records.

        ballot      = Ballot(argsdict, file_paths=ballot_file_paths, archive_basename=archive_basename)    # initialize and derive ballot_id, precinct, party, group, vendor
        ballot_id   = ballot.ballotdict['ballot_id']
        precinct    = ballot.ballotdict['precinct']

        utils.sts (f"gentemplate_by_tasklist for "
                    f"style_num:{style_num} "
                    f"item:{task_idx} "
                    f"in archive {archive_basename} "
                    f"ballotid:{ballot_id} "
                    f"in precinct:'{precinct}'...", 3)

        if archive_basename != current_archive_basename:
            if current_archive_basename:
                archive.close()
            utils.sts (f"opening archive: '{archive_basename}'...", 3)
            archive = open_archive(argsdict, archive_basename)
            current_archive_basename = archive_basename

        if not ballot.load_source_files(archive):
            utils.exception_report(f"EXCEPTION: Could not load source files from archive {archive_basename} "
                                    f"item:{task_idx} for ballot_id: {ballot_id} Precinct: {precinct}")
            continue
        ballot.get_ballot_images()
        ballot.align_images()
        read_style_num = ballot.read_style_num_from_barcode(argsdict)
        if not argsdict.get('style_from_party', None) and not argsdict.get('style_lookup_table_path', ''):
            if str(read_style_num) != str(card_code):
                utils.exception_report(f"Style {read_style_num} in ballot {ballot_id} doesn't match style card_code {card_code} from tasklist")
                #add_instruction(bif_name=source_name, ballot_id=ballot_id, column='style_num', value=f'not matched to {style_num}')
                ballots_unprocessed.append(ballot_id)
                continue
        #add_instruction(bif_name=archive_basename, ballot_id=ballot_id, column='style_num', value=style_num)

        ballot.get_timing_marks()       # for each image, capture the timing marks to ballot instance.
                                        # note that sometimes timing marks are not available on page 1.

        if not are_timing_marks_consistent(ballot.ballotdict['timing_marks']):
            utils.exception_report(f"EXCEPTION: Timing mark recognition failed: ballot_id: {ballot_id} Precinct: {precinct}")
            tot_failures += 1
            continue
        ballot_queue.append(ballot)

    utils.sts(f"Generating Style Template from {len(ballot_queue)} ballots (omitted {tot_failures} failed ballots)...", 3)
    if generate_style_template(argsdict, ballot_queue, style_num, sheet0):
        utils.sts(f"Style templates generation completed successfully.\n Processed a total of {len(ballot_queue)} ballots", 3)
        return True
    else:
        utils.sts("Style templates generation FAILED.", 3)
        return False
    def read_style_num_from_barcode(self, argsdict):
        """
        if ballot.style_num is defined, then use it, otherwise:
        given np.array of image, read ES&S barcode and decode it.
        return style_num as str if successful else None
        typical usage:
        style_num = read_style_from_image(image)
            may return None if there is an underlying error.
        """

        logs.sts("Reading style_num from ballot barcode...", 3)
        ballot_id = self.ballotdict['ballot_id']

        ballot_style_overrides_dict = args.get_ballot_style_overrides(argsdict)

        if self.ballotdict['vendor'] == 'Dominion':
            if self.ballotdict['card_code'] is None:
                # This situation exists if there was a problem converting the barcode during alignment.

                self.ballotdict['style_num'] = None
            elif argsdict['conv_card_code_to_style_num']:
                #attempt to convert card_code to the official style_num which should match CVR style field.
                # if ballot_type_id or card_code cannote be read, then this may return None
                self.ballotdict[
                    'style_num'], _ = dominion_build_effective_style_num(
                        argsdict, self.ballotdict['card_code'])
            else:
                self.ballotdict['style_num'] = self.ballotdict['card_code']

            if self.ballotdict['style_num'] is None:
                utils.exception_report(
                    f"### EXCEPTION: card_code not read from ballot:{ballot_id}. "
                )
                return None

        elif self.ballotdict['vendor'] == 'ES&S':
            card_code = read_raw_ess_barcode(self.ballotimgdict['images'][0],
                                             ballot_id)
            self.ballotdict['card_code'] = style_num = card_code

            from utilities.bif_utils import read_pstyle_from_image_if_specd
            self.ballotdict['pstyle_num'] = read_pstyle_from_image_if_specd(
                argsdict, self.ballotimgdict['images'][0])

            # style num must be a string
            if argsdict['conv_card_code_to_style_num']:
                # converting the card_code to the style number is important to link it to the
                # style number as used on CVR. If no CVR is used, or if we are not attempting to link them
                # then using the card_code directly occurs when 'conv_card_code_to_style_num' is False
                cc_style_num = str(
                    barcode_parser.get_parsed_barcode(
                        card_code, ballot_id, self.ballotdict['precinct']))
                self.ballotdict['ballot_type_id'] = cc_style_num

            if argsdict['use_pstyle_as_style_num'] and self.ballotdict[
                    'pstyle_num']:
                self.ballotdict['style_num'] = self.ballotdict['pstyle_num']
            elif self.ballotdict['ballot_type_id']:
                self.ballotdict['style_num'] = self.ballotdict[
                    'ballot_type_id']
            else:
                self.ballotdict['style_num'] = card_code

        if not self.ballotdict['style_num'] and ballot_style_overrides_dict:
            if ballot_id in ballot_style_overrides_dict:
                return ballot_style_overrides_dict[ballot_id]

        else:
            style_num = self.ballotdict['style_num']
        return style_num
Example #29
0
def generate_style_template(argsdict: dict,
                            ballots: list,
                            style_num,
                            sheet0=0,
                            omit_ballot_images=False):
    """
    ACTIVE 
    Function which takes a list of Ballot instances and generate
    a new style template with information like ballot code, number
    and regions of interests (ROI). To achieve that, function creates
    a weighted image of ballot based on a list of all passed 'ballots'
    (they should be in similar alignment and shape). Then function looks
    for ROIs and extract data contained within weighted image with OCR tool.
    
    TO MOVE THIS TOWARD IMPLMENTATION COMPATIBLE WITH LAMBDAS
    1. the caller this function should, instead of generating a list of Ballot instances
        with the image already extracted from the file, into just a list of pathnames
        to process. So the Queues.py class should be oriented to just keeping a single
        dict of list structure, where the key of the dict is the style_num, and the
        list containing the ballots pathnames that are of that style.
    2. We must add an intermediate function to make this conversion, which will
        take that list and for each ballot, open it and load the images for each file, 
        and then call this function. Let's assume we call that function
        'generate_style_template_from_paths(ballot_paths: list, style_num)'
        It will be the appropriate operation type that can be ported to work on lambdas.
    3. The result of this function will be only the combined template. It will be
        reasonable to continue with the subsequent steps for this style, such as
        genrois and maprois. Those functions take the combined template plus
        EIF file information to finally generate at roismap_df for the style.
        Each roismap_df is combined together after all lambdas are competed to 
        produce the roismap_df which is later used in the extraction process.
    4. Result of style generation lambda will be:
        1. list of pathnames actually used in the style generation, in cause some were
            inappropriate or unusable.
        2. roismap_df for that style.
        3. combined template with redlines of the rois that are mapped to it.
        
    sheet value is simply added to the style dict. The sheet is used for any later 
    drawing of lines which may only be appropriate for one of the sheets.
        
    """
    #use_sync_timing = True

    utils.sts(
        f"Generating ballot style templates for style {style_num} using {len(ballots)} ballots...",
        3)
    if not ballots:
        utils.exception_report(
            "generate_style_template: List of ballots is empty")
        return False

    #ballots.sort(key=sum_determinants)
    ballots = ballots[:config_dict[
        'LAYERS_FOR_EMPTY_BALLOT']]  # consider first ballots. Maybe better to choose ballots with least stretch
    style = Style(style_num=style_num)
    style.sheet0 = sheet0
    style.target_side = argsdict['target_side']
    style.build_from_count = len(ballots)
    style.precinct = ballots[0].ballotdict['precinct']
    style.build_from_ballots = [
        ballot.ballotdict['ballot_id'] for ballot in ballots
    ]
    weighted_images = []
    pages = range(len(ballots[0].ballotimgdict['images']))

    utils.sts("Generating the average timing marks for minimal corrections", 3)
    std_ballot_num = choose_unstretched_ballot(ballots)

    utils.sts("stretch_fix all ballots to std_timing_marks", 3)
    stretch_fix_ballots(argsdict, ballots, std_ballot_num)

    # first save them so we can diagnose any problem.
    if argsdict['save_checkpoint_images'] and not omit_ballot_images:
        utils.sts("Saving checkpoint images...", 3)
        #confirmed this is working to s3.
        save_style_ballot_images(ballots, style_num)

    utils.sts("Combining images to create template for each page...", 3)
    for page in pages:

        if not (page and
                (ballots[0].ballotdict.get('p1_blank', False)
                 or not ballots[0].ballotdict.get('timing_marks', []))):

            weighted_images.append(get_weighted_image_from_page(page, ballots))

    # image templates must be saved outside style
    utils.sts("Saving style template images...", 3)
    style.filepaths = save_style_template_images(style_num, weighted_images)

    style.timing_marks = ballots[std_ballot_num].ballotdict['timing_marks']

    utils.sts("Saving style object...", 3)
    #DB.save_style(name=style_num, style_data=vars(style))
    DB.save_data(data_item=vars(style),
                 dirname='styles',
                 subdir=style_num,
                 name=f'{style_num}_style')
    """
    style_dict saved at this point:
        'build_from_count':     int number of ballots included in the generation of the template
        'precinct':             str precinct designation
        'build_from_ballots':   list of ballot_ids that were used to build the template.
        'filepaths':            list of template files produced
    """
    utils.sts("Saved combined image tamplates...", 3)
    return True