def extractvote_by_tasklists(argsdict: dict): """ ACTIVE This replaces the extractvotes function. given tasklists which exist in the extraction_tasks folder, Tasklists are generated by reviewing the BIF tables. Each tasklist creates a separate f"marks_{tasklist_name}.csv" file in the results folder. """ logs.sts('Extracting marks from extraction tasklists', 3) tasklists = DB.list_files_in_dirname_filtered(dirname='marks', subdir='tasks', file_pat=r'^[^~].*\.csv$', fullpaths=False) total_num = len(tasklists) utils.sts(f"Found {total_num} taskslists", 3) use_lambdas = argsdict['use_lambdas'] if use_lambdas: LambdaTracker.clear_requests() #clear_instructions(config_d.TASKS_BUCKET, Job.get_path_name()) biflist = get_biflist(no_ext=True) for bif_idx, bifname in enumerate(biflist): archive_name = re.sub(r'_bif', '', bifname) genmarks_tasks = [t for t in tasklists if t.startswith(archive_name)] for chunk_idx, tasklist_name in enumerate(genmarks_tasks): #---------------------------------- # this call may delegate to lambdas and return immediately # if 'use_lambdas' is enabled. # otherwise, it blocks until the chunk is completed. build_one_chunk(argsdict, dirname='marks', chunk_idx=chunk_idx, filelist=[tasklist_name], group_name=bifname, task_name='extractvote', incremental=False) #---------------------------------- if not chunk_idx and not bif_idx and argsdict['one_lambda_first']: if not wait_for_lambdas(argsdict, task_name='extractvote'): utils.exception_report("task 'extractvote' failed delegation to lambdas.") sys.exit(1) wait_for_lambdas(argsdict, task_name='extractvote') utils.combine_dirname_chunks_each_archive(argsdict, dirname='marks') logs.get_and_merge_s3_logs(dirname='marks', rootname='log', chunk_pat=r"_chunk_\d+", subdir="chunks") logs.get_and_merge_s3_logs(dirname='marks', rootname='exc', chunk_pat=r"_chunk_\d+", subdir="chunks")
def lambda_report_status(task_args, request_id, status, error_info=None): tracker_s3path = create_lambda_tracker_s3path_by_task_args(task_args, status) buff = json.dumps({ "request_id": request_id, "status": status, "error_info": error_info, 'task_args': task_args, }) s3utils.write_buff_to_s3path(tracker_s3path, buff) # log to cloudwatch in case if there is any error for tracking if error_info: print(buff) logs.sts(f"Tracker file written with status='{status}'", 3)
def get_style_nums_with_templates_s3(argsdict): """Return list of style_nums based on styles found in S3 bucket. """ logs.sts("Compiling style_nums with templates:", 3) from aws_lambda import s3utils s3dirpath = DB.dirpath_from_dirname('styles', s3flag=True) s3dict = s3utils.parse_s3path(s3dirpath) prefix = s3dict['prefix'] style_nums = s3utils.list_files_in_prefix_s3( s3dirpath, file_pat=fr".*{prefix}([^/]+)/.*\-template\-1\.png$") logs.sts(f"Total of {len(style_nums)} style_nums found.", 3) return style_nums
def fuzzy_compare_str(correct_str, ocr_str, thres=80, justify='full', method='levdist') -> tuple: #bool, metric """ compare a known correct string with an ocrd string that may have mistakes. justify can be 'left', 'right' or 'full' """ p_correct_str = correct_str.replace("\n", " ")[:50] p_ocr_str = ocr_str.replace("\n", " ") #[:50] logs.sts(f"fuzzy_compare_str justify: {justify}:\n" f"correct: '{p_correct_str}'\n" f"ocr: '{p_ocr_str}'") if method == 'regex': """ This algorithm assumes no special characters in the correct string. and it is relatively greedy. first, correct string is scanned to create a regex specifier. then, the ocrd string is compard with the regex specified string. """ regexc = make_fuzzy_regex(correct_str) return regexc.match(ocr_str), None if method == 'table': match_val = compare_words(correct_str, ocr_str) return match_val > thres, None if method == 'levdist': min_len = min(len(correct_str), len(ocr_str)) if justify == 'left': local_ocr_str = ocr_str[:min_len] #local_cor_str = correct_str[:min_len] elif justify == 'right': local_ocr_str = ocr_str[-min_len:] #local_cor_str = correct_str[-min_len:] else: local_ocr_str = ocr_str #local_cor_str = correct_str match_val = lev.ratio(correct_str, local_ocr_str) lv = "%1.5f" % match_val logs.sts(f" levratio = {lv}", 3) return match_val >= thres, match_val print(f"Logic Error: Unrecognized method:{method}\n") traceback.print_stack() sys.exit(1)
def post_gentemplate_cleanup(argsdict): # this portion of the above function has been separated to allow for individual testing. # normally, we combine chunks, but in the case of styles generation, this is not needed except for roismap. logs.sts("gentemplates_by_tasklists completed.\n", 3) #import pdb; pdb.set_trace() if argsdict['include_maprois']: #styles_completed = DB.list_subdirs_with_filepat('styles', file_pat=r'\.json$', s3flag=None) #attempted_but_failed_styles = [s for s in styles_on_input if s not in styles_completed] logs.sts("Combining roismap for each style into a single .csv file.", 3) DB.combine_dirname_chunks(dirname='styles', subdir="roismap", dest_name='roismap.csv', file_pat=r'_roismap\.csv') good_map_num = logs.get_and_merge_s3_logs(dirname='styles', rootname='map_report', chunk_pat=r'\d+_styles_chunk_\d+', subdir='logs_good_maps') fail_map_num = logs.get_and_merge_s3_logs(dirname='styles', rootname='map_report', chunk_pat=r'\d+_styles_chunk_\d+', subdir='logs_failed_maps') logs.sts(f"{good_map_num} styles successfully mapped; {fail_map_num} styles did not fully map.", 3) # style logs are placed in one folder in styles # logs are like exc_11010_styles_chunk_84.txt # downloads file_pat=fr"{rootname}_{chunk_pat}\.txt" logs.get_and_merge_s3_logs(dirname='styles', rootname='log', chunk_pat=r'\d+_styles_chunk_\d+', subdir='logs') logs.get_and_merge_s3_logs(dirname='styles', rootname='exc', chunk_pat=r'\d+_styles_chunk_\d+', subdir='logs')
def filter_extraction_ballots(argsdict, reduced_df): """ given df including a reduced set of ballots from BIF, further reduce these ballots to those that have styles mapped and meet input parameter specifications. """ logs.sts(f"Total number of ballots in BIF: {len(reduced_df.index)}", 3) all_mapped_styles = DB.get_style_nums_with_templates(argsdict) utils.sts(f"There are a total of {len(all_mapped_styles)} styles mapped.", 3) included_styles = argsdict.get('include_style_num') if included_styles: filtered_styles = [i for i in all_mapped_styles if i in included_styles] utils.sts(f"The settings file includes list of styles to include. Filtered to {len(filtered_styles)} styles.", 3) else: filtered_styles = all_mapped_styles excluded_styles = argsdict.get('exclude_style_num') if excluded_styles: filtered_styles = [i for i in filtered_styles if not i in excluded_styles] utils.sts(f"The settings file includes list of styles to exclude. Filtered to {len(filtered_styles)} styles.", 3) if excluded_styles or included_styles: reduced_df = reduced_df[reduced_df['style_num'].isin(filtered_styles)] if not argsdict['include_bmd_ballot_type']: reduced_df = reduced_df.loc[reduced_df['is_bmd'] != 1] utils.sts(f"The settings file excludes BMD ballots. Filtered to {len(reduced_df.index)} styles.", 3) if not argsdict['include_nonbmd_ballot_type']: reduced_df = reduced_df.loc[reduced_df['is_bmd'] == 1] utils.sts(f"The settings file excludes nonBMD ballots. Filtered to {len(reduced_df.index)} styles.", 3) logs.sts(f"Total number of ballots after filters applied for extraction: {len(reduced_df.index)}", 3) return reduced_df
def gentemplates_by_tasklists(argsdict): """ ACTIVE This replaces the gentemplates function. given tasklists which exist in the tasklist folder, read each in turn and if the number of ballots included meet a minimum, process each line item in turn. The style is the name of the tasklist. Tasklists are generated by reviewing the BIF tables. Each delegetion to lambdas (or performed locally) will include subprocesses according to the argsdict parameters: include_gentemplate_tasks - include the generation of tasklists prior to delegation. use_single_template_task_file - means a single JSON file will be created instead of separate task files on s3 and a portion of that task list will be passed to each lambda include_gentemplate - for each style, combine ballots to create a base template include_genrois - generate regions of interest (ROIs) and OCR include_maprois - map the official contest names to what is read on the ballot to create roismap """ styles_on_input = [] #attempted_but_failed_styles = [] # will need to determine by looking for templates utils.sts('Generating style templates from a combined set of ballot images', 3) # this loads and parses the EIF contests_dod = create_contests_dod(argsdict) #DB.save_style(name='contests_dod', style_data=contests_dod) DB.save_data(data_item=contests_dod, dirname='styles', name='contests_dod.json') # style_to_contests_dol # if the CVR is available, we can get a list of styles that are associated with a ballot_type_id. # this may be enough to know exactly what contests are on a given ballot, but only if the # style which keys this list is also directly coupled with the card_code read from the ballot. # In some cases, such as Dane County, WI, this is a 1:1 correspondence. But SF has an complex # style conversion which is nontrivial to figure out. # thus, this is still needed in style discovery. style_to_contests_dol = DB.load_data(dirname='styles', name='CVR_STYLE_TO_CONTESTS_DICT.json', silent_error=True) if not style_to_contests_dol: logs.sts("CVR_STYLE_TO_CONTESTS_DICT.json not available. Trying to convert CVR to styles", 3) style_to_contests_dol = convert_cvr_to_styles(argsdict, silent_error=True) if not style_to_contests_dol: logs.sts("Unable to convert CVR to style_to_contests_dol, trying manual_styles_to_contests", 3) style_to_contests_dol = get_manual_styles_to_contests(argsdict, silent_error=True) if style_to_contests_dol: DB.save_data(data_item=style_to_contests_dol, dirname='styles', name='CVR_STYLE_TO_CONTESTS_DICT.json') if not style_to_contests_dol: logs.sts("style_to_contests_dol unavailable. full style search is required.", 3) if argsdict.get('use_lambdas'): LambdaTracker.clear_requests() first_pass = True if argsdict['use_single_template_task_file']: template_tasklists_dolod = DB.load_data(dirname='styles', name="template_tasklists_dolod.json") total_num = len(template_tasklists_dolod) utils.sts(f"Found {total_num} taskslists", 3) for chunk_idx, (style_num, style_lod) in enumerate(template_tasklists_dolod.items()): if not style_num: continue if argsdict.get('include_style_num') and style_num not in argsdict['include_style_num'] or \ argsdict.get('exclude_style_num') and style_num in argsdict['exclude_style_num']: continue styles_on_input.append(style_num) if argsdict.get('incremental_gentemplate', False) and DB.template_exists(style_num): utils.sts(f"Style {style_num} already generated, skipping...", 3) continue utils.sts(f"Processing template for style {style_num} #{chunk_idx}: of {total_num} ({round(100 * (chunk_idx+1) / total_num, 2)}%)") # the function call below will delegate to lambdas if use_lambdas is True. build_one_chunk(argsdict, dirname='styles', subdir=style_num, chunk_idx=chunk_idx, filelist=[style_lod], # only one style per lambda chunk, but can execute gentemplate, genrois, and maprois for same style. group_name=style_num, task_name='gentemplate', incremental=False, ) if argsdict['use_lambdas'] and first_pass and argsdict['one_lambda_first']: if not wait_for_lambdas(argsdict, task_name='gentemplate'): utils.exception_report("task 'gentemplate' failed delegation to lambdas.") sys.exit(1) first_pass = False # if not generate_template_for_style_by_tasklist_df(argsdict, style_num, tasklist_df): # attempted_but_failed_styles.append(style_num) else: tasklists = DB.list_files_in_dirname_filtered(dirname='styles', subdir="tasks", file_pat=r'.*\.csv', fullpaths=False) total_num = len(tasklists) utils.sts(f"Found {total_num} taskslists", 3) for chunk_idx, tasklist_name in enumerate(tasklists): if tasklist_name == '.csv': continue style_num = os.path.splitext(os.path.basename(tasklist_name))[0] styles_on_input.append(style_num) if args.argsdict.get('incremental_gentemplate', False) and DB.template_exists(style_num): utils.sts(f"Style {style_num} already generated, skipping...", 3) continue utils.sts(f"Processing template for style {style_num} #{chunk_idx}: of {total_num} ({round(100 * (chunk_idx+1) / total_num, 2)}%)") # the function call below will delegate to lambdas if use_lambdas is True. build_one_chunk(argsdict, dirname='styles', chunk_idx=chunk_idx, filelist=[tasklist_name], group_name=style_num, task_name='gentemplate', incremental=False, ) if argsdict['use_lambdas'] and first_pass and argsdict['one_lambda_first']: if not wait_for_lambdas(argsdict, task_name='gentemplate'): utils.exception_report("task 'gentemplate' failed delegation to lambdas.") sys.exit(1) first_pass = False wait_for_lambdas(argsdict, task_name='gentemplate') post_gentemplate_cleanup(argsdict)
def main(): utils.show_logo() print( f"\n\n{'=' * 50}") argsdict = args.get_args() # parses input_file as specifed in CLI using arg_specs.csv args.argsdict = argsdict print("argsdict:") print(pprint.pformat(argsdict)) print( f"\n\n{'=' * 50}") if (argsdict.get('self_test')): self_test.self_test(argsdict) """ The paths of archives is normalized to allow the archives to be either local or on s3. 'archives_folder_path' -- path to folder on local system. 'archives_folder_s3path' -- s3path to folder on s3 'source' list are basenames, without path, but including extension. """ # if argsdict['archives_folder_path'] and not argsdict['source']: # # create a list of source archives in the source folder. # srcdict = {} # dirdict = utils.get_dirdict(argsdict['archives_folder_path'], '.zip') # for name, path in dirdict.items(): # if (name in argsdict['exclude_archives'] or # argsdict['include_archives'] and not name in argsdict['include_archives']): # continue # srcdict[name] = path # argsdict['source'] = list(srcdict.values()) # argsdict['srcdict'] = srcdict # utils.sts(f"input directive 'source' resolved to: {argsdict['source']}", 3) op = argsdict.get('op', 'all').lower() DB.set_DB_mode() """ ======================================================================= PRIMARY API ENTRY POINTS Each one of the following relies on a job file which provides the settings as parameter,value in csv file, where comments are allowed preceded by #. Thus the api must provide -i path location of settings file -- could be file on s3. -op operation string like 'genbif_from_cvr' Each function produces: log.txt appends extensive status reports. exception_report.txt appends each exception encountered. exceptions to processing and not python exceptions, per se. as well as other files, noted below. Initial implementation will include one major intry point with operation selection as follows: 'genbif_from_cvr' (Fast) 'genbif_from_ballots' (Slow) 'create_bif_report' (Fast) 'gentemplates' (Slow) 'genmaprois' (Somewhat slow) 'extractvote' (Very slow) 'genreport' (fast) 'cmpcvr_and_report' (somewhat slow) 'get_status' (fast) - return status of slow functions. op='get_status' ref='function' where function = one of 'genbif_from_ballots', 'gentemplates', 'genmaprois', 'extractvote' In the functions below, argsdict is established from the settings file. """ if op == 'copy_config_files_to_s3': """ This function will copy local config files in EIFs to s3, to simulate interaction with the frontend website, which will upload and place files s3://us-east-1-audit-engine-jobs/{job_name}/config/ Files to be placed there: JOB settings file EIF file BOF file manual_styles_to_contests style_lookup_table In local mode running these are in either EIFs/ or input_files/ in repo folder. """ DB.upload_file_dirname('config', argsdict['eif']) DB.upload_file_dirname('config', argsdict['bof']) DB.upload_file_dirname('config', argsdict['manual_styles_to_contests_filename']) DB.upload_file_dirname('config', argsdict['style_lookup_table_filename']) DB.upload_file_dirname('config', argsdict['input'], local_dirname='input_files') elif op == 'precheck_job_files': """ This function simply does a precheck of the job files that exist in the config folder for this job on s3. """ pass elif op == 'genbif_from_cvr': """ If CVR file(s) are provided with style information included, this operation builds "ballot information file" BIF data by reviewing the CVR May also use path information of ballots in archives for precincts, groups, party. For Dominion, scan CVR JSON chunks and fill in info about ballots. Creates one .csv file for each archive in folder bif. This is a relatively fast operation that can be completed typically in a matter of seconds Result: BIF data file ready for BIF report. log exception report """ genbif_from_cvr(argsdict) elif op == 'genbif_from_ballots': """ If no CVR is available, we must scan the ballots to generate the bif. Each ballot is reviewed and style information is read from the ballots. May also use path information of ballots in archives for precincts, groups, party. This can be done by lambdas and should complete within minutes but typically will not complete during a single REST post/response. Result: BIF ready to produce BIF report. separate folder for each failing ballot to allow investigation. log exception report """ genbif_from_ballots(argsdict) # elif op == 'get_status': # """ This function provides status operation in terms of % complete. # """ # if ref == 'genbif_from_ballots': # return get_status_genbif_from_ballots(argsdict) # elif ref == 'gentemplates': # return get_status_gentemplates(argsdict) # elif ref == 'genmaprois': # return get_status_genmaprois(argsdict) # elif ref == 'extractvote': # return get_status_extractvote(argsdict) # else: # utils.sts(f"ref '{ref}' not supported by op=get_status", 3) elif op == 'create_bif_report': """ as a result of validate_bifs or genbif_from_ballots, this report is generated, or it can be generated once the BIF is built. Report provides: Number of Ballot Archives Total number of BIF records Unique ballot_ids Duplicate ballot_ids Number of CVR files Number of precincts Number of parties Number of style_nums Number of card_codes Number of ballots w/o card_codes Number of BMD ballots Number of corrupted ballots (could not be read) Number of different sheets Number of each sheet This operation completes quickly and currently produces a text report to console. Can provide alternative data output as JSON or HTML through command line switch. """ create_bif_report(argsdict) elif op == 'build_template_tasklists': """ Scan bifs and generate template tasklists, with one tasklist csv file per style. tasklist is the same format as bif but should not be updated with any information. This generally not used as REST entry point. """ build_template_tasklists(argsdict) elif op == 'gentemplates': """ this function requires that BIF data is available. Used as REST entry point. 1. generates template tasklists 2. contructs templates by combining usually 50 ballots to improve resolution. Result is a set of raw templates (PNG files), one for each style, and possibly also checkpoint images including the components (up to 50). This function takes significant time, of more than a minute per style. However, this can be delegated to lambdas and may be completed in (# styles/1000) * time per style, but still too long for single REST POST. For Dane County, WI, with 191 styles, it still takes at least a minute. If all 10,000 styles are used in SF, time is 10 minutes. Log file updated. Report generated of result. PNG files for review of each style. """ if argsdict['include_gentemplate_tasks']: # sub tasks in gentemplate action - generate base templates build_template_tasklists(argsdict) gentemplates_by_tasklists(argsdict) elif op == 'gentemplates_only': """ This function used for debugging only when tasklists are already generated. Tasklists take only seconds to complete now. NOT USED IN REST API """ gentemplates_by_tasklists(argsdict) elif op == 'genrois': """ After templates are generated, each style is image-analyzed and then OCR'd. Result is set of PNG images providing regions of interest (ROIs) determined. Style templates must be generated at this point to allow further analysis and generation of rois The json list of rois and the image for each result. Result: Creates a report of rois generated PNG image files with graphic outlines of rois that can be reviewed by the user. """ genrois(argsdict) elif op == 'maprois': """ Once Rois are generated, they can be fairly quickly mapped to contests and options based on information in the EIF - Election Information File. This operates at the rate of several seconds per style. Result is PNG "redlines" showing the mapping of contests and options to each style. Map report, providing detail of where mapping may have gotten off track. Log. """ maprois(argsdict) elif op == 'genmaprois': """ Major REST entry point. This the most typical operation once templates have been generated, which may take time and use compute resources. May need to be done repetitively while operator makes changes to settings file. Operator must review the map report and redlines. Once review is completed, then extraction can commence. Can break this up for processing by lambdas but it is so fast now that it may not be necessary. Result is: PNG images showing ROIS from genrois PNG redlines showing the correspondence of contests and options for each style. failures copied to assist folder Map Report Log """ genrois(argsdict) maprois(argsdict) elif op == 'get_assist_requests': """ After genmaprois is completed, some styles may need manual assistance by human operator. This is used in graphic-mode dominant rois generation rather than OCR dominant generation. Front end first requests assist requests, and the response is list of ballot_ids which needs assistance. path to each template file path to existing json file for that template. NOTE this is a new function which is not implemented yet. """ pass elif op == 'write_new_assist_annotation': """ The front end will implement functionality like is implemented by tools/template_edit.py, to allow the user to add rectangular regions, horizontal and vertical lines, to the image. Then, this writes a new JSON annodation file. Maybe this does not need to be provided if frontend can write to s3 directly. NOTE this is a new function which is not implemented yet, but is implemented for CLI operation as 'template_edit' using tools/template_edit.py """ pass elif op == 'build_extraction_tasks': """ Scan bifs and generate extraction tasklists, with an appropriate number of ballots for each lambda. tasklist is the same format as bif and should not be updated with any information by lambda. This function completes rapidly and thus is combined with actual extraction. """ build_extraction_tasks(argsdict) elif op == 'extractvote_only': """ with extraction tasklists already built, go through all the ballots in the archives and extract the marks into single csv data table for each tasklist, and then combine into a single csv file for each archive. Each tasklist is delegated to a separate lambda process. Each lambda can take up to 15 minutes to process one tasklist. Total time of this process is less than (# ballots / 200,000) * 15 minutes. So for a county like SF, with 500K ballots, upper limit is about 35 minutes. LA, the largest county in the US has about 6 million ballots, upper limit is 7.5 hours. """ extractvote_by_tasklists(argsdict) #extractvote(argsdict) elif op == 'extractvote': """ Build extraction tasklists and then extract vote Perform both the tasklist generation (fast) and extraction (slow) above. This is the normal REST entry point. Result is marks_df.csv for each archive. Extraction Report Log Exception Report """ # go through all the ballots in the archives and extract the marks into single json file for each archive build_extraction_tasks(argsdict) extractvote_by_tasklists(argsdict) elif op == 'genreport': """ Once extraction is completed, a report of results can be produced independent of the voting system results, or CVR. Can be compared with high-level election results. Result: summary of the election results per audit system. Includes total number of ballots: not processed by audit system due to misalignment or other corruption. not provided in archives. Compares with high-level election result. """ genreport(argsdict) elif op == 'cmpcvr': """ If a CVR is available and the voting system evaluation of each ballot is provided, then this function compares the audit system result with the voting system cvr and provides a comprehensive result. This function processes each marks_df.csv that corresponds to each archive, and compares each record with CVR, which is fully combined into one data file by this function. Result: cmpresult_n.csv for each archive n processed. This file is not combined to a single report. """ cmpcvr_by_tasklists(argsdict) elif op == 'gen_cmpcvr_report': """ The result of cmpcvr is on an archive-by-archive basis and compares the combined CVR, which is generally not organized by archive, with the marks_df.csv which are organized by archive. Creates a ballot-by-ballot comparison result on per-archive basis as csv file. Includes any adjudications in the determination of discrepancies. Result: comprehensive report of the comparison, as JSON or text. JSON discrepancy list reduced to just the discrepancies. """ generate_cmpcvr_report(argsdict) elif op == 'cmpcvr_and_report': """ This is a major REST entry point. compares the CVR and creates a report by combining the above two functions. """ cmpcvr_by_tasklists(argsdict) generate_cmpcvr_report(argsdict) elif op == 'get_discrepancy_list': """ new function for front end. After cmpcvr is completed, a full report is created. This provides just the discrepancies to allow for adjudication in frontend UI, and the existing adjudication JSON file. This is a new function. Result: JSON list of discrepancies log updated. NOTE: THIS IS A NEW FUNCTION """ pass elif op == 'submit_adjudications': """ front end will implement a review of all discrepancies and provides a DRE-like entry of votes as determined by review of ballot images This is a new function. Perhaps front end updates the adjudication file but this function may be better so the action is properly logged. Results: status log updated. NOTE: THIS IS A NEW FUNCTION """ pass # ============================================================================= # Updates the lambdas functions. # ============================================================================= elif op == 'update_lambda' or op == 'update_lambdas': branch = argsdict.get('update_branch', 's3-and-lambdas-dev') """ to run this function, you must first delete the tree 'lambda_deploytment' including the folder. """ function_name = argsdict.get('lambda_function', 'all') if function_name == 'all': update_lambda(update_all=True, branch=branch) else: update_lambda(function_name=function_name, branch=branch) # ============================================================================= # Additional operations only used for development and CLI operation. # ============================================================================= elif op == 'post_gentemplate_cleanup': post_gentemplate_cleanup(argsdict) # elif op == 'combine_bif_chunks': # """ used for testing combining bif chunks # """ # utils.combine_dirname_chunks_each_archive(argsdict, dirname='bif') elif op == 'get_manual_styles_to_contests': logs.sts("Processing manual_styles_to_contests", 3) style_to_contests_dol = get_manual_styles_to_contests(argsdict, silent_error=True) logs.sts(f"style_to_contests_dol:\n {pprint.pformat(style_to_contests_dol)}") if style_to_contests_dol: DB.save_data(data_item=style_to_contests_dol, dirname='styles', name='CVR_STYLE_TO_CONTESTS_DICT.json') elif op == 'web2eif': """ This operation scrapes from a url provided a high-level report of results. It was thought at the time that this report would provide unique contest names and consistent option names, but even though they were shorter and a bit better than the CVR, they also were insufficient for our needs. Thus, althought this does provide a basic function, it is not up to date with the current EIF format and does not eliminate the need for the EIF and manual editing. RESEARCH ONLY. """ web_scraper.run_scraper(url=argsdict['url']) sys.exit() #elif op == 'tidycvr': # """ This operation converts and ES&S cvr to tidy format # Although it is operational, it was found that the existing ES&S format was # a reasonably consice and useful format and we would work with it. # """ # tidy_ess_cvr(argsdict) # sys.exit() elif op == 'cvr2styles': """ DEPRECATED. Use validate_bifs or genbif_from_ballots This operation preprocesses an ES&S CVR file or multiple Dominion CVR files. creates two dicts: styles_dict, which provides contest list for each style_num ballotid_to_style dict, which provides style_num based on ballotid. This currently only works if the CVR has a column providding the style named 'Ballot Style' Would need a different approach if no Ballot Style column is provided, such as creating a logical style iD, perhaps bitstring of contests, and use that as a logcal style identifier. This would not match to any style designator on the ballot. Proceses multple CVR files one at a time. (scalable) convert_cvr_to_styles function is in styles_from_cvr_converter.py for dominion, get_styles_to_contests_dominion is in gentemplate.py """ convert_cvr_to_styles(argsdict) elif op == 'gentrm': gentemplates_by_tasklists(argsdict) genrois(argsdict) maprois(argsdict) elif op == 'tltrm': build_template_tasklists(argsdict) gentemplates_by_tasklists(argsdict) genrois(argsdict) maprois(argsdict) elif op == 'alltemplates': """ Perform all the steps to creation of templates """ genbif_from_cvr(argsdict) build_template_tasklists(argsdict) # convert_cvr_to_styles(argsdict) gentemplates_by_tasklists(argsdict) genrois(argsdict) maprois(argsdict) # elif op == 'download_results': # # download all results from s3 bucket. # s3utils.download_entire_dirname(argsdict, dirname='marks') # s3utils.get_and_merge_lambda_logs(argsdict) elif op == 'download_gentemplates': # download all gentemplates from s3 bucket. # NOT UPDATED TO NEW FILE STRUCTURE DB.download_entire_dirname(dirname='styles') #DB.download_entire_dirname(dirname='styles') elif op == 'delete_s3_results': # delete all results on s3 bucket. DB.delete_s3_results(argsdict) elif op == 'merge_results': """ merge results into single csv file. """ utils.merge_results() elif op == 'check_extraction': check_extraction(argsdict) elif op == 'extractcmp': build_extraction_tasks(argsdict) extractvote_by_tasklists(argsdict) cmpcvr_by_tasklists(argsdict) # elif op == 'getlogs': # DB.get_and_merge_s3_logs() elif op == 'plotmetrics': plotmetrics() elif op == 'evalmarks': evalmarks() elif op == 'save_failing_ballots': # given list of ballots in inputfile, copy the original ballot image files # to (jobname)/styles/(ballot_id) folders # this function # 1. builds single bif table. # 2. looks each ballot up. # 3. using entry, opens the indicated archive and extracts the original file. # 4. saves the file in folder of jobname and ballot_id in styles, see above. save_failing_ballots(argsdict) elif op == 'reprocess_failing_ballots': reprocess_failing_ballots(argsdict) else: print("op value not defined ", op) sys.exit()
def read_style_num_from_barcode(self, argsdict): """ if ballot.style_num is defined, then use it, otherwise: given np.array of image, read ES&S barcode and decode it. return style_num as str if successful else None typical usage: style_num = read_style_from_image(image) may return None if there is an underlying error. """ logs.sts("Reading style_num from ballot barcode...", 3) ballot_id = self.ballotdict['ballot_id'] ballot_style_overrides_dict = args.get_ballot_style_overrides(argsdict) if self.ballotdict['vendor'] == 'Dominion': if self.ballotdict['card_code'] is None: # This situation exists if there was a problem converting the barcode during alignment. self.ballotdict['style_num'] = None elif argsdict['conv_card_code_to_style_num']: #attempt to convert card_code to the official style_num which should match CVR style field. # if ballot_type_id or card_code cannote be read, then this may return None self.ballotdict[ 'style_num'], _ = dominion_build_effective_style_num( argsdict, self.ballotdict['card_code']) else: self.ballotdict['style_num'] = self.ballotdict['card_code'] if self.ballotdict['style_num'] is None: utils.exception_report( f"### EXCEPTION: card_code not read from ballot:{ballot_id}. " ) return None elif self.ballotdict['vendor'] == 'ES&S': card_code = read_raw_ess_barcode(self.ballotimgdict['images'][0], ballot_id) self.ballotdict['card_code'] = style_num = card_code from utilities.bif_utils import read_pstyle_from_image_if_specd self.ballotdict['pstyle_num'] = read_pstyle_from_image_if_specd( argsdict, self.ballotimgdict['images'][0]) # style num must be a string if argsdict['conv_card_code_to_style_num']: # converting the card_code to the style number is important to link it to the # style number as used on CVR. If no CVR is used, or if we are not attempting to link them # then using the card_code directly occurs when 'conv_card_code_to_style_num' is False cc_style_num = str( barcode_parser.get_parsed_barcode( card_code, ballot_id, self.ballotdict['precinct'])) self.ballotdict['ballot_type_id'] = cc_style_num if argsdict['use_pstyle_as_style_num'] and self.ballotdict[ 'pstyle_num']: self.ballotdict['style_num'] = self.ballotdict['pstyle_num'] elif self.ballotdict['ballot_type_id']: self.ballotdict['style_num'] = self.ballotdict[ 'ballot_type_id'] else: self.ballotdict['style_num'] = card_code if not self.ballotdict['style_num'] and ballot_style_overrides_dict: if ballot_id in ballot_style_overrides_dict: return ballot_style_overrides_dict[ballot_id] else: style_num = self.ballotdict['style_num'] return style_num
def sts(string, verboselevel=0, end='\n'): return logs.sts(string, verboselevel, end)
def wait_for_lambdas(argsdict: dict, task_name=None): #, download_failed=False): """ Waits for every lambda request added to LambdaTracker. Note: not specific to task_name. Only only one use of Lambdas at a time by a specific job_name. We may want to use task_name to create separate folders for any given task. So keep task_name for now even though we are not using it. """ if not argsdict['use_lambdas']: return # running_requests = LambdaTracker.get_status_request_keys('Running') total_requests = len(LambdaTracker.lambda_requests.keys()) running_requests = total_requests if not running_requests: return wait = 10 timeout = 60 * 20 time.sleep(10) # Just to be sure that all lambdas tracker files are on the bucket s3dirpath_completed = DB.dirpath_from_dirname('lambda_tracker', subdir='Completed') s3dirpath_failed = DB.dirpath_from_dirname('lambda_tracker', subdir='Failed') while timeout > 0 and running_requests: time.sleep(wait) timeout -= wait # running_requests = LambdaTracker.get_status_request_keys('Running') files_completed = s3utils.list_files_in_s3dirpath(s3dirpath_completed) files_failed = s3utils.list_files_in_s3dirpath(s3dirpath_failed) completed_requests = len(files_completed) failed_requests = len(files_failed) running_requests = total_requests - completed_requests - failed_requests if timeout <= 0 or not running_requests: break logs.sts(f'Waiting for lambdas. Timeout (s): {timeout}. Running: {running_requests}') # for request in running_requests: # chunk_name = LambdaTracker.lambda_requests[request].get('chunk_name') # tracker = s3utils.check_lambda_status(argsdict, task_name=task_name, chunk_name=chunk_name) # if tracker: # if tracker.get('status') != 'Running': # #import pdb; pdb.set_trace() # LambdaTracker.lambda_requests[request]['status'] = tracker['status'] # utils.sts(f"Task {chunk_name}, ID {request} changed status to {tracker['status']}") # if tracker.get('error_info'): # LambdaTracker.lambda_requests[request]['error_type'] = tracker['error_info']['error_type'] # LambdaTracker.lambda_requests[request]['error_message'] = tracker['error_info']['error_message'] # LambdaTracker.lambda_requests[request]['error_stack'] = tracker['error_info']['error_stack'] # else: # utils.sts(f"Trackign info from job:{job_name}, task:{task_name} and chunk:{chunk_name} not found", 3) # failed_requests = LambdaTracker.get_not_done_request_keys() failed_requests_log_list = s3utils.list_files_in_s3dirpath(s3dirpath_failed) all_succeeded = True if failed_requests_log_list: # if download_failed: # #download_results(argsdict) # pass for failed_request in failed_requests_log_list: print(f'Lambda request failed. please check cloudwatch logs for chunks: {failed_request} \n') # request = LambdaTracker.lambda_requests[failed_request] # chunk_name = request.get('chunk_name') # utils.sts(f'Task: {chunk_name}, ID: {failed_request} failed') # if request['status'] == 'Failed': # utils.sts(f"{request.get('error_type')}: {request.get('error_message')}") # error_stack = request.get('error_stack') # for error_item in error_stack: # print(error_item) # #utils.sts(f"Error Stack: {request.get('error_stack')}") # else: # utils.sts('Error: TIMEOUT') # utils.sts(f"Files payload: {json.dumps(request['task_args'])}", verboselevel=1) # print('Files payload list saved to log file') all_succeeded = False logs.sts(f"All lambdas finished; {completed_requests} {round(100 * completed_requests/(completed_requests + failed_requests), 2)}% successful, " f"{failed_requests} {round(100 * failed_requests/(completed_requests + failed_requests), 2)}% failed", 3) return all_succeeded