def state_download(state, s3_bucket): config_file = Config.config_file_from_state(state=state) configs = Config(file_name=config_file) if state == "north_carolina": today = nc_date_grab() list_files = configs['data_chunk_links'] zipped_files = [] for i, url in enumerate(list_files): target_path = "/tmp/" + state + str(i) + ".zip" zipped_files.append(target_path) response = requests.get(url, stream=True) handle = open(target_path, "wb") for chunk in response.iter_content(chunk_size=512): if chunk: handle.write(chunk) handle.close() file_to_zip = today + ".zip" with zipfile.ZipFile(file_to_zip, 'w') as myzip: for f in zipped_files: myzip.write(f) file_to_zip = FileItem( "NC file auto download", filename=file_to_zip, s3_bucket=s3_bucket) loader = Loader(config_file=config_file, force_date=today, s3_bucket=s3_bucket) loader.s3_dump(file_to_zip, file_class=RAW_FILE_PREFIX) elif state == "ohio": today = str(ohio_get_last_updated().isoformat())[0:10] list_files = configs['data_chunk_links'] file_names = configs['data_file_names'] zipped_files = [] for i, url in enumerate(list_files): logging.info("downloading {} file".format(url)) target_path = "/tmp/" + state + "_" + file_names[i] + ".txt.gz" zipped_files.append(target_path) response = requests.get(url, stream=True, verify=False) handle = open(target_path, "wb") for chunk in response.iter_content(chunk_size=512): if chunk: handle.write(chunk) handle.close() logging.info("downloaded {} file".format(url)) file_to_zip = today + ".zip" logging.info("Zipping files") with zipfile.ZipFile(file_to_zip, 'w') as myzip: for f in zipped_files: myzip.write(f) logging.info("Uploading") file_to_zip = FileItem( "OH file auto download", filename=file_to_zip, s3_bucket=s3_bucket) loader = Loader(config_file=config_file, force_date=today, s3_bucket=s3_bucket) loader.s3_dump(file_to_zip, file_class=RAW_FILE_PREFIX)
def __init__(self, raw_s3_file, config_file, force_date=None, **kwargs): if force_date is None: force_date = date_from_str(raw_s3_file) super().__init__(raw_s3_file=raw_s3_file, config_file=config_file, force_date=force_date, **kwargs) self.raw_s3_file = raw_s3_file self.processed_file = None self.config = Config(file_name=config_file)
def convert_voter_file(state=None, local_file=None, file_date=None, write_file=False): config_file = Config.config_file_from_state(state) file_date = str(datetime.datetime.strptime(file_date, '%Y-%m-%d').date()) with Preprocessor(None, config_file, force_file=local_file, force_date=file_date) as preprocessor: file_item = preprocessor.execute() if not write_file: return (preprocessor.output_dataframe(file_item), preprocessor.meta) preprocessor.local_dump(file_item)
def __init__(self, raw_s3_file, config_file, force_date=None, force_file=None, testing=False, ignore_checks=False, s3_bucket="", **kwargs): # Init change begin (adding loader object) self.config_file_path = config_file self.config = Config(file_name=config_file) self.chunk_urls = (self.config[CONFIG_CHUNK_URLS] if CONFIG_CHUNK_URLS in self.config else []) if "tmp" not in os.listdir("/"): os.system("mkdir /tmp") self.file_type = self.config["file_type"] self.source = self.config["source"] self.is_compressed = False self.checksum = None self.state = self.config["state"] self.meta = None self.testing = testing self.ignore_checks = ignore_checks self.s3_bucket = s3_bucket if force_date is not None: self.download_date = parser.parse(force_date).isoformat() else: self.download_date = datetime.now().isoformat() if force_file is not None: working_file = "/tmp/voteshield_{}.tmp".format(uuid.uuid4()) logging.info("copying {} to {}".format(force_file, working_file)) shutil.copy2(force_file, working_file) self.main_file = FileItem("loader_force_file", filename=working_file, s3_bucket=self.s3_bucket) else: self.main_file = "/tmp/voteshield_{}.tmp".format(uuid.uuid4()) self.temp_files = [self.main_file] # Init change end if force_date is None: force_date = date_from_str(raw_s3_file) self.raw_s3_file = raw_s3_file
def convert_voter_file(state=None, local_file=None, file_date=None, write_file=False): """Main Reggie function; processes a voter file, which is often more than one file, so will likely be a compressed file such as a .zip file. Parameters ---------- state : string, optional State identifier which is the lower case version of the state name with underscores replacing spaces, by default None local_file : string, optional Path to file to process, by default None file_date : string, optional The snapshot date in format "YYYY-MM-DD", by default None write_file : bool, optional Whether to write the file out into a CSV file, which will be automatically named and write to the local directory, by default False Returns ------- tuple If `write_file` is falsey, this function will return a tuple with the following objects: - The processed voter file as a CSV string - The meta data object - The preprocessor object """ config_file = Config.config_file_from_state(state) file_date = str(datetime.datetime.strptime(file_date, "%Y-%m-%d").date()) preprocessor = state_router( state, raw_s3_file=None, config_file=config_file, force_file=local_file, force_date=file_date, ) preprocessor.execute() if not write_file: return ( preprocessor.output_dataframe(preprocessor.processed_file), preprocessor.meta, preprocessor, ) preprocessor.local_dump(preprocessor.processed_file)
def get_processed_s3_uploads(state, s3_bucket, testing=False): configs = Config(state=state) keys = get_s3_uploads(configs["state"], configs["file_class"], configs["source"], s3_bucket, testing) return keys
def execute(self): def district_fun(df_dist, df_voter, dist_dict): for dist_code in dist_dict.keys(): temp_df = df_dist[df_dist["DistrictTypeCode"] == dist_code] temp_df = temp_df.rename( columns={"DistrictName": dist_dict[dist_code]}) df_voter = pd.merge( df_voter, temp_df[["PrecinctId", dist_dict[dist_code]]], how="left", on="PrecinctId", ) df_voter.drop(columns=["PrecinctId"], inplace=True) return df_voter if self.raw_s3_file is not None: self.main_file = self.s3_download() config = Config(file_name=self.config_file) new_files = self.unpack_files(file_obj=self.main_file) del self.main_file, self.temp_files gc.collect() # Have to use longer whole string not just suffix because hist will # match to voter file voter_file = [f for f in new_files if "pvrdr-vrd" in f["name"]][0] district_file = [f for f in new_files if "pvrdr-pd" in f["name"]][0] history_file = [f for f in new_files if "pvrdr-vph" in f["name"]][0] temp_voter_id_df = pd.read_csv( voter_file["obj"], sep="\t", encoding="latin-1", usecols=["RegistrantID"], dtype=str, ) # rewind voter_file["obj"].seek(0) voter_ids = temp_voter_id_df["RegistrantID"].unique().tolist() del temp_voter_id_df gc.collect() hist_dict = {i: np.nan for i in voter_ids} votetype_dict = {i: np.nan for i in voter_ids} del voter_ids gc.collect() # key election, values date and count, then sort. # gonna have to iterate over all_hist and map to sparse elect_dict = defaultdict(int) def dict_cols(chunk, history_dict=None, votetype_dict=None, election_dict=None): chunk["combined_col"] = ( chunk["ElectionType"].replace(" ", "") + "_" + chunk["ElectionDate"] # + "_" # + chunk["Method"] ) chunk["election"] = (chunk["ElectionType"].replace(" ", "") + "_" + chunk["ElectionDate"]) chunk.drop( columns=[ "ElectionType", "ElectionName", "ElectionDate", "CountyCode", ], inplace=True, ) for row in chunk.itertuples(): try: current_li = hist_dict[row.RegistrantID] votetype_hist = votetype_dict[row.RegistrantID] # throws key error for entries not in voter file election_dict[row.election] += 1 combined_row = row.combined_col if isinstance(current_li, list): current_li.append(combined_row) votetype_hist.append(row.Method) history_dict[row.RegistrantID] = current_li votetype_dict[row.RegistrantID] = votetype_hist else: # test_dict[row['RegistrantID']][0] history_dict[row.RegistrantID] = [ combined_row ] # Create list of elections even if len 1 votetype_dict[row.RegistrantID] = [row.Method] except KeyError: continue # Chunk size, over ~3 mil of so leads to slowdown chunk_size = 3000000 history_chunks = pd.read_csv( history_file["obj"], sep="\t", usecols=[ "RegistrantID", "CountyCode", "ElectionDate", "ElectionName", "ElectionType", "Method", ], dtype=str, chunksize=chunk_size, ) for chunk in history_chunks: dict_cols(chunk, hist_dict, votetype_dict, elect_dict) history_file["obj"].close() del history_file gc.collect() hist_series = pd.Series(hist_dict, name="all_history") del hist_dict gc.collect() votetype_series = pd.Series(votetype_dict, name="votetype_history") del votetype_dict gc.collect() logging.info("reading in CA voter df") category_list = [ "CountyCode", "Suffix", "StreetDirPrefix", "AddressNumberSuffix", "StreetType", "StreetDirSuffix", "UnitType", "City", "State", "Zip", "Language", "Gender", "PartyCode", "Status", "VoterStatusReasonCodeDesc", "AssistanceRequestFlag", "VbmVoterType", "USCongressionalDistrict", "StateSenate", "Municipality", "StateAddr", ] # read in columns to set dtype as pyarrow col_ifornia = pd.read_csv(voter_file["obj"], sep="\t", nrows=0, encoding="latin-1").columns.tolist() voter_file["obj"].seek(0) dtype_dict = { col: ("string[pyarrow]" if col not in category_list else "category") for col in col_ifornia } voter_df = pd.read_csv( voter_file["obj"], sep="\t", dtype=dtype_dict, encoding="latin-1", on_bad_lines="warn", ) # Replaces the state column name in the address fields with StateAddr to avoid duplicate column names voter_df.rename(columns={"State": "StateAddr"}, inplace=True) logging.info("dataframe memory usage: {}".format( round((voter_df.memory_usage(deep=True).sum() / 1024**2), 2))) voter_file["obj"].close() del voter_file gc.collect() district_dict = { "CG": "USCongressionalDistrict", "SS": "StateSenate", "SA": "StateAssembly", "CI": "Municipality", "SU": "CountySupervisoral", } district_df = pd.read_csv(district_file["obj"], sep="\t", dtype="string[pyarrow]") district_file["obj"].close() del district_file merged_districts = district_fun( district_df, voter_df[["RegistrantID", "PrecinctId"]], district_dict, ) voter_df = voter_df.merge(merged_districts, left_on="RegistrantID", right_on="RegistrantID") del merged_districts gc.collect() voter_df.set_index("RegistrantID", inplace=True) voter_df = voter_df.merge(hist_series, left_index=True, right_index=True) del hist_series gc.collect() voter_df = voter_df.merge(votetype_series, left_index=True, right_index=True) del votetype_series gc.collect() # create sparse history sorted_keys = sorted(elect_dict.items(), key=lambda x: x[0].split("_")[1]) sorted_codes_dict = { value[0]: { "index": i, "count": value[1] } for i, value in enumerate(sorted_keys) } sorted_codes = [x[0] for x in sorted_keys] voter_df["sparse_history"] = voter_df.all_history.apply( lambda x: [sorted_codes_dict[y]["index"] for y in x] if x == x else np.nan) # Begin Coerce # categories to turn them in to strings logging.info("coecrcing strings") voter_df = self.coerce_strings(voter_df, config, category_list) logging.info("coecrcing dates") voter_df = self.config.coerce_dates(voter_df) logging.info("coecrcing numeric") voter_df = self.config.coerce_numeric(voter_df) voter_df = voter_df.reset_index().rename( columns={"index": "RegistrantID"}) voter_csv = voter_df.to_csv(encoding="utf-8", index=False) del voter_df gc.collect() self.meta = { "message": "california_{}".format(datetime.now().isoformat()), "array_encoding": sorted_codes_dict, "array_decoding": sorted_codes, } self.processed_file = FileItem( name="{}.processed".format(self.config["state"]), io_obj=StringIO(voter_csv), s3_bucket=self.s3_bucket, )
def execute(self): if self.raw_s3_file is not None: self.main_file = self.s3_download() config = Config(file_name=self.config_file) new_files = self.unpack_files(file_obj=self.main_file) del self.main_file, self.temp_files gc.collect() voter_files = [f for f in new_files if "FVE" in f["name"]] election_maps = [f for f in new_files if "Election Map" in f["name"]] zone_codes = [f for f in new_files if "Codes" in f["name"]] zone_types = [f for f in new_files if "Types" in f["name"]] del new_files gc.collect() if not self.ignore_checks: # election maps need to line up to voter files? self.file_check(len(voter_files), len(election_maps)) counties = config["county_names"] main_df = None # Preserving the order of the file sent, but concatinating the district and election columns which were # dropped in the legacy processed file dfcols = (config["ordered_columns"][:-3] + config["district_columns"] + config["election_columns"] + config["ordered_columns"][-3:]) # create a mapping that returns a series based on the values across rows (voters) of cells (election info). # consolidates the non nan values into one string that can be appended as a column later for the all_history and # the districts columns def list_map(df_sub, columns, zone_dict=None): def mapping(li, zone_dict=zone_dict): if zone_dict is None: li = [x for x in li if x != "nan"] return li else: li = [ zone_dict[x] for x in li if x != "nan" and x in zone_dict ] return li return pd.Series( map(mapping, df_sub[columns].values.astype(str).tolist())) sorted_codes = [] sorted_code_dict = defaultdict(defaultdict) dtypes = {col: "str" for col in dfcols} for idx, c in enumerate(counties): logging.info("Processing {} {}/{}".format(c, idx, len(counties))) c = format_column_name(c) try: voter_file = next(f for f in voter_files if c in f["name"].lower()) election_map = next(f for f in election_maps if c in f["name"].lower()) zones = next(f for f in zone_codes if c in f["name"].lower()) types = next(f for f in zone_types if c in f["name"].lower()) except StopIteration: continue df = self.read_csv_count_error_lines( voter_file["obj"], sep="\t", names=dfcols, error_bad_lines=False, dtype=dtypes, ) edf = self.read_csv_count_error_lines( election_map["obj"], sep="\t", names=["county", "number", "title", "date"], error_bad_lines=False, dtype={ "county": str, "number": str, "title": str, "date": str, }, ) zdf = self.read_csv_count_error_lines( zones["obj"], sep="\t", names=[ "county_name", "zone_number", "zone_code", "zone_description", ], error_bad_lines=False, ) tdf = self.read_csv_count_error_lines( types["obj"], sep="\t", names=[ "county_name", "zone_number", "zone_short_name", "zone_long_name", ], error_bad_lines=False, ) # Refactor note: format the election data into the format expected in the original all_history column edf["election_list"] = edf["title"] + " " + edf["date"] # Gather the positional vote and distict columns district_columns = df.columns[30:70].to_list() vote_columns = df.columns[70:150].to_list() # create a dict of the formatted election data using the index number in the given file, this # corresponds to the column index beginning at the start of the vote columns in the dataframe # Index begins starting at 1 election_map = pd.Series(edf.election_list.values, index=edf.number).to_dict() # merge the zone files together to consolidate the information in one dataframe zdf = zdf.merge(tdf, how="left", on="zone_number") # format a column field that contains the zone description and the name so # that it matches the current district field zdf["combined"] = (zdf["zone_description"] + " Type: " + zdf["zone_long_name"]) # create a dict that utilizes the zone code as the key and the long name string as the value zone_dict = dict(zip(zdf.zone_code.astype(str), zdf.combined)) # Gather the pairs of election columns to iterate over both at the same time to collect the information # contained in both of the columns per election vote_column_list = list( zip(df.columns[70:150:2], df.columns[71:150:2])) # get the value from the eleciton map key for the election name, # then combine it with the value in the party and vote type cells for the full election information # Creates a history dataframe containing, as cells, the election name as gathered in the election file, the # vote type (AP, A etc), and the party information, all separated by spaces # The columns are all named election_#_vote_type but the cells contain the relevant information vote_hist_df = pd.DataFrame({ i: election_map[i.split("_")[1]] + " " + df[i] + " " + df[j] for i, j in vote_column_list if i.split("_")[1] in election_map }) # counts for the metadata counts = vote_hist_df.count() for i in counts.index: current_key = election_map[i.split("_")[1]] # Metadata needs to be _ separated not space current_key = "_".join(current_key.split()) if current_key in sorted_code_dict: sorted_code_dict[current_key]["count"] += int(counts[i]) else: current_date = edf.loc[edf["number"] == i.split("_") [1]]["date"].values[0] new_dict_entry = defaultdict(str) new_dict_entry["date"] = current_date new_dict_entry["count"] = int(counts[i]) sorted_code_dict[current_key] = new_dict_entry # converts the dataframe to a series that contains the list of elections participate in indexed on position vote_hist_df = list_map(vote_hist_df, vote_hist_df.columns) districts = list_map(df[district_columns], district_columns, zone_dict) df["all_history"] = vote_hist_df df["districts"] = districts df.drop(vote_columns, axis=1, inplace=True) df.drop(district_columns, axis=1, inplace=True) cols_to_check = [ col for col in list(df.columns) if col not in vote_columns and col not in district_columns ] self.column_check(list(df.columns), cols_to_check) if main_df is None: main_df = df else: main_df = pd.concat([main_df, df], ignore_index=True) del voter_files, election_maps, zone_codes, zone_types gc.collect() sorted_keys = sorted(sorted_code_dict.items(), key=lambda x: parser.parse(x[1]["date"])) for index, key in enumerate(sorted_keys): sorted_code_dict[key[0]]["index"] = index sorted_codes.append(key[0]) del sorted_keys gc.collect() logging.info("coercing") main_df = config.coerce_dates(main_df) main_df = config.coerce_numeric( main_df, extra_cols=[ "house_number", "apartment_number", "address_line_2", "zip", "mail_address_1", "mail_address_2", "mail_zip", "precinct_code", "precinct_split_id", "legacy_id", "home_phone", ], ) logging.info("Writing CSV") self.meta = { "message": "pennsylvania_{}".format(datetime.now().isoformat()), "array_encoding": json.dumps(sorted_code_dict), "array_decoding": json.dumps(sorted_codes), } csv_obj = main_df.to_csv(encoding="utf-8", index=False) del main_df gc.collect() self.processed_file = FileItem( name="{}.processed".format(self.config["state"]), io_obj=StringIO(csv_obj), s3_bucket=self.s3_bucket, ) del csv_obj gc.collect()
class PreprocessNorthCarolina(Preprocessor): def __init__(self, raw_s3_file, config_file, force_date=None, **kwargs): if force_date is None: force_date = date_from_str(raw_s3_file) super().__init__(raw_s3_file=raw_s3_file, config_file=config_file, force_date=force_date, **kwargs) self.raw_s3_file = raw_s3_file self.processed_file = None self.config = Config(file_name=config_file) def execute(self): if self.raw_s3_file is not None: self.main_file = self.s3_download() new_files = self.unpack_files( file_obj=self.main_file) # array of dicts del self.main_file, self.temp_files gc.collect() if not self.ignore_checks: self.file_check(len(new_files)) for i in new_files: if ("ncvhis" in i["name"]) and (".txt" in i["name"]): vote_hist_file = i elif ("ncvoter" in i["name"]) and (".txt" in i["name"]): voter_file = i voter_df = self.read_csv_count_error_lines( voter_file["obj"], sep="\t", quotechar='"', encoding="latin-1", error_bad_lines=False, ) del voter_file gc.collect() vote_hist = self.read_csv_count_error_lines( vote_hist_file["obj"], sep="\t", quotechar='"', error_bad_lines=False, ) del vote_hist_file, new_files gc.collect() try: voter_df.columns = self.config["ordered_columns"] except ValueError: logging.info( "Incorrect number of columns found for the voter file in North Carolina" ) raise MissingNumColumnsError( "{} state is missing columns".format(self.state), self.state, len(self.config["ordered_columns"]), len(voter_df.columns), ) try: vote_hist.columns = self.config["hist_columns"] except ValueError: logging.info( "Incorrect number of columns found for the history file in North Carolina" ) raise valid_elections, counts = np.unique(vote_hist["election_desc"], return_counts=True) count_order = counts.argsort()[::-1] valid_elections = valid_elections[count_order] counts = counts[count_order] sorted_codes = valid_elections.tolist() sorted_codes_dict = { k: { "index": i, "count": int(counts[i]), "date": date_from_str(k) } for i, k in enumerate(sorted_codes) } vote_hist["array_position"] = vote_hist["election_desc"].map( lambda x: int(sorted_codes_dict[x]["index"])) del valid_elections, counts, count_order gc.collect() voter_groups = vote_hist.groupby(self.config["voter_id"]) all_history = voter_groups["array_position"].apply(list) vote_type = voter_groups["voting_method"].apply(list) voter_df = voter_df.set_index(self.config["voter_id"]) voter_df["all_history"] = all_history voter_df["vote_type"] = vote_type del voter_groups, vote_hist, all_history, vote_type gc.collect() voter_df = self.config.coerce_strings(voter_df) voter_df = self.config.coerce_dates(voter_df) voter_df = self.config.coerce_numeric( voter_df, extra_cols=[ "county_commiss_abbrv", "fire_dist_abbrv", "full_phone_number", "judic_dist_abbrv", "munic_dist_abbrv", "municipality_abbrv", "precinct_abbrv", "precinct_desc", "school_dist_abbrv", "super_court_abbrv", "township_abbrv", "township_desc", "vtd_abbrv", "vtd_desc", "ward_abbrv", ], ) self.meta = { "message": "north_carolina_{}".format(datetime.now().isoformat()), "array_encoding": json.dumps(sorted_codes_dict), "array_decoding": json.dumps(sorted_codes), } self.is_compressed = False csv_obj = voter_df.to_csv(encoding="utf-8", index=True) del voter_df gc.collect() self.processed_file = FileItem( name="{}.processed".format(self.config["state"]), io_obj=StringIO(csv_obj), s3_bucket=self.s3_bucket, ) del csv_obj gc.collect()