def execute(self): if self.raw_s3_file is not None: self.main_file = self.s3_download() config = Config(file_name=self.config_file) new_files = self.unpack_files(file_obj=self.main_file) del self.main_file, self.temp_files gc.collect() voter_files = [f for f in new_files if "FVE" in f["name"]] election_maps = [f for f in new_files if "Election Map" in f["name"]] zone_codes = [f for f in new_files if "Codes" in f["name"]] zone_types = [f for f in new_files if "Types" in f["name"]] del new_files gc.collect() if not self.ignore_checks: # election maps need to line up to voter files? self.file_check(len(voter_files), len(election_maps)) counties = config["county_names"] main_df = None # Preserving the order of the file sent, but concatinating the district and election columns which were # dropped in the legacy processed file dfcols = (config["ordered_columns"][:-3] + config["district_columns"] + config["election_columns"] + config["ordered_columns"][-3:]) # create a mapping that returns a series based on the values across rows (voters) of cells (election info). # consolidates the non nan values into one string that can be appended as a column later for the all_history and # the districts columns def list_map(df_sub, columns, zone_dict=None): def mapping(li, zone_dict=zone_dict): if zone_dict is None: li = [x for x in li if x != "nan"] return li else: li = [ zone_dict[x] for x in li if x != "nan" and x in zone_dict ] return li return pd.Series( map(mapping, df_sub[columns].values.astype(str).tolist())) sorted_codes = [] sorted_code_dict = defaultdict(defaultdict) dtypes = {col: "str" for col in dfcols} for idx, c in enumerate(counties): logging.info("Processing {} {}/{}".format(c, idx, len(counties))) c = format_column_name(c) try: voter_file = next(f for f in voter_files if c in f["name"].lower()) election_map = next(f for f in election_maps if c in f["name"].lower()) zones = next(f for f in zone_codes if c in f["name"].lower()) types = next(f for f in zone_types if c in f["name"].lower()) except StopIteration: continue df = self.read_csv_count_error_lines( voter_file["obj"], sep="\t", names=dfcols, error_bad_lines=False, dtype=dtypes, ) edf = self.read_csv_count_error_lines( election_map["obj"], sep="\t", names=["county", "number", "title", "date"], error_bad_lines=False, dtype={ "county": str, "number": str, "title": str, "date": str, }, ) zdf = self.read_csv_count_error_lines( zones["obj"], sep="\t", names=[ "county_name", "zone_number", "zone_code", "zone_description", ], error_bad_lines=False, ) tdf = self.read_csv_count_error_lines( types["obj"], sep="\t", names=[ "county_name", "zone_number", "zone_short_name", "zone_long_name", ], error_bad_lines=False, ) # Refactor note: format the election data into the format expected in the original all_history column edf["election_list"] = edf["title"] + " " + edf["date"] # Gather the positional vote and distict columns district_columns = df.columns[30:70].to_list() vote_columns = df.columns[70:150].to_list() # create a dict of the formatted election data using the index number in the given file, this # corresponds to the column index beginning at the start of the vote columns in the dataframe # Index begins starting at 1 election_map = pd.Series(edf.election_list.values, index=edf.number).to_dict() # merge the zone files together to consolidate the information in one dataframe zdf = zdf.merge(tdf, how="left", on="zone_number") # format a column field that contains the zone description and the name so # that it matches the current district field zdf["combined"] = (zdf["zone_description"] + " Type: " + zdf["zone_long_name"]) # create a dict that utilizes the zone code as the key and the long name string as the value zone_dict = dict(zip(zdf.zone_code.astype(str), zdf.combined)) # Gather the pairs of election columns to iterate over both at the same time to collect the information # contained in both of the columns per election vote_column_list = list( zip(df.columns[70:150:2], df.columns[71:150:2])) # get the value from the eleciton map key for the election name, # then combine it with the value in the party and vote type cells for the full election information # Creates a history dataframe containing, as cells, the election name as gathered in the election file, the # vote type (AP, A etc), and the party information, all separated by spaces # The columns are all named election_#_vote_type but the cells contain the relevant information vote_hist_df = pd.DataFrame({ i: election_map[i.split("_")[1]] + " " + df[i] + " " + df[j] for i, j in vote_column_list if i.split("_")[1] in election_map }) # counts for the metadata counts = vote_hist_df.count() for i in counts.index: current_key = election_map[i.split("_")[1]] # Metadata needs to be _ separated not space current_key = "_".join(current_key.split()) if current_key in sorted_code_dict: sorted_code_dict[current_key]["count"] += int(counts[i]) else: current_date = edf.loc[edf["number"] == i.split("_") [1]]["date"].values[0] new_dict_entry = defaultdict(str) new_dict_entry["date"] = current_date new_dict_entry["count"] = int(counts[i]) sorted_code_dict[current_key] = new_dict_entry # converts the dataframe to a series that contains the list of elections participate in indexed on position vote_hist_df = list_map(vote_hist_df, vote_hist_df.columns) districts = list_map(df[district_columns], district_columns, zone_dict) df["all_history"] = vote_hist_df df["districts"] = districts df.drop(vote_columns, axis=1, inplace=True) df.drop(district_columns, axis=1, inplace=True) cols_to_check = [ col for col in list(df.columns) if col not in vote_columns and col not in district_columns ] self.column_check(list(df.columns), cols_to_check) if main_df is None: main_df = df else: main_df = pd.concat([main_df, df], ignore_index=True) del voter_files, election_maps, zone_codes, zone_types gc.collect() sorted_keys = sorted(sorted_code_dict.items(), key=lambda x: parser.parse(x[1]["date"])) for index, key in enumerate(sorted_keys): sorted_code_dict[key[0]]["index"] = index sorted_codes.append(key[0]) del sorted_keys gc.collect() logging.info("coercing") main_df = config.coerce_dates(main_df) main_df = config.coerce_numeric( main_df, extra_cols=[ "house_number", "apartment_number", "address_line_2", "zip", "mail_address_1", "mail_address_2", "mail_zip", "precinct_code", "precinct_split_id", "legacy_id", "home_phone", ], ) logging.info("Writing CSV") self.meta = { "message": "pennsylvania_{}".format(datetime.now().isoformat()), "array_encoding": json.dumps(sorted_code_dict), "array_decoding": json.dumps(sorted_codes), } csv_obj = main_df.to_csv(encoding="utf-8", index=False) del main_df gc.collect() self.processed_file = FileItem( name="{}.processed".format(self.config["state"]), io_obj=StringIO(csv_obj), s3_bucket=self.s3_bucket, ) del csv_obj gc.collect()
class PreprocessNorthCarolina(Preprocessor): def __init__(self, raw_s3_file, config_file, force_date=None, **kwargs): if force_date is None: force_date = date_from_str(raw_s3_file) super().__init__(raw_s3_file=raw_s3_file, config_file=config_file, force_date=force_date, **kwargs) self.raw_s3_file = raw_s3_file self.processed_file = None self.config = Config(file_name=config_file) def execute(self): if self.raw_s3_file is not None: self.main_file = self.s3_download() new_files = self.unpack_files( file_obj=self.main_file) # array of dicts del self.main_file, self.temp_files gc.collect() if not self.ignore_checks: self.file_check(len(new_files)) for i in new_files: if ("ncvhis" in i["name"]) and (".txt" in i["name"]): vote_hist_file = i elif ("ncvoter" in i["name"]) and (".txt" in i["name"]): voter_file = i voter_df = self.read_csv_count_error_lines( voter_file["obj"], sep="\t", quotechar='"', encoding="latin-1", error_bad_lines=False, ) del voter_file gc.collect() vote_hist = self.read_csv_count_error_lines( vote_hist_file["obj"], sep="\t", quotechar='"', error_bad_lines=False, ) del vote_hist_file, new_files gc.collect() try: voter_df.columns = self.config["ordered_columns"] except ValueError: logging.info( "Incorrect number of columns found for the voter file in North Carolina" ) raise MissingNumColumnsError( "{} state is missing columns".format(self.state), self.state, len(self.config["ordered_columns"]), len(voter_df.columns), ) try: vote_hist.columns = self.config["hist_columns"] except ValueError: logging.info( "Incorrect number of columns found for the history file in North Carolina" ) raise valid_elections, counts = np.unique(vote_hist["election_desc"], return_counts=True) count_order = counts.argsort()[::-1] valid_elections = valid_elections[count_order] counts = counts[count_order] sorted_codes = valid_elections.tolist() sorted_codes_dict = { k: { "index": i, "count": int(counts[i]), "date": date_from_str(k) } for i, k in enumerate(sorted_codes) } vote_hist["array_position"] = vote_hist["election_desc"].map( lambda x: int(sorted_codes_dict[x]["index"])) del valid_elections, counts, count_order gc.collect() voter_groups = vote_hist.groupby(self.config["voter_id"]) all_history = voter_groups["array_position"].apply(list) vote_type = voter_groups["voting_method"].apply(list) voter_df = voter_df.set_index(self.config["voter_id"]) voter_df["all_history"] = all_history voter_df["vote_type"] = vote_type del voter_groups, vote_hist, all_history, vote_type gc.collect() voter_df = self.config.coerce_strings(voter_df) voter_df = self.config.coerce_dates(voter_df) voter_df = self.config.coerce_numeric( voter_df, extra_cols=[ "county_commiss_abbrv", "fire_dist_abbrv", "full_phone_number", "judic_dist_abbrv", "munic_dist_abbrv", "municipality_abbrv", "precinct_abbrv", "precinct_desc", "school_dist_abbrv", "super_court_abbrv", "township_abbrv", "township_desc", "vtd_abbrv", "vtd_desc", "ward_abbrv", ], ) self.meta = { "message": "north_carolina_{}".format(datetime.now().isoformat()), "array_encoding": json.dumps(sorted_codes_dict), "array_decoding": json.dumps(sorted_codes), } self.is_compressed = False csv_obj = voter_df.to_csv(encoding="utf-8", index=True) del voter_df gc.collect() self.processed_file = FileItem( name="{}.processed".format(self.config["state"]), io_obj=StringIO(csv_obj), s3_bucket=self.s3_bucket, ) del csv_obj gc.collect()