Ejemplo n.º 1
0
    def __init__(self, raw_s3_file, config_file, force_date=None, **kwargs):

        if force_date is None:
            force_date = date_from_str(raw_s3_file)

        super().__init__(raw_s3_file=raw_s3_file,
                         config_file=config_file,
                         force_date=force_date,
                         **kwargs)
        self.raw_s3_file = raw_s3_file
        self.processed_file = None
    def __init__(self, raw_s3_file, config_file, force_date=None, **kwargs):
        # Attempt to get date from file if not specifically given
        if force_date is None:
            force_date = date_from_str(raw_s3_file)

        # Call the main Preprocessor init
        super().__init__(
            raw_s3_file=raw_s3_file,
            config_file=config_file,
            force_date=force_date,
            **kwargs,
        )

        # Initialize some properties
        self.raw_s3_file = raw_s3_file
        self.processed_file = None
Ejemplo n.º 3
0
        def add_history(main_df):
            # also save as sparse array since so many elections are stored
            count_df = pd.DataFrame()
            for idx, hist in enumerate(self.config["hist_columns"]):
                unique_codes, counts = np.unique(
                    main_df[hist].str.replace(" ", "_").dropna().values,
                    return_counts=True,
                )
                count_df_new = pd.DataFrame(index=unique_codes,
                                            data=counts,
                                            columns=["counts_" + hist])
                count_df = pd.concat([count_df, count_df_new], axis=1)
            count_df["total_counts"] = count_df.sum(axis=1)
            unique_codes = count_df.index.values
            counts = count_df["total_counts"].values
            count_order = counts.argsort()
            unique_codes = unique_codes[count_order]
            counts = counts[count_order]
            sorted_codes = unique_codes.tolist()
            sorted_codes_dict = {
                k: {
                    "index": i,
                    "count": int(counts[i]),
                    "date": date_from_str(k),
                }
                for i, k in enumerate(sorted_codes)
            }

            def insert_code_bin(arr):
                return [sorted_codes_dict[k]["index"] for k in arr]

            main_df["all_history"] = main_df[
                self.config["hist_columns"]].apply(
                    lambda x: list(x.dropna().str.replace(" ", "_")), axis=1)
            main_df.all_history = main_df.all_history.map(insert_code_bin)
            return sorted_codes, sorted_codes_dict
Ejemplo n.º 4
0
 def handle_date(d):
     possible_date = date_from_str(d)
     if possible_date is None:
         return ""
     return pd.to_datetime(possible_date).strftime("%m/%d/%Y")
Ejemplo n.º 5
0
    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        logging.info("preprocessing florida")
        # new_files is list of dicts, i.e. [{"name":.. , "obj": <fileobj>}, ..]
        new_files = self.unpack_files(compression="unzip",
                                      file_obj=self.main_file)
        del self.main_file, self.temp_files
        gc.collect()

        vote_history_files = []
        voter_files = []
        for i in new_files:
            if "_H_" in i["name"]:
                vote_history_files.append(i)
            elif ".txt" in i["name"]:
                voter_files.append(i)

        if not self.ignore_checks:
            self.file_check(len(voter_files))
        concat_voter_file = concat_and_delete(voter_files)
        concat_history_file = concat_and_delete(vote_history_files)
        del new_files, vote_history_files, voter_files
        gc.collect()

        logging.info("FLORIDA: loading voter history file")
        df_hist = pd.read_fwf(concat_history_file, header=None)
        try:
            df_hist.columns = self.config["hist_columns"]
        except ValueError:
            logging.info("Incorrect history columns found in Florida")
            raise MissingNumColumnsError(
                "{} state history is missing columns".format(self.state),
                self.state,
                len(self.config["hist_columns"]),
                len(df_hist.columns),
            )
        del concat_history_file
        gc.collect()

        df_hist = df_hist[df_hist["date"].map(lambda x: len(x)) > 5]
        df_hist["election_name"] = (df_hist["date"] + "_" +
                                    df_hist["election_type"])
        valid_elections, counts = np.unique(df_hist["election_name"],
                                            return_counts=True)
        date_order = [
            idx for idx, election in sorted(
                enumerate(valid_elections),
                key=lambda x: datetime.strptime(x[1][:-4], "%m/%d/%Y"),
                reverse=True,
            )
        ]
        valid_elections = valid_elections[date_order]
        counts = counts[date_order]
        sorted_codes = valid_elections.tolist()
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(counts[i]),
                "date": date_from_str(k)
            }
            for i, k in enumerate(sorted_codes)
        }

        df_hist["array_position"] = df_hist["election_name"].map(
            lambda x: int(sorted_codes_dict[x]["index"]))
        del valid_elections, counts, date_order
        gc.collect()

        logging.info("FLORIDA: history apply")
        voter_groups = df_hist.groupby("VoterID")
        all_history = voter_groups["array_position"].apply(list)
        vote_type = voter_groups["vote_type"].apply(list)
        del voter_groups, df_hist
        gc.collect()

        logging.info("FLORIDA: loading main voter file")
        df_voters = self.read_csv_count_error_lines(concat_voter_file,
                                                    header=None,
                                                    sep="\t",
                                                    error_bad_lines=False)
        del concat_voter_file
        gc.collect()

        try:
            df_voters.columns = self.config["ordered_columns"]
        except ValueError:
            logging.info("Incorrect number of columns found for Flordia")
            raise MissingNumColumnsError(
                "{} state is missing voters columns".format(self.state),
                self.state,
                len(self.config["ordered_columns"]),
                len(df_voters.columns),
            )
        df_voters = df_voters.set_index(self.config["voter_id"])

        df_voters["all_history"] = all_history
        df_voters["vote_type"] = vote_type
        del all_history, vote_type
        gc.collect()

        df_voters = self.config.coerce_strings(df_voters)
        df_voters = self.config.coerce_dates(df_voters)
        df_voters = self.config.coerce_numeric(
            df_voters,
            extra_cols=[
                "Precinct",
                "Precinct_Split",
                "Daytime_Phone_Number",
                "Daytime_Area_Code",
                "Daytime_Phone_Extension",
                "Daytime_Area_Code",
                "Daytime_Phone_Extension",
                "Mailing_Zipcode",
                "Residence_Zipcode",
                "Mailing_Address_Line_1",
                "Mailing_Address_Line_2",
                "Mailing_Address_Line_3",
                "Residence_Address_Line_1",
                "Residence_Address_Line_2",
            ],
        )

        self.meta = {
            "message": "florida_{}".format(datetime.now().isoformat()),
            "array_encoding": sorted_codes_dict,
            "array_decoding": sorted_codes,
        }

        csv_obj = df_voters.to_csv(encoding="utf-8")
        del df_voters
        gc.collect()

        logging.info("FLORIDA: writing out")
        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(csv_obj),
            s3_bucket=self.s3_bucket,
        )
        del csv_obj
        gc.collect()
Ejemplo n.º 6
0
    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        # config = Config('michigan')
        new_files = self.unpack_files(file_obj=self.main_file)
        del self.main_file, self.temp_files
        gc.collect()

        if not self.ignore_checks:
            self.file_check(len(new_files))
        voter_file = ([
            n for n in new_files if "entire_state_v" in n["name"]
            or "EntireStateVoters" in n["name"]
        ] + [None])[0]
        hist_file = ([
            n for n in new_files if "entire_state_h" in n["name"]
            or "EntireStateVoterHistory" in n["name"]
        ] + [None])[0]
        elec_codes = ([n for n in new_files if "electionscd" in n["name"]] +
                      [None])[0]

        logging.info("Loading voter file: " + voter_file["name"])
        if voter_file["name"][-3:] == "lst":
            vcolspecs = [
                [0, 35],
                [35, 55],
                [55, 75],
                [75, 78],
                [78, 82],
                [82, 83],
                [83, 91],
                [91, 92],
                [92, 99],
                [99, 103],
                [103, 105],
                [105, 135],
                [135, 141],
                [141, 143],
                [143, 156],
                [156, 191],
                [191, 193],
                [193, 198],
                [198, 248],
                [248, 298],
                [298, 348],
                [348, 398],
                [398, 448],
                [448, 461],
                [461, 463],
                [463, 468],
                [468, 474],
                [474, 479],
                [479, 484],
                [484, 489],
                [489, 494],
                [494, 499],
                [499, 504],
                [504, 510],
                [510, 516],
                [516, 517],
                [517, 519],
            ]
            vdf = pd.read_fwf(
                voter_file["obj"],
                colspecs=vcolspecs,
                names=self.config["fwf_voter_columns"],
                na_filter=False,
            )
        elif voter_file["name"][-3:] == "csv":
            vdf = self.read_csv_count_error_lines(
                voter_file["obj"],
                encoding="latin-1",
                na_filter=False,
                error_bad_lines=False,
            )
            # rename 'STATE' field to not conflict with our 'state' field
            vdf.rename(columns={"STATE": "STATE_ADDR"}, inplace=True)
        else:
            raise NotImplementedError("File format not implemented")
        del voter_file
        gc.collect()

        def column_is_empty(col):
            total_size = col.shape[0]
            if (sum(col.isna()) == total_size) or (sum(col == "")):
                return True
            return False

        def fill_empty_columns(df):
            # Dummy values for newly added data fields
            if column_is_empty(df["STATUS_USER_CODE"]):
                df["STATUS_USER_CODE"] = "_"
            if column_is_empty(df["VOTER_ID"]):
                df["VOTER_ID"] = 0
            if column_is_empty(df["STATUS_DATE"]):
                df["STATUS_DATE"] = "1970-01-01 00:00:00"
            return df

        vdf = self.reconcile_columns(vdf, self.config["columns"])
        vdf = fill_empty_columns(vdf)
        vdf = vdf.reindex(columns=self.config["ordered_columns"])
        vdf[self.config["party_identifier"]] = "npa"

        logging.info("Loading history file: " + hist_file["name"])
        if hist_file["name"][-3:] == "lst":
            hcolspecs = [
                [0, 13],
                [13, 15],
                [15, 20],
                [20, 25],
                [25, 38],
                [38, 39],
            ]
            hdf = pd.read_fwf(
                hist_file["obj"],
                colspecs=hcolspecs,
                names=self.config["fwf_hist_columns"],
                na_filter=False,
            )
        elif hist_file["name"][-3:] == "csv":
            hdf = self.read_csv_count_error_lines(hist_file["obj"],
                                                  na_filter=False,
                                                  error_bad_lines=False)
            if ("IS_ABSENTEE_VOTER"
                    not in hdf.columns) and ("IS_PERMANENT_ABSENTEE_VOTER"
                                             in hdf.columns):
                hdf.rename(
                    columns={
                        "IS_PERMANENT_ABSENTEE_VOTER": "IS_ABSENTEE_VOTER"
                    },
                    inplace=True,
                )
        else:
            raise NotImplementedError("File format not implemented")
        del hist_file
        gc.collect()

        # If hdf has ELECTION_DATE (new style) instead of ELECTION_CODE,
        # then we don't need to do election code lookups
        elec_code_dict = dict()
        missing_history_dates = False
        if "ELECTION_DATE" in hdf.columns:
            try:
                hdf["ELECTION_NAME"] = pd.to_datetime(
                    hdf["ELECTION_DATE"]).map(lambda x: x.strftime("%Y-%m-%d"))
            except ValueError:
                missing_history_dates = True
                hdf["ELECTION_NAME"] = hdf["ELECTION_DATE"]
        else:
            if elec_codes:
                # If we have election codes in this file
                logging.info("Loading election codes file: " +
                             elec_codes["name"])
                if elec_codes["name"][-3:] == "lst":
                    ecolspecs = [[0, 13], [13, 21], [21, 46]]
                    edf = pd.read_fwf(
                        elec_codes["obj"],
                        colspecs=ecolspecs,
                        names=self.config["elec_code_columns"],
                        na_filter=False,
                    )
                    edf["Date"] = pd.to_datetime(edf["Date"], format="%m%d%Y")
                elif elec_codes["name"][-3:] == "csv":
                    # I'm not sure if this would actually ever happen
                    edf = self.read_csv_count_error_lines(
                        elec_codes["obj"],
                        names=self.config["elec_code_columns"],
                        na_filter=False,
                        error_bad_lines=False,
                    )
                else:
                    raise NotImplementedError("File format not implemented")

                # make a code dictionary that will be stored with meta data
                for idx, row in edf.iterrows():
                    d = row["Date"].strftime("%Y-%m-%d")
                    elec_code_dict[row["Election_Code"]] = {
                        "Date":
                        d,
                        "Slug":
                        d + "_" + str(row["Election_Code"]) + "_" +
                        row["Title"].replace(" ", "-").replace("_", "-"),
                    }
            else:
                # Get election codes from most recent meta data
                this_date = parser.parse(date_from_str(
                    self.raw_s3_file)).date()
                pre_date, post_date, pre_key, post_key = get_surrounding_dates(
                    this_date,
                    self.state,
                    self.s3_bucket,
                    testing=self.testing)
                if pre_key is not None:
                    nearest_meta = get_metadata_for_key(
                        pre_key, self.s3_bucket)
                    elec_code_dict = nearest_meta["elec_code_dict"]
                    if len(elec_code_dict) == 0:
                        raise MissingElectionCodesError(
                            "No election codes in nearby meta data.")
                else:
                    raise MissingElectionCodesError(
                        "No election code file or nearby meta data found.")

            # Election code lookup
            hdf["ELECTION_NAME"] = hdf["ELECTION_CODE"].map(
                lambda x: elec_code_dict[str(x)]["Slug"]
                if str(x) in elec_code_dict else str(x))

        # Create meta data
        counts = hdf["ELECTION_NAME"].value_counts()
        counts.sort_index(inplace=True)
        sorted_codes = counts.index.to_list()
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(counts[i]),
                "date": date_from_str(k)
            }
            for i, k in enumerate(sorted_codes)
        }

        # Collect histories
        vdf.set_index(self.config["voter_id"], drop=False, inplace=True)
        hdf_id_groups = hdf.groupby(self.config["voter_id"])
        vdf["all_history"] = hdf_id_groups["ELECTION_NAME"].apply(list)
        vdf["votetype_history"] = hdf_id_groups["IS_ABSENTEE_VOTER"].apply(
            list)
        vdf["county_history"] = hdf_id_groups["COUNTY_CODE"].apply(list)
        vdf["jurisdiction_history"] = hdf_id_groups["JURISDICTION_CODE"].apply(
            list)
        vdf["schooldistrict_history"] = hdf_id_groups[
            "SCHOOL_DISTRICT_CODE"].apply(list)
        del hdf, hdf_id_groups
        gc.collect()

        def insert_code_bin(arr):
            if isinstance(arr, list):
                return [
                    sorted_codes_dict[k]["index"] for k in arr
                    if k in sorted_codes_dict
                ]
            else:
                return np.nan

        vdf["sparse_history"] = vdf["all_history"].map(insert_code_bin)

        if missing_history_dates:
            vdf["all_history"] = None
            vdf["sparse_history"] = None

        vdf = self.config.coerce_dates(vdf)
        vdf = self.config.coerce_numeric(
            vdf,
            extra_cols=[
                "PRECINCT",
                "WARD",
                "VILLAGE_PRECINCT",
                "SCHOOL_PRECINCT",
            ],
        )
        vdf = self.config.coerce_strings(vdf)

        self.meta = {
            "message": "michigan_{}".format(datetime.now().isoformat()),
            "array_encoding": sorted_codes_dict,
            "array_decoding": sorted_codes,
            "elec_code_dict": elec_code_dict,
        }

        csv_obj = vdf.to_csv(encoding="utf-8", index=False)
        del vdf
        gc.collect()

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(csv_obj),
            s3_bucket=self.s3_bucket,
        )
        del csv_obj
        gc.collect()
Ejemplo n.º 7
0
    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        logging.info("Minnesota: loading voter file")
        new_files = self.unpack_files(compression="unzip",
                                      file_obj=self.main_file)

        if not self.ignore_checks:
            self.file_check(len(new_files))
        voter_reg_df = pd.DataFrame(columns=self.config["ordered_columns"])
        voter_hist_df = pd.DataFrame(columns=self.config["hist_columns"])
        for i in new_files:
            if "election" in i["name"].lower():
                voter_hist_df = pd.concat(
                    [
                        voter_hist_df,
                        self.read_csv_count_error_lines(i["obj"],
                                                        error_bad_lines=False),
                    ],
                    axis=0,
                )
            elif "voter" in i["name"].lower():
                voter_reg_df = pd.concat(
                    [
                        voter_reg_df,
                        self.read_csv_count_error_lines(i["obj"],
                                                        encoding="latin-1",
                                                        error_bad_lines=False),
                    ],
                    axis=0,
                )
        voter_reg_df[self.config["voter_status"]] = np.nan
        voter_reg_df[self.config["party_identifier"]] = np.nan

        # if the dataframes are assigned columns to begin with, there will be nans due to concat if the columns are off
        self.column_check(list(voter_reg_df.columns))

        voter_reg_df["DOBYear"] = voter_reg_df["DOBYear"].astype(str).str[0:4]

        voter_hist_df["election_name"] = (voter_hist_df["ElectionDate"] + "_" +
                                          voter_hist_df["VotingMethod"])
        valid_elections, counts = np.unique(voter_hist_df["election_name"],
                                            return_counts=True)
        date_order = [
            idx for idx, election in sorted(
                enumerate(valid_elections),
                key=lambda x: datetime.strptime(x[1][:-2], "%m/%d/%Y"),
                reverse=True,
            )
        ]
        valid_elections = valid_elections[date_order]
        counts = counts[date_order]
        sorted_codes = valid_elections.tolist()
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(counts[i]),
                "date": date_from_str(k)
            }
            for i, k in enumerate(sorted_codes)
        }

        voter_hist_df["array_position"] = voter_hist_df["election_name"].map(
            lambda x: int(sorted_codes_dict[x]["index"]))

        logging.info("Minnesota: history apply")
        voter_groups = voter_hist_df.groupby("VoterId")
        all_history = voter_groups["array_position"].apply(list)
        vote_type = voter_groups["VotingMethod"].apply(list)

        voter_reg_df = voter_reg_df.set_index(self.config["voter_id"])

        voter_reg_df["all_history"] = all_history
        voter_reg_df["vote_type"] = vote_type
        gc.collect()

        voter_reg_df = self.config.coerce_strings(voter_reg_df)
        voter_reg_df = self.config.coerce_dates(voter_reg_df)
        voter_reg_df = self.config.coerce_numeric(voter_reg_df)

        self.meta = {
            "message": "minnesota_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }

        gc.collect()
        logging.info("Minnesota: writing out")

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(voter_reg_df.to_csv(encoding="utf-8")),
            s3_bucket=self.s3_bucket,
        )
Ejemplo n.º 8
0
    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        def file_is_active(filename):
            for word in ["Canceled", "Suspense", "Inactive"]:
                if word in filename:
                    return False
            return True

        def add_files_to_main_df(main_df, file_list):
            alias_dict = self.config["column_aliases"]
            for f in file_list:
                if f["name"].split(".")[-1] == "csv":
                    new_df = self.read_csv_count_error_lines(
                        f["obj"], error_bad_lines=False)
                else:
                    new_df = pd.read_excel(f["obj"])

                for c in new_df.columns:
                    # files vary in consistent use of spaces in headers,
                    # and some headers have different aliases for headers
                    if c.replace(" ", "") in alias_dict.keys():
                        new_df.rename(
                            columns={c: alias_dict[c.replace(" ", "")]},
                            inplace=True,
                        )
                    else:
                        new_df.rename(columns={c: c.replace(" ", "")},
                                      inplace=True)
                new_df.rename(columns={"YearofBirth": "DOB"}, inplace=True)
                main_df = pd.concat([main_df, new_df], sort=False)
            return main_df

        def insert_code_bin(arr):
            return [sorted_codes_dict[k]["index"] for k in arr]

        new_files = self.unpack_files(file_obj=self.main_file,
                                      compression="unzip")

        active_files = [f for f in new_files if file_is_active(f["name"])]
        other_files = [f for f in new_files if not file_is_active(f["name"])]

        main_df = pd.DataFrame()
        main_df = add_files_to_main_df(main_df, active_files)
        main_df = add_files_to_main_df(main_df, other_files)
        main_df.reset_index(drop=True, inplace=True)

        main_df = self.config.coerce_dates(main_df)
        main_df = self.config.coerce_strings(main_df)
        main_df = self.config.coerce_numeric(
            main_df,
            extra_cols=[
                "HouseNumber",
                "UnitNumber",
                "ResidenceZip",
                "MailingZip",
                "Phone",
                "PrecinctPart",
                "VRAZVoterID",
            ],
        )
        voter_columns = [c for c in main_df.columns if not c[0].isdigit()]
        history_columns = [c for c in main_df.columns if c[0].isdigit()]

        self.column_check(voter_columns)
        to_normalize = history_columns + [
            self.config["party_identifier"],
            self.config["voter_status"],
        ]
        for c in to_normalize:
            s = main_df[c].astype(str).str.strip().str.lower()
            s = s.str.encode("utf-8", errors="ignore").str.decode("utf-8")
            main_df.loc[(~main_df[c].isna()), c] = s.loc[(~main_df[c].isna())]
        for c in history_columns:
            main_df[c] = main_df[c].str.replace(" - ", "_")

        main_df[self.config["party_identifier"]] = main_df[
            self.config["party_identifier"]].map(
                lambda x: self.config["party_aliases"][x]
                if x in self.config["party_aliases"] else x)

        # handle history:
        sorted_codes = history_columns[::-1]
        hist_df = main_df[sorted_codes]
        voter_df = main_df[voter_columns]
        counts = (~hist_df.isna()).sum()
        sorted_codes_dict = {
            k: {
                "index": int(i),
                "count": int(counts[i]),
                "date": date_from_str(k),
            }
            for i, k in enumerate(sorted_codes)
        }

        hist_df.loc[:, "vote_codes"] = pd.Series(hist_df.values.tolist())
        hist_df.loc[:, "vote_codes"] = hist_df.loc[:, "vote_codes"].map(
            lambda x: [c for c in x if not pd.isna(c)])
        voter_df.loc[:, "votetype_history"] = hist_df.loc[:, "vote_codes"].map(
            lambda x: [c.split("_")[0] for c in x])
        voter_df.loc[:, "party_history"] = hist_df.loc[:, "vote_codes"].map(
            lambda x: [
                c.split("_")[1] if len(c.split("_")) > 1 else self.config[
                    "no_party_affiliation"] for c in x
            ])

        hist_df.drop(columns=["vote_codes"], inplace=True)
        for c in hist_df.columns:
            hist_df.loc[:, c] = hist_df.loc[:, c].map(
                lambda x: c if not pd.isna(x) else np.nan)
        voter_df.loc[:, "all_history"] = pd.Series(hist_df.values.tolist())
        voter_df.loc[:, "all_history"] = voter_df.loc[:, "all_history"].map(
            lambda x: [c for c in x if not pd.isna(c)])
        voter_df.loc[:, "sparse_history"] = voter_df.loc[:, "all_history"].map(
            insert_code_bin)

        expected_cols = (self.config["ordered_columns"] +
                         self.config["ordered_generated_columns"])
        voter_df = self.reconcile_columns(voter_df, expected_cols)
        voter_df = voter_df[expected_cols]

        self.meta = {
            "message": "arizona2_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }
        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(voter_df.to_csv(encoding="utf-8", index=False)),
            s3_bucket=self.s3_bucket,
        )
Ejemplo n.º 9
0
    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(self.main_file, compression="unzip")

        if not self.ignore_checks:
            self.file_check(len(new_files))
        voter_file = (
            new_files[0] if "ElgbVtr" in new_files[0]["name"] else new_files[1]
        )
        hist_file = (
            new_files[0] if "VtHst" in new_files[0]["name"] else new_files[1]
        )

        df_hist = self.read_csv_count_error_lines(
            hist_file["obj"], header=None, error_bad_lines=False
        )
        df_hist.columns = self.config["hist_columns"]
        df_voters = self.read_csv_count_error_lines(
            voter_file["obj"], header=None, error_bad_lines=False
        )
        del self.main_file, self.temp_files, voter_file, hist_file, new_files
        gc.collect()

        try:
            df_voters.columns = self.config["ordered_columns"]
        except ValueError:
            logging.info("Incorrect number of columns found for Nevada")
            raise MissingNumColumnsError(
                "{} state is missing columns".format(self.state),
                self.state,
                len(self.config["ordered_columns"]),
                len(df_voters.columns),
            )

        sorted_codes = df_hist.date.unique().tolist()
        sorted_codes.sort(key=lambda x: datetime.strptime(x, "%m/%d/%Y"))
        counts = df_hist.date.value_counts()
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(counts.loc[k]),
                "date": date_from_str(k),
            }
            for i, k in enumerate(sorted_codes)
        }

        def insert_code_bin(arr):
            if isinstance(arr, list):
                return [sorted_codes_dict[k]["index"] for k in arr]
            else:
                return np.nan

        df_voters = df_voters.set_index("VoterID", drop=False)
        voter_id_groups = df_hist.groupby("VoterID")
        df_voters["all_history"] = voter_id_groups["date"].apply(list)
        df_voters["votetype_history"] = voter_id_groups["vote_code"].apply(
            list
        )
        del df_hist, voter_id_groups
        gc.collect()

        df_voters["sparse_history"] = df_voters["all_history"].map(
            insert_code_bin
        )

        # create compound string for unique voter ID from county ID
        df_voters["County_Voter_ID"] = (
            df_voters["County"].str.replace(" ", "").str.lower()
            + "_"
            + df_voters["County_Voter_ID"].astype(int).astype(str)
        )
        df_voters = self.config.coerce_dates(df_voters)
        df_voters = self.config.coerce_numeric(
            df_voters,
            extra_cols=[
                "Zip",
                "Phone",
                "Congressional_District",
                "Senate_District",
                "Assembly_District",
                "Education_District",
                "Regent_District",
                "Registered_Precinct",
            ],
        )
        df_voters = self.config.coerce_strings(df_voters)

        # standardize district data - over time these have varied from:
        #   "1" vs. "district 1" vs "cd1"/"sd1"/"ad1"
        digits = re.compile("\d+")
        def get_district_number_str(x):
            try:
                s = digits.search(x)
            except TypeError:
                return None
            if s is not None:
                return s.group()
            else:
                return None

        df_voters["Congressional_District"] = (
            df_voters["Congressional_District"].map(ensure_int_string)
        )
        df_voters["Senate_District"] = (
            df_voters["Senate_District"].map(ensure_int_string)
        )
        df_voters["Assembly_District"] = (
            df_voters["Assembly_District"].map(ensure_int_string)
        )
        df_voters["Congressional_District"] = (
            df_voters["Congressional_District"].map(get_district_number_str)
        )
        df_voters["Senate_District"] = (
            df_voters["Senate_District"].map(get_district_number_str)
        )
        df_voters["Assembly_District"] = (
            df_voters["Assembly_District"].map(get_district_number_str)
        )

        self.meta = {
            "message": "nevada_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }

        csv_obj = df_voters.to_csv(encoding="utf-8", index=False)
        del df_voters
        gc.collect()

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(csv_obj),
            s3_bucket=self.s3_bucket,
        )
        del csv_obj
        gc.collect()
Ejemplo n.º 10
0
    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(file_obj=self.main_file)

        if not self.ignore_checks:
            self.file_check(len(new_files))

        for i in new_files:
            logging.info("Loading file {}".format(i))
            if "_22" in i["name"]:
                df = self.read_csv_count_error_lines(
                    i["obj"],
                    encoding="latin-1",
                    compression="gzip",
                    error_bad_lines=False,
                )
            elif ".txt" in i["name"]:
                temp_df = self.read_csv_count_error_lines(
                    i["obj"],
                    encoding="latin-1",
                    compression="gzip",
                    error_bad_lines=False,
                )
                df = pd.concat([df, temp_df], axis=0)

        # create history meta data
        voting_history_cols = list(
            filter(
                lambda x: any(
                    [pre in x for pre in ("GENERAL-", "SPECIAL-", "PRIMARY-")]
                ),
                df.columns.values,
            )
        )
        self.column_check(list(set(df.columns) - set(voting_history_cols)))
        total_records = df.shape[0]
        sorted_codes = voting_history_cols
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(total_records - df[k].isna().sum()),
                "date": date_from_str(k),
            }
            for i, k in enumerate(voting_history_cols)
        }

        # ensure district and other numeric fields are e.g. "1" not "1.0"
        df["CONGRESSIONAL_DISTRICT"] = (
            df["CONGRESSIONAL_DISTRICT"].map(ensure_int_string)
        )
        df["STATE_REPRESENTATIVE_DISTRICT"] = (
            df["STATE_REPRESENTATIVE_DISTRICT"].map(ensure_int_string)
        )
        df["STATE_SENATE_DISTRICT"] = (
            df["STATE_SENATE_DISTRICT"].map(ensure_int_string)
        )
        df["COURT_OF_APPEALS"] = (
            df["COURT_OF_APPEALS"].map(ensure_int_string)
        )
        df["STATE_BOARD_OF_EDUCATION"] = (
            df["STATE_BOARD_OF_EDUCATION"].map(ensure_int_string)
        )
        df["RESIDENTIAL_ZIP"] = (
            df["RESIDENTIAL_ZIP"].map(ensure_int_string)
        )
        df["RESIDENTIAL_ZIP_PLUS4"] = (
            df["RESIDENTIAL_ZIP_PLUS4"].map(ensure_int_string)
        )
        df["MAILING_ZIP"] = (
            df["MAILING_ZIP"].map(ensure_int_string)
        )
        df["MAILING_ZIP_PLUS4"] = (
            df["MAILING_ZIP_PLUS4"].map(ensure_int_string)
        )

        self.meta = {
            "message": "ohio_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }
        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(df.to_csv(encoding="utf-8", index=False)),
            s3_bucket=self.s3_bucket,
        )
Ejemplo n.º 11
0
    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(compression="unzip",
                                      file_obj=self.main_file)
        df_voter = pd.DataFrame(columns=self.config.raw_file_columns())
        df_hist = pd.DataFrame(columns=self.config["hist_columns"])
        df_master_voter = pd.DataFrame(
            columns=self.config["master_voter_columns"])
        master_vf_version = True

        def master_to_reg_df(df):
            df.columns = self.config["master_voter_columns"]
            df["STATUS"] = df["VOTER_STATUS"]
            df["PRECINCT"] = df["PRECINCT_CODE"]
            df["VOTER_NAME"] = (df["LAST_NAME"] + ", " + df["FIRST_NAME"] +
                                " " + df["MIDDLE_NAME"])
            df = pd.concat(
                [df,
                 pd.DataFrame(columns=self.config["blacklist_columns"])])
            df = df[self.config.processed_file_columns()]
            return df

        for i in new_files:
            if "Registered_Voters_List" in i["name"]:
                master_vf_version = False

        for i in new_files:
            if "Public" not in i["name"]:

                if ("Registered_Voters_List" in i["name"]
                        and not master_vf_version):
                    logging.info("reading in {}".format(i["name"]))
                    # Colorado has a couple different encodings they send us, the format that is detected as ascii will
                    # error out if not read in as latin-1
                    # The format that is typically detected as utf-8-sig needs to have the index col explicitly set to
                    # false, or else pandas will attempt to read the voterid column
                    # in as the index and the history won't apply
                    encoding_result = chardet.detect(i["obj"].read(10000))
                    if encoding_result["encoding"] == "ascii":
                        encoding = "latin-1"
                        index_col = None
                    else:
                        encoding = encoding_result["encoding"]
                        index_col = False
                    i["obj"].seek(0)
                    df_voter = pd.concat(
                        [
                            df_voter,
                            self.read_csv_count_error_lines(
                                i["obj"],
                                encoding=encoding,
                                error_bad_lines=False,
                                index_col=index_col,
                            ),
                        ],
                        axis=0,
                    )

                elif ("Voting_History"
                      in i["name"]) or ("Coordinated_Voter_Details"
                                        in i["name"]):
                    if "Voter_Details" not in i["name"]:
                        logging.info("reading in {}".format(i["name"]))
                        new_df = self.read_csv_count_error_lines(
                            i["obj"],
                            compression="gzip",
                            error_bad_lines=False)
                        df_hist = pd.concat([df_hist, new_df], axis=0)

                    if "Voter_Details" in i["name"] and master_vf_version:
                        logging.info("reading in {}".format(i["name"]))
                        new_df = self.read_csv_count_error_lines(
                            i["obj"],
                            compression="gzip",
                            error_bad_lines=False)
                        if len(new_df.columns) < len(
                                self.config["master_voter_columns"]):
                            new_df.insert(10, "PHONE_NUM", np.nan)
                        try:
                            new_df.columns = self.config[
                                "master_voter_columns"]
                        except ValueError:
                            logging.info(
                                "Incorrect number of columns found for Colorado for file: {}"
                                .format(i["name"]))
                            raise MissingNumColumnsError(
                                "{} state is missing columns".format(
                                    self.state),
                                self.state,
                                len(self.config["master_voter_columns"]),
                                len(new_df.columns),
                            )
                        df_master_voter = pd.concat([df_master_voter, new_df],
                                                    axis=0)

        if df_voter.empty:
            df_voter = master_to_reg_df(df_master_voter)
        if df_hist.empty:
            raise ValueError("must supply a file containing voter history")
        df_hist["VOTING_METHOD"] = df_hist["VOTING_METHOD"].replace(np.nan, "")
        df_hist["ELECTION_DATE"] = pd.to_datetime(df_hist["ELECTION_DATE"],
                                                  format="%m/%d/%Y",
                                                  errors="coerce")
        df_hist.dropna(subset=["ELECTION_DATE"], inplace=True)
        df_hist["election_name"] = (df_hist["ELECTION_DATE"].astype(str) +
                                    "_" + df_hist["VOTING_METHOD"])

        valid_elections, counts = np.unique(df_hist["election_name"],
                                            return_counts=True)

        date_order = [
            idx for idx, election in sorted(
                enumerate(valid_elections),
                key=lambda x: datetime.strptime(x[1][0:10], "%Y-%m-%d"),
                reverse=True,
            )
        ]
        valid_elections = valid_elections[date_order]
        counts = counts[date_order]
        sorted_codes = valid_elections.tolist()
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(counts[i]),
                "date": date_from_str(k)
            }
            for i, k in enumerate(sorted_codes)
        }

        df_hist["array_position"] = df_hist["election_name"].map(
            lambda x: int(sorted_codes_dict[x]["index"]))

        logging.info("Colorado: history apply")
        voter_groups = df_hist.groupby(self.config["voter_id"])
        all_history = voter_groups["array_position"].apply(list)
        vote_type = voter_groups["VOTING_METHOD"].apply(list)

        df_voter.dropna(subset=[self.config["voter_id"]], inplace=True)
        df_voter = df_voter.set_index(self.config["voter_id"])
        df_voter.sort_index(inplace=True)

        df_voter["all_history"] = all_history
        df_voter["vote_type"] = vote_type
        gc.collect()

        # at some point mailing address field names changed
        for num in ["1", "2", "3"]:
            if f"MAIL_ADDR{num}" in df_voter.columns:
                # if both are present, combine them
                if f"MAILING_ADDRESS_{num}" in df_voter.columns:
                    df_voter[f"MAILING_ADDRESS_{num}"] = np.where(
                        df_voter[f"MAILING_ADDRESS_{num}"].isnull(),
                        df_voter[f"MAIL_ADDR{num}"],
                        df_voter[f"MAILING_ADDRESS_{num}"],
                    )
                else:
                    df_voter[f"MAILING_ADDRESS_{num}"] = df_voter[
                        f"MAIL_ADDR{num}"]
                df_voter.drop(columns=[f"MAIL_ADDR{num}"], inplace=True)

        df_voter = self.config.coerce_strings(df_voter)
        df_voter = self.config.coerce_dates(df_voter)
        df_voter = self.config.coerce_numeric(
            df_voter,
            extra_cols=[
                "HOUSE_NUM",
                "UNIT_NUM",
                "RESIDENTIAL_ZIP_CODE",
                "RESIDENTIAL_ZIP_PLUS",
                "MAILING_ZIP_CODE",
                "MAILING_ZIP_PLUS",
                "PRECINCT_NAME",
                "PRECINCT",
                "MAILING_ADDRESS_3",
                "PHONE_NUM",
            ],
        )

        self.meta = {
            "message": "Colorado_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }

        gc.collect()
        logging.info("Colorado: writing out")
        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(df_voter.to_csv(encoding="utf-8")),
            s3_bucket=self.s3_bucket,
        )
Ejemplo n.º 12
0
    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        def is_first_file(fname):
            if "CD1" in fname:
                if ("Part1" in fname) or ("Part 1" in fname):
                    return True
            return False

        new_files = self.unpack_files(
            file_obj=self.main_file, compression="unzip"
        )
        logging.info("IOWA: reading in voter file")

        first_file = [f for f in new_files if is_first_file(f["name"])][0]
        remaining_files = [
            f for f in new_files if not is_first_file(f["name"])
        ]
        if not self.ignore_checks:
            # add 1 for firs file
            valid_files = len(remaining_files) + 1
            self.file_check(valid_files)

        buffer_cols = [
            "buffer0",
            "buffer1",
            "buffer2",
            "buffer3",
            "buffer4",
            "buffer5",
            "buffer6",
            "buffer7",
            "buffer8",
            "buffer9",
        ]

        # Reads the headers in on the first file given
        headers = pd.read_csv(first_file["obj"], nrows=1).columns

        # Gather the columns for renaming in order to fit the original schema in the database and then rename
        # so that the columns in the header will fit what is expected
        column_rename_dict = self.config["rename_columns"]
        normalized_headers = [
            x if x not in column_rename_dict else column_rename_dict[x]
            for x in headers
        ]
        normalized_headers = [x.replace(" ", "_") for x in normalized_headers]

        columns_to_check = [
            x
            for x in normalized_headers
            if x not in self.config["election_columns"]
        ]
        self.column_check(columns_to_check)

        # Add the buffer columns back in for lines that contain extra commas
        headers_with_buffers = normalized_headers + buffer_cols

        # Begin reading the file with the correct headers
        df_voters = self.read_csv_count_error_lines(
            first_file["obj"],
            skiprows=1,
            header=None,
            names=headers_with_buffers,
            error_bad_lines=False,
        )

        for i in remaining_files:
            skiprows = 1 if "Part1" in i["name"] else 0
            new_df = self.read_csv_count_error_lines(
                i["obj"],
                header=None,
                skiprows=skiprows,
                names=headers_with_buffers,
                error_bad_lines=False,
            )
            df_voters = pd.concat([df_voters, new_df], axis=0)

        key_delim = "_"
        df_voters["all_history"] = ""
        df_voters = df_voters[df_voters.COUNTY != "COUNTY"]

        # instead of iterating over all of the columns for each row, we should
        # handle all this beforehand.
        # also we should not compute the unique values until after, not before
        df_voters.drop(columns=buffer_cols, inplace=True)

        for c in self.config["election_dates"]:
            null_rows = df_voters[c].isnull()
            df_voters[c][null_rows] = ""

            # each key contains info from the columns
            prefix = c.split("_")[0] + key_delim

            # and the corresponding votervotemethod column
            vote_type_col = c.replace("ELECTION_DATE", "VOTERVOTEMETHOD")
            null_rows = df_voters[vote_type_col].isnull()
            df_voters[vote_type_col].loc[null_rows] = ""
            # add election type and date
            df_voters[c] = prefix + df_voters[c].str.strip()
            # add voting method
            df_voters[c] += key_delim + df_voters[vote_type_col].str.strip()

            # the code below will format each key as
            # <election_type>_<date>_<voting_method>_<political_party>_
            # <political_org>
            if "PRIMARY" in prefix:

                # so far so good but we need more columns in the event of a
                # primary
                org_col = c.replace(
                    "PRIMARY_ELECTION_DATE", "POLITICAL_ORGANIZATION"
                )
                party_col = c.replace(
                    "PRIMARY_ELECTION_DATE", "POLITICAL_PARTY"
                )
                df_voters[org_col].loc[df_voters[org_col].isnull()] = ""
                df_voters[party_col].loc[df_voters[party_col].isnull()] = ""
                party_info = (
                    df_voters[party_col].str.strip()
                    + key_delim
                    + df_voters[org_col].str.replace(" ", "")
                )
                df_voters[c] += key_delim + party_info
            else:
                # add 'blank' values for the primary slots
                df_voters[c] += key_delim + key_delim

            df_voters[c] = df_voters[c].str.replace(prefix + key_delim * 3, "")
            df_voters[c] = df_voters[c].str.replace('"', "")
            df_voters[c] = df_voters[c].str.replace("'", "")
            df_voters.all_history += " " + df_voters[c]

        # make into an array (null values are '' so they are ignored)
        df_voters.all_history = df_voters.all_history.str.split()
        elections, counts = np.unique(
            df_voters[self.config["election_dates"]], return_counts=True
        )
        # we want reverse order (lower indices are higher frequency)
        count_order = counts.argsort()[::-1]
        elections = elections[count_order]
        counts = counts[count_order]

        # create meta
        sorted_codes_dict = {
            j: {"index": i, "count": int(counts[i]), "date": date_from_str(j)}
            for i, j in enumerate(elections)
        }

        default_item = {"index": len(elections)}

        def ins_code_bin(a):
            return [sorted_codes_dict.get(k, default_item)["index"] for k in a]

        # In an instance like this, where we've created our own systematized
        # labels for each election I think it makes sense to also keep them
        # in addition to the sparse history
        df_voters["sparse_history"] = df_voters.all_history.apply(ins_code_bin)

        self.meta = {
            "message": "iowa_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(elections.tolist()),
        }
        for c in df_voters.columns:
            df_voters[c].loc[df_voters[c].isnull()] = ""

        for c in df_voters.columns:
            df_voters[c] = (
                df_voters[c]
                .astype(str)
                .str.encode("utf-8", errors="ignore")
                .str.decode("utf-8")
            )

        df_voters = self.config.coerce_dates(df_voters)
        df_voters = self.config.coerce_numeric(
            df_voters,
            extra_cols=[
                "COMMUNITY_COLLEGE",
                "COMMUNITY_COLLEGE_DIRECTOR",
                "LOSST_CONTIGUOUS_CITIES",
                "PRECINCT",
                "SANITARY",
                "SCHOOL_DIRECTOR",
                "UNIT_NUM",
            ],
        )
        # force reg num to be integer
        df_voters["REGN_NUM"] = pd.to_numeric(
            df_voters["REGN_NUM"], errors="coerce"
        ).fillna(0)
        df_voters["REGN_NUM"] = df_voters["REGN_NUM"].astype(int)

        # Drop the election columns because they are no longer needed
        df_voters.drop(columns=self.config["election_columns"], inplace=True)

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(df_voters.to_csv(encoding="utf-8", index=False)),
            s3_bucket=self.s3_bucket,
        )
Ejemplo n.º 13
0
    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(self.main_file)
        precincts_file = [
            x for x in new_files if 'precincts' in x["name"].lower()
        ][0]
        if precincts_file is None:
            raise ValueError("Missing Precincts File")
        voter_files = list(
            filter(lambda v: re.search('cty[0-9]+_vr.csv', v["name"].lower()),
                   new_files))
        self.file_check(len(voter_files) + 1)
        hist_files = list(
            filter(lambda v: re.search('cty[0-9]+_vh.csv', v["name"].lower()),
                   new_files))
        vdf = pd.DataFrame()
        hdf = pd.DataFrame()
        dtypes = self.config['dtypes']
        cty_map = dict([(value, key)
                        for key, value in self.config['county_codes'].items()])

        # Returns the string county name for the county code contained in the first two characters of the precicnct string
        def county_map(pct):
            def mapping(prec):
                county = cty_map[prec[:2]]
                return county

            return pd.Series(map(mapping, pct.tolist()))

        for file in voter_files:
            if "vr.csv" in file["name"].lower():
                temp_vdf = pd.read_csv(file["obj"],
                                       encoding='latin',
                                       dtype=dtypes)
                vdf = pd.concat([vdf, temp_vdf], ignore_index=True)
        vdf.drop_duplicates(inplace=True)

        # Read and merge the precincts file to the main df
        precinct_dtypes = {
            'PrecinctCode': 'string',
            'CongressionalDistrict': 'int64',
            'StateSenateDistrict': 'int64',
            'StateHouseDistrict': 'int64',
            'CountyCommissioner': 'int64',
            'PollSite': 'string'
        }
        precincts = pd.read_csv(precincts_file["obj"],
                                encoding='latin',
                                dtype=precinct_dtypes)
        precincts.rename(columns={"PrecinctCode": "Precinct"}, inplace=True)
        if precincts.empty:
            raise ValueError("Missing Precicnts file")
        vdf = vdf.merge(precincts, how='left', on='Precinct')

        # Add the county column
        vdf['County'] = county_map(vdf['Precinct'])

        # At one point OK added some columns, this adds them to older files for backwards compatibility
        self.reconcile_columns(vdf, self.config["columns"])
        for file in hist_files:
            temp_hdf = pd.read_csv(file["obj"], dtype={'VoterID': 'string'})
            hdf = pd.concat(
                [hdf, temp_hdf],
                ignore_index=True,
            )

        valid_elections, counts = np.unique(hdf["ElectionDate"],
                                            return_counts=True)
        count_order = counts.argsort()[::-1]
        valid_elections = valid_elections[count_order]
        counts = counts[count_order]
        sorted_codes = valid_elections.tolist()
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(counts[i]),
                "date": date_from_str(k)
            }
            for i, k in enumerate(sorted_codes)
        }
        hdf["array_position"] = hdf["ElectionDate"].map(
            lambda x: int(sorted_codes_dict[x]["index"]))

        # The hist columns in the vdf are unecessary because we get a separate hist file that is more complete.
        hist_columns = [
            col for col in vdf.columns
            if "voterhist" in col.lower() or "histmethod" in col.lower()
        ]
        vdf = self.config.coerce_numeric(vdf)
        vdf = self.config.coerce_strings(vdf)
        vdf = self.config.coerce_dates(vdf)
        vdf.drop(hist_columns, inplace=True)
        vdf.set_index(self.config["voter_id"], drop=False, inplace=True)
        voter_groups = hdf.groupby(self.config["voter_id"])
        vdf["all_history"] = voter_groups["ElectionDate"].apply(list)
        vdf["sparse_history"] = voter_groups["array_position"].apply(list)
        vdf["votetype_history"] = voter_groups["VotingMethod"].apply(list)

        self.meta = {
            "message": "oklahoma_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(vdf.to_csv(encoding="utf-8", index=False)),
            s3_bucket=self.s3_bucket,
        )
Ejemplo n.º 14
0
    def execute(self):
        if self.raw_s3_file is not None:
            self.main_file = self.s3_download()

        new_files = self.unpack_files(
            file_obj=self.main_file)  # array of dicts
        del self.main_file, self.temp_files
        gc.collect()

        if not self.ignore_checks:
            self.file_check(len(new_files))

        for i in new_files:
            if ("ncvhis" in i["name"]) and (".txt" in i["name"]):
                vote_hist_file = i
            elif ("ncvoter" in i["name"]) and (".txt" in i["name"]):
                voter_file = i
        voter_df = self.read_csv_count_error_lines(
            voter_file["obj"],
            sep="\t",
            quotechar='"',
            encoding="latin-1",
            error_bad_lines=False,
        )
        del voter_file
        gc.collect()

        vote_hist = self.read_csv_count_error_lines(
            vote_hist_file["obj"],
            sep="\t",
            quotechar='"',
            error_bad_lines=False,
        )
        del vote_hist_file, new_files
        gc.collect()

        try:
            voter_df.columns = self.config["ordered_columns"]
        except ValueError:
            logging.info(
                "Incorrect number of columns found for the voter file in North Carolina"
            )
            raise MissingNumColumnsError(
                "{} state is missing columns".format(self.state),
                self.state,
                len(self.config["ordered_columns"]),
                len(voter_df.columns),
            )
        try:
            vote_hist.columns = self.config["hist_columns"]
        except ValueError:
            logging.info(
                "Incorrect number of columns found for the history file in North Carolina"
            )
            raise

        valid_elections, counts = np.unique(vote_hist["election_desc"],
                                            return_counts=True)
        count_order = counts.argsort()[::-1]
        valid_elections = valid_elections[count_order]
        counts = counts[count_order]

        sorted_codes = valid_elections.tolist()
        sorted_codes_dict = {
            k: {
                "index": i,
                "count": int(counts[i]),
                "date": date_from_str(k)
            }
            for i, k in enumerate(sorted_codes)
        }
        vote_hist["array_position"] = vote_hist["election_desc"].map(
            lambda x: int(sorted_codes_dict[x]["index"]))
        del valid_elections, counts, count_order
        gc.collect()

        voter_groups = vote_hist.groupby(self.config["voter_id"])
        all_history = voter_groups["array_position"].apply(list)
        vote_type = voter_groups["voting_method"].apply(list)

        voter_df = voter_df.set_index(self.config["voter_id"])

        voter_df["all_history"] = all_history
        voter_df["vote_type"] = vote_type
        del voter_groups, vote_hist, all_history, vote_type
        gc.collect()

        voter_df = self.config.coerce_strings(voter_df)
        voter_df = self.config.coerce_dates(voter_df)
        voter_df = self.config.coerce_numeric(
            voter_df,
            extra_cols=[
                "county_commiss_abbrv",
                "fire_dist_abbrv",
                "full_phone_number",
                "judic_dist_abbrv",
                "munic_dist_abbrv",
                "municipality_abbrv",
                "precinct_abbrv",
                "precinct_desc",
                "school_dist_abbrv",
                "super_court_abbrv",
                "township_abbrv",
                "township_desc",
                "vtd_abbrv",
                "vtd_desc",
                "ward_abbrv",
            ],
        )

        self.meta = {
            "message": "north_carolina_{}".format(datetime.now().isoformat()),
            "array_encoding": json.dumps(sorted_codes_dict),
            "array_decoding": json.dumps(sorted_codes),
        }
        self.is_compressed = False

        csv_obj = voter_df.to_csv(encoding="utf-8", index=True)
        del voter_df
        gc.collect()

        self.processed_file = FileItem(
            name="{}.processed".format(self.config["state"]),
            io_obj=StringIO(csv_obj),
            s3_bucket=self.s3_bucket,
        )
        del csv_obj
        gc.collect()