def check_ru_file(juris_path: str, juris_true_name: str) -> Optional[dict]:
    err = None
    ru = get_element(juris_path, "ReportingUnit")

    # create set of all parents, all lead rus
    parents = set()
    leadings = set()
    for _, r in ru.iterrows():
        components = r["Name"].split(";")
        parents.update(
            {";".join(components[:j + 1])
             for j in range(len(components) - 1)})
        leadings.update({components[0]})

    # identify and report parents that are missing from ReportingUnit.txt
    missing = [p for p in parents if p not in ru["Name"].unique()]
    missing.sort(reverse=True)
    if missing:
        m_str = "\n".join(missing)
        err = ui.add_new_error(
            err,
            "jurisdiction",
            Path(juris_path).name,
            f"Some parent reporting units are missing from ReportingUnit.txt:\n{m_str}",
        )

    # check that all reporting units start with true name
    bad = [j for j in leadings if j != juris_true_name]
    if bad:
        bad.sort(reverse=True)
        bad_str = "\n".join(bad)
        err = ui.add_new_error(
            err,
            "jurisdiction",
            Path(juris_path).name,
            f"Every ReportingUnit should start with the jurisdiction name. These do not:\n{bad_str}",
        )

    # check that there are no duplicate Names
    ru_freq = ru.groupby(["Name"]).count()
    duped = ru_freq[ru_freq["ReportingUnitType"] > 1]
    if not duped.empty:
        dupe_str = "\n".join(list(duped.index.unique()))
        err = ui.add_new_error(
            err,
            "jurisdiction",
            Path(juris_path).name,
            f"\nReportingUnit Names must be unique. These are listed on more than one row:\n{dupe_str}",
        )

    return err
def check_dictionary(dictionary_path: str) -> Optional[dict]:
    err = None
    dictionary_dir = Path(dictionary_path).parent.name

    # dedupe the dictionary
    clean_and_dedupe(dictionary_path, clean_candidates=True)
    # check that no entry is null
    df = pd.read_csv(dictionary_path,
                     **constants.standard_juris_csv_reading_kwargs)
    null_mask = df.T.isnull().any()
    if null_mask.any():
        # drop null rows and report error
        err = ui.add_new_error(
            err,
            "jurisdiction",
            dictionary_dir,
            f"dictionary.txt has some null entries:\n{df[null_mask]}",
        )
        df = df[~null_mask]

    # check that cdf_element-raw_identifier_value pairs are unique
    two_column_df = df[["cdf_element", "raw_identifier_value"]]
    dupes_df, _ = ui.find_dupes(two_column_df)
    if not dupes_df.empty:
        err = ui.add_new_error(
            err,
            "jurisdiction",
            dictionary_dir,
            f"dictionary.txt has more than one entry for each of these:\n {dupes_df}",
        )
    # check that there are no candidate dupes after regularization
    cands = two_column_df[two_column_df.cdf_element == "Candidate"].copy()
    cands["regular"] = m.regularize_candidate_names(cands.raw_identifier_value)
    dupe_reg = list()
    for reg in cands.regular.unique():
        all_match = cands[cands.regular == reg].copy()
        if all_match.shape[0] > 1:
            dupe_reg.append(
                f"{reg} is regular version of: {list(all_match.raw_identifier_value.unique())}"
            )
    if dupe_reg:
        dupe_str = "\n".join(dupe_reg)
        err = ui.add_new_error(
            err,
            "jurisdiction",
            dictionary_dir,
            f"Some raw candidate names match after regularization, "
            f"so are effectively dupes and should be deduped.:\n{dupe_str}",
        )
    return err
def add_candidates(
    juris_sys_name: str,
    repo_content_root: str,
    candidate_list: List[str],
    normal: Dict[str, str],
) -> Optional[Dict[str, Any]]:
    err = None
    juris_path = os.path.join(repo_content_root, "jurisdictions",
                              juris_sys_name)
    try:
        old_df = juris.get_element(juris_path, "Candidate")
        old_dictionary = juris.get_element(juris_path, "dictionary")
        new_df = pd.DataFrame([[normal[x], x] for x in candidate_list],
                              columns=["BallotName", "raw"])
        new_dict = new_df.rename(columns={
            "BallotName": "cdf_internal_name",
            "raw": "raw_identifier_value"
        })
        new_dict["cdf_element"] = "Candidate"
        juris.write_element(juris_path, "Candidate",
                            pd.concat([old_df, new_df[["BallotName"]]]))
        juris.write_element(juris_path, "dictionary",
                            pd.concat([old_dictionary, new_dict]))
    except Exception as exc:
        err = ui.add_new_error(err, "jurisdiction", juris_sys_name,
                               f"Error adding candidates: {exc}")
    return err
def write_element(juris_path: str,
                  element: str,
                  df: pd.DataFrame,
                  file_name=None) -> dict:
    """<juris> is path to target directory. Info taken
    from <element>.txt file in that directory.
    <element>.txt is overwritten with info in <df>"""
    err = None
    # set name of target file
    if not file_name:
        file_name = f"{element}.txt"
    # dedupe the input df
    dupes_df, deduped = ui.find_dupes(df)

    if element == "dictionary":
        # remove empty lines
        deduped = remove_empty_lines(deduped, element)
    try:
        # write info to file (note: this overwrites existing info in file!)
        deduped.drop_duplicates().fillna("").to_csv(
            os.path.join(juris_path, file_name),
            index=False,
            sep="\t",
            encoding=constants.default_encoding,
        )
    except Exception as e:
        err = ui.add_new_error(
            err,
            "system",
            "REMOVEpreparation.write_element",
            f"Unexpected exception writing to file: {e}",
        )
    return err
def load_to_db(session, element: str,
               element_file: str) -> Optional[Dict[str, Any]]:
    err = None
    try:
        e_df = pd.read_csv(element_file, sep="\t")
        err = db.insert_to_cdf_db(session.bind, e_df, element, "database",
                                  session.bind.url.database)

    except Exception as exc:
        err = ui.add_new_error(
            err,
            "file",
            element_file,
            f"Error adding {element}s to database {session.bind.url.database}: {exc}",
        )
    return err
def add_dictionary_entries(juris_sys_name, repo_content_root, element, p_map):
    err = None
    juris_path = os.path.join(repo_content_root, "jurisdictions",
                              juris_sys_name)
    try:
        old_dictionary = juris.get_element(juris_path, "dictionary")
        new_dict = pd.DataFrame(
            [[element, raw, internal] for internal, raw in p_map.items()],
            columns=[
                "cdf_element", "cdf_internal_name", "raw_identifier_value"
            ],
        )
        juris.write_element(
            juris_path,
            "dictionary",
            pd.concat([old_dictionary, new_dict
                       ]).sort_values(by=["cdf_element", "cdf_internal_name"]),
        )
    except Exception as exc:
        err = ui.add_new_error(err, "jurisdiction", juris_sys_name,
                               f"Error adding {element}: {exc}")
    return err
def df_from_tree(
    tree: lxml_et.ElementTree,
    main_path: str,
    main_attrib: Optional[str],
    xml_path_info: Dict[str, Dict[str, Dict[str, str]]],
    file_name: str,
    ns: Optional[str],
    lookup_id: str = None,
) -> (pd.DataFrame, Optional[dict]):
    """Reads all counts (or lookup_ids, if given), along with info from munge string paths
    ((tag, attr) for each element), into a dataframe.
    If main_attrib is None, reads from text value of element; otherwise from attribute."""
    # create parent lookup
    parent = {c: p for p in tree.iter() for c in p}
    if ns:
        ns = f"{{{ns}}}"
    else:
        ns = ""
    with_ns = [f"{ns}{s}" for s in main_path.split("/")]
    head = with_ns[0]
    tail = "/".join(with_ns[1:])
    root = tree.getroot()
    if root.tag == head:
        err = None
    else:
        err = ui.add_new_error(
            None,
            "file",
            file_name,
            f"Root element of file is not {head}, as expected per munger",
        )
        return pd.DataFrame, err
    df_rows = list()
    for driver in root.findall(tail):
        if lookup_id:
            if main_attrib:
                row = {lookup_id: driver.attrib[main_attrib]}
            else:
                row = {lookup_id: driver.text}
        else:
            if main_attrib:
                row = {"Count": int(driver.attrib[main_attrib])}
            else:
                row = {"Count": int(driver.text)}

        ancestor = driver
        while ancestor is not None:
            for field in xml_path_info.keys():
                if xml_path_info[field]["local_root_tag"] == ancestor.tag:
                    if xml_path_info[field]["attrib"]:
                        try:
                            row[field] = ancestor.attrib[xml_path_info[field]
                                                         ["attrib"]]
                        except KeyError:
                            pass
                    else:
                        row[field] = ancestor.find(
                            xml_path_info[field]["tail"]).text
            if ancestor in parent.keys():
                ancestor = parent[ancestor]
            else:
                ancestor = None
        df_rows.append(row)
    df = pd.DataFrame(df_rows)
    return df, err
def nist_v2_xml_export_tree(
    session: Session,
    election: str,
    jurisdiction: str,
    rollup_subdivision_type: Optional[str] = None,
    issuer: str = constants.default_issuer,
    issuer_abbreviation: str = constants.default_issuer_abbreviation,
    status: str = constants.default_status,
    vendor_application_id: str = constants.default_vendor_application_id,
) -> (ET.ElementTree, Dict[str, Any]):
    """Creates a tree in the NIST common data format (V2) containing the results
    from the given election and jurisdiction. Note that all available results will
    be exported. I.e., if database has precinct-level results, the tree will
    contain precinct-level results.
    Major subdivision for rollup is <rollup_subdivision_type> ;
    """
    err = None
    # set up
    election_id = db.name_to_id(session, "Election", election)
    jurisdiction_id = db.name_to_id(session, "ReportingUnit", jurisdiction)
    if not election_id or not jurisdiction_id:
        err = ui.add_new_error(
            err,
            "database",
            session.bind.url.database,
            f"One or more of election {election} or jurisdiction {jurisdiction} not found in database",
        )
        tree = ET.ElementTree()
        return tree, err

    # include jurisdiction id in gp unit ids
    gpu_idxs = {jurisdiction_id}

    # get vote count data (if rollup_subdivision_type is None, no rollup will happen)
    results_df = db.read_vote_count_nist(
        session,
        election_id,
        jurisdiction_id,
        rollup_ru_type=rollup_subdivision_type)

    # collect ids for gp units that have vote counts, gp units that are election districts
    gpu_idxs.update(results_df.ReportingUnit_Id.unique())
    gpu_idxs.update(results_df.ElectionDistrict_Id.unique())

    # ElectionReport (root)
    attr = {
        "xmlns:xsi": "http://www.w3.org/2001/XMLSchema-instance",
        "xsi:schemaLocation": constants.nist_schema_location,
        "xmlns": constants.nist_namespace,
    }
    root = ET.Element("ElectionReport", attr)

    # add election sub-element of ElectionReport
    e_elt = ET.SubElement(root, "Election")

    # other sub-elements of ElectionReport
    ET.SubElement(
        root, "Format").text = "summary-contest"  # NB NIST restricts choices
    ET.SubElement(root, "GeneratedDate").text = datetime.now(
        tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

    # get name, ru-type and composing info for all gpus
    rus = pd.read_sql("ReportingUnit", session.bind, index_col="Id")
    cruj = pd.read_sql("ComposingReportingUnitJoin",
                       session.bind,
                       index_col="Id")

    # add each gpu
    for idx in gpu_idxs:
        name = rus.loc[idx]["Name"]

        children = [
            f"oid{x}" for x in cruj[cruj["ParentReportingUnit_Id"] == idx]
            ["ChildReportingUnit_Id"].unique() if x in gpu_idxs and x != idx
        ]
        attr = {
            "ObjectId": f"oid{idx}",
            "xsi:type": "ReportingUnit",
        }
        gpu_elt = ET.SubElement(root, "GpUnit", attr)
        if children:
            children_elt = ET.SubElement(gpu_elt, "ComposingGpUnitIds")
            children_elt.text = " ".join(children)
        gpu_name = ET.SubElement(gpu_elt, "Name")
        ET.SubElement(gpu_name, "Text", {"Language": "en"}).text = name
        ru_type = rus.loc[idx]["ReportingUnitType"]
        if ru_type in constants.nist_standard["ReportingUnitType"]:
            ET.SubElement(gpu_elt, "Type").text = ru_type
        else:
            ET.SubElement(gpu_elt, "Type").text = "other"
            ET.SubElement(gpu_elt, "OtherType").text = ru_type

    # other sub-elements of ElectionReport
    ET.SubElement(root, "Issuer").text = issuer
    ET.SubElement(root, "IssuerAbbreviation").text = issuer_abbreviation

    # add each party
    party_df = results_df[["Party_Id", "PartyName"]].drop_duplicates()
    for i, p in party_df.iterrows():
        p_elt = ET.SubElement(
            root,
            "Party",
            {
                "ObjectId": f'oid{p["Party_Id"]}',
            },
        )
        p_name_elt = ET.SubElement(p_elt, "Name")
        ET.SubElement(p_name_elt, "Text", {
            "Language": "en"
        }).text = p["PartyName"]

    # still more sub-elements of ElectionReport
    ET.SubElement(root, "SequenceStart").text = "1"  # TODO placeholder
    ET.SubElement(root, "SequenceEnd").text = "1"  # TODO placeholder
    ET.SubElement(root, "Status").text = status
    ET.SubElement(root, "VendorApplicationId").text = vendor_application_id

    # add each candidate (as sub-element of Election)
    candidate_df = results_df[["Candidate_Id", "BallotName",
                               "Party_Id"]].drop_duplicates()
    for i, can in candidate_df.iterrows():
        can_elt = ET.SubElement(e_elt, "Candidate",
                                {"ObjectId": f'oid{can["Candidate_Id"]}'})
        bn_elt = ET.SubElement(can_elt, "BallotName")
        ET.SubElement(bn_elt, "Text", {
            "Language": "en"
        }).text = can["BallotName"]
        party_id_elt = ET.SubElement(can_elt, "PartyId")
        party_id_elt.text = f'oid{can["Party_Id"]}'

    # add each contest (as sub-element of Election)
    contest_df = results_df[[
        "Contest_Id", "ContestName", "ContestType", "ElectionDistrict_Id"
    ]].drop_duplicates()
    for i, con in contest_df.iterrows():
        # create element for the contest
        attr = {
            "ObjectId": f'oid{con["Contest_Id"]}',
            "xsi:type": f'{con["ContestType"]}Contest',
        }
        con_elt = ET.SubElement(e_elt, "Contest", attr)

        # create ballot selection sub-elements
        # TODO (remove assumption that it's a  CandidateContest)
        selection_idxs = results_df[
            results_df.Contest_Id ==
            con["Contest_Id"]]["Selection_Id"].unique()
        for s_idx in selection_idxs:
            attr = {
                "ObjectId": f"oid{s_idx}",
                "xsi:type": "CandidateSelection",
            }
            cs_elt = ET.SubElement(con_elt, "ContestSelection", attr)
            vc_df = results_df[(results_df.Contest_Id == con["Contest_Id"])
                               & (results_df.Selection_Id == s_idx)][[
                                   "ReportingUnit_Id",
                                   "Candidate_Id",
                                   "CountItemType",
                                   "Count",
                               ]].drop_duplicates()
            for idx, vc in vc_df.iterrows():
                vote_counts_elt = ET.SubElement(cs_elt, "VoteCounts")
                # create GpUnitId sub-element
                ET.SubElement(vote_counts_elt,
                              "GpUnitId").text = f'oid{vc["ReportingUnit_Id"]}'
                # create Type sub-elements (for CountItemType)
                if vc["CountItemType"] in constants.nist_standard[
                        "CountItemType"]:
                    ET.SubElement(vote_counts_elt,
                                  "Type").text = vc["CountItemType"]
                else:
                    ET.SubElement(vote_counts_elt, "Type").text = "other"
                    ET.SubElement(vote_counts_elt,
                                  "OtherType").text = vc["CountItemType"]
                # create Count sub-element
                ET.SubElement(vote_counts_elt, "Count").text = str(vc["Count"])

            candidate_ids = " ".join(
                [f"oid{x}" for x in vc_df.Candidate_Id.unique()])
            ET.SubElement(cs_elt, "CandidateIds").text = candidate_ids

        # create ElectionDistrictId sub-element
        ET.SubElement(
            con_elt,
            "ElectionDistrictId").text = f'oid{con["ElectionDistrict_Id"]}'

        # create Name sub-element
        ET.SubElement(con_elt, "Name").text = con["ContestName"]

        # create VotesAllowed sub-element
        ET.SubElement(con_elt, "VotesAllowed").text = "1"
        # TODO tech debt allow arbitrary "votes allowed

    # election scope (geographic unit for whole election)
    ET.SubElement(e_elt, "ElectionScopeId").text = f"oid{jurisdiction_id}"

    # add properties of particular election
    # NB we assume only one election!
    election_df = pd.read_sql_table("Election", session.bind, index_col="Id")
    election_name_elt = ET.SubElement(e_elt, "Name")
    ET.SubElement(election_name_elt, "Text", {
        "Language": "en"
    }).text = election_df.loc[election_id]["Name"]
    ET.SubElement(e_elt, "StartDate").text = "1900-01-01"  # placeholder
    ET.SubElement(e_elt, "EndDate").text = "1900-01-01"  # placeholder

    # election type
    e_type = election_df.loc[election_id]["ElectionType"]
    if e_type in constants.nist_standard["ElectionType"]:
        ET.SubElement(e_elt, "Type").text = e_type
    else:
        ET.SubElement(e_elt, "Type").text = "other"
        ET.SubElement(e_elt, "OtherType").text = e_type

    tree = ET.ElementTree(root)
    return tree, err
def add_candidate_contests(
    juris_path: str,
    df: pd.DataFrame,
    file_path: str,
) -> Optional[dict]:
    """
    Inputs:
        juris_path: str, path to directory containing info for jurisdiction
        df: pd.DataFrame, dataframe with info for candidate contests
            (ContestName,NumberElected,OfficeName,PrimaryParty,ElectionDistrict, ReportingUnitType)
        file_path: str, for error reporting, the path of the file from which the dataframe was taken

    Adds any contests in <df> to the CandidateContest file in <juris_path>, along with dependent info

    Returns:
        Optional[dict], error dictionary
    """
    err = None
    necessary_columns = {
        "ContestName",
        "NumberElected",
        "OfficeName",
        "PrimaryParty",
        "ElectionDistrict",
        "ReportingUnitType",
    }
    if necessary_columns.issubset(set(df.columns)):
        # read files (or return errors)
        df_dict = dict()
        path_dict = dict()
        for element in ["ReportingUnit", "Office", "CandidateContest"]:
            path_dict[element] = os.path.join(juris_path, f"{element}.txt")
            try:
                df_dict[element] = pd.read_csv(path_dict[element], sep="\t")
            except FileNotFoundError:
                err = ui.add_new_error(err, "jurisdiction", juris_path,
                                       f"{element}.txt file not found")
            except Exception as e:
                err = ui.add_new_error(err, "jurisdiction", juris_path,
                                       f"Error reading {element}.txt: {e}")
        if ui.fatal_error(err):
            return err

        # add to ReportingUnit if necessary
        mask = df.ElectionDistrict.notin(
            df_dict["ReportingUnit"].Name.unique())
        if mask.any():
            new = pd.concat([
                df[["ElectionDistrict", "ReportingUnitType"
                    ]].rename(column={"ElectionDistrict": "Name"}),
                df_dict["ReportingUnit"],
            ])
            new.to_csv(path_dict["ReportingUnit"], sep="\t", index=False)

        # add to Office
        mask = df.OfficeName.notin(df_dict["Office"].Name.unique())
        if mask.any():
            new = pd.concat([
                df[["OfficeName",
                    "ElectionDistrict"]].rename(column={"OfficeName": "Name"})
            ])
            new.to_csv(path_dict["Office"], sep="\t", index=False)

        # add to CandidateContest
        mask = df.ContestName.notin(cc.Name.unique())
        if mask.any():
            new = pd.concat([
                df[[
                    "ContestName", "NumberElected", "OfficeName",
                    "PrimaryParty"
                ]].rename(column={
                    "OfficeName": "Office",
                    "ContestName": "Name"
                })
            ])
            new.to_csv(path_dict["CandidateContest"], sep="\t", index=False)
    else:
        err = ui.add_new_error(
            err,
            "file",
            file_path,
            f"Missing columns: {[col for col in necessary_columns if col not in df.columns]}",
        )
    return err
def load_or_update_contests(
    engine,
    path_to_jurisdiction_dir,
    juris_true_name,
    contest_type: str,
    err: Optional[dict],
) -> Optional[dict]:
    # read <contest_type>Contests from jurisdiction folder
    element_fpath = os.path.join(path_to_jurisdiction_dir,
                                 f"{contest_type}Contest.txt")
    if not os.path.exists(element_fpath):
        err = ui.add_new_error(
            err,
            "jurisdiction",
            juris_true_name,
            f"file not found: {contest_type}Contest.txt",
        )
        return err
    df = pd.read_csv(element_fpath,
                     **constants.standard_juris_csv_reading_kwargs).fillna(
                         "none or unknown")

    # add contest_type column
    df = m.add_constant_column(df, "contest_type", contest_type)

    # add 'none or unknown' record
    df = add_none_or_unknown(df, contest_type=contest_type)

    # dedupe df
    dupes, df = ui.find_dupes(df)

    # insert into in Contest table
    # Structure of CandidateContest vs Contest table means there is nothing to update in the CandidateContest table.
    # TODO check handling of BallotMeasure contests -- do they need to be updated?
    new_err = db.insert_to_cdf_db(
        engine,
        df[["Name", "contest_type"]],
        "Contest",
        "jurisdiction",
        juris_true_name,
        on_conflict="NOTHING",
    )
    if new_err:
        err = ui.consolidate_errors([err, new_err])
        if ui.fatal_error(new_err):
            return err

    # append Contest_Id
    col_map = {"Name": "Name", "contest_type": "contest_type"}
    df = db.append_id_to_dframe(engine, df, "Contest", col_map=col_map)

    if contest_type == "BallotMeasure":
        # append ElectionDistrict_Id, Election_Id
        for fk, ref in [
            ("ElectionDistrict", "ReportingUnit"),
            ("Election", "Election"),
        ]:
            col_map = {fk: "Name"}
            df = (db.append_id_to_dframe(
                engine, df, ref,
                col_map=col_map).rename(columns={
                    f"{ref}_Id": f"{fk}_Id"
                }).drop(fk, axis=1))

    else:
        # append Office_Id, PrimaryParty_Id
        for fk, ref in [("Office", "Office"), ("PrimaryParty", "Party")]:
            col_map = {fk: "Name"}
            df = db.append_id_to_dframe(
                engine, df, ref,
                col_map=col_map).rename(columns={f"{ref}_Id": f"{fk}_Id"})

    # create entries in <contest_type>Contest table
    # commit info in df to <contest_type>Contest table to db
    try:
        new_err = db.insert_to_cdf_db(
            engine,
            df.rename(columns={"Contest_Id": "Id"}),
            f"{contest_type}Contest",
            "jurisdiction",
            juris_true_name,
            on_conflict="NOTHING",
        )
        if new_err:
            err = ui.consolidate_errors([err, new_err])
    except Exception as ie:
        err = ui.add_new_error(
            err,
            "jurisdiction",
            juris_true_name,
            f"Contests not loaded to database (sql error {ie}). ",
        )
    return err
def load_juris_dframe_into_cdf(
    session,
    element,
    all_juris_path,
    juris_true_name: str,
    juris_system_name: str,
    err: Optional[dict],
    on_conflict: str = "NOTHING",
) -> Optional[dict]:
    """TODO"""

    # define paths
    project_root = Path(__file__).parents[1].absolute()
    cdf_schema_def_dir = os.path.join(
        project_root,
        "CDF_schema_def_info",
    )
    element_file = os.path.join(all_juris_path, juris_system_name,
                                f"{element}.txt")
    fk_file = os.path.join(cdf_schema_def_dir, "elements", element,
                           "foreign_keys.txt")

    # fail if <element>.txt does not exist
    if not os.path.exists(element_file):
        err = ui.add_new_error(
            err,
            "jurisdiction",
            juris_system_name,
            f"File {element}.txt not found",
        )
        return err

    clean_and_dedupe(element_file, clean_candidates=True)

    # read info from <element>.txt, filling null fields with 'none or unknown'
    df = pd.read_csv(element_file,
                     **constants.standard_juris_csv_reading_kwargs).fillna(
                         "none or unknown")
    # TODO check that df has the right format

    # add 'none or unknown' record
    df = add_none_or_unknown(df)

    # dedupe df
    dupes, df = ui.find_dupes(df)
    if not dupes.empty:
        err = ui.add_new_error(
            err,
            "warn-jurisdiction",
            juris_system_name,
            f"\nDuplicates were found in {element}.txt",
        )

    # get Ids for any foreign key (or similar) in the table, e.g., Party_Id, etc.
    if os.path.isfile(fk_file):
        foreign_keys = pd.read_csv(fk_file, sep="\t", index_col="fieldname")

        for fn in foreign_keys.index:
            ref = foreign_keys.loc[
                fn,
                "refers_to"]  # NB: juris elements have no multiple referents (as joins may)
            col_map = {fn[:-3]: db.get_name_field(ref)}
            df = db.append_id_to_dframe(
                session.bind, df, ref,
                col_map=col_map).rename(columns={f"{ref}_Id": fn})

    # commit info in df to corresponding cdf table to db
    new_err = db.insert_to_cdf_db(
        session.bind,
        df,
        element,
        "jurisdiction",
        juris_true_name,
        on_conflict=on_conflict,
    )
    if new_err:
        err = ui.consolidate_errors([err, new_err])
    return err
def check_dependencies(juris_dir, element,
                       repository_content_root) -> (list, dict):
    """Looks in <juris_dir> to check that every dependent column in <element>.txt
    is listed in the corresponding jurisdiction file. Note: <juris_dir> assumed to exist.
    """
    err = None
    changed_elements = list()
    juris_name = Path(juris_dir).name
    d = juris_dependency_dictionary()
    f_path = os.path.join(juris_dir, f"{element}.txt")
    try:
        element_df = pd.read_csv(f_path,
                                 **constants.standard_juris_csv_reading_kwargs)
    except FileNotFoundError:
        err = ui.add_new_error(
            err,
            "system",
            f"{Path(__file__).absolute().parents[0].name}.{inspect.currentframe().f_code.co_name}",
            f"file doesn't exist: {f_path}",
        )
        return changed_elements, err

    # Find all dependent columns
    dependent = [c for c in element_df.columns if c in d.keys()]
    changed_elements = set()
    for c in dependent:
        target = d[c]
        ed = (pd.read_csv(
            os.path.join(juris_dir, f"{element}.txt"),
            **constants.standard_juris_csv_reading_kwargs,
        ).fillna("").loc[:, c].unique())

        # create list of elements, removing any nulls
        # # look for required other element in the jurisdiction's directory; if not there, use global
        if os.path.isfile(os.path.join(juris_dir, f"{target}.txt")):
            target_path = os.path.join(juris_dir, f"{target}.txt")
        else:
            target_path = os.path.join(
                repository_content_root,
                "jurisdictions",
                "000_for_all_jurisdictions",
                f"{target}.txt",
            )
            if not os.path.isfile(target_path):
                err = ui.add_new_error(
                    err,
                    "jurisdiction",
                    "all jurisdictions",
                    f"{target}.txt file missing from both {juris_dir} and "
                    f"{os.path.join(repository_content_root, 'electiondata', '000_for_all_jurisdictions')}",
                )
                return changed_elements, err
        ru = list(
            pd.read_csv(
                target_path,
                **constants.standard_juris_csv_reading_kwargs,
            ).fillna("").loc[:, db.get_name_field(target)])
        try:
            ru.remove(np.nan)
        except ValueError:
            pass

        missing = [x for x in ed if x not in ru]
        # if the only missing is null or blank
        if len(missing) == 1 and missing == [""]:
            # exclude PrimaryParty, which isn't required to be not-null
            if c != "PrimaryParty":
                err = ui.add_new_error(err, "jurisdiction", juris_name,
                                       f"Some {c} are null.")
        elif missing:
            changed_elements.add(element)
            changed_elements.add(target)
            m_str = "\n".join(missing)
            err = ui.add_new_error(
                err,
                "jurisdiction",
                juris_name,
                f"Every {c} in {element}.txt must be in {target}.txt. Offenders are:\n{m_str}",
            )

    return changed_elements, err
def recast_options(options: Dict[str, str], types: Dict[str, str],
                   munger_name: str) -> (dict, Optional[Dict[str, Any]]):
    """Convert a dictionary <options> of string parameter values to typed objects,
    where type is determined by <types>"""
    err: Optional[Dict[str, Any]] = None
    keys = {k for k in options.keys() if k in types.keys()}
    for k in keys:
        if options[k]:
            if types[k] in ["int", "integer"]:
                try:
                    options[k] = int(options[k])
                except Exception:
                    options[k] = None
                    err = ui.add_new_error(
                        err,
                        "warn-munger",
                        munger_name,
                        f"{k} should be integer but isn't: {options[k]}",
                    )
            elif types[k] == "list-of-integers":
                try:
                    options[k] = [int(s) for s in options[k].split(",")]
                except Exception:
                    options[k] = list()
                    err = ui.add_new_error(
                        err,
                        "warn-munger",
                        munger_name,
                        f"{k} should be list of integers but isn't: {options[k]}",
                    )
            elif types[k] == "str":
                if options[k] == "":
                    # null string is read as None
                    options[k] = None
            elif types[k] == "list-of-strings":
                if options[k] != "":
                    try:
                        options[k] = [s for s in options[k].split(",")]
                    except Exception:
                        options[k] = list()
                        err = ui.add_new_error(
                            err,
                            "warn-munger",
                            munger_name,
                            f"{k} should be list of strings but isn't: {options[k]}",
                        )
                # if the string is empty, assign an empty list
                else:
                    options[k] = list()

            elif types[k] == "string-with-opt-list" and k == "count_location":
                if not options["count_location"]:
                    # if we're munging a lookup file
                    pass
                elif options["file_type"] in ["xml", "json-nested"]:
                    # nothing needs to be broken out for these file types
                    pass
                elif options[k].split(":")[0] == "by_name":
                    options["count_fields_by_name"] = [
                        s for s in options[k][8:].split(",")
                    ]
                    options[k] = "by_name"
                elif options[k].split(":")[0] == "by_number":
                    options["count_column_numbers"] = [
                        int(s) for s in options[k][10:].split(",")
                    ]
                    options[k] = "by_number"
    return options, err
def ensure_juris_files(repository_content_root,
                       juris_path: str,
                       ignore_empty: bool = False) -> Optional[dict]:
    """Check that the jurisdiction files are complete and consistent with one another.
    Check for extraneous files in Jurisdiction directory.
    Assumes Jurisdiction directory exists. Assumes dictionary.txt is in the template file"""

    # package possible errors from this function into a dictionary and return them
    err = None
    juris_name = Path(juris_path).name
    juris_true_name = juris_name.replace("-", " ")

    templates_dir = os.path.join(repository_content_root,
                                 "jurisdictions/000_jurisdiction_templates")
    # notify user of any extraneous files
    extraneous = [
        f for f in os.listdir(juris_path)
        if f not in os.listdir(templates_dir) and f[0] != "."
    ]
    if extraneous:
        err = ui.add_new_error(
            err,
            "jurisdiction",
            juris_name,
            f"extraneous_files_in_juris_directory {extraneous}",
        )

    template_list = [x[:-4] for x in os.listdir(templates_dir)]

    # reorder template_list, so that first things are created first
    ordered_list = [
        "dictionary", "ReportingUnit", "Office", "CandidateContest"
    ]
    template_list = ordered_list + [
        x for x in template_list if x not in ordered_list
    ]

    # ensure necessary all files exist
    for juris_file in template_list:
        cf_path = os.path.join(juris_path, f"{juris_file}.txt")
        created = False
        # if file does not already exist in jurisdiction directory, create from template and invite user to fill
        template_path = os.path.join(templates_dir, f"{juris_file}.txt")
        try:
            if os.path.isfile(template_path):
                temp = pd.read_csv(
                    template_path,
                    **constants.standard_juris_csv_reading_kwargs)
            else:
                err = ui.add_new_error(
                    err,
                    ""
                    "system",
                    f"{Path(__file__).absolute().parents[0].name}.{inspect.currentframe().f_code.co_name}",
                    f"Template file {template_path} does not exist",
                )
                temp = pd.DataFrame()  # for syntax checker
        except pd.errors.EmptyDataError:
            if not ignore_empty:
                err = ui.add_new_error(
                    err,
                    "system",
                    f"{Path(__file__).absolute().parents[0].name}.{inspect.currentframe().f_code.co_name}",
                    f"Template file {template_path} has no contents",
                )
            temp = pd.DataFrame()

        # if file does not exist
        if not os.path.isfile(cf_path):
            # create the file
            temp.to_csv(cf_path,
                        sep="\t",
                        index=False,
                        encoding=constants.default_encoding)
            created = True

        # if file exists, check format against template
        if not created:
            try:
                cf_df = pd.read_csv(
                    os.path.join(juris_path, f"{juris_file}.txt"),
                    **constants.standard_juris_csv_reading_kwargs,
                )
            except pd.errors.ParserError as pe:
                err = ui.add_new_error(
                    err,
                    "jurisdiction",
                    juris_name,
                    f"Error reading file {juris_file}.txt: {pe}",
                )
                return err

            if set(cf_df.columns) != set(temp.columns):
                print(juris_file)
                cols = "\t".join(temp.columns.to_list())
                err = ui.add_new_error(
                    err,
                    "jurisdiction",
                    juris_name,
                    f"Columns of {juris_file}.txt need to be (tab-separated):\n {cols}\n",
                )

            if juris_file == "dictionary":
                new_err = check_dictionary(cf_path)
                err = ui.consolidate_errors([err, new_err])

            else:
                # dedupe the file
                clean_and_dedupe(cf_path, clean_candidates=True)

                # TODO check for lines that are too long

                # check for problematic null entries
                null_columns = check_nulls(
                    juris_file,
                    cf_path,
                    os.path.join(repository_content_root, "electiondata"),
                )
                if null_columns:
                    err = ui.add_new_error(
                        err,
                        "jurisdiction",
                        juris_name,
                        f"Null entries in {juris_file} in columns {null_columns}",
                    )

                # check uniqueness of name field
                ambiguous_names = find_ambiguous_names(juris_file, cf_path)
                if ambiguous_names:
                    readable_list = "\n".join(ambiguous_names)
                    err = ui.add_new_error(
                        err,
                        "jurisdiction",
                        juris_name,
                        f"Some names are ambiguous, appearing in more than one row in {juris_file}.txt:"
                        f"\n{readable_list}",
                    )

    # check dependencies
    for juris_file in [x for x in template_list if x != "dictionary"]:
        # check dependencies
        d, new_err = check_dependencies(juris_path, juris_file,
                                        repository_content_root)
        if new_err:
            err = ui.consolidate_errors([err, new_err])

    # check ReportingUnit.txt for internal consistency
    new_err = check_ru_file(juris_path, juris_true_name)
    if new_err:
        err = ui.consolidate_errors([err, new_err])
    return err