def pick_record_from_file_system(storage_dir, table, known_info_d=None):
    """ Looks for record in file system.
	Returns a file-style <record> (with enums as plaintext).
	If no record found, <idx> is none;
	otherwise value of <idx> is irrelevant."""
    # initialize to keep syntax-checker happy
    filtered_file = None
    if not known_info_d:
        known_info_d = {}
    name_field = dbr.get_name_field(table)

    # identify/create the directory for storing individual records in file system
    if not os.path.isdir(storage_dir):
        os.makedirs(storage_dir)
    # read any info from <table>'s file within that directory
    storage_file = os.path.join(storage_dir, f'{table}.txt')
    if os.path.isfile(storage_file):
        from_file = pd.read_csv(storage_file, sep='\t')
        if not from_file.empty:
            # filter via known_info_d
            filtered_file = from_file.loc[(
                from_file[list(known_info_d)] == pd.Series(known_info_d)).all(
                    axis=1)]
        else:
            filtered_file = from_file
        print(f'Pick a record from {table} list in file system:')
        idx, file_style_record = pick_one(filtered_file, name_field)
    else:
        idx, file_style_record = None, None
    if idx is not None:
        file_style_record = dict(filtered_file.loc[idx])
    else:
        file_style_record = None
    return idx, file_style_record
Beispiel #2
0
def get_ids_for_foreign_keys(session, df1, element, foreign_key, refs,
                             load_refs, error):
    """ TODO <fn> is foreign key"""
    df = df1.copy()
    # append the Id corresponding to <fn> from the db
    foreign_elt = f'{foreign_key[:-3]}'
    interim = f'{foreign_elt}_Name'

    target_list = []
    for r in refs:
        ref_name_field = db_routines.get_name_field(r)

        r_target = pd.read_sql_table(r, session.bind)[['Id', ref_name_field]]
        r_target.rename(columns={
            'Id': foreign_key,
            ref_name_field: interim
        },
                        inplace=True)
        if element == 'ExternalIdentifier':
            # add column for cdf_table of referent
            r_target.loc[:, 'cdf_element'] = r

        target_list.append(r_target)

    target = pd.concat(target_list)

    if element == 'ExternalIdentifier':
        # join on cdf_element name as well
        df = df.merge(target,
                      how='left',
                      left_on=['cdf_element', 'internal_name'],
                      right_on=['cdf_element', interim])
        # rename 'Foreign_Id' to 'Foreign' for consistency in definition of missing
        # TODO why is ExternalIdentifier special in this regard?
        #  Is it that ExternalIdentifier doesn't have a name field?
        df.rename(columns={foreign_key: foreign_elt}, inplace=True)
    else:
        df = df.merge(target,
                      how='left',
                      left_on=foreign_elt,
                      right_on=interim)

    missing = df[(df[foreign_elt].notnull()) & (df[interim].isnull())]
    if missing.empty:
        df.drop([interim], axis=1)
    else:
        if load_refs:
            # Always try to handle/fill in the missing IDs
            raise ForeignKeyException(
                f'For some {element} records, {foreign_elt} was not found')
        else:
            if not element in error:
                error[element] = {}
            error[element]["foreign_key"] = \
            f"For some {element} records, {foreign_elt} was not found"
    return df
Beispiel #3
0
def check_dependencies(juris_dir, element):
    """Looks in <juris_dir> to check that every dependent column in <element>.txt
    is listed in the corresponding jurisdiction file. Note: <juris_dir> assumed to exist.
    """
    d = juris_dependency_dictionary()
    f_path = os.path.join(juris_dir, f'{element}.txt')
    assert os.path.isdir(juris_dir)
    element_df = pd.read_csv(f_path,
                             sep='\t',
                             index_col=None,
                             encoding='iso-8859-1')
    unmatched_error = []

    # Find all dependent columns
    dependent = [c for c in element_df if c in d.keys()]
    changed_elements = set()
    report = [f'In {element}.txt:']
    for c in dependent:
        target = d[c]
        ed = pd.read_csv(os.path.join(juris_dir, f'{element}.txt'),
                         sep='\t',
                         header=0,
                         encoding='iso-8859-1').fillna('').loc[:, c].unique()

        # create list of elements, removing any nulls
        ru = list(
            pd.read_csv(os.path.join(juris_dir, f'{target}.txt'),
                        sep='\t',
                        encoding='iso-8859-1').fillna(
                            '').loc[:, db_routines.get_name_field(target)])
        try:
            ru.remove(np.nan)
        except ValueError:
            pass

        missing = [x for x in ed if x not in ru]
        if len(missing) == 0:
            report.append(f'Every {c} in {element}.txt is a {target}.')
        elif len(missing) == 1 and missing == [
                ''
        ]:  # if the only missing is null or blank
            # TODO some dependencies are ok with null (eg. PrimaryParty) and some are not
            report.append(
                f'Some {c} are null, and every non-null {c} is a {target}.')
        else:
            changed_elements.add(element)
            changed_elements.add(target)
            unmatched_error.append(
                f'Every {c} must be a {target}. This is not optional!!')

    # if dependent:
    #     print('\n\t'.join(report))

    return changed_elements, unmatched_error
def raw_elements_to_cdf(session,
                        project_root,
                        juris,
                        mu,
                        raw,
                        count_cols,
                        ids=None):
    """load data from <raw> into the database.
    Note that columns to be munged (e.g. County_xxx) have mu.field_rename_suffix (e.g., _xxx) added already"""
    working = raw.copy()

    # enter elements from sources outside raw data, including creating id column(s)
    # TODO what if contest_type (BallotMeasure or Candidate) has source 'other'?
    if not ids:
        for t, r in mu.cdf_elements[mu.cdf_elements.source ==
                                    'other'].iterrows():
            # add column for element id
            # TODO allow record to be passed as a parameter
            idx, db_record, enum_d, fk_d = ui.pick_or_create_record(
                session, project_root, t)
            working = add_constant_column(working, f'{t}_Id', idx)
    else:
        working = add_constant_column(working, 'Election_Id', ids[1])
        working = add_constant_column(working, '_datafile_Id', ids[0])

    working = munge_and_melt(mu, working, count_cols)

    # append ids for BallotMeasureContests and CandidateContests
    working = add_constant_column(working, 'contest_type', 'unknown')
    for c_type in ['BallotMeasure', 'Candidate']:
        df_contest = pd.read_sql_table(f'{c_type}Contest', session.bind)
        working = replace_raw_with_internal_ids(
            working,
            juris,
            df_contest,
            f'{c_type}Contest',
            dbr.get_name_field(f'{c_type}Contest'),
            mu.path_to_munger_dir,
            drop_unmatched=False)

        # set contest_type where id was found
        working.loc[working[f'{c_type}Contest_Id'].notnull(),
                    'contest_type'] = c_type

        # drop column with munged name
        working.drop(f'{c_type}Contest', axis=1, inplace=True)

    # drop rows with unmatched contests
    to_be_dropped = working[working['contest_type'] == 'unknown']
    working_temp = working[working['contest_type'] != 'unknown']
    if working_temp.empty:
        raise MungeError(
            'No contests in database matched. No results will be loaded to database.'
        )
    elif not to_be_dropped.empty:
        print(f'Warning: Results for {to_be_dropped.shape[0]} rows '
              f'with unmatched contests will not be loaded to database.')
    working = working_temp

    # get ids for remaining info sourced from rows and columns
    element_list = [
        t for t in mu.cdf_elements[mu.cdf_elements.source != 'other'].index
        if (t[-7:] != 'Contest' and t[-9:] != 'Selection')
    ]
    for t in element_list:
        # capture id from db in new column and erase any now-redundant cols
        df = pd.read_sql_table(t, session.bind)
        name_field = dbr.get_name_field(t)
        # set drop_unmatched = True for fields necessary to BallotMeasure rows,
        #  drop_unmatched = False otherwise to prevent losing BallotMeasureContests for BM-inessential fields
        if t == 'ReportingUnit' or t == 'CountItemType':
            drop = True
        else:
            drop = False
        working = replace_raw_with_internal_ids(working,
                                                juris,
                                                df,
                                                t,
                                                name_field,
                                                mu.path_to_munger_dir,
                                                drop_unmatched=drop)
        working.drop(t, axis=1, inplace=True)
        # working = add_non_id_cols_from_id(working,df,t)

    # append BallotMeasureSelection_Id, drop BallotMeasureSelection
    df_selection = pd.read_sql_table(f'BallotMeasureSelection', session.bind)
    working = replace_raw_with_internal_ids(
        working,
        juris,
        df_selection,
        'BallotMeasureSelection',
        dbr.get_name_field('BallotMeasureSelection'),
        mu.path_to_munger_dir,
        drop_unmatched=False,
        mode=mu.cdf_elements.loc['BallotMeasureSelection', 'source'])
    # drop records with a BMC_Id but no BMS_Id (i.e., keep if BMC_Id is null or BMS_Id is not null)
    working = working[(working['BallotMeasureContest_Id'].isnull()) |
                      (working['BallotMeasureSelection_Id']).notnull()]

    working.drop('BallotMeasureSelection', axis=1, inplace=True)

    # append CandidateSelection_Id
    #  First must load CandidateSelection table (not directly munged, not exactly a join either)
    #  Note left join, as not every record in working has a Candidate_Id
    # TODO maybe introduce Selection and Contest tables, have C an BM types refer to them?
    c_df = pd.read_sql_table('Candidate', session.bind)
    c_df.rename(columns={'Id': 'Candidate_Id'}, inplace=True)
    cs_df, err = dbr.dframe_to_sql(c_df,
                                   session,
                                   'CandidateSelection',
                                   return_records='original')
    # add CandidateSelection_Id column, merging on Candidate_Id

    working = working.merge(cs_df[['Candidate_Id', 'Id']],
                            how='left',
                            left_on='Candidate_Id',
                            right_on='Candidate_Id')
    working.rename(columns={'Id': 'CandidateSelection_Id'}, inplace=True)
    # drop records with a CC_Id but no CS_Id (i.e., keep if CC_Id is null or CS_Id is not null)
    working = working[(working['CandidateContest_Id'].isnull()) |
                      (working['CandidateSelection_Id']).notnull()]

    # TODO: warn user if contest is munged but candidates are not
    # TODO warn user if BallotMeasureSelections not recognized in dictionary.txt
    for j in [
            'BallotMeasureContestSelectionJoin',
            'CandidateContestSelectionJoin', 'ElectionContestJoin'
    ]:
        working = append_join_id(project_root, session, working, j)

    # Fill VoteCount and ElectionContestSelectionVoteCountJoin
    #  To get 'VoteCount_Id' attached to the correct row, temporarily add columns to VoteCount
    #  add ElectionContestSelectionVoteCountJoin columns to VoteCount

    # Define ContestSelectionJoin_Id field needed in ElectionContestSelectionVoteCountJoin
    ref_d = {
        'ContestSelectionJoin_Id': [
            'BallotMeasureContestSelectionJoin_Id',
            'CandidateContestSelectionJoin_Id'
        ]
    }
    working = append_multi_foreign_key(working, ref_d)

    # add extra columns to VoteCount table temporarily to allow proper join
    extra_cols = [
        'ElectionContestJoin_Id', 'ContestSelectionJoin_Id', '_datafile_Id'
    ]
    dbr.add_integer_cols(session, 'VoteCount', extra_cols)

    # upload to VoteCount table, pull  Ids
    working_fat, err = dbr.dframe_to_sql(working,
                                         session,
                                         'VoteCount',
                                         raw_to_votecount=True)
    working_fat.rename(columns={'Id': 'VoteCount_Id'}, inplace=True)
    session.commit()

    # TODO check that all candidates in munged contests (including write ins!) are munged
    # upload to ElectionContestSelectionVoteCountJoin
    data, err = dbr.dframe_to_sql(working_fat, session,
                                  'ElectionContestSelectionVoteCountJoin')

    # drop extra columns
    dbr.drop_cols(session, 'VoteCount', extra_cols)

    return
def get_record_info_from_user(sess, element, known_info_d={}, mode='database'):
    """Collect new record info from user, with chance to confirm.
	For each enumeration, translate the user's plaintext input into id/othertext.

	Return the corresponding record (id/othertext only) and an enumeration-value
	dictionary. Depending on <mode> ('database', 'filesystem' or 'database_and_filesystem'),
	returns enum plaintext, or enum id/othertext pairs, or both.
	"""

    # read existing info from db
    all_from_db = pd.read_sql_table(element, sess.bind, index_col='Id')
    # initialize <show_user_cols>
    db_cols = list(all_from_db.columns)  # note: does not include 'Id'
    show_user_cols = db_cols.copy()

    # initialize value dictionaries to be returned
    enum_val = fk_val = new = {}
    enum_list = dbr.get_enumerations(sess, element)
    fk_df = dbr.get_foreign_key_df(sess, element)

    # get enumeration tables from db
    e_df = {}
    for e in enum_list:
        e_df[e] = pd.read_sql_table(e, sess.bind, index_col='Id')

    # add cols to all_from_db for showing user and update show_user_cols
    for e in enum_list:
        all_from_db = mr.enum_col_from_id_othertext(all_from_db,
                                                    e,
                                                    e_df[e],
                                                    drop_old=False)
        show_user_cols.append(e)
        show_user_cols.remove(f'{e}_Id')
        show_user_cols.remove(f'Other{e}')
    for i, r in fk_df.iterrows():
        # exclude foreign ids pointing to enumerations
        if i[:-3] not in enum_list:
            all_from_db = dbr.add_foreign_key_name_col(
                sess,
                all_from_db,
                r['foreign_column_name'],
                r['foreign_table_name'],
                drop_old=False)
            show_user_cols.append(i[:-3])
            show_user_cols.remove(i)

    # collect and confirm info from user
    unconfirmed = True
    while unconfirmed:
        # solicit info from user and store values for db insertion
        new = {}
        print(f'Enter info for new {element} record.')
        for c in db_cols:
            # define new[c] if value is known
            if c in known_info_d.keys():
                new[c] = known_info_d[c]

            # if c is an enumeration Id
            if c[-3:] == '_Id' and c[:-3] in enum_list:
                c_plain = c[:-3]
                # if plaintext of enumeration is known
                if c_plain in new.keys():
                    new[c], new[
                        f'Other{c_plain}'] = mr.enum_value_to_id_othertext(
                            e_df[c], new[c_plain])
                # if id/othertext of enumeration is known
                elif f'{c}_Id' in new.keys() and f'Other{c}' in new.keys():
                    new[c] = mr.enum_value_from_id_othertext(
                        e_df[c], new[f'{c}_Id'], new[f'Other{c}'])
                # otherwise
                else:
                    new[c], new[f'Other{c_plain}'], new[c_plain] = pick_enum(
                        sess, c_plain)
            # if c is an Other<enumeration>, new value was defined in loop through enum_list
            elif c[:5] == 'Other' and c[5:] in enum_list:
                pass
            # if c is a foreign key (and not an enumeration)
            elif c in fk_df.index:
                # if foreign key id is known
                c_plain = c[:-3]
                if c in new.keys():
                    new[c_plain] = dbr.name_from_id(
                        sess, fk_df.loc[c, 'foreign_table_name'], new[c])
                # if foreign key plaintext is known
                elif c_plain in new.keys():
                    new[c] = dbr.name_to_id(sess,
                                            fk_df.loc[c, 'foreign_table_name'],
                                            new[c_plain])
                # otherwise
                else:
                    print(
                        f'Specify the {fk_df.loc[c,"foreign_table_name"]} for this {element}'
                    )
                    idx, db_record = pick_record_from_db(
                        sess,
                        fk_df.loc[c, 'foreign_table_name'],
                        required=True)
                    new[c_plain] = db_record[dbr.get_name_field(
                        fk_df.loc[c, 'foreign_table_name'])]
                    # TODO pull from DB info about whether the foreign key is required
                    new[c] = dbr.name_to_id(sess,
                                            fk_df.loc[c, 'foreign_table_name'],
                                            new[c_plain])
            else:
                new[c] = enter_and_check_datatype(f'Enter the {c}',
                                                  get_datatype(all_from_db, c))

        # present to user for confirmation
        entry = '\n\t'.join([f'{k}:\t{new[k]}' for k in show_user_cols])
        confirm = input(f'Confirm entry:\n\t{entry}\nIs this correct (y/n)?\n')
        if confirm == 'y':
            unconfirmed = False

    # get db_record, enum_val, fk_val
    db_record = {k: new[k] for k in db_cols}
    enum_val = {e: new[e] for e in enum_list}
    fk_val = {k[:-3]: new[k[:-3]] for k in fk_df.index}
    show_user = {k: new[k] for k in show_user_cols}

    if mode == 'database':
        return db_record, enum_val, fk_val
    elif mode == 'filesystem':
        return show_user, enum_val, fk_val
    elif mode == 'database_and_filesystem':
        return {**db_record, **show_user}, enum_val, fk_val
    else:
        print(f'Mode {mode} not recognized.')
        return None, None, None
def pick_record_from_db(sess,
                        element,
                        known_info_d=None,
                        required=False,
                        db_idx=None):
    """Get id and info from database, if it exists.
	If <db_idx> is passed, return that index and a dictionary with the rest of the record"""
    if not known_info_d:
        known_info_d = {}

    element_df = pd.read_sql_table(element, sess.bind, index_col='Id')
    if element_df.empty:
        return None, None
    elif db_idx:
        return db_idx, element_df.loc[db_idx].to_dict()

    # add columns for plaintext of any enumerations
    # FIXME also add columns for foreign key plaintext
    enums = dbr.read_enums_from_db_table(sess, element)
    element_enhanced_df = element_df.copy()
    for e in enums:
        e_df = pd.read_sql_table(e, sess.bind, index_col='Id')
        element_enhanced_df = mr.enum_col_from_id_othertext(
            element_enhanced_df, e, e_df, drop_old=False)

    # filter by known_info_d
    d = {
        k: v
        for k, v in known_info_d.items() if k in element_enhanced_df.columns
    }
    filtered = element_enhanced_df.loc[(
        element_enhanced_df[list(d)] == pd.Series(d)).all(axis=1)]
    # TODO if filtered is empty, offer all
    if filtered.empty:
        print(
            'Nothing meets the filter criteria. Unfiltered options will be offered.'
        )
        filtered = element_enhanced_df

    print(f'Pick the {element} record from the database:')
    name_field = db_routines.get_name_field(element)
    element_idx, values = pick_one(filtered, name_field, element)
    if element_idx in element_df.index:
        d = dict(element_df.loc[element_idx])
    else:
        d = None
    if required and element_idx is None:
        # offer to filter by available enumerations
        enum_list = [
            x for x in dbr.get_enumerations(sess, element)
            if x not in known_info_d
        ]
        if len(enum_list) == 0:
            print('No more filters available. You must choose from this list')
            element_idx, d = pick_record_from_db(sess,
                                                 element,
                                                 known_info_d=known_info_d)
        else:
            while element_idx is None and len(enum_list) > 0:
                e = enum_list[0]
                e_filter = input(f'Filter by {e} (y/n)?\n')
                if e_filter == 'y':
                    known_info_d[f'{e}_Id'], known_info_d[
                        f'Other{e}'], known_info_d[e] = pick_enum(sess, e)
                    element_idx, d = pick_record_from_db(
                        sess,
                        element,
                        known_info_d=known_info_d,
                        required=True)
                enum_list.remove(e)

    return element_idx, d