コード例 #1
def cleanTas(csvPath):
    """Read a CSV into a dataframe, then use a configured `cleanData` and
    return the results"""
    data = pd.read_csv(csvPath, dtype=str)
    data = LoaderUtils.cleanData(
        {"a": "availability_type_code",
         "acct_num": "account_num",
         "aid": "agency_identifier",
         "ata": "allocation_transfer_agency",
         "bpoa": "beginning_period_of_availability",
         "epoa": "ending_period_of_availability",
         "main": "main_account_code",
         "sub": "sub_account_code",
        {"allocation_transfer_agency": {"pad_to_length": 3, "keep_null": True},
         "agency_identifier": {"pad_to_length": 3},
         # Account for " " cells
         "availability_type_code": {"pad_to_length": 0, "keep_null": True},
         "beginning_period_of_availability": {"pad_to_length": 0,
                                              "keep_null": True},
         "ending_period_of_availability": {"pad_to_length": 0,
                                           "keep_null": True},
         "main_account_code": {"pad_to_length": 4},
         "sub_account_code": {"pad_to_length": 3},
    data["account_num"] = pd.to_numeric(data['account_num'])
    return data.where(pd.notnull(data), None)
コード例 #2
def loadCgac(filename):
    """Load CGAC (high-level agency names) lookup table."""
    model = CGAC

    with createApp().app_context():
        sess = GlobalDB.db().session

        # for CGAC, delete and replace values

        # read CGAC values from csv
        data = pd.read_csv(filename, dtype=str)
        # clean data
        data = LoaderUtils.cleanData(data, model, {
            "cgac": "cgac_code",
            "agency": "agency_name"
        }, {"cgac_code": {
            "pad_to_length": 3
        # de-dupe
        data.drop_duplicates(subset=['cgac_code'], inplace=True)
        # insert to db
        table_name = model.__table__.name
        num = LoaderUtils.insertDataframe(data, table_name, sess.connection())

    logger.info('{} records inserted to {}'.format(num, table_name))
コード例 #3
def loadObjectClass(filename):
    """Load object class lookup table."""
    model = ObjectClass

    with createApp().app_context():
        sess = GlobalDB.db().session
        # for object class, delete and replace values

        data = pd.read_csv(filename, dtype=str)
        data = LoaderUtils.cleanData(
            {"max_oc_code": "object_class_code",
             "max_object_class_name": "object_class_name"},
        # de-dupe
        data.drop_duplicates(subset=['object_class_code'], inplace=True)
        # insert to db
        table_name = model.__table__.name
        num = LoaderUtils.insertDataframe(data, table_name, sess.connection())

    logger.info('{} records inserted to {}'.format(num, table_name))
コード例 #4
def loadProgramActivity(filename):
    """Load program activity lookup table."""
    model = ProgramActivity

    with createApp().app_context():
        sess = GlobalDB.db().session

        # for program activity, delete and replace values??

        data = pd.read_csv(filename, dtype=str)
        data = LoaderUtils.cleanData(
            data, model, {
                "year": "budget_year",
                "agency_id": "agency_id",
                "alloc_id": "allocation_transfer_id",
                "account": "account_number",
                "pa_code": "program_activity_code",
                "pa_name": "program_activity_name"
            }, {
                "program_activity_code": {
                    "pad_to_length": 4
                "agency_id": {
                    "pad_to_length": 3
                "allocation_transfer_id": {
                    "pad_to_length": 3,
                    "keep_null": True
                "account_number": {
                    "pad_to_length": 4
        # because we're only loading a subset of program activity info,
        # there will be duplicate records in the dataframe. this is ok,
        # but need to de-duped before the db load.
        # insert to db
        table_name = model.__table__.name
        num = LoaderUtils.insertDataframe(data, table_name, sess.connection())

    logger.info('{} records inserted to {}'.format(num, table_name))
コード例 #5
def load_sf133(filename, fiscal_year, fiscal_period, force_load=False):
    """Load SF 133 (budget execution report) lookup table."""

    with createApp().app_context():
        sess = GlobalDB.db().session

        existing_records = sess.query(SF133).filter(
            SF133.fiscal_year == fiscal_year, SF133.period == fiscal_period)
        if force_load:
            # force a reload of this period's current data
            logger.info('Force SF 133 load: deleting existing records for {} {}'.format(
                fiscal_year, fiscal_period))
            delete_count = existing_records.delete()
            logger.info('{} records deleted'.format(delete_count))
        elif existing_records.count():
            # if there's existing data & we're not forcing a load, skip
            logger.info('SF133 {} {} already in database ({} records). Skipping file.'.format(
                fiscal_year, fiscal_period, existing_records.count()))

        data = pd.read_csv(filename, dtype=str)
        data = LoaderUtils.cleanData(
            {"ata": "allocation_transfer_agency",
             "aid": "agency_identifier",
             "availability_type_code": "availability_type_code",
             "bpoa": "beginning_period_of_availa",
             "epoa": "ending_period_of_availabil",
             "main_account": "main_account_code",
             "sub_account": "sub_account_code",
             "fiscal_year": "fiscal_year",
             "period": "period",
             "line_num": "line",
            {"allocation_transfer_agency": {"pad_to_length": 3},
             "agency_identifier": {"pad_to_length": 3},
             "main_account_code": {"pad_to_length": 4},
             "sub_account_code": {"pad_to_length": 3},
             # next 3 lines handle the TAS fields that shouldn't
             # be padded but should still be empty spaces rather
             # than NULLs. this ensures that the downstream pivot & melt
             # (which insert the missing 0-value SF-133 lines)
             # will work as expected (values used in the pivot
             # index cannot be NULL).
             # the "pad_to_length: 0" works around the fact
             # that sometimes the incoming data for these columns
             # is a single space and sometimes it is blank/NULL.
             "beginning_period_of_availa": {"pad_to_length": 0},
             "ending_period_of_availabil": {"pad_to_length": 0},
             "availability_type_code": {"pad_to_length": 0},
             "amount": {"strip_commas": True}}

        # todo: find out how to handle dup rows (e.g., same tas/period/line number)
        # line numbers 2002 and 2012 are the only duped SF 133 report line numbers,
        # and they are not used by the validation rules, so for now
        # just remove them before loading our SF-133 table
        dupe_line_numbers = ['2002', '2102']
        data = data[~data.line.isin(dupe_line_numbers)]

        # add concatenated TAS field for internal use (i.e., joining to staging tables)
        data['tas'] = data.apply(lambda row: format_internal_tas(row), axis=1)

        # incoming .csv does not always include rows for zero-value SF-133 lines
        # so we add those here because they're needed for the SF-133 validations.
        # 1. "pivot" the sf-133 dataset to explode it horizontally, creating one
        # row for each tas/fiscal year/period, with columns for each SF-133 line.
        # the "fill_value=0" parameter puts a 0 into any Sf-133 line number cell
        # with a missing value for a specific tas/fiscal year/period.
        # 2. Once the zeroes are filled in, "melt" the pivoted data back to its normal
        # format of one row per tas/fiscal year/period.
        # NOTE: fields used for the pivot in step #1 (i.e., items in pivot_idx) cannot
        # have NULL values, else they will be silently dropped by pandas :(
        pivot_idx = ['created_at', 'updated_at', 'agency_identifier', 'allocation_transfer_agency',
                     'availability_type_code', 'beginning_period_of_availa', 'ending_period_of_availabil',
                     'main_account_code', 'sub_account_code', 'tas', 'fiscal_year', 'period']
        data.amount = data.amount.astype(float)
        data = pd.pivot_table(data, values='amount', index=pivot_idx, columns=['line'], fill_value=0).reset_index()
        data = pd.melt(data, id_vars=pivot_idx, value_name='amount')

        # Now that we've added zero lines for EVERY tas and SF 133 line number, get rid of the ones
        # we don't actually use in the validations. Arguably, it would be better just to include
        # everything, but that drastically increases the number of records we're inserting to the
        # sf_133 table. If we ever decide that we need *all* SF 133 lines that are zero value,
        # remove the next two lines.
        sf_133_validation_lines = [
            '1000', '1010', '1011', '1012', '1013', '1020', '1021', '1022',
            '1023', '1024', '1025', '1026', '1029', '1030', '1031', '1032',
            '1033', '1040', '1041', '1042', '1160', '1180', '1260', '1280',
            '1340', '1440', '1540', '1640', '1750', '1850', '1910', '2190',
            '2490', '2500', '3020', '4801', '4802', '4881', '4882', '4901',
            '4902', '4908', '4981', '4982'
        data = data[(data.line.isin(sf_133_validation_lines)) | (data.amount != 0)]

        # we didn't use the the 'keep_null' option when padding allocation transfer agency,
        # because nulls in that column break the pivot (see above comments).
        # so, replace the ata '000' with an empty value before inserting to db
        data['allocation_transfer_agency'] = data['allocation_transfer_agency'].str.replace('000', '')
        # make a pass through the dataframe, changing any empty values to None, to ensure
        # that those are represented as NULL in the db.
        data = data.applymap(lambda x: str(x).strip() if len(str(x).strip()) else None)

        # insert to db
        table_name = SF133.__table__.name
        num = LoaderUtils.insertDataframe(data, table_name, sess.connection())

    logger.info('{} records inserted to {}'.format(num, table_name))