def parse_sam_file(file, sess):
    logger.info("starting file " + str(file.name))

    csv_file = os.path.splitext(os.path.basename(file.name))[0]+'.dat'
    zfile = zipfile.ZipFile(file.name)

    # can't use skipfooter, pandas' c engine doesn't work with skipfooter and the python engine doesn't work with dtype
    nrows = 0
    with zfile.open(csv_file) as f:
        nrows = len(f.readlines()) - 2  # subtract the header and footer
    column_header_mapping = {
        "awardee_or_recipient_uniqu": 0,
        "sam_extract": 4,
        "expiration_date": 7,
        "activation_date": 9,
        "ultimate_parent_legal_enti": 10,
        "ultimate_parent_unique_ide": 48,
        "exec_comp_str": 89
    }
    column_header_mapping_ordered = OrderedDict(sorted(column_header_mapping.items(), key=lambda c: c[1]))
    csv_data = pd.read_csv(zfile.open(csv_file), dtype=str, header=None, skiprows=1, nrows=nrows, sep='|',
                           usecols=column_header_mapping_ordered.values(), names=column_header_mapping_ordered.keys())
    total_data = csv_data.copy()

    # skipping when sam_extract == '4' as it's expired
    total_data = total_data[total_data.sam_extract != '4']

    # parse out executive compensation from row 90
    lambda_func = (lambda ecs: pd.Series(list(parse_exec_comp(ecs).values())))
    parsed_data = total_data["exec_comp_str"].apply(lambda_func)
    parsed_data.columns = list(parse_exec_comp().keys())
    del total_data["exec_comp_str"]
    total_data = total_data.join(parsed_data)

    # split into 3 dataframes based on row 8 ('1', '2', '3')
    delete_data = total_data[total_data.sam_extract == '1'].replace(np.nan, "", regex=True)
    add_data = total_data[total_data.sam_extract == '2'].replace(np.nan, "", regex=True)
    update_data = total_data[total_data.sam_extract == '3'].replace(np.nan, "", regex=True)
    for dataframe in [add_data, update_data, delete_data, total_data]:
        del dataframe["sam_extract"]

    table_name = ExecutiveCompensation.__table__.name
    insert_dataframe(add_data, table_name, sess.connection())
    for _, row in update_data.iterrows():
        sess.query(ExecutiveCompensation).filter_by(awardee_or_recipient_uniqu=row['awardee_or_recipient_uniqu']).\
            update(row, synchronize_session=False)
    for _, row in delete_data.iterrows():
        sess.query(ExecutiveCompensation).filter_by(awardee_or_recipient_uniqu=row['awardee_or_recipient_uniqu']).\
            delete(synchronize_session=False)
    sess.commit()
def load_object_class(filename):
    """Load object class lookup table."""
    model = ObjectClass

    with create_app().app_context():
        sess = GlobalDB.db().session
        # for object class, delete and replace values
        sess.query(model).delete()

        data = pd.read_csv(filename, dtype=str)
        data = clean_data(
            data, model, {
                "max_oc_code": "object_class_code",
                "max_object_class_name": "object_class_name"
            }, {"object_class_code": {
                "pad_to_length": 3
            }})
        # de-dupe
        data.drop_duplicates(subset=['object_class_code'], inplace=True)
        # insert to db
        table_name = model.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))
def parse_county_file(county_file, sess):
    # read the data and clean up the column names
    data = pd.read_csv(county_file, dtype=str, sep="|")
    data = clean_data(
        data, {
            "COUNTY_NUMERIC": "county_number",
            "COUNTY_NAME": "county_name",
            "STATE_ALPHA": "state_code"
        })

    # remove all blank county_number rows. Not much use in a county number table
    data = data[pd.notnull(data['county_number'])]

    # remove duplicates because we have no use for them (there may be none, this is a precaution)
    data = data[
        ~data.duplicated(subset=['county_number', 'state_code'], keep='first')]

    # add created_at and updated_at columns
    now = datetime.utcnow()
    data = data.assign(created_at=now, updated_at=now)

    # insert data into table
    num = insert_dataframe(data, CountyCode.__table__.name, sess.connection())
    logger.info('{} records inserted to county_code'.format(num))
    sess.commit()
Beispiel #4
0
def load_sf133(filename, fiscal_year, fiscal_period, force_sf133_load=False):
    """Load SF 133 (budget execution report) lookup table."""

    with create_app().app_context():
        sess = GlobalDB.db().session

        existing_records = sess.query(SF133).filter(
            SF133.fiscal_year == fiscal_year, SF133.period == fiscal_period)
        if force_sf133_load:
            # force a reload of this period's current data
            logger.info(
                'Force SF 133 load: deleting existing records for %s %s',
                fiscal_year, fiscal_period)
            delete_count = existing_records.delete()
            logger.info('%s records deleted', delete_count)
        elif existing_records.count():
            # if there's existing data & we're not forcing a load, skip
            logger.info(
                'SF133 %s %s already in database (%s records). Skipping file.',
                fiscal_year, fiscal_period, existing_records.count())
            return

        data = clean_sf133_data(filename, SF133)

        # Now that we've added zero lines for EVERY tas and SF 133 line number, get rid of the ones
        # we don't actually use in the validations. Arguably, it would be better just to include
        # everything, but that drastically increases the number of records we're inserting to the
        # sf_133 table. If we ever decide that we need *all* SF 133 lines that are zero value,
        # remove the next two lines.
        sf_133_validation_lines = [
            '1000', '1010', '1011', '1012', '1013', '1020', '1021', '1022',
            '1023', '1024', '1025', '1026', '1029', '1030', '1031', '1032',
            '1033', '1040', '1041', '1042', '1160', '1180', '1260', '1280',
            '1340', '1440', '1540', '1640', '1750', '1850', '1910', '2190',
            '2490', '2500', '3020', '4801', '4802', '4881', '4882', '4901',
            '4902', '4908', '4981', '4982'
        ]
        data = data[(data.line.isin(sf_133_validation_lines)) |
                    (data.amount != 0)]

        # we didn't use the the 'keep_null' option when padding allocation transfer agency,
        # because nulls in that column break the pivot (see above comments).
        # so, replace the ata '000' with an empty value before inserting to db
        data['allocation_transfer_agency'] = data[
            'allocation_transfer_agency'].str.replace('000', '')
        # make a pass through the dataframe, changing any empty values to None, to ensure
        # that those are represented as NULL in the db.
        data = data.applymap(lambda x: str(x).strip()
                             if len(str(x).strip()) else None)

        # insert to db
        table_name = SF133.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        update_tas_id(int(fiscal_year), int(fiscal_period))
        sess.commit()

    logger.info('%s records inserted to %s', num, table_name)
def update_state_congr_table_census(census_file, sess):
    logger.info("Adding congressional districtions from census to the state_congressional table")

    data = pd.read_csv(census_file, dtype=str)
    model = StateCongressional

    data = clean_data(
        data,
        model,
        {"state_code": "state_code",
         "congressional_district_no": "congressional_district_no",
         "census_year": "census_year"},
        {'congressional_district_no': {"pad_to_length": 2}}
    )

    table_name = model.__table__.name
    insert_dataframe(data, table_name, sess.connection())
    sess.commit()
def parse_fabs_file(f, sess, fips_state_list, state_code_list, sub_tier_list,
                    county_code_list):
    logger.info("starting file " + str(f.name))

    csv_file = 'datafeeds\\' + os.path.splitext(os.path.basename(f.name))[0]
    zfile = zipfile.ZipFile(f.name)
    data = pd.read_csv(
        zfile.open(csv_file),
        dtype=str,
        usecols=[
            'cfda_program_num', 'sai_number', 'recipient_name',
            'recipient_city_code', 'recipient_city_name',
            'recipient_county_code', 'recipient_county_name', 'recipient_zip',
            'recipient_type', 'action_type', 'agency_code', 'federal_award_id',
            'federal_award_mod', 'fed_funding_amount',
            'non_fed_funding_amount', 'total_funding_amount',
            'obligation_action_date', 'starting_date', 'ending_date',
            'assistance_type', 'record_type', 'correction_late_ind',
            'fyq_correction', 'principal_place_code', 'principal_place_state',
            'principal_place_cc', 'principal_place_country_code',
            'principal_place_zip', 'principal_place_cd', 'cfda_program_title',
            'project_description', 'duns_no', 'receip_addr1', 'receip_addr2',
            'receip_addr3', 'face_loan_guran', 'orig_sub_guran',
            'recipient_cd', 'rec_flag', 'recipient_country_code', 'uri',
            'recipient_state_code', 'last_modified_date'
        ])

    clean_data = format_fabs_data(data, sess, fips_state_list, state_code_list,
                                  sub_tier_list, county_code_list)

    if clean_data is not None:
        logger.info("loading {} rows".format(len(clean_data.index)))

        insert_dataframe(clean_data,
                         PublishedAwardFinancialAssistance.__table__.name,
                         sess.connection())
        sess.commit()
def parse_fabs_file(f, sess):
    logger.info("starting file " + str(f.name))

    csv_file = 'datafeeds\\' + os.path.splitext(os.path.basename(f.name))[0]
    zfile = zipfile.ZipFile(f.name)
    data = pd.read_csv(zfile.open(csv_file), dtype=str, usecols=[
        'cfda_program_num', 'sai_number', 'recipient_name', 'recipient_city_code', 'recipient_city_name',
        'recipient_county_code', 'recipient_county_name', 'recipient_zip', 'recipient_type', 'action_type',
        'agency_code', 'federal_award_id', 'federal_award_mod', 'fed_funding_amount', 'non_fed_funding_amount',
        'total_funding_amount', 'obligation_action_date', 'starting_date', 'ending_date', 'assistance_type',
        'record_type', 'correction_late_ind', 'fyq_correction', 'principal_place_code', 'principal_place_state',
        'principal_place_cc', 'principal_place_country_code', 'principal_place_zip', 'principal_place_cd',
        'cfda_program_title', 'project_description', 'duns_no', 'receip_addr1', 'receip_addr2', 'receip_addr3',
        'face_loan_guran', 'orig_sub_guran', 'recipient_cd', 'rec_flag', 'recipient_country_code', 'uri',
        'recipient_state_code', 'last_modified_date'
    ])

    clean_data = format_fabs_data(data)

    if clean_data is not None:
        logger.info("loading {} rows".format(len(clean_data.index)))

        insert_dataframe(clean_data, PublishedAwardFinancialAssistance.__table__.name, sess.connection())
        sess.commit()
def parse_state_file(state_file, sess):
    # read the data. Cleaning is in there in case something changes, doesn't really do anything now
    data = pd.read_csv(state_file, dtype=str)
    data = clean_data(
        data,
        {"state_name": "state_name",
         "state_code": "state_code"})

    # add created_at and updated_at columns
    now = datetime.utcnow()
    data = data.assign(created_at=now, updated_at=now)

    # insert data into table
    num = insert_dataframe(data, States.__table__.name, sess.connection())
    logger.info('{} records inserted to states'.format(num))
    sess.commit()
def load_program_activity(filename):
    """Load program activity lookup table."""
    model = ProgramActivity

    with create_app().app_context():
        sess = GlobalDB.db().session

        # for program activity, delete and replace values??
        sess.query(model).delete()

        data = pd.read_csv(filename, dtype=str)
        data = clean_data(
            data, model, {
                "year": "budget_year",
                "agency_id": "agency_id",
                "alloc_id": "allocation_transfer_id",
                "account": "account_number",
                "pa_code": "program_activity_code",
                "pa_name": "program_activity_name"
            }, {
                "program_activity_code": {
                    "pad_to_length": 4
                },
                "agency_id": {
                    "pad_to_length": 3
                },
                "allocation_transfer_id": {
                    "pad_to_length": 3,
                    "keep_null": True
                },
                "account_number": {
                    "pad_to_length": 4
                }
            })
        # Lowercase Program Activity Name
        data['program_activity_name'] = data['program_activity_name'].apply(
            lambda x: x.lower())
        # because we're only loading a subset of program activity info,
        # there will be duplicate records in the dataframe. this is ok,
        # but need to de-duped before the db load.
        data.drop_duplicates(inplace=True)
        # insert to db
        table_name = model.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))
def parse_state_file(state_file, sess):
    # read the data. Cleaning is in there in case something changes, doesn't really do anything now
    data = pd.read_csv(state_file, dtype=str)
    data = clean_data(
        data,
        {"state_name": "state_name",
         "state_code": "state_code",
         "fips_code": "fips_code"})

    # add created_at and updated_at columns
    now = datetime.utcnow()
    data = data.assign(created_at=now, updated_at=now)

    # insert data into table
    num = insert_dataframe(data, States.__table__.name, sess.connection())
    logger.info('{} records inserted to states'.format(num))
    sess.commit()
def parse_city_file(city_file, sess):
    # read the data and clean up the column names
    data = pd.read_csv(city_file, dtype=str, sep="|")
    data = clean_data(
        data, {
            "FEATURE_NAME": "feature_name",
            "FEATURE_CLASS": "feature_class",
            "CENSUS_CODE": "city_code",
            "STATE_ALPHA": "state_code",
            "COUNTY_NUMERIC": "county_number",
            "COUNTY_NAME": "county_name",
            "PRIMARY_LATITUDE": "latitude",
            "PRIMARY_LONGITUDE": "longitude"
        })

    # add a sort column based on feature_class and remove anything with a different feature class or empty city_code
    feature_class_ranking = {
        "Populated Place": 1,
        "Locale": 2,
        "Civil": 3,
        "Census": 4
    }
    data = data[pd.notnull(data['city_code'])]
    data['sorting_col'] = data['feature_class'].map(feature_class_ranking)
    data = data[pd.notnull(data['sorting_col'])]

    # sort by feature_class then remove any duplicates within state/city code combo (we keep the first occurrence
    # because we've sorted by priority so the one that would overwrite the others is on top already)
    data = data.sort_values(by=['sorting_col'])
    data = data[~data.
                duplicated(subset=['state_code', 'city_code'], keep='first')]
    data = data.drop('sorting_col', axis=1)

    # add created_at and updated_at columns
    now = datetime.utcnow()
    data = data.assign(created_at=now, updated_at=now)

    # just sorting it how it started out
    data = data.sort_values(by=['feature_name'])

    # insert data into table
    num = insert_dataframe(data, CityCode.__table__.name, sess.connection())
    logger.info('{} records inserted to city_code'.format(num))
    sess.commit()
def parse_county_file(county_file, sess):
    # read the data and clean up the column names
    data = pd.read_csv(county_file, dtype=str, sep="|")
    data = clean_data(
        data,
        {"COUNTY_NUMERIC": "county_number",
         "COUNTY_NAME": "county_name",
         "STATE_ALPHA": "state_code"})

    # remove all blank county_number rows. Not much use in a county number table
    data = data[pd.notnull(data['county_number'])]

    # remove duplicates because we have no use for them (there may be none, this is a precaution)
    data = data[~data.duplicated(subset=['county_number', 'state_code'], keep='first')]

    # add created_at and updated_at columns
    now = datetime.utcnow()
    data = data.assign(created_at=now, updated_at=now)

    # insert data into table
    num = insert_dataframe(data, CountyCode.__table__.name, sess.connection())
    logger.info('{} records inserted to county_code'.format(num))
    sess.commit()
def parse_city_file(city_file, sess):
    # read the data and clean up the column names
    data = pd.read_csv(city_file, dtype=str, sep="|")
    data = clean_data(
        data,
        {"FEATURE_NAME": "feature_name",
         "FEATURE_CLASS": "feature_class",
         "CENSUS_CODE": "city_code",
         "STATE_ALPHA": "state_code",
         "COUNTY_NUMERIC": "county_number",
         "COUNTY_NAME": "county_name",
         "PRIMARY_LATITUDE": "latitude",
         "PRIMARY_LONGITUDE": "longitude"})

    # add a sort column based on feature_class and remove anything with a different feature class or empty city_code
    feature_class_ranking = {"Populated Place": 1, "Locale": 2, "Civil": 3, "Census": 4}
    data = data[pd.notnull(data['city_code'])]
    data['sorting_col'] = data['feature_class'].map(feature_class_ranking)
    data = data[pd.notnull(data['sorting_col'])]

    # sort by feature_class then remove any duplicates within state/city code combo (we keep the first occurrence
    # because we've sorted by priority so the one that would overwrite the others is on top already)
    data = data.sort_values(by=['sorting_col'])
    data = data[~data.duplicated(subset=['state_code', 'city_code'], keep='first')]
    data = data.drop('sorting_col', axis=1)

    # add created_at and updated_at columns
    now = datetime.utcnow()
    data = data.assign(created_at=now, updated_at=now)

    # just sorting it how it started out
    data = data.sort_values(by=['feature_name'])

    # insert data into table
    num = insert_dataframe(data, CityCode.__table__.name, sess.connection())
    logger.info('{} records inserted to city_code'.format(num))
    sess.commit()
def load_country_codes(base_path):
    """ Load Country Codes into the database.

        Args
            base_path: directory that contains the domain values files.
    """

    if CONFIG_BROKER["use_aws"]:
        s3connection = boto.s3.connect_to_region(CONFIG_BROKER['aws_region'])
        s3bucket = s3connection.lookup(CONFIG_BROKER['sf_133_bucket'])
        filename = s3bucket.get_key("country_codes.csv").generate_url(expires_in=600)
    else:
        filename = os.path.join(base_path, "country_codes.csv")

    logger.info('Loading country codes file: country_codes.csv')

    with create_app().app_context():
        sess = GlobalDB.db().session
        # for object class, delete and replace values
        sess.query(CountryCode).delete()

        data = pd.read_csv(filename, dtype=str)
        data = clean_data(
            data,
            CountryCode,
            {"country_code": "country_code", "country_name": "country_name"},
            {}
        )
        # de-dupe
        data.drop_duplicates(subset=['country_code'], inplace=True)
        # insert to db
        table_name = CountryCode.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))
Beispiel #15
0
def load_object_class(base_path):
    """ This function loads Object classes into the database

        Args:
            base_path: directory that contains the domain values files.
    """
    if CONFIG_BROKER["use_aws"]:
        s3connection = boto.s3.connect_to_region(CONFIG_BROKER['aws_region'])
        s3bucket = s3connection.lookup(CONFIG_BROKER['sf_133_bucket'])
        filename = s3bucket.get_key("object_class.csv").generate_url(
            expires_in=600)
    else:
        filename = os.path.join(base_path, "object_class.csv")

    # Load object class lookup table
    logger.info('Loading Object Class File: object_class.csv')
    with create_app().app_context():
        sess = GlobalDB.db().session
        sess.query(ObjectClass).delete()

        data = pd.read_csv(filename, dtype=str)
        data = clean_data(
            data, ObjectClass, {
                "max_oc_code": "object_class_code",
                "max_object_class_name": "object_class_name"
            }, {"object_class_code": {
                "pad_to_length": 3
            }})
        # de-dupe
        data.drop_duplicates(subset=['object_class_code'], inplace=True)
        # insert to db
        table_name = ObjectClass.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))
Beispiel #16
0
def parse_sam_file(file_path, sess, monthly=False, benchmarks=False):
    parse_start_time = time.time()
    logger.info("Starting file " + str(file_path))

    dat_file_name = os.path.splitext(os.path.basename(file_path))[0]+'.dat'
    sam_file_type = "MONTHLY" if monthly else "DAILY"
    dat_file_date = re.findall(".*{}_(.*).dat".format(sam_file_type), dat_file_name)[0]

    with create_app().app_context():

        column_header_mapping = {
            "awardee_or_recipient_uniqu": 0,
            "sam_extract_code": 4,
            "registration_date": 6,
            "expiration_date": 7,
            "last_sam_mod_date": 8,
            "activation_date": 9,
            "legal_business_name": 10
        }
        column_header_mapping_ordered = OrderedDict(sorted(column_header_mapping.items(), key=lambda c: c[1]))

        # Initial sweep of the file to see rows and possibly what DUNS we're updating
        if benchmarks:
            initial_sweep = time.time()
        nrows = 0
        with zipfile.ZipFile(file_path) as zip_file:
            with zip_file.open(dat_file_name) as dat_file:
                nrows = len(dat_file.readlines())
        if benchmarks:
            logger.info("Initial sweep took {} seconds".format(time.time() - initial_sweep))

        block_size = 10000
        batches = nrows//block_size
        # skip the first line again if the last batch is also the first batch
        skiplastrows = 2 if batches == 0 else 1
        last_block_size = (nrows % block_size)-skiplastrows
        batch = 0
        added_rows = 0
        while batch <= batches:
            skiprows = 1 if batch == 0 else (batch*block_size)
            nrows = (((batch+1)*block_size)-skiprows) if (batch < batches) else last_block_size
            logger.info('Loading rows %s to %s', skiprows+1, nrows+skiprows)

            with zipfile.ZipFile(file_path) as zip_file:
                with zip_file.open(dat_file_name) as dat_file:
                    csv_data = pd.read_csv(dat_file, dtype=str, header=None, skiprows=skiprows, nrows=nrows, sep='|',
                                           usecols=column_header_mapping_ordered.values(),
                                           names=column_header_mapping_ordered.keys(), quoting=3)

                    # add deactivation_date column for delete records
                    lambda_func = (lambda sam_extract: pd.Series([dat_file_date if sam_extract == "1" else np.nan]))
                    csv_data = csv_data.assign(deactivation_date=pd.Series([np.nan], name='deactivation_date')
                                               if monthly else csv_data["sam_extract_code"].apply(lambda_func))
                    # removing rows where DUNS number isn't even provided
                    csv_data = csv_data.where(csv_data["awardee_or_recipient_uniqu"].notnull())
                    # cleaning and replacing NaN/NaT with None's
                    csv_data = clean_sam_data(csv_data.where(pd.notnull(csv_data), None))

                    if monthly:
                        logger.info("Adding all monthly data with bulk load")
                        if benchmarks:
                            bulk_month_load = time.time()
                        del csv_data["sam_extract_code"]
                        insert_dataframe(csv_data, DUNS.__table__.name, sess.connection())
                        if benchmarks:
                            logger.info("Bulk month load took {} seconds".format(time.time()-bulk_month_load))
                    else:
                        add_data = csv_data[csv_data.sam_extract_code == '2']
                        update_delete_data = csv_data[(csv_data.sam_extract_code == '3') |
                                                      (csv_data.sam_extract_code == '1')]
                        for dataframe in [add_data, update_delete_data]:
                            del dataframe["sam_extract_code"]

                        if not add_data.empty:
                            try:
                                logger.info("Attempting to bulk load add data")
                                insert_dataframe(add_data, DUNS.__table__.name, sess.connection())
                            except IntegrityError:
                                logger.info("Bulk loading add data failed, loading add data by row")
                                sess.rollback()
                                models, activated_models = get_relevant_models(add_data, benchmarks=benchmarks)
                                logger.info("Loading add data ({} rows)".format(len(add_data.index)))
                                load_duns_by_row(add_data, sess, models, activated_models, benchmarks=benchmarks)
                        if not update_delete_data.empty:
                            models, activated_models = get_relevant_models(update_delete_data, benchmarks=benchmarks)
                            logger.info("Loading update_delete data ({} rows)".format(len(update_delete_data.index)))
                            load_duns_by_row(update_delete_data, sess, models, activated_models, benchmarks=benchmarks)
                    sess.commit()

            added_rows += nrows
            batch += 1
            logger.info('%s DUNS records inserted', added_rows)
        if benchmarks:
            logger.info("Parsing {} took {} seconds with {} rows".format(dat_file_name, time.time()-parse_start_time,
                                                                         added_rows))
Beispiel #17
0
def parse_sam_file(file, sess):
    logger.info("starting file " + str(file.name))

    csv_file = os.path.splitext(os.path.basename(file.name))[0] + '.dat'
    zfile = zipfile.ZipFile(file.name)

    # can't use skipfooter, pandas' c engine doesn't work with skipfooter and the python engine doesn't work with dtype
    nrows = 0
    with zfile.open(csv_file) as f:
        nrows = len(f.readlines()) - 2  # subtract the header and footer
    column_header_mapping = {
        "awardee_or_recipient_uniqu": 0,
        "sam_extract": 4,
        "expiration_date": 7,
        "activation_date": 9,
        "ultimate_parent_legal_enti": 10,
        "ultimate_parent_unique_ide": 48,
        "exec_comp_str": 89
    }
    column_header_mapping_ordered = OrderedDict(
        sorted(column_header_mapping.items(), key=lambda c: c[1]))
    csv_data = pd.read_csv(zfile.open(csv_file),
                           dtype=str,
                           header=None,
                           skiprows=1,
                           nrows=nrows,
                           sep='|',
                           usecols=column_header_mapping_ordered.values(),
                           names=column_header_mapping_ordered.keys())
    total_data = csv_data.copy()

    # skipping when sam_extract == '4' as it's expired
    total_data = total_data[total_data.sam_extract != '4']

    # parse out executive compensation from row 90
    lambda_func = (lambda ecs: pd.Series(list(parse_exec_comp(ecs).values())))
    parsed_data = total_data["exec_comp_str"].apply(lambda_func)
    parsed_data.columns = list(parse_exec_comp().keys())
    del total_data["exec_comp_str"]
    total_data = total_data.join(parsed_data)

    # split into 3 dataframes based on row 8 ('1', '2', '3')
    delete_data = total_data[total_data.sam_extract == '1'].replace(np.nan,
                                                                    "",
                                                                    regex=True)
    add_data = total_data[total_data.sam_extract == '2'].replace(np.nan,
                                                                 "",
                                                                 regex=True)
    update_data = total_data[total_data.sam_extract == '3'].replace(np.nan,
                                                                    "",
                                                                    regex=True)
    for dataframe in [add_data, update_data, delete_data, total_data]:
        del dataframe["sam_extract"]

    table_name = ExecutiveCompensation.__table__.name
    insert_dataframe(add_data, table_name, sess.connection())
    for _, row in update_data.iterrows():
        sess.query(ExecutiveCompensation).filter_by(awardee_or_recipient_uniqu=row['awardee_or_recipient_uniqu']).\
            update(row, synchronize_session=False)
    for _, row in delete_data.iterrows():
        sess.query(ExecutiveCompensation).filter_by(awardee_or_recipient_uniqu=row['awardee_or_recipient_uniqu']).\
            delete(synchronize_session=False)
    sess.commit()
def load_cfda_program(filename):
    """Load country code lookup table."""
    model = CFDAProgram

    with create_app().app_context():
        sess = GlobalDB.db().session
        # for object class, delete and replace values
        sess.query(model).delete()

        data = pd.read_csv(filename, dtype=str, encoding='latin1')

        data = clean_data(
            data, model, {
                "program_title": "program_title",
                "program_number": "program_number",
                "popular_name_(020)": "popular_name",
                "federal_agency_(030)": "federal_agency",
                "authorization_(040)": "authorization",
                "objectives_(050)": "objectives",
                "types_of_assistance_(060)": "types_of_assistance",
                "uses_and_use_restrictions_(070)": "uses_and_use_restrictions",
                "applicant_eligibility_(081)": "applicant_eligibility",
                "beneficiary_eligibility_(082)": "beneficiary_eligibility",
                "credentials/documentation_(083)": "credentials_documentation",
                "preapplication_coordination_(091)":
                "preapplication_coordination",
                "application_procedures_(092)": "application_procedures",
                "award_procedure_(093)": "award_procedure",
                "deadlines_(094)": "deadlines",
                "range_of_approval/disapproval_time_(095)":
                "range_of_approval_disapproval_time",
                "appeals_(096)": "appeals",
                "renewals_(097)": "renewals",
                "formula_and_matching_requirements_(101)":
                "formula_and_matching_requirements",
                "length_and_time_phasing_of_assistance_(102)":
                "length_and_time_phasing_of_assistance",
                "reports_(111)": "reports",
                "audits_(112)": "audits",
                "records_(113)": "records",
                "account_identification_(121)": "account_identification",
                "obligations_(122)": "obligations",
                "range_and_average_of_financial_assistance_(123)":
                "range_and_average_of_financial_assistance",
                "program_accomplishments_(130)": "program_accomplishments",
                "regulations__guidelines__and_literature_(140)":
                "regulations_guidelines_and_literature",
                "regional_or_local_office_(151)": "regional_or_local_office",
                "headquarters_office_(152)": "headquarters_office",
                "website_address_(153)": "website_address",
                "related_programs_(160)": "related_programs",
                "examples_of_funded_projects_(170)":
                "examples_of_funded_projects",
                "criteria_for_selecting_proposals_(180)":
                "criteria_for_selecting_proposals",
                "url": "url",
                "recovery": "recovery",
                "omb_agency_code": "omb_agency_code",
                "omb_bureau_code": "omb_bureau_code",
                "published_date": "published_date",
                "archived_date": "archived_date"
            }, {})
        data["published_date"] = format_date(data["published_date"])
        data["archived_date"] = format_date(data["archived_date"])

        # insert to db
        table_name = model.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))
def parse_sam_file(file_path, sess, monthly=False, benchmarks=False):
    parse_start_time = time.time()
    logger.info("starting file " + str(file_path))

    dat_file_name = os.path.splitext(os.path.basename(file_path))[0]+'.dat'
    sam_file_type = "MONTHLY" if monthly else "DAILY"
    dat_file_date = re.findall(".*{}_(.*).dat".format(sam_file_type), dat_file_name)[0]

    with create_app().app_context():

        column_header_mapping = {
            "awardee_or_recipient_uniqu": 0,
            "sam_extract_code": 4,
            "expiration_date": 7,
            "last_sam_mod_date": 8,
            "activation_date": 9,
            "legal_business_name": 10
        }
        column_header_mapping_ordered = OrderedDict(sorted(column_header_mapping.items(), key=lambda c: c[1]))

        # Initial sweep of the file to see rows and possibly what DUNS we're updating
        if benchmarks:
            initial_sweep = time.time()
        nrows = 0
        with zipfile.ZipFile(file_path) as zip_file:
            with zip_file.open(dat_file_name) as dat_file:
                nrows = len(dat_file.readlines())
        if benchmarks:
            logger.info("Initial sweep took {} seconds".format(time.time() - initial_sweep))

        block_size = 10000
        batches = nrows//block_size
        # skip the first line again if the last batch is also the first batch
        skiplastrows = 2 if batches == 0 else 1
        last_block_size = (nrows % block_size)-skiplastrows
        batch = 0
        added_rows = 0
        while batch <= batches:
            skiprows = 1 if batch == 0 else (batch*block_size)
            nrows = (((batch+1)*block_size)-skiprows) if (batch < batches) else last_block_size
            logger.info('loading rows %s to %s', skiprows+1, nrows+skiprows)

            with zipfile.ZipFile(file_path) as zip_file:
                with zip_file.open(dat_file_name) as dat_file:
                    csv_data = pd.read_csv(dat_file, dtype=str, header=None, skiprows=skiprows, nrows=nrows, sep='|',
                                           usecols=column_header_mapping_ordered.values(),
                                           names=column_header_mapping_ordered.keys())

                    # add deactivation_date column for delete records
                    lambda_func = (lambda sam_extract: pd.Series([dat_file_date if sam_extract == "1" else np.nan]))
                    csv_data = csv_data.assign(deactivation_date=pd.Series([np.nan], name='deactivation_date')
                                               if monthly else csv_data["sam_extract_code"].apply(lambda_func))
                    # removing rows where DUNS number isn't even provided
                    csv_data = csv_data.where(csv_data["awardee_or_recipient_uniqu"].notnull())
                    # cleaning and replacing NaN/NaT with None's
                    csv_data = clean_sam_data(csv_data.where(pd.notnull(csv_data), None))

                    if monthly:
                        logger.info("adding all monthly data with bulk load")
                        if benchmarks:
                            bulk_month_load = time.time()
                        del csv_data["sam_extract_code"]
                        insert_dataframe(csv_data, DUNS.__table__.name, sess.connection())
                        if benchmarks:
                            logger.info("Bulk month load took {} seconds".format(time.time()-bulk_month_load))
                    else:
                        add_data = csv_data[csv_data.sam_extract_code == '2']
                        update_delete_data = csv_data[(csv_data.sam_extract_code == '3') |
                                                      (csv_data.sam_extract_code == '1')]
                        for dataframe in [add_data, update_delete_data]:
                            del dataframe["sam_extract_code"]

                        if not add_data.empty:
                            try:
                                logger.info("attempting to bulk load add data")
                                insert_dataframe(add_data, DUNS.__table__.name, sess.connection())
                            except IntegrityError:
                                logger.info("bulk loading add data failed, loading add data by row")
                                sess.rollback()
                                models, activated_models = get_relevant_models(add_data, benchmarks=benchmarks)
                                logger.info("loading add data ({} rows)".format(len(add_data.index)))
                                load_duns_by_row(add_data, sess, models, activated_models, benchmarks=benchmarks)
                        if not update_delete_data.empty:
                            models, activated_models = get_relevant_models(update_delete_data, benchmarks=benchmarks)
                            logger.info("loading update_delete data ({} rows)".format(len(update_delete_data.index)))
                            load_duns_by_row(update_delete_data, sess, models, activated_models, benchmarks=benchmarks)
                    sess.commit()

            added_rows += nrows
            batch += 1
            logger.info('%s DUNS records inserted', added_rows)
        if benchmarks:
            logger.info("Parsing {} took {} seconds with {} rows".format(dat_file_name, time.time()-parse_start_time,
                                                                         added_rows))
Beispiel #20
0
def load_program_activity_data(base_path):
    """ Load program activity lookup table.

        Args:
            base_path: directory of domain config files
    """
    last_upload = get_date_of_current_pa_upload(base_path)
    if not (last_upload > get_stored_pa_last_upload()):
        return

    program_activity_file = get_program_activity_file(base_path)

    logger.info('Loading program activity: ' + PA_FILE_NAME)

    with create_app().app_context():
        sess = GlobalDB.db().session
        try:
            data = pd.read_csv(program_activity_file, dtype=str)
        except pd.io.common.EmptyDataError as e:
            log_blank_file()
            sys.exit(
                4
            )  # exit code chosen arbitrarily, to indicate distinct failure states

        headers = set([header.upper() for header in list(data)])

        if not VALID_HEADERS.issubset(headers):
            logger.error(
                "Missing required headers. Required headers include: %s" %
                str(VALID_HEADERS))
            sys.exit(4)

        try:
            dropped_count, data = clean_data(
                data, ProgramActivity, {
                    "fyq": "fiscal_year_quarter",
                    "agency_code": "agency_id",
                    "allocation_id": "allocation_transfer_id",
                    "account_code": "account_number",
                    "pa_code": "program_activity_code",
                    "pa_title": "program_activity_name"
                }, {
                    "program_activity_code": {
                        "pad_to_length": 4
                    },
                    "agency_id": {
                        "pad_to_length": 3
                    },
                    "allocation_transfer_id": {
                        "pad_to_length": 3,
                        "keep_null": True
                    },
                    "account_number": {
                        "pad_to_length": 4
                    }
                }, [
                    "agency_id", "program_activity_code", "account_number",
                    "program_activity_name"
                ], True)
        except FailureThresholdExceededException as e:
            if e.count == 0:
                log_blank_file()
                sys.exit(4)
            else:
                count_str = "Application tried to drop {} rows".format(e.count)
                logger.error(
                    "Loading of program activity file failed due to exceeded failure threshold. "
                    + count_str)
                sys.exit(5)

        sess.query(ProgramActivity).delete()

        # Lowercase Program Activity Name
        data['program_activity_name'] = data['program_activity_name'].apply(
            lambda x: lowercase_or_notify(x))

        # because we're only loading a subset of program activity info,
        # there will be duplicate records in the dataframe. this is ok,
        # but need to de-duped before the db load. We also need to log them.
        base_count = data.shape[0]
        data.drop_duplicates(inplace=True)
        logger.info("Dropped {} duplicate rows.".format(base_count -
                                                        data.shape[0]))

        # insert to db
        table_name = ProgramActivity.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    set_stored_pa_last_upload(last_upload)
    logger.info('{} records inserted to {}'.format(num, table_name))

    if dropped_count > 0:
        sys.exit(3)