def load_state_data(force_reload):
    """ Load data into the States table

        Args:
            force_reload: boolean to determine if reload should happen whether there are differences or not
    """
    start_time = datetime.now()
    state_file_url = '{}/state_list.csv'.format(CONFIG_BROKER['usas_public_reference_url'])
    with RetrieveFileFromUri(state_file_url, 'r').get_file_object() as state_file:
        new_data = parse_state_file(state_file)

    diff_found = check_dataframe_diff(new_data, States, ['states_id'], ['state_code'])

    if force_reload or diff_found:
        sess = GlobalDB.db().session
        logger.info('Differences found or reload forced, reloading states table.')
        # delete any data in the States table
        sess.query(States).delete()

        # insert data into table
        num = insert_dataframe(new_data, States.__table__.name, sess.connection())
        logger.info('{} records inserted to states'.format(num))
        sess.commit()
        update_external_data_load_date(start_time, datetime.now(), 'state_code')
    else:
        logger.info('No differences found, skipping states table reload.')
def load_agency_data(base_path, force_reload=False):
    """ Load agency data into the database

        Args:
            base_path: directory that contains the agency files
            force_reload: whether to reload regardless
    """
    start_time = datetime.now()
    agency_codes_url = '{}/agency_codes.csv'.format(CONFIG_BROKER['usas_public_reference_url'])
    logger.info('Loading agency codes file from {}'.format(agency_codes_url))

    # Get data from public S3 bucket
    r = requests.get(agency_codes_url, allow_redirects=True)
    agency_codes_file = os.path.join(base_path, 'agency_codes.csv')
    open(agency_codes_file, 'wb').write(r.content)

    # Parse data
    with create_app().app_context():
        logger.info('Loading CGAC')
        new_cgac = load_cgac(agency_codes_file, force_reload=force_reload)
        logger.info('Loading FREC')
        new_frec = load_frec(agency_codes_file, force_reload=force_reload)
        logger.info('Loading Sub Tier Agencies')
        new_sub_tier = load_sub_tier_agencies(agency_codes_file, force_reload=force_reload)

        # If we've updated the data at all, update the external data load date
        if new_cgac or new_frec or new_sub_tier:
            update_external_data_load_date(start_time, datetime.now(), 'agency')

    # Delete file once we're done
    os.remove(agency_codes_file)
Beispiel #3
0
def test_set_get_pa_last_upload_existing(monkeypatch, database):
    """ Test the last upload date/time retrieval """

    monkeypatch.setattr(load_program_activity, 'CONFIG_BROKER',
                        {'use_aws': False})
    add_relevant_data_types(database.session)

    # test epoch timing
    stored_date = load_program_activity.get_stored_pa_last_upload()
    expected_date = datetime.datetime(1970, 1, 1, 0, 0, 0)
    assert stored_date == expected_date

    update_external_data_load_date(datetime.datetime(2017, 12, 31, 0, 0, 0),
                                   datetime.datetime(2017, 12, 31, 0, 0, 0),
                                   'program_activity_upload')

    stored_date = load_program_activity.get_stored_pa_last_upload()
    expected_date = datetime.datetime(2017, 12, 31, 0, 0, 0)
    assert stored_date == expected_date

    # repeat this, because the first time, there is no stored object, but now test with one that already exists.
    update_external_data_load_date(datetime.datetime(2016, 12, 31, 0, 0, 0),
                                   datetime.datetime(2016, 12, 31, 0, 0, 0),
                                   'program_activity_upload')

    stored_date = load_program_activity.get_stored_pa_last_upload()
    expected_date = datetime.datetime(2016, 12, 31, 0, 0, 0)
    assert stored_date == expected_date

    remove_metrics_file()
Beispiel #4
0
def load_object_class(base_path):
    """ This function loads Object classes into the database

        Args:
            base_path: directory that contains the domain values files.
    """
    now = datetime.datetime.now()
    metrics_json = {
        'script_name': 'load_object_class.py',
        'start_time': str(now),
        'records_received': 0,
        'duplicates_dropped': 0,
        'records_deleted': 0,
        'records_inserted': 0
    }

    filename = os.path.join(base_path, 'object_class.csv')
    try:
        # Update file from public S3 bucket
        object_class_url = '{}/object_class.csv'.format(CONFIG_BROKER['usas_public_reference_url'])
        r = requests.get(object_class_url, allow_redirects=True)
        open(filename, 'wb').write(r.content)
    except Exception:
        pass

    # Load object class lookup table
    logger.info('Loading Object Class File: object_class.csv')
    with create_app().app_context():
        sess = GlobalDB.db().session
        metrics_json['records_deleted'] = sess.query(ObjectClass).delete()

        data = pd.read_csv(filename, dtype=str)
        data = clean_data(
            data,
            ObjectClass,
            {"max_oc_code": "object_class_code", "max_object_class_name": "object_class_name"},
            {"object_class_code": {"pad_to_length": 3}}
        )
        metrics_json['records_received'] = len(data.index)
        # de-dupe
        data.drop_duplicates(subset=['object_class_code'], inplace=True)
        metrics_json['duplicates_dropped'] = metrics_json['records_received'] - len(data.index)
        # insert to db
        table_name = ObjectClass.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))
    metrics_json['records_inserted'] = num

    update_external_data_load_date(now, datetime.datetime.now(), 'object_class')

    metrics_json['duration'] = str(datetime.datetime.now() - now)

    with open('load_object_class_metrics.json', 'w+') as metrics_file:
        json.dump(metrics_json, metrics_file)
Beispiel #5
0
def load_all_sf133(sf133_path=None, force_sf133_load=False, aws_prefix='sf_133', fix_links=True,
                   update_tas_fields=True):
    """ Load any SF-133 files that are not yet in the database and fix any broken links

        Args:
            sf133_path: path to the SF133 files
            force_sf133_load: boolean to indicate whether to force a reload of the data
            aws_prefix: prefix to filter which files to pull from AWS
            fix_links: fix any SF133 records not linked to TAS data
            update_tas_fields: rederive SF133 records if the associated TAS record has been updated
    """
    now = datetime.now()
    metrics_json = {
        'script_name': 'load_sf133.py',
        'start_time': str(now),
        'records_deleted': 0,
        'records_inserted': 0
    }

    with create_app().app_context():
        sess = GlobalDB.db().session

        # get a list of SF 133 files to load
        sf133_list = get_sf133_list(sf133_path, aws_prefix=aws_prefix)
        sf_re = re.compile(r'sf_133_(?P<year>\d{4})_(?P<period>\d{2})\.csv')
        for sf133 in sf133_list:
            # for each SF file, parse out fiscal year and period and call the SF 133 loader
            file_match = sf_re.match(sf133.file)
            if not file_match:
                logger.info('Skipping SF 133 file with invalid name: %s', sf133.full_file)
                continue
            logger.info('Starting %s...', sf133.full_file)
            load_sf133(sess, sf133.full_file, file_match.group('year'), file_match.group('period'),
                       force_sf133_load=force_sf133_load, metrics=metrics_json)
        if update_tas_fields:
            rederive_tas_fields(sess, metrics=metrics_json)
        if fix_links:
            fix_broken_links(sess, metrics=metrics_json)

    update_external_data_load_date(now, datetime.now(), 'gtas')

    metrics_json['duration'] = str(datetime.now() - now)

    with open('load_sf133_metrics.json', 'w+') as metrics_file:
        json.dump(metrics_json, metrics_file)
    logger.info("Script complete")
Beispiel #6
0
def load_defc(force_reload=False):
    """ Loads the DEFC data.

        Args:
            force_reload: boolean to determine if reload should happen whether there are differences or not
    """
    start_time = datetime.now()
    defc_file = os.path.join(CONFIG_BROKER['path'], 'dataactvalidator',
                             'config', 'def_codes.csv')

    try:
        # Update file from public S3 bucket
        def_codes_url = '{}/def_codes.csv'.format(
            CONFIG_BROKER['usas_public_reference_url'])
        r = requests.get(def_codes_url, allow_redirects=True)
        open(defc_file, 'wb').write(r.content)
    except Exception:
        pass

    logger.info('Loading defc data')
    with create_app().app_context():
        data = pd.read_csv(defc_file, dtype=str)

        # Remove all invalid DEFCs that have been left in the file so USAS can continue to display them correctly
        data = data[data['Is Valid'] == 'true']

        data = clean_data(data, DEFC, {
            'defc': 'code',
            'group_name': 'group'
        }, {})

        diff_found = check_dataframe_diff(data, DEFC, ['defc_id'], ['code'])

        if force_reload or diff_found:
            sess = GlobalDB.db().session
            # delete any data in the DEFC table
            sess.query(DEFC).delete()

            # insert data into table
            num = insert_dataframe(data, DEFC.__table__.name,
                                   sess.connection())
            logger.info('{} records inserted to defc'.format(num))
            sess.commit()
            update_external_data_load_date(start_time, datetime.now(), 'defc')
        else:
            logger.info('No differences found, skipping defc table reload.')
Beispiel #7
0
def main():
    now = datetime.now()
    parser = argparse.ArgumentParser(
        description='Pull data from the Federal Hierarchy API.')
    parser.add_argument('-a',
                        '--all',
                        help='Clear out the database and get historical data',
                        action='store_true')
    parser.add_argument('-f',
                        '--filename',
                        help='Generate a local CSV file from the data.',
                        nargs=1,
                        type=str)
    parser.add_argument('-o',
                        '--export_office',
                        help='Export the current office table. '
                        'Please provide the file name/path.',
                        nargs=1,
                        type=str)
    parser.add_argument('-d',
                        '--pull_date',
                        help='Date from which to start the pull',
                        nargs=1,
                        type=str)
    parser.add_argument('-i',
                        '--ignore_db',
                        help='Do not update the DB tables',
                        action='store_true')
    args = parser.parse_args()

    if args.all and args.pull_date:
        logger.error('The -a and -d flags conflict, cannot use both at once.')
        sys.exit(1)

    metrics_json = {
        'script_name': 'load_federal_hierarchy.py',
        'start_time': str(now),
        'level_1_records': 0,
        'level_2_records': 0,
        'level_3_records': 0,
        'level_4_records': 0,
        'level_5_records': 0,
        'level_6_records': 0,
        'level_7_records': 0,
        'missing_cgacs': [],
        'missing_subtier_codes': []
    }

    # Handle the pull_date parameter
    updated_date_from = None
    if args.pull_date:
        try:
            updated_date_from = args.pull_date[0]
            datetime.strptime(updated_date_from, '%Y-%m-%d')
        except ValueError:
            logger.error('The date given to the -d flag was not parseable.')
            sys.exit(1)

    # Get or create the start date
    sess = GlobalDB.db().session
    if not args.all and not updated_date_from:
        last_pull_date = sess.query(ExternalDataLoadDate.last_load_date_start).\
            filter_by(external_data_type_id=EXTERNAL_DATA_TYPE_DICT['office']).one_or_none()
        if not last_pull_date:
            logger.error(
                'The -a or -d flag must be set when there is no latest run in the database.'
            )
            sys.exit(1)
        # We want to make the date one day earlier to account for any timing weirdness between the two systems
        updated_date_from = last_pull_date[0].date() - timedelta(days=1)

    # Handle the filename parameter
    filename = args.filename[0] if args.filename else None

    # Handle the export office parameter
    export_office = args.export_office[0] if args.export_office else None

    # Handle a complete data reload
    if args.all and not args.ignore_db:
        logger.info('Emptying out the Office table for a complete reload.')
        sess.execute('''TRUNCATE TABLE office RESTART IDENTITY''')

    try:
        pull_offices(sess, filename, not args.ignore_db, args.all,
                     updated_date_from, export_office, metrics_json)
    except Exception as e:
        logger.exception(e)
        sys.exit(1)

    # find if there were any new cgacs/subtiers added
    all_cgacs = [cgac.cgac_code for cgac in sess.query(CGAC.cgac_code)]
    all_frecs = [frec.frec_code for frec in sess.query(FREC.frec_code)]
    all_subtiers = [
        subtier.sub_tier_agency_code
        for subtier in sess.query(SubTierAgency.sub_tier_agency_code)
    ]
    metrics_json['missing_cgacs'] = list(
        set(metrics_json['missing_cgacs']) - set(all_cgacs + all_frecs))
    metrics_json['missing_subtier_codes'] = list(
        set(metrics_json['missing_subtier_codes']) - set(all_subtiers))

    update_external_data_load_date(now, datetime.now(), 'office')

    metrics_json['duration'] = str(datetime.now() - now)

    with open('load_federal_hierarchy_metrics.json', 'w+') as metrics_file:
        json.dump(metrics_json, metrics_file)
    logger.info('Script complete')
def load_program_activity_data(base_path, force_reload=False, export=False):
    """ Load program activity lookup table.

        Args:
            base_path: directory of domain config files
            force_reload: whether or not to force a reload
            export: whether or not to export a public copy of the file
    """
    now = datetime.datetime.now()
    metrics_json = {
        'script_name': 'load_program_activity.py',
        'start_time': str(now),
        'records_received': 0,
        'duplicates_dropped': 0,
        'invalid_records_dropped': 0,
        'records_deleted': 0,
        'records_inserted': 0
    }
    dropped_count = 0

    logger.info('Checking PA upload dates to see if we can skip.')
    last_upload = get_date_of_current_pa_upload(base_path)
    if not (last_upload > get_stored_pa_last_upload()) and not force_reload:
        logger.info('Skipping load as it\'s already been done')
    else:
        logger.info('Getting the progrma activity file')
        program_activity_file = get_program_activity_file(base_path)

        logger.info('Loading program activity: {}'.format(PA_FILE_NAME))

        with create_app().app_context():
            sess = GlobalDB.db().session
            try:
                raw_data = pd.read_csv(program_activity_file, dtype=str)
            except pd.io.common.EmptyDataError:
                log_blank_file()
                exit_if_nonlocal(4)  # exit code chosen arbitrarily, to indicate distinct failure states
                return
            headers = set([header.upper() for header in list(raw_data)])

            if not VALID_HEADERS.issubset(headers):
                logger.error('Missing required headers. Required headers include: %s' % str(VALID_HEADERS))
                exit_if_nonlocal(4)
                return

            try:
                dropped_count, data = clean_data(
                    raw_data,
                    ProgramActivity,
                    {'fyq': 'fiscal_year_period', 'agency_code': 'agency_id', 'allocation_id': 'allocation_transfer_id',
                     'account_code': 'account_number', 'pa_code': 'program_activity_code',
                     'pa_title': 'program_activity_name'},
                    {'program_activity_code': {'pad_to_length': 4}, 'agency_id': {'pad_to_length': 3},
                     'allocation_transfer_id': {'pad_to_length': 3, 'keep_null': True},
                     'account_number': {'pad_to_length': 4}},
                    ['agency_id', 'program_activity_code', 'account_number', 'program_activity_name'],
                    True
                )
            except FailureThresholdExceededException as e:
                if e.count == 0:
                    log_blank_file()
                    exit_if_nonlocal(4)
                    return
                else:
                    logger.error('Loading of program activity file failed due to exceeded failure threshold. '
                                 'Application tried to drop {} rows'.format(e.count))
                    exit_if_nonlocal(5)
                    return

            metrics_json['records_deleted'] = sess.query(ProgramActivity).delete()
            metrics_json['invalid_records_dropped'] = dropped_count

            # Lowercase Program Activity Name
            data['program_activity_name'] = data['program_activity_name'].apply(lambda x: lowercase_or_notify(x))
            # Convert FYQ to FYP
            data['fiscal_year_period'] = data['fiscal_year_period'].apply(lambda x: convert_fyq_to_fyp(x))

            # because we're only loading a subset of program activity info, there will be duplicate records in the
            # dataframe. this is ok, but need to de-duped before the db load. We also need to log them.
            base_count = len(data.index)
            metrics_json['records_received'] = base_count
            data.drop_duplicates(inplace=True)

            dupe_count = base_count - len(data.index)
            logger.info('Dropped {} duplicate rows.'.format(dupe_count))
            metrics_json['duplicates_dropped'] = dupe_count

            # insert to db
            table_name = ProgramActivity.__table__.name
            num = insert_dataframe(data, table_name, sess.connection())
            sess.commit()

            if export:
                export_public_pa(raw_data)

        end_time = datetime.datetime.now()
        update_external_data_load_date(now, end_time, 'program_activity')
        update_external_data_load_date(last_upload, end_time, 'program_activity_upload')
        logger.info('{} records inserted to {}'.format(num, table_name))
        metrics_json['records_inserted'] = num

        metrics_json['duration'] = str(end_time - now)

    with open('load_program_activity_metrics.json', 'w+') as metrics_file:
        json.dump(metrics_json, metrics_file)

    if dropped_count > 0:
        exit_if_nonlocal(3)
        return
Beispiel #9
0
def load_cfda_program(base_path, load_local=False, local_file_name="cfda_program.csv"):
    """ Load cfda program.

        Args:
            base_path: directory that contains the cfda values files.
            load_local: boolean indicating whether to load from a local file or not
            local_file_name: the name of the file if loading locally
    """
    local_now = datetime.now()
    if not load_local:
        logger.info("Fetching CFDA file from {}".format(S3_CFDA_FILE))
        tmp_name = str(time.time()).replace(".", "") + "_cfda_program.csv"
        filename = os.path.join(base_path, tmp_name)
        r = requests.get(S3_CFDA_FILE, allow_redirects=True)
        open(filename, 'wb').write(r.content)
    else:
        filename = os.path.join(base_path, local_file_name)
    logger.info('Loading CFDA program file: ' + filename)
    model = CFDAProgram

    metrics_json = {
        'script_name': 'load_cfda_data.py',
        'start_time': str(local_now),
        'new_records': 0
    }

    def fix_program_number(row, decimals=3):
        multiplier = 10 ** decimals
        value = math.floor(row['program_number'] * multiplier + 0.5) / multiplier
        return str(value).ljust(6, '0')

    with create_app().app_context():
        configure_logging()
        sess = GlobalDB.db().session

        import_data = pd.read_csv(filename, dtype=str, encoding='cp1252', na_filter=False)
        import_data = clean_data(
            import_data,
            model,
            DATA_CLEANING_MAP,
            {}
        )
        import_data["published_date"] = format_date(import_data["published_date"])
        import_data["archived_date"] = format_date(import_data["archived_date"])
        table_name = model.__table__.name
        # Check if there is new data to load
        new_data = check_dataframe_diff(import_data, model, ['cfda_program_id'], ['program_number'],
                                        lambda_funcs=[('program_number', fix_program_number)])
        if new_data:
            # insert to db
            sess.query(model).delete()
            num = insert_dataframe(import_data, table_name, sess.connection())
            sess.commit()

            # If we've updated the data at all, update the external data load date
            update_external_data_load_date(local_now, datetime.now(), 'cfda')
    if not load_local:
        os.remove(filename)
    if new_data:
        logger.info('{} records inserted to {}'.format(num, table_name))
        metrics_json['new_records'] = num
    else:
        logger.info("Skipped cfda load, no new data.")
        sys.exit(3)

    metrics_json['duration'] = str(datetime.now() - local_now)

    with open('load_cfda_data_metrics.json', 'w+') as metrics_file:
        json.dump(metrics_json, metrics_file)
def load_tas(backfill_historic=False):
    """ Load TAS file into broker database.

        Args:
            backfill_historic: if set to true, this will only update certain columns if budget_function_code is null
    """
    # read TAS file to dataframe, to make sure all is well with the file before firing up a db transaction
    tas_files = []
    file_loaded = False
    now = datetime.now()
    metrics_json = {
        'script_name': 'load_tas.py',
        'start_time': str(now),
        'records_updated': 0,
        'records_added': 0,
        'records_provided': 0,
        'duplicates_dropped': 0
    }

    if CONFIG_BROKER["use_aws"]:
        # Storing version dictionaries in the list to prevent getting all the links at once and possibly work with
        # expired AWS links
        s3connection = boto3.client('s3',
                                    region_name=CONFIG_BROKER['aws_region'])
        # list_object_versions returns the versions in reverse chronological order

        if not backfill_historic:
            # get the latest tas_file
            tas_files = [
                s3connection.generate_presigned_url(
                    ClientMethod='get_object',
                    Params={
                        'Bucket': CONFIG_BROKER['sf_133_bucket'],
                        'Key': 'cars_tas.csv'
                    },
                    ExpiresIn=600)
            ]
        else:
            tas_files = s3connection.list_object_versions(
                Bucket=CONFIG_BROKER['sf_133_bucket'],
                Prefix='cars_tas.csv')['Versions']
            # getting the latest file (see the reversed) from each day for performance and accuracy
            tas_files_grouped = {
                tas_file['LastModified'].date(): tas_file
                for tas_file in reversed(tas_files)
            }
            # sorting them back to chronological order
            tas_files = sorted(
                [tas_file for date, tas_file in tas_files_grouped.items()],
                key=lambda k: k['LastModified'])
    elif backfill_historic:
        logger.error(
            'Unable to attain historical versions of cars_tas without aws access.'
        )
        return
    else:
        tas_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator",
                                "config", "cars_tas.csv")
        tas_files.append(tas_file)

    sess = GlobalDB.db().session
    for tas_file in reversed(tas_files):
        update_missing = missing_records(sess) if backfill_historic else []
        if backfill_historic and not update_missing:
            # no more missing, we're done here
            break
        if CONFIG_BROKER["use_aws"] and backfill_historic:
            # generate url from the version dict
            logger.info('Working with remote cars_tas.csv (from {})'.format(
                tas_file['LastModified']))
            tas_file = s3connection.generate_presigned_url(
                ClientMethod='get_object',
                Params={
                    'Bucket': CONFIG_BROKER['sf_133_bucket'],
                    'Key': 'cars_tas.csv',
                    'VersionId': tas_file['VersionId']
                },
                ExpiresIn=600)
        elif CONFIG_BROKER["use_aws"]:
            logger.info('Working with latest remote cars_tas.csv')
        else:
            logger.info('Working with local cars_tas.csv')
        update_tas_lookups(sess,
                           tas_file,
                           update_missing=update_missing,
                           metrics=metrics_json)
        file_loaded = True

    metrics_json['duration'] = str(datetime.now() - now)
    if file_loaded:
        update_external_data_load_date(now, datetime.now(), 'tas')

    with open('load_tas_metrics.json', 'w+') as metrics_file:
        json.dump(metrics_json, metrics_file)
def load_country_codes(base_path, force_reload=False):
    """ Load Country Codes into the database.

        Args:
            base_path: directory that contains the domain values files.
            force_reload: boolean to determine if reload should happen whether there are differences or not
    """
    now = datetime.datetime.now()
    metrics_json = {
        'script_name': 'load_country_codes.py',
        'start_time': str(now),
        'records_deleted': 0,
        'records_provided': 0,
        'duplicates_dropped': 0,
        'records_inserted': 0
    }

    with create_app().app_context():
        sess = GlobalDB.db().session

        country_code_file = '{}/country_codes.csv'.format(
            CONFIG_BROKER['usas_public_reference_url'])

        logger.info(
            'Loading country codes file from {}'.format(country_code_file))

        # Get data from public S3 bucket
        r = requests.get(country_code_file, allow_redirects=True)
        filename = os.path.join(base_path, 'country_codes.csv')
        open(filename, 'wb').write(r.content)

        # Parse data
        data = pd.read_csv(filename, dtype=str)
        metrics_json['records_provided'] = len(data.index)
        data = clean_data(
            data, CountryCode, {
                'country_code': 'country_code',
                'country_name': 'country_name',
                '2_char_country_code': 'country_code_2_char',
                'territory_or_freely_associated_state': 'territory_free_state'
            }, {})
        # de-dupe
        data.drop_duplicates(subset=['country_code'], inplace=True)
        metrics_json['duplicates_dropped'] = metrics_json[
            'records_provided'] - len(data.index)

        # compare to existing content in table
        diff_found = check_dataframe_diff(data,
                                          CountryCode, ['country_code_id'],
                                          ['country_code'],
                                          lambda_funcs=[
                                              ('territory_free_state',
                                               convert_territory_bool_to_str)
                                          ])

        # insert to db if reload required
        if force_reload or diff_found:
            logger.info(
                'Differences found or reload forced, reloading country_code table.'
            )
            # if there's a difference, clear out the old data before adding the new stuff
            metrics_json['records_deleted'] = sess.query(CountryCode).delete()

            num = insert_dataframe(data, CountryCode.__table__.name,
                                   sess.connection())
            metrics_json['records_inserted'] = num
            sess.commit()

            # Updating data load dates if the load successfully added new country codes
            update_external_data_load_date(now, datetime.datetime.now(),
                                           'country_code')

            logger.info(
                '{} records inserted to country_code table'.format(num))
        else:
            logger.info(
                'No differences found, skipping country_code table reload.')

        # Delete file once we're done
        os.remove(filename)

    metrics_json['duration'] = str(datetime.datetime.now() - now)

    with open('load_country_codes_metrics.json', 'w+') as metrics_file:
        json.dump(metrics_json, metrics_file)
    logger.info('Script complete')
            logger.info('Populating subaward table based off new data')
            new_procurements = (SERVICE_MODEL[PROCUREMENT].next_id(sess) > original_min_procurement_id)
            new_grants = (SERVICE_MODEL[GRANT].next_id(sess) > original_min_grant_id)
            proc_ids = list(set(updated_proc_internal_ids))
            grant_ids = list(set(updated_grant_internal_ids))

            if len(sys.argv) <= 1:
                if new_procurements:
                    sess.query(Subaward).filter(Subaward.internal_id.in_(proc_ids)).delete(synchronize_session=False)
                    populate_subaward_table(sess, PROCUREMENT, min_id=original_min_procurement_id)
                if new_grants:
                    sess.query(Subaward).filter(Subaward.internal_id.in_(grant_ids)).delete(synchronize_session=False)
                    populate_subaward_table(sess, GRANT, min_id=original_min_grant_id)
            elif args.procurement and new_procurements and args.ids:
                sess.query(Subaward).filter(Subaward.internal_id.in_(proc_ids)).delete(synchronize_session=False)
                populate_subaward_table(sess, PROCUREMENT, ids=args.ids)
            elif args.grants and new_grants and args.ids:
                sess.query(Subaward).filter(Subaward.internal_id.in_(grant_ids)).delete(synchronize_session=False)
                populate_subaward_table(sess, GRANT, ids=args.ids)

        # Deletes state mapping variable
        config_state_mappings()

        # Only change the last run date if we aren't doing specific ids to backfill
        if not args.ids:
            update_external_data_load_date(now, datetime.datetime.now(), 'subaward')
        metrics_json['duration'] = str(datetime.datetime.now() - now)

        with open('load_fsrs_metrics.json', 'w+') as metrics_file:
            json.dump(metrics_json, metrics_file)
Beispiel #13
0
        'updates_received': 0,
        'deletes_received': 0,
        'added_uei': [],
        'updated_uei': [],
        'records_added': 0,
        'records_updated': 0,
        'parent_rows_updated': 0,
        'parent_update_date': None
    }

    with create_app().app_context():
        sess = GlobalDB.db().session
        if data_type in ('duns', 'both'):
            start_time = datetime.datetime.now()
            load_from_sam('DUNS', sess, historic, local, metrics=metrics, reload_date=reload_date)
            update_external_data_load_date(start_time, datetime.datetime.now(), 'recipient')
        if data_type in ('exec_comp', 'both'):
            start_time = datetime.datetime.now()
            load_from_sam('Executive Compensation', sess, historic, local, metrics=metrics, reload_date=reload_date)
            update_external_data_load_date(start_time, datetime.datetime.now(), 'executive_compensation')
        sess.close()

    metrics['records_added'] = len(set(metrics['added_uei']))
    metrics['records_updated'] = len(set(metrics['updated_uei']) - set(metrics['added_uei']))
    del metrics['added_uei']
    del metrics['updated_uei']

    logger.info('Added {} records and updated {} records'.format(metrics['records_added'], metrics['records_updated']))

    metrics['duration'] = str(datetime.datetime.now() - now)
    with open('load_duns_exec_comp_metrics.json', 'w+') as metrics_file:
Beispiel #14
0
def read_zips():
    """ Update zip codes in the zips table. """
    with create_app().app_context():
        start_time = datetime.now()
        sess = GlobalDB.db().session

        # Create temporary table to do work in so we don't disrupt the site for too long by altering the actual table
        sess.execute(
            'CREATE TABLE IF NOT EXISTS temp_zips (LIKE zips INCLUDING ALL);')
        sess.execute(
            'CREATE TABLE IF NOT EXISTS temp_zips_grouped (LIKE zips_grouped INCLUDING ALL);'
        )
        # Truncating in case we didn't clear out this table after a failure in the script
        sess.execute('TRUNCATE TABLE temp_zips;')
        sess.execute('TRUNCATE TABLE temp_zips_grouped;')
        # Resetting the pk sequence
        sess.execute('SELECT setval(\'zips_zips_id_seq\', 1, false);')
        sess.execute(
            'SELECT setval(\'zips_grouped_zips_grouped_id_seq\', 1, false);')
        sess.commit()

        if CONFIG_BROKER["use_aws"]:
            zip_folder = CONFIG_BROKER["zip_folder"] + "/"
            s3_client = boto3.client('s3',
                                     region_name=CONFIG_BROKER['aws_region'])
            response = s3_client.list_objects_v2(
                Bucket=CONFIG_BROKER['sf_133_bucket'], Prefix=zip_folder)
            for obj in response.get('Contents', []):
                if obj['Key'] != zip_folder:
                    zip_4_file_path = s3_client.generate_presigned_url(
                        'get_object', {
                            'Bucket': CONFIG_BROKER['sf_133_bucket'],
                            'Key': obj['Key']
                        },
                        ExpiresIn=600)
                    parse_zip4_file(urllib.request.urlopen(zip_4_file_path),
                                    sess)

            # parse remaining 5 digit zips that weren't in the first file
            citystate_file = s3_client.generate_presigned_url(
                'get_object', {
                    'Bucket': CONFIG_BROKER['sf_133_bucket'],
                    'Key': "ctystate.txt"
                },
                ExpiresIn=600)
            parse_citystate_file(urllib.request.urlopen(citystate_file), sess)
        else:
            base_path = os.path.join(CONFIG_BROKER["path"], "dataactvalidator",
                                     "config", CONFIG_BROKER["zip_folder"])
            # creating the list while ignoring hidden files on mac
            file_list = [
                f for f in os.listdir(base_path) if not re.match('^\.', f)
            ]
            for file in file_list:
                parse_zip4_file(open(os.path.join(base_path, file)), sess)

            # parse remaining 5 digit zips that weren't in the first file
            citystate_file = os.path.join(CONFIG_BROKER["path"],
                                          "dataactvalidator", "config",
                                          "ctystate.txt")
            parse_citystate_file(open(citystate_file), sess)

        group_zips(sess)
        hot_swap_zip_tables(sess)
        update_external_data_load_date(start_time, datetime.now(), 'zip_code')

        update_state_congr_table_current(sess)
        if CONFIG_BROKER["use_aws"]:
            census_file = s3_client.generate_presigned_url(
                'get_object', {
                    'Bucket': CONFIG_BROKER['sf_133_bucket'],
                    'Key': "census_congressional_districts.csv"
                },
                ExpiresIn=600)
        else:
            census_file = os.path.join(base_path,
                                       "census_congressional_districts.csv")
        update_state_congr_table_census(census_file, sess)
        if CONFIG_BROKER['use_aws']:
            export_state_congr_table(sess)
        update_external_data_load_date(start_time, datetime.now(),
                                       'congressional_district')

        logger.info("Zipcode script complete")