def load_location_data():
    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        city_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'],
                                                                    'Key': "NationalFedCodes.txt"}, ExpiresIn=600)
        county_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'],
                                                                      'Key': "GOVT_UNITS.txt"}, ExpiresIn=600)
        state_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'],
                                                                     'Key': "state_list.txt"}, ExpiresIn=600)
        citystate_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'],
                                                                         'Key': "ctystate.txt"}, ExpiresIn=600)
        zip_city_file = urllib.request.urlopen(citystate_file)
    else:
        city_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "NationalFedCodes.txt")
        county_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "GOVT_UNITS.txt")
        state_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "state_list.txt")
        citystate_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "ctystate.txt")
        zip_city_file = open(citystate_file)

    with create_app().app_context():
        logger.info('Loading city data')
        load_city_data(city_file)
        logger.info('Loading county data')
        load_county_data(county_file)
        logger.info('Loading state data')
        load_state_data(state_file)
        logger.info('Loading zip city data')
        load_zip_city_data(zip_city_file)
コード例 #2
0
def load_cgac(file_name):
    """Load CGAC (high-level agency names) lookup table."""
    with create_app().app_context():
        sess = GlobalDB.db().session
        models = {cgac.cgac_code: cgac for cgac in sess.query(CGAC)}

        # read CGAC values from csv
        data = pd.read_csv(file_name, dtype=str)
        # clean data
        data = clean_data(
            data,
            CGAC,
            {"cgac_agency_code": "cgac_code", "agency_name": "agency_name",
             "agency_abbreviation": "agency_abbreviation"},
            {"cgac_code": {"pad_to_length": 3}}
        )
        # de-dupe
        data.drop_duplicates(subset=['cgac_code'], inplace=True)

        delete_missing_cgacs(models, data)
        update_cgacs(models, data)
        sess.add_all(models.values())
        sess.commit()

        logger.info('%s CGAC records inserted', len(models))
コード例 #3
0
def setup_emails():
    """Create email templates from model metadata."""
    with create_app().app_context():
        sess = GlobalDB.db().session

        # insert email template types
        type_list = [
            ('review_submission', '')
        ]
        for t in type_list:
            email_id = sess.query(
                EmailTemplateType.email_template_type_id).filter(
                EmailTemplateType.name == t[0]).one_or_none()
            if not email_id:
                email_type = EmailTemplateType(name=t[0], description=t[1])
                sess.add(email_type)

        sess.commit()

        # insert email templates

        # Submission Review
        template = ("[REV_USER_NAME] has shared a DATA Act broker submission with you from [REV_AGENCY]. Click "
                    "<a href='[REV_URL]'>here</a> to review their submission. For questions or comments, please visit "
                    "the Service Desk at https://servicedesk.usaspending.gov/ or e-mail [email protected].")
        load_email_template(sess, "DATA Act Broker - Submission Ready for Review", template, "review_submission")
コード例 #4
0
def load_frec(file_name):
    """Load FREC (high-level agency names) lookup table."""
    with create_app().app_context():
        sess = GlobalDB.db().session
        models = {frec.frec_code: frec for frec in sess.query(FREC)}

        # read FREC values from csv
        data = pd.read_csv(file_name, dtype=str)

        # clean data
        data = clean_data(
            data,
            FREC,
            {"frec": "frec_code", "cgac_agency_code": "cgac_code", "frec_entity_description": "agency_name",
             "agency_abbreviation": "agency_abbreviation"},
            {"frec": {"keep_null": False}, "cgac_code": {"pad_to_length": 3}, "frec_code": {"pad_to_length": 4}}
        )
        # de-dupe
        data.drop_duplicates(subset=['frec_code'], inplace=True)
        # create foreign key dicts
        cgac_dict = {str(cgac.cgac_code): cgac.cgac_id for
                     cgac in sess.query(CGAC).filter(CGAC.cgac_code.in_(data["cgac_code"])).all()}

        # insert to db
        delete_missing_frecs(models, data)
        update_frecs(models, data, cgac_dict)
        sess.add_all(models.values())
        sess.commit()

        logger.info('%s FREC records inserted', len(models))
コード例 #5
0
def read_zips():
    with create_app().app_context():
        sess = GlobalDB.db().session

        # delete old values in case something changed and one is now invalid
        sess.query(Zips).delete(synchronize_session=False)
        sess.commit()

        if CONFIG_BROKER["use_aws"]:
            s3connection = boto.s3.connect_to_region(CONFIG_BROKER['aws_region'])
            s3bucket = s3connection.lookup(CONFIG_BROKER['sf_133_bucket'])
            zip_folder = CONFIG_BROKER["zip_folder"] + "/"
            for key in s3bucket.list(prefix=zip_folder):
                if key.name != zip_folder:
                    zip_4_file_path = key.generate_url(expires_in=600)
                    parse_zip4_file(urllib.request.urlopen(zip_4_file_path), sess)

            # parse remaining 5 digit zips that weren't in the first file
            citystate_file = s3bucket.get_key("ctystate.txt").generate_url(expires_in=600)
            parse_citystate_file(urllib.request.urlopen(citystate_file), sess)
        else:
            base_path = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", CONFIG_BROKER["zip_folder"])
            # creating the list while ignoring hidden files on mac
            file_list = [f for f in os.listdir(base_path) if not re.match('^\.', f)]
            for file in file_list:
                parse_zip4_file(open(os.path.join(base_path, file)), sess)

            # parse remaining 5 digit zips that weren't in the first file
            citystate_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "ctystate.txt")
            parse_citystate_file(open(citystate_file), sess)

        logger.info("Zipcode script complete")
コード例 #6
0
def load_county_data(county_file):
    with create_app().app_context():
        sess = GlobalDB.db().session

        # delete any data in the CityCode table
        sess.query(CountyCode).delete()

        # parse the new county code data
        parse_county_file(county_file, sess)
コード例 #7
0
def load_state_data(state_file):
    with create_app().app_context():
        sess = GlobalDB.db().session

        # delete any data in the States table
        sess.query(States).delete()

        # parse the new state data
        parse_state_file(state_file, sess)
コード例 #8
0
def load_zip_city_data(zip_city_file):
    with create_app().app_context():
        sess = GlobalDB.db().session

        # delete any data in the ZipCity table
        sess.query(ZipCity).delete()

        # parse the new zip city data
        parse_zip_city_file(zip_city_file, sess)
def reset_alembic(alembic_version):

    with create_app().app_context():
        db = GlobalDB.db()

        engine = db.engine
        sess = db.session
        metadata = MetaData(bind=engine)
        alembic_table = Table('alembic_version', metadata, autoload=True)
        u = update(alembic_table)
        u = u.values({"version_num": alembic_version})
        sess.execute(u)
        sess.commit()
コード例 #10
0
def load_offices():
    """ Load FPDS Contracting Office file into broker database. """
    # read office file to dataframe, to make sure all is well with the file before firing up a db transaction
    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        load_office = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'],
                                                                      'Key': "FPDSNG_Contracting_Offices.csv"},
                                                       ExpiresIn=600)
    else:
        load_office = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config",
                                   "FPDSNG_Contracting_Offices.csv")

    with create_app().app_context():
        update_offices(load_office)
コード例 #11
0
def create_admin():
    """Create initial admin user."""
    logger.info('Creating admin user')
    admin_email = CONFIG_BROKER['admin_email']
    admin_pass = CONFIG_BROKER['admin_password']
    with create_app().app_context():
        sess = GlobalDB.db().session
        user = sess.query(User).filter(User.email == admin_email).one_or_none()
        if not user:
            # once the rest of the setup scripts are updated to use
            # GlobalDB instead of databaseSession, move the app_context
            # creation up to initialize()
            user = create_user_with_password(admin_email, admin_pass, Bcrypt(), website_admin=True)
    return user
コード例 #12
0
def read_zips():
    """ Update zip codes in the zips table. """
    with create_app().app_context():
        sess = GlobalDB.db().session

        # Create temporary table to do work in so we don't disrupt the site for too long by altering the actual table
        sess.execute('CREATE TABLE IF NOT EXISTS temp_zips (LIKE zips INCLUDING ALL);')
        # Truncating in case we didn't clear out this table after a failure in the script
        sess.execute('TRUNCATE TABLE temp_zips;')
        sess.commit()

        if CONFIG_BROKER["use_aws"]:
            zip_folder = CONFIG_BROKER["zip_folder"] + "/"
            s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
            response = s3_client.list_objects_v2(Bucket=CONFIG_BROKER['sf_133_bucket'], Prefix=zip_folder)
            for obj in response.get('Contents', []):
                if obj['Key'] != zip_folder:
                    zip_4_file_path = s3_client.generate_presigned_url('get_object',
                                                                       {'Bucket': CONFIG_BROKER['sf_133_bucket'],
                                                                        'Key': obj['Key']}, ExpiresIn=600)
                    parse_zip4_file(urllib.request.urlopen(zip_4_file_path), sess)

            # parse remaining 5 digit zips that weren't in the first file
            citystate_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'],
                                                                             'Key': "ctystate.txt"}, ExpiresIn=600)
            parse_citystate_file(urllib.request.urlopen(citystate_file), sess)

            census_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'],
                                                                          'Key': "census_congressional_districts.csv"},
                                                           ExpiresIn=600)
        else:
            base_path = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", CONFIG_BROKER["zip_folder"])
            # creating the list while ignoring hidden files on mac
            file_list = [f for f in os.listdir(base_path) if not re.match('^\.', f)]
            for file in file_list:
                parse_zip4_file(open(os.path.join(base_path, file)), sess)

            # parse remaining 5 digit zips that weren't in the first file
            citystate_file = os.path.join(CONFIG_BROKER["path"], "dataactvalidator", "config", "ctystate.txt")
            parse_citystate_file(open(citystate_file), sess)

            census_file = os.path.join(base_path, "census_congressional_districts.csv")

        hot_swap_zip_tables(sess)
        update_state_congr_table_current(sess)
        update_state_congr_table_census(census_file, sess)

        logger.info("Zipcode script complete")
コード例 #13
0
    def load_fields(file_type_name, schema_file_name):
        """Load specified schema from a .csv."""
        with create_app().app_context():
            sess = GlobalDB.db().session

            # get file type object for specified fileTypeName
            file_type = sess.query(FileType).filter(FileType.name == file_type_name).one()

            # delete existing schema from database
            SchemaLoader.remove_columns_by_file_type(sess, file_type)

            # get allowable datatypes
            type_query = sess.query(FieldType.name, FieldType.field_type_id).all()
            types = {data_type.name: data_type.field_type_id for data_type in type_query}

            # add schema to database
            with open(schema_file_name, 'rU') as csvfile:
                reader = csv.DictReader(csvfile)
                file_column_count = 0
                for record in reader:
                    record = FieldCleaner.clean_record(record)

                    fields = ["fieldname", "required", "data_type"]
                    if all(field in record for field in fields):
                        SchemaLoader.add_column_by_file_type(
                            sess,
                            types,
                            file_type,
                            FieldCleaner.clean_string(record["fieldname"]),
                            FieldCleaner.clean_string(record["fieldname_short"]),
                            record["required"],
                            record["data_type"],
                            record["padded_flag"],
                            record["field_length"])
                        file_column_count += 1
                    else:
                            raise ValueError('CSV File does not follow schema')

                sess.commit()
                logger.info({
                    'message': '{} {} schema records added to {}'.format(file_column_count, file_type_name,
                                                                         FileColumn.__tablename__),
                    'message_type': 'ValidatorInfo',
                    'file_type': file_type.letter_name
                })
コード例 #14
0
    def load_labels(cls, filename):
        """Load non-SQL-based validation rules to db."""
        with create_app().app_context():
            sess = GlobalDB.db().session

            # Delete all records currently in table
            sess.query(ValidationLabel).delete()

            filename = os.path.join(cls.validation_labels_path, filename)

            # open csv
            with open(filename, 'rU') as csvfile:
                # read header
                header = csvfile.readline()
                # split header into filed names
                raw_field_names = header.split(',')
                field_names = []
                # clean field names
                for field in raw_field_names:
                    field_names.append(FieldCleaner.clean_string(field))

                unknown_fields = set(field_names) - set(cls.headers)
                if len(unknown_fields) != 0:
                    raise KeyError("".join(["Found unexpected fields: ", str(list(unknown_fields))]))

                missing_fields = set(cls.headers) - set(field_names)
                if len(missing_fields) != 0:
                    raise ValueError("".join(["Missing required fields: ", str(list(missing_fields))]))

                reader = csv.DictReader(csvfile, fieldnames=field_names)
                for row in reader:
                    validation_label = ValidationLabel(label=row['label'], error_message=row['error_message'],
                                                       column_name=row['column_name'], label_type=row['label_type'])

                    # look up file type id
                    try:
                        file_id = FILE_TYPE_DICT[row["file_type"]]
                    except Exception as e:
                        raise Exception("{}: file type={}, rule label={}. Rule not loaded.".format(
                            e, row["file_type"], row["rule_label"]))

                    validation_label.file_id = file_id

                    sess.merge(validation_label)
            sess.commit()
コード例 #15
0
def load_offices(load_office=None):
    """Load TAS file into broker database. """
    # read office file to dataframe, to make sure all is well
    # with the file before firing up a db transaction
    if not load_office:
        if CONFIG_BROKER["use_aws"]:
            s3connection = boto.s3.connect_to_region(
                CONFIG_BROKER['aws_region'])
            s3bucket = s3connection.lookup(CONFIG_BROKER['sf_133_bucket'])
            load_office = s3bucket.get_key(
                "FPDSNG_Contracting_Offices.csv").generate_url(expires_in=600)
        else:
            load_office = os.path.join(CONFIG_BROKER["path"],
                                       "dataactvalidator", "config",
                                       "FPDSNG_Contracting_Offices.csv")

    with create_app().app_context():
        update_offices(load_office)
コード例 #16
0
def load_submission_window_schedule():
    """ Loads the submission window schedule data. """
    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        sub_schedule_file = s3_client.generate_presigned_url(
            'get_object', {
                'Bucket': CONFIG_BROKER['sf_133_bucket'],
                'Key': "submission_window_schedule.csv"
            },
            ExpiresIn=600)
    else:
        sub_schedule_file = os.path.join(CONFIG_BROKER['path'],
                                         'dataactvalidator', 'config',
                                         'submission_window_schedule.csv')

    logger.info('Loading submission window schedule data')
    with create_app().app_context():
        data = pd.read_csv(sub_schedule_file, dtype=str)

        data = clean_data(
            data, SubmissionWindowSchedule, {
                'year': 'year',
                'period': 'period',
                'period_start': 'period_start',
                'publish_deadline': 'publish_deadline',
                'certification_deadline': 'certification_deadline'
            }, {})

        # Add a day to the deadlines because the dates in the file are supposed to be inclusive
        data['publish_deadline'] = data.apply(
            lambda x: add_day(x, 'publish_deadline'), axis=1)
        data['certification_deadline'] = data.apply(
            lambda x: add_day(x, 'certification_deadline'), axis=1)

        sess = GlobalDB.db().session
        # delete any data in the SubmissionWindowSchedule table
        sess.query(SubmissionWindowSchedule).delete()

        # insert data into table
        num = insert_dataframe(data, SubmissionWindowSchedule.__table__.name,
                               sess.connection())
        logger.info(
            '{} records inserted to submission_window_schedule'.format(num))
        sess.commit()
コード例 #17
0
    def load_sql(cls, filename):
        """ Load SQL-based validation rules to db. """
        with create_app().app_context():
            sess = GlobalDB.db().session
            filename = os.path.join(cls.sql_rules_path, filename)

            # Initial load
            sql_data = pd.read_csv(filename, dtype=str, usecols=cls.headers)
            sql_data = clean_data(
                sql_data,
                RuleSql,
                {'rule_label': 'rule_label', 'rule_error_message': 'rule_error_message', 'query_name': 'query_name',
                 'expected_value': 'expected_value', 'category': 'category', 'file_type': 'file_type',
                 'target_file': 'target_file', 'rule_cross_file_flag': 'rule_cross_file_flag',
                 'severity_name': 'severity_name'},
                {}
            )

            # Processing certain values
            sql_data['rule_sql'] = sql_data['query_name'].apply(lambda name: cls.read_sql_str(name))
            sql_data['file_id'] = sql_data['file_type'].apply(lambda type: FILE_TYPE_DICT.get(type, None))
            if sql_data['file_id'].isnull().values.any():
                raise Exception('Invalid file_type value found in sqlLoader. Must be one of the following: {}'
                                .format(', '.join(list(FILE_TYPE_DICT.keys()))))
            sql_data['target_file_id'] = sql_data['target_file'].apply(lambda type: FILE_TYPE_DICT.get(type, None))
            sql_data['rule_cross_file_flag'] = sql_data['rule_cross_file_flag'].apply(lambda flag:
                                                                                      flag in ('true', 't', 'y', 'yes'))
            sql_data['rule_severity_id'] = sql_data['severity_name'].apply(lambda severity_name:
                                                                           RULE_SEVERITY_DICT.get(severity_name, None))
            if sql_data['rule_severity_id'].isnull().values.any():
                raise Exception('Invalid severity_name value found in sqlLoader Must be one of the following: {}'
                                .format(', '.join(list(RULE_SEVERITY_DICT.keys()))))
            sql_data.drop(['file_type', 'severity_name', 'target_file'], axis=1, inplace=True)

            # Final check if we need to actually reload
            if check_dataframe_diff(sql_data, RuleSql, del_cols=['rule_sql_id', 'created_at', 'updated_at'],
                                    sort_cols=['rule_label', 'file_id', 'target_file_id']):
                # Delete and reload all records currently in table
                logger.info('Detected changes in {}, deleting RuleSQL and reloading'.format(cls.sql_rules_path))
                sess.query(RuleSql).delete()
                insert_dataframe(sql_data, RuleSql.__table__.name, sess.connection())
                sess.commit()
            else:
                logger.info('No changes detected since last load. Skipping.')
コード例 #18
0
def load_frec(file_name):
    """Load FREC (high-level agency names) lookup table."""
    with create_app().app_context():
        sess = GlobalDB.db().session
        models = {frec.frec_code: frec for frec in sess.query(FREC)}

        # read FREC values from csv
        data = pd.read_csv(file_name, dtype=str)

        # clean data
        data = clean_data(
            data, FREC, {
                "frec": "frec_code",
                "cgac_agency_code": "cgac_code",
                "frec_entity_description": "agency_name",
                "agency_abbreviation": "agency_abbreviation"
            }, {
                "frec": {
                    "keep_null": False
                },
                "cgac_code": {
                    "pad_to_length": 3
                },
                "frec_code": {
                    "pad_to_length": 4
                }
            })
        # de-dupe
        data.drop_duplicates(subset=['frec_code'], inplace=True)
        # create foreign key dicts
        cgac_dict = {
            str(cgac.cgac_code): cgac.cgac_id
            for cgac in sess.query(CGAC).filter(
                CGAC.cgac_code.in_(data["cgac_code"])).all()
        }

        # insert to db
        delete_missing_frecs(models, data)
        update_frecs(models, data, cgac_dict)
        sess.add_all(models.values())
        sess.commit()

        logger.info('%s FREC records inserted', len(models))
コード例 #19
0
    def setUpClass(cls):
        """ Set up class-wide resources (test data) """
        super(SettingsTests, cls).setUpClass()
        # TODO: refactor into a pytest fixture

        with create_app().app_context():
            # get the submission test user
            sess = GlobalDB.db().session
            cls.session = sess

            cgac = CGAC(cgac_code='097')
            rule = RuleSql(rule_sql_id=1, rule_sql='', rule_label='FABS1', rule_error_message='', query_name='',
                           file_id=1, rule_severity_id=2, rule_cross_file_flag=False)
            sess.add_all([cgac, rule])
            sess.commit()
            default_setting = RuleSetting(agency_code='097', rule_label=rule.rule_label, file_id=rule.file_id,
                                          target_file_id=rule.target_file_id, priority=1, impact_id=1)
            sess.add(default_setting)
            sess.commit()
コード例 #20
0
def read_zips():
    with create_app().app_context():
        sess = GlobalDB.db().session

        # delete old values in case something changed and one is now invalid
        sess.query(Zips).delete(synchronize_session=False)
        sess.commit()

        if CONFIG_BROKER["use_aws"]:
            s3connection = boto.s3.connect_to_region(
                CONFIG_BROKER['aws_region'])
            s3bucket = s3connection.lookup(CONFIG_BROKER['sf_133_bucket'])
            zip_folder = CONFIG_BROKER["zip_folder"] + "/"
            for key in s3bucket.list(prefix=zip_folder):
                if key.name != zip_folder:
                    zip_4_file_path = key.generate_url(expires_in=600)
                    parse_zip4_file(urllib.request.urlopen(zip_4_file_path),
                                    sess)

            # parse remaining 5 digit zips that weren't in the first file
            citystate_file = s3bucket.get_key("ctystate.txt").generate_url(
                expires_in=600)
            parse_citystate_file(urllib.request.urlopen(citystate_file), sess)
        else:
            base_path = os.path.join(CONFIG_BROKER["path"], "dataactvalidator",
                                     "config", CONFIG_BROKER["zip_folder"])
            # creating the list while ignoring hidden files on mac
            file_list = [
                f for f in os.listdir(base_path) if not re.match('^\.', f)
            ]
            for file in file_list:
                parse_zip4_file(open(os.path.join(base_path, file)), sess)

            # parse remaining 5 digit zips that weren't in the first file
            citystate_file = os.path.join(CONFIG_BROKER["path"],
                                          "dataactvalidator", "config",
                                          "ctystate.txt")
            parse_citystate_file(open(citystate_file), sess)

        update_state_congr_table(sess)

        logger.info("Zipcode script complete")
    def setUpClass(cls):
        """ Set up class-wide resources (test data) """
        super(DashboardTests, cls).setUpClass()
        # TODO: refactor into a pytest fixture

        with create_app().app_context():
            # get the submission test user
            sess = GlobalDB.db().session
            cls.session = sess
            submission_user = sess.query(User).filter(User.email == cls.test_users['admin_user']).one()
            cls.submission_user_id = submission_user.user_id

            other_user = sess.query(User).filter(User.email == cls.test_users['agency_user']).one()
            cls.other_user_id = other_user.user_id

            no_submissions_user = sess.query(User).filter(User.email == cls.test_users['no_permissions_user']).one()
            cls.no_submissions_user_email = no_submissions_user.email
            cls.no_submissions_user_id = no_submissions_user.user_id

            cls.quarter_sub = insert_submission(cls.session, cls.submission_user_id, cgac_code='SYS',
                                                start_date='01/2017', end_date='03/2017', is_quarter=True)
コード例 #22
0
def load_country_codes(filename):
    """Load country code lookup table."""
    model = CountryCode

    with create_app().app_context():
        sess = GlobalDB.db().session
        # for object class, delete and replace values
        sess.query(model).delete()

        data = pd.read_csv(filename, dtype=str)
        data = clean_data(data, model, {
            "country_code": "country_code",
            "country_name": "country_name"
        }, {})
        # de-dupe
        data.drop_duplicates(subset=['country_code'], inplace=True)
        # insert to db
        table_name = model.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))
コード例 #23
0
    def setUpClass(cls):
        """Set up resources to be shared within a test class"""
        # TODO: refactor into a pytest class fixtures and inject as necessary
        # update application's db config options so unittests
        # run against test databases
        configure_logging()
        suite = cls.__name__.lower()
        config = dataactcore.config.CONFIG_DB
        cls.num = randint(1, 9999)
        config['db_name'] = 'unittest{}_{}_data_broker'.format(cls.num, suite)
        dataactcore.config.CONFIG_DB = config
        create_database(CONFIG_DB['db_name'])
        run_migrations()

        app = create_app()
        app.config['TESTING'] = True
        app.config['DEBUG'] = False
        cls.app = TestApp(app)

        # Allow us to augment default test failure msg w/ more detail
        cls.longMessage = True
        # Upload files to S3 (False = skip re-uploading on subsequent runs)
        cls.uploadFiles = True
        # Run tests for local broker or not
        cls.local = CONFIG_BROKER['local']
        # This needs to be set to the local directory for error reports if local is True
        cls.local_file_directory = CONFIG_SERVICES['error_report_path']

        # drop and re-create test job db/tables
        setup_job_tracker_db()
        # drop and re-create test error db/tables
        setup_error_db()
        # drop and re-create test validation db
        setup_validation_db()

        cls.userId = None
        # constants to use for default submission start and end dates
        cls.SUBMISSION_START_DEFAULT = datetime(2015, 10, 1)
        cls.SUBMISSION_END_DEFAULT = datetime(2015, 10, 31)
    def setUpClass(cls):
        """Set up resources to be shared within a test class"""
        # TODO: refactor into a pytest class fixtures and inject as necessary
        # update application's db config options so unittests
        # run against test databases
        suite = cls.__name__.lower()
        config = dataactcore.config.CONFIG_DB
        cls.num = randint(1, 9999)
        config['db_name'] = 'unittest{}_{}_data_broker'.format(cls.num, suite)
        dataactcore.config.CONFIG_DB = config
        create_database(CONFIG_DB['db_name'])
        run_migrations()

        app = create_app()
        app.config['TESTING'] = True
        app.config['DEBUG'] = False
        cls.app = TestApp(app)

        # Allow us to augment default test failure msg w/ more detail
        cls.longMessage = True
        # Upload files to S3 (False = skip re-uploading on subsequent runs)
        cls.uploadFiles = True
        # Run tests for local broker or not
        cls.local = CONFIG_BROKER['local']
        # This needs to be set to the local directory for error reports if local is True
        cls.local_file_directory = CONFIG_SERVICES['error_report_path']

        # drop and re-create test job db/tables
        setup_job_tracker_db()
        # drop and re-create test error db/tables
        setup_error_db()
        # drop and re-create test validation db
        setup_validation_db()

        cls.userId = None
        # constants to use for default submission start and end dates
        cls.SUBMISSION_START_DEFAULT = datetime(2015, 10, 1)
        cls.SUBMISSION_END_DEFAULT = datetime(2015, 10, 31)
    def setUpClass(cls):
        """Set up class-wide resources (test data)"""
        super(GenerationTests, cls).setUpClass()
        # TODO: refactor into a pytest fixture

        with create_app().app_context():
            # get the submission test user
            sess = GlobalDB.db().session
            submission_user = sess.query(User).filter(User.email == cls.test_users['admin_user']).one()
            cls.submission_user_id = submission_user.user_id

            other_user = sess.query(User).filter(User.email == cls.test_users['agency_user']).one()
            cls.other_user_email = other_user.email
            cls.other_user_id = other_user.user_id

            # setup submission/jobs data for test_check_status
            cls.generation_submission_id = insert_submission(sess, cls.submission_user_id, cgac_code="SYS",
                                                             start_date="07/2015", end_date="09/2015", is_quarter=True)
            cls.setup_file_generation_submission(sess)

            cls.test_fabs_submission_id = insert_submission(sess, cls.submission_user_id, cgac_code="SYS",
                                                            start_date="10/2015", end_date="12/2015", is_quarter=False,
                                                            number_of_errors=0, is_fabs=True)
    def setUpClass(cls):
        """Set up class-wide resources (test data)"""
        super(GenerationTests, cls).setUpClass()
        # TODO: refactor into a pytest fixture

        with create_app().app_context():
            # get the submission test user
            sess = GlobalDB.db().session
            submission_user = sess.query(User).filter(User.email == cls.test_users['admin_user']).one()
            cls.submission_user_id = submission_user.user_id

            other_user = sess.query(User).filter(User.email == cls.test_users['agency_user']).one()
            cls.other_user_email = other_user.email
            cls.other_user_id = other_user.user_id

            # setup submission/jobs data for test_check_status
            cls.generation_submission_id = insert_submission(sess, cls.submission_user_id, cgac_code="SYS",
                                                             start_date="07/2015", end_date="09/2015", is_quarter=True)
            cls.setup_file_generation_submission(sess)

            cls.test_fabs_submission_id = insert_submission(sess, cls.submission_user_id, cgac_code="SYS",
                                                            start_date="10/2015", end_date="12/2015", is_quarter=False,
                                                            number_of_errors=0, is_fabs=True)
def load_quarterly_threshold():
    """ Loads the quarterly revalidation threshold data. """
    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        threshold_file = s3_client.generate_presigned_url(
            'get_object', {
                'Bucket': CONFIG_BROKER['sf_133_bucket'],
                'Key': "quarterly_submission_starts.csv"
            },
            ExpiresIn=600)
    else:
        threshold_file = os.path.join(CONFIG_BROKER["path"],
                                      "dataactvalidator", "config",
                                      "quarterly_submission_starts.csv")

    logger.info('Loading quarterly revalidation threshold data')
    with create_app().app_context():
        data = pd.read_csv(threshold_file, dtype=str)

        data = clean_data(data, QuarterlyRevalidationThreshold, {
            "year": "year",
            "quarter": "quarter",
            "window_start": "window_start"
        }, {})

        sess = GlobalDB.db().session
        # delete any data in the QuarterlyRevalidationThreshold table
        sess.query(QuarterlyRevalidationThreshold).delete()

        # insert data into table
        num = insert_dataframe(data,
                               QuarterlyRevalidationThreshold.__table__.name,
                               sess.connection())
        logger.info(
            '{} records inserted to quarterly_revalidation_threshold'.format(
                num))
        sess.commit()
コード例 #28
0
def load_sub_tier_agencies(file_name):
    """Load Sub Tier Agency (sub_tier-level agency names) lookup table."""
    with create_app().app_context():
        sess = GlobalDB.db().session
        models = {sub_tier_agency.sub_tier_agency_code: sub_tier_agency for
                  sub_tier_agency in sess.query(SubTierAgency)}

        # read Sub Tier Agency values from csv
        data = pd.read_csv(file_name, dtype=str)

        condition = data["FPDS DEPARTMENT ID"] == data["SUBTIER CODE"]
        data.loc[condition, "PRIORITY"] = 1
        data.loc[~condition, "PRIORITY"] = 2

        # clean data
        data = clean_data(
            data,
            SubTierAgency,
            {"cgac_agency_code": "cgac_code", "subtier_code": "sub_tier_agency_code", "priority": "priority",
             "frec": "frec_code", "subtier_name": "sub_tier_agency_name", "is_frec": "is_frec"},
            {"cgac_code": {"pad_to_length": 3}, "frec_code": {"pad_to_length": 4},
             "sub_tier_agency_code": {"pad_to_length": 4}}
        )
        # de-dupe
        data.drop_duplicates(subset=['sub_tier_agency_code'], inplace=True)
        # create foreign key dicts
        cgac_dict = {str(cgac.cgac_code): cgac.cgac_id for
                     cgac in sess.query(CGAC).filter(CGAC.cgac_code.in_(data["cgac_code"])).all()}
        frec_dict = {str(frec.frec_code): frec.frec_id for
                     frec in sess.query(FREC).filter(FREC.frec_code.in_(data["frec_code"])).all()}

        delete_missing_sub_tier_agencies(models, data)
        update_sub_tier_agencies(models, data, cgac_dict, frec_dict)
        sess.add_all(models.values())
        sess.commit()

        logger.info('%s Sub Tier Agency records inserted', len(models))
コード例 #29
0
def load_object_class(base_path):
    """ This function loads Object classes into the database

        Args:
            base_path: directory that contains the domain values files.
    """
    if CONFIG_BROKER["use_aws"]:
        s3connection = boto.s3.connect_to_region(CONFIG_BROKER['aws_region'])
        s3bucket = s3connection.lookup(CONFIG_BROKER['sf_133_bucket'])
        filename = s3bucket.get_key("object_class.csv").generate_url(
            expires_in=600)
    else:
        filename = os.path.join(base_path, "object_class.csv")

    # Load object class lookup table
    logger.info('Loading Object Class File: object_class.csv')
    with create_app().app_context():
        sess = GlobalDB.db().session
        sess.query(ObjectClass).delete()

        data = pd.read_csv(filename, dtype=str)
        data = clean_data(
            data, ObjectClass, {
                "max_oc_code": "object_class_code",
                "max_object_class_name": "object_class_name"
            }, {"object_class_code": {
                "pad_to_length": 3
            }})
        # de-dupe
        data.drop_duplicates(subset=['object_class_code'], inplace=True)
        # insert to db
        table_name = ObjectClass.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))
コード例 #30
0
def load_country_codes(base_path):
    """ Load Country Codes into the database.

        Args:
            base_path: directory that contains the domain values files.
    """

    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        filename = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'],
                                                                   'Key': "country_codes.csv"}, ExpiresIn=600)
    else:
        filename = os.path.join(base_path, "country_codes.csv")

    logger.info('Loading country codes file: country_codes.csv')

    with create_app().app_context():
        sess = GlobalDB.db().session
        # for object class, delete and replace values
        sess.query(CountryCode).delete()

        data = pd.read_csv(filename, dtype=str)
        data = clean_data(
            data,
            CountryCode,
            {"country_code": "country_code", "country_name": "country_name"},
            {}
        )
        # de-dupe
        data.drop_duplicates(subset=['country_code'], inplace=True)
        # insert to db
        table_name = CountryCode.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))
コード例 #31
0
def load_agency_data(base_path):
    """ Load agency data into the database

        Args:
            base_path: directory that contains the agency files
    """
    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        agency_list_file = s3_client.generate_presigned_url('get_object', {'Bucket': CONFIG_BROKER['sf_133_bucket'],
                                                                           'Key': "agency_list.csv"}, ExpiresIn=600)
        cascading_agency_list_file = s3_client.generate_presigned_url('get_object',
                                                                      {'Bucket': CONFIG_BROKER['sf_133_bucket'],
                                                                       'Key': "agency_codes_list.csv"}, ExpiresIn=600)
    else:
        agency_list_file = os.path.join(base_path, "agency_list.csv")
        cascading_agency_list_file = os.path.join(base_path, "agency_codes_list.csv")

    with create_app().app_context():
        logger.info('Loading CGAC')
        load_cgac(agency_list_file)
        logger.info('Loading FREC')
        load_frec(cascading_agency_list_file)
        logger.info('Loading Sub Tier Agencies')
        load_sub_tier_agencies(cascading_agency_list_file)
コード例 #32
0
    def load_sql(cls, filename):
        """Load SQL-based validation rules to db."""
        with create_app().app_context():
            sess = GlobalDB.db().session

            # Delete all records currently in table
            sess.query(RuleSql).delete()

            filename = os.path.join(cls.sql_rules_path, filename)

            # open csv
            with open(filename, 'rU') as csvfile:
                # read header
                header = csvfile.readline()
                # split header into filed names
                raw_field_names = header.split(',')
                field_names = []
                # clean field names
                for field in raw_field_names:
                    field_names.append(FieldCleaner.clean_string(field))

                unknown_fields = set(field_names) - set(cls.headers)
                if len(unknown_fields) != 0:
                    raise KeyError("".join([
                        "Found unexpected fields: ",
                        str(list(unknown_fields))
                    ]))

                missing_fields = set(cls.headers) - set(field_names)
                if len(missing_fields) != 0:
                    raise ValueError("".join([
                        "Missing required fields: ",
                        str(list(missing_fields))
                    ]))

                reader = csv.DictReader(csvfile, fieldnames=field_names)
                for row in reader:
                    sql = cls.read_sql_str(row['query_name'])

                    rule_sql = RuleSql(
                        rule_sql=sql,
                        rule_label=row['rule_label'],
                        rule_error_message=row['rule_error_message'],
                        query_name=row['query_name'])

                    # look up file type id
                    try:
                        file_id = FILE_TYPE_DICT[row["file_type"]]
                    except Exception as e:
                        raise Exception(
                            "{}: file type={}, rule label={}. Rule not loaded."
                            .format(e, row["file_type"], row["rule_label"]))
                    try:
                        if row["target_file"].strip() == "":
                            # No target file provided
                            target_file_id = None
                        else:
                            target_file_id = FILE_TYPE_DICT[row["target_file"]]
                    except Exception as e:
                        raise Exception(
                            "{}: file type={}, rule label={}. Rule not loaded."
                            .format(e, row["target_file"], row["rule_label"]))

                    # set cross file flag
                    flag = FieldCleaner.clean_string(
                        row["rule_cross_file_flag"])
                    if flag in ('true', 't', 'y', 'yes'):
                        cross_file_flag = True
                    else:
                        cross_file_flag = False

                    rule_sql.rule_severity_id = RULE_SEVERITY_DICT[
                        row['severity_name']]
                    rule_sql.file_id = file_id
                    rule_sql.target_file_id = target_file_id
                    rule_sql.rule_cross_file_flag = cross_file_flag

                    sess.merge(rule_sql)
            sess.commit()
        'award_procurement': {
            'staging_table': AwardProcurement,
            'certified_table': CertifiedAwardProcurement,
            'staging_id': 'award_procurement_id',
            'certified_id': 'certified_award_procurement_id',
            'file_type_id': FILE_TYPE_DICT['award_procurement']
        },
        'award_financial_assistance': {
            'staging_table': AwardFinancialAssistance,
            'certified_table': CertifiedAwardFinancialAssistance,
            'staging_id': 'award_financial_assistance_id',
            'certified_id': 'certified_award_financial_assistance_id',
            'file_type_id': FILE_TYPE_DICT['award']
        }
    }

    for award_type, award_dict in aw_data_map.items():
        copy_certified_submission_award_data(award_dict['staging_table'],
                                             award_dict['certified_table'],
                                             award_dict['staging_id'])
        load_updated_award_data(
            award_dict['staging_table'], award_dict['certified_table'],
            award_dict['file_type_id'],
            shared_internal_cols + [award_dict['certified_id']])


if __name__ == '__main__':
    configure_logging()
    with create_app().app_context():
        main()
コード例 #34
0
def uncache_file_requests():
    logger.info('Un-caching file generation requests')
    with create_app().app_context():
        sess = GlobalDB.db().session
        sess.query(FileRequest).update({"is_cached_file": False}, synchronize_session=False)
        sess.commit()
コード例 #35
0
def parse_sam_file(file_path, sess, monthly=False, benchmarks=False, table=DUNS, year=None):
    """ Takes in a SAM file and adds the DUNS data to the database

        Args:
            file_path: the path to the SAM file
            sess: the database connection
            monthly: whether it's a monthly file
            benchmarks: whether to log times
            table: the table to work from (could be DUNS/HistoricParentDuns)
            year: the year associated with the data (primarily for  HistoricParentDUNS loads)
    """
    parse_start_time = time.time()
    logger.info("Starting file " + str(file_path))

    dat_file_name = os.path.splitext(os.path.basename(file_path))[0]+'.dat'
    sam_file_type = "MONTHLY" if monthly else "DAILY"
    dat_file_date = re.findall(".*{}_(.*).dat".format(sam_file_type), dat_file_name)[0]

    with create_app().app_context():

        column_header_mapping = {
            "awardee_or_recipient_uniqu": 0,
            "sam_extract_code": 4,
            "registration_date": 6,
            "expiration_date": 7,
            "last_sam_mod_date": 8,
            "activation_date": 9,
            "legal_business_name": 10,
            "dba_name": 11,
            "address_line_1": 14,
            "address_line_2": 15,
            "city": 16,
            "state": 17,
            "zip": 18,
            "zip4": 19,
            "country_code": 20,
            "congressional_district": 21,
            "entity_structure": 27,
            "business_types_raw": 31,
            "ultimate_parent_legal_enti": 186,
            "ultimate_parent_unique_ide": 187
        }
        column_header_mapping_ordered = OrderedDict(sorted(column_header_mapping.items(), key=lambda c: c[1]))

        # Initial sweep of the file to see rows and possibly what DUNS we're updating
        if benchmarks:
            initial_sweep = time.time()
        nrows = 0
        with zipfile.ZipFile(file_path) as zip_file:
            with zip_file.open(dat_file_name) as dat_file:
                nrows = len(dat_file.readlines())
        if benchmarks:
            logger.info("Initial sweep took {} seconds".format(time.time() - initial_sweep))

        block_size = 10000
        batches = (nrows-1)//block_size
        # skip the first line again if the last batch is also the first batch
        skiplastrows = 2 if batches == 0 else 1
        last_block_size = ((nrows % block_size) or block_size)-skiplastrows
        batch = 0
        added_rows = 0
        while batch <= batches:
            skiprows = 1 if batch == 0 else (batch*block_size)
            nrows = (((batch+1)*block_size)-skiprows) if (batch < batches) else last_block_size
            logger.info('Loading rows %s to %s', skiprows+1, nrows+skiprows)

            with zipfile.ZipFile(file_path) as zip_file:
                with zip_file.open(dat_file_name) as dat_file:
                    csv_data = pd.read_csv(dat_file, dtype=str, header=None, skiprows=skiprows, nrows=nrows, sep='|',
                                           usecols=column_header_mapping_ordered.values(),
                                           names=column_header_mapping_ordered.keys(), quoting=3)

                    # add deactivation_date column for delete records
                    lambda_func = (lambda sam_extract: pd.Series([dat_file_date if sam_extract == "1" else np.nan]))
                    csv_data = csv_data.assign(deactivation_date=pd.Series([np.nan], name='deactivation_date')
                                               if monthly else csv_data["sam_extract_code"].apply(lambda_func))
                    # convert business types string to array
                    bt_func = (lambda bt_raw: pd.Series([[str(code) for code in str(bt_raw).split('~')
                                                          if isinstance(bt_raw, str)]]))
                    csv_data = csv_data.assign(business_types_codes=csv_data["business_types_raw"].apply(bt_func))
                    del csv_data["business_types_raw"]
                    # removing rows where DUNS number isn't even provided
                    csv_data = csv_data.where(csv_data["awardee_or_recipient_uniqu"].notnull())
                    # cleaning and replacing NaN/NaT with None's
                    csv_data = clean_sam_data(csv_data.where(pd.notnull(csv_data), None), table=table)

                    if monthly:
                        logger.info("Adding all monthly data with bulk load")
                        if benchmarks:
                            bulk_month_load = time.time()
                        del csv_data["sam_extract_code"]
                        if year:
                            csv_data['year'] = year
                        insert_dataframe(csv_data, table.__table__.name, sess.connection())
                        if benchmarks:
                            logger.info("Bulk month load took {} seconds".format(time.time()-bulk_month_load))
                    else:
                        add_data = csv_data[csv_data.sam_extract_code == '2']
                        update_delete_data = csv_data[(csv_data.sam_extract_code == '3') |
                                                      (csv_data.sam_extract_code == '1')]
                        for dataframe in [add_data, update_delete_data]:
                            del dataframe["sam_extract_code"]

                        if not add_data.empty:
                            try:
                                logger.info("Attempting to bulk load add data")
                                insert_dataframe(add_data, table.__table__.name, sess.connection())
                            except IntegrityError:
                                logger.info("Bulk loading add data failed, loading add data by row")
                                sess.rollback()
                                models, activated_models = get_relevant_models(add_data, sess, benchmarks=benchmarks)
                                logger.info("Loading add data ({} rows)".format(len(add_data.index)))
                                load_duns_by_row(add_data, sess, models, activated_models, benchmarks=benchmarks,
                                                 table=table)
                        if not update_delete_data.empty:
                            models, activated_models = get_relevant_models(update_delete_data, sess,
                                                                           benchmarks=benchmarks)
                            logger.info("Loading update_delete data ({} rows)".format(len(update_delete_data.index)))
                            load_duns_by_row(update_delete_data, sess, models, activated_models, benchmarks=benchmarks,
                                             table=table)
                    sess.commit()

            added_rows += nrows
            batch += 1
            logger.info('%s DUNS records inserted', added_rows)
        if benchmarks:
            logger.info("Parsing {} took {} seconds with {} rows".format(dat_file_name, time.time()-parse_start_time,
                                                                         added_rows))
コード例 #36
0
    def setUpClass(cls):
        """Set up class-wide resources (test data)"""
        super(ListSubmissionTests, cls).setUpClass()
        # TODO: refactor into a pytest fixture

        with create_app().app_context():
            # get an admin and non-admin user
            sess = GlobalDB.db().session
            cls.session = sess
            admin_user = sess.query(User).filter(
                User.email == cls.test_users['admin_user']).one()
            cls.admin_user_id = admin_user.user_id

            other_user = sess.query(User).filter(
                User.email == cls.test_users['agency_user']).one()
            cls.other_user_id = other_user.user_id

            # set up submissions for dabs
            cls.non_admin_dabs_sub_id = insert_submission(
                sess,
                cls.other_user_id,
                cgac_code="SYS",
                start_date="10/2015",
                end_date="12/2015",
                is_quarter=True,
                is_fabs=False,
                publish_status_id=PUBLISH_STATUS_DICT['unpublished'],
                updated_at='01/01/2010')

            cls.admin_dabs_sub_id = insert_submission(
                sess,
                cls.admin_user_id,
                cgac_code="000",
                start_date="10/2015",
                end_date="12/2015",
                is_quarter=True,
                is_fabs=False,
                publish_status_id=PUBLISH_STATUS_DICT['unpublished'],
                updated_at='01/01/2012')

            cls.certified_dabs_sub_id = insert_submission(
                sess,
                cls.admin_user_id,
                cgac_code="SYS",
                start_date="10/2015",
                end_date="12/2015",
                is_quarter=True,
                is_fabs=False,
                publish_status_id=PUBLISH_STATUS_DICT['published'])

            # Add a couple jobs for dabs files
            insert_job(sess,
                       FILE_TYPE_DICT['appropriations'],
                       FILE_STATUS_DICT['complete'],
                       JOB_TYPE_DICT['file_upload'],
                       cls.non_admin_dabs_sub_id,
                       filename='/path/to/test/file_1.csv',
                       file_size=123,
                       num_rows=3)
            insert_job(sess,
                       FILE_TYPE_DICT['award'],
                       FILE_STATUS_DICT['complete'],
                       JOB_TYPE_DICT['file_upload'],
                       cls.non_admin_dabs_sub_id,
                       filename='/path/to/test/file_2.csv',
                       file_size=123,
                       num_rows=3)

            insert_job(sess,
                       FILE_TYPE_DICT['award'],
                       FILE_STATUS_DICT['complete'],
                       JOB_TYPE_DICT['file_upload'],
                       cls.certified_dabs_sub_id,
                       filename='/path/to/test/file_part_2.csv',
                       file_size=123,
                       num_rows=3)

            # set up submissions for fabs
            cls.non_admin_fabs_sub_id = insert_submission(
                sess,
                cls.admin_user_id,
                cgac_code="SYS",
                start_date="10/2015",
                end_date="12/2015",
                is_fabs=True,
                publish_status_id=PUBLISH_STATUS_DICT['unpublished'])

            cls.admin_fabs_sub_id = insert_submission(
                sess,
                cls.other_user_id,
                cgac_code="000",
                start_date="10/2015",
                end_date="12/2015",
                is_fabs=True,
                publish_status_id=PUBLISH_STATUS_DICT['unpublished'])

            cls.published_fabs_sub_id = insert_submission(
                sess,
                cls.other_user_id,
                cgac_code="000",
                start_date="10/2015",
                end_date="12/2015",
                is_fabs=True,
                publish_status_id=PUBLISH_STATUS_DICT['published'])

            # Add a job for a FABS submission
            insert_job(sess,
                       FILE_TYPE_DICT['fabs'],
                       FILE_STATUS_DICT['complete'],
                       JOB_TYPE_DICT['file_upload'],
                       cls.admin_fabs_sub_id,
                       filename=str(cls.admin_fabs_sub_id) + '/test_file.csv',
                       file_size=123,
                       num_rows=3)
コード例 #37
0
def parse_sam_file(file_path, sess, monthly=False, benchmarks=False):
    parse_start_time = time.time()
    logger.info("starting file " + str(file_path))

    dat_file_name = os.path.splitext(os.path.basename(file_path))[0]+'.dat'
    sam_file_type = "MONTHLY" if monthly else "DAILY"
    dat_file_date = re.findall(".*{}_(.*).dat".format(sam_file_type), dat_file_name)[0]

    with create_app().app_context():

        column_header_mapping = {
            "awardee_or_recipient_uniqu": 0,
            "sam_extract_code": 4,
            "expiration_date": 7,
            "last_sam_mod_date": 8,
            "activation_date": 9,
            "legal_business_name": 10
        }
        column_header_mapping_ordered = OrderedDict(sorted(column_header_mapping.items(), key=lambda c: c[1]))

        # Initial sweep of the file to see rows and possibly what DUNS we're updating
        if benchmarks:
            initial_sweep = time.time()
        nrows = 0
        with zipfile.ZipFile(file_path) as zip_file:
            with zip_file.open(dat_file_name) as dat_file:
                nrows = len(dat_file.readlines())
        if benchmarks:
            logger.info("Initial sweep took {} seconds".format(time.time() - initial_sweep))

        block_size = 10000
        batches = nrows//block_size
        # skip the first line again if the last batch is also the first batch
        skiplastrows = 2 if batches == 0 else 1
        last_block_size = (nrows % block_size)-skiplastrows
        batch = 0
        added_rows = 0
        while batch <= batches:
            skiprows = 1 if batch == 0 else (batch*block_size)
            nrows = (((batch+1)*block_size)-skiprows) if (batch < batches) else last_block_size
            logger.info('loading rows %s to %s', skiprows+1, nrows+skiprows)

            with zipfile.ZipFile(file_path) as zip_file:
                with zip_file.open(dat_file_name) as dat_file:
                    csv_data = pd.read_csv(dat_file, dtype=str, header=None, skiprows=skiprows, nrows=nrows, sep='|',
                                           usecols=column_header_mapping_ordered.values(),
                                           names=column_header_mapping_ordered.keys())

                    # add deactivation_date column for delete records
                    lambda_func = (lambda sam_extract: pd.Series([dat_file_date if sam_extract == "1" else np.nan]))
                    csv_data = csv_data.assign(deactivation_date=pd.Series([np.nan], name='deactivation_date')
                                               if monthly else csv_data["sam_extract_code"].apply(lambda_func))
                    # removing rows where DUNS number isn't even provided
                    csv_data = csv_data.where(csv_data["awardee_or_recipient_uniqu"].notnull())
                    # cleaning and replacing NaN/NaT with None's
                    csv_data = clean_sam_data(csv_data.where(pd.notnull(csv_data), None))

                    if monthly:
                        logger.info("adding all monthly data with bulk load")
                        if benchmarks:
                            bulk_month_load = time.time()
                        del csv_data["sam_extract_code"]
                        insert_dataframe(csv_data, DUNS.__table__.name, sess.connection())
                        if benchmarks:
                            logger.info("Bulk month load took {} seconds".format(time.time()-bulk_month_load))
                    else:
                        add_data = csv_data[csv_data.sam_extract_code == '2']
                        update_delete_data = csv_data[(csv_data.sam_extract_code == '3') |
                                                      (csv_data.sam_extract_code == '1')]
                        for dataframe in [add_data, update_delete_data]:
                            del dataframe["sam_extract_code"]

                        if not add_data.empty:
                            try:
                                logger.info("attempting to bulk load add data")
                                insert_dataframe(add_data, DUNS.__table__.name, sess.connection())
                            except IntegrityError:
                                logger.info("bulk loading add data failed, loading add data by row")
                                sess.rollback()
                                models, activated_models = get_relevant_models(add_data, benchmarks=benchmarks)
                                logger.info("loading add data ({} rows)".format(len(add_data.index)))
                                load_duns_by_row(add_data, sess, models, activated_models, benchmarks=benchmarks)
                        if not update_delete_data.empty:
                            models, activated_models = get_relevant_models(update_delete_data, benchmarks=benchmarks)
                            logger.info("loading update_delete data ({} rows)".format(len(update_delete_data.index)))
                            load_duns_by_row(update_delete_data, sess, models, activated_models, benchmarks=benchmarks)
                    sess.commit()

            added_rows += nrows
            batch += 1
            logger.info('%s DUNS records inserted', added_rows)
        if benchmarks:
            logger.info("Parsing {} took {} seconds with {} rows".format(dat_file_name, time.time()-parse_start_time,
                                                                         added_rows))
コード例 #38
0
def read_zips():
    with create_app().app_context():
        sess = GlobalDB.db().session

        # delete old values in case something changed and one is now invalid
        sess.query(Zips).delete(synchronize_session=False)
        sess.commit()

        if CONFIG_BROKER["use_aws"]:
            zip_folder = CONFIG_BROKER["zip_folder"] + "/"
            s3_client = boto3.client('s3',
                                     region_name=CONFIG_BROKER['aws_region'])
            response = s3_client.list_objects_v2(
                Bucket=CONFIG_BROKER['sf_133_bucket'], Prefix=zip_folder)
            for obj in response.get('Contents', []):
                if obj['Key'] != zip_folder:
                    zip_4_file_path = s3_client.generate_presigned_url(
                        'get_object', {
                            'Bucket': CONFIG_BROKER['sf_133_bucket'],
                            'Key': obj['Key']
                        },
                        ExpiresIn=600)
                    parse_zip4_file(urllib.request.urlopen(zip_4_file_path),
                                    sess)

            # parse remaining 5 digit zips that weren't in the first file
            citystate_file = s3_client.generate_presigned_url(
                'get_object', {
                    'Bucket': CONFIG_BROKER['sf_133_bucket'],
                    'Key': "ctystate.txt"
                },
                ExpiresIn=600)
            parse_citystate_file(urllib.request.urlopen(citystate_file), sess)

            census_file = s3_client.generate_presigned_url(
                'get_object', {
                    'Bucket': CONFIG_BROKER['sf_133_bucket'],
                    'Key': "census_congressional_districts.csv"
                },
                ExpiresIn=600)
        else:
            base_path = os.path.join(CONFIG_BROKER["path"], "dataactvalidator",
                                     "config", CONFIG_BROKER["zip_folder"])
            # creating the list while ignoring hidden files on mac
            file_list = [
                f for f in os.listdir(base_path) if not re.match('^\.', f)
            ]
            for file in file_list:
                parse_zip4_file(open(os.path.join(base_path, file)), sess)

            # parse remaining 5 digit zips that weren't in the first file
            citystate_file = os.path.join(CONFIG_BROKER["path"],
                                          "dataactvalidator", "config",
                                          "ctystate.txt")
            parse_citystate_file(open(citystate_file), sess)

            census_file = os.path.join(base_path,
                                       "census_congressional_districts.csv")

        update_state_congr_table_current(sess)
        update_state_congr_table_census(census_file, sess)

        logger.info("Zipcode script complete")
コード例 #39
0
def load_sf133(filename,
               fiscal_year,
               fiscal_period,
               force_sf133_load=False,
               metrics=None):
    """ Load SF 133 (budget execution report) lookup table.

        Args:
            filename: name/path of the file to read in
            fiscal_year: fiscal year of the file being loaded
            fiscal_period: fiscal period of the file being loaded
            force_sf133_load: boolean to indicate whether to force a reload of the data
            metrics: an object containing information for the metrics file
    """
    if not metrics:
        metrics = {}
    with create_app().app_context():
        sess = GlobalDB.db().session

        existing_records = sess.query(SF133).filter(
            SF133.fiscal_year == fiscal_year, SF133.period == fiscal_period)
        if force_sf133_load:
            # force a reload of this period's current data
            logger.info(
                'Force SF 133 load: deleting existing records for %s %s',
                fiscal_year, fiscal_period)
            delete_count = existing_records.delete()
            logger.info('%s records deleted', delete_count)
            metrics['records_deleted'] += delete_count
        elif existing_records.count():
            # if there's existing data & we're not forcing a load, skip
            logger.info(
                'SF133 %s %s already in database (%s records). Skipping file.',
                fiscal_year, fiscal_period, existing_records.count())
            return

        data = clean_sf133_data(filename, SF133)

        # Now that we've added zero lines for EVERY tas and SF 133 line number, get rid of the ones we don't actually
        # use in the validations. Arguably, it would be better just to include everything, but that drastically
        # increases the number of records we're inserting to the sf_133 table. If we ever decide that we need *all*
        # SF 133 lines that are zero value, remove the next two lines.
        sf_133_validation_lines = [
            '1000', '1010', '1011', '1012', '1013', '1020', '1021', '1022',
            '1023', '1024', '1025', '1026', '1029', '1030', '1031', '1032',
            '1033', '1040', '1041', '1042', '1160', '1180', '1260', '1280',
            '1340', '1440', '1540', '1640', '1750', '1850', '1910', '2190',
            '2490', '2500', '3020', '4801', '4802', '4881', '4882', '4901',
            '4902', '4908', '4981', '4982'
        ]
        data = data[(data.line.isin(sf_133_validation_lines)) |
                    (data.amount != 0)]

        # we didn't use the the 'keep_null' option when padding allocation transfer agency, because nulls in that column
        # break the pivot (see above comments). so, replace the ata '000' with an empty value before inserting to db
        data['allocation_transfer_agency'] = data[
            'allocation_transfer_agency'].str.replace('000', '')
        # make a pass through the dataframe, changing any empty values to None, to ensure that those are represented as
        # NULL in the db.
        data = data.applymap(lambda x: str(x).strip()
                             if len(str(x).strip()) else None)

        # Keeping display_tas out here as it depends on empty allocation_transfer_agency being None and not 000
        data['display_tas'] = data.apply(
            lambda row: concat_display_tas_dict(row), axis=1)

        # insert to db
        table_name = SF133.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        metrics['records_inserted'] += num
        update_tas_id(int(fiscal_year), int(fiscal_period))
        sess.commit()

    logger.info('%s records inserted to %s', num, table_name)
コード例 #40
0
    def setUpClass(cls):
        """Set up class-wide resources (test data)"""
        super(ListLatestPublishedFileTests, cls).setUpClass()
        # TODO: refactor into a pytest fixture

        with create_app().app_context():
            # get the submission test user
            sess = GlobalDB.db().session
            cls.session = sess

            other_user = sess.query(User).filter(
                User.email == cls.test_users['agency_user']).one()
            cls.other_user_email = other_user.email
            cls.other_user_id = other_user.user_id
            cls.submission_user_id = other_user.user_id

            # ======= Reference ======
            cgac = CGAC(cgac_id=11, cgac_code='111', agency_name='CGAC 1')
            frec = FREC(frec_id=12,
                        cgac_id=11,
                        frec_code='2222',
                        agency_name='FREC 2')
            cgac2 = CGAC(cgac_id=13, cgac_code='333', agency_name='CGAC 3')
            sess.add_all([cgac, frec, cgac2])
            sess.commit()

            year = 2020
            period = 6
            diff_year = 2021
            diff_period = 7

            # ======= DABS =======
            cls.dabs_sub_unpub = insert_submission(
                sess,
                cls.submission_user_id,
                cgac_code=cgac2.cgac_code,
                reporting_fiscal_year=1999,
                reporting_fisacal_period=2,
                publish_status_id=PUBLISH_STATUS_DICT['unpublished'],
                is_fabs=False)
            cls.dabs_sub_pub_twice = insert_submission(
                sess,
                cls.submission_user_id,
                cgac_code=cgac.cgac_code,
                reporting_fiscal_year=year,
                reporting_fisacal_period=period,
                publish_status_id=PUBLISH_STATUS_DICT['published'],
                is_fabs=False)
            cls.setup_published_submission(sess,
                                           cls.dabs_sub_pub_twice,
                                           date='01/01/2020',
                                           is_fabs=False)
            cls.setup_published_submission(sess,
                                           cls.dabs_sub_pub_twice,
                                           date='01/02/2020',
                                           is_fabs=False)

            cls.dabs_sub_pub_diff_agency = insert_submission(
                sess,
                cls.submission_user_id,
                frec_code=frec.frec_code,
                reporting_fiscal_year=year,
                reporting_fisacal_period=period,
                publish_status_id=PUBLISH_STATUS_DICT['published'],
                is_fabs=False)
            cls.setup_published_submission(sess,
                                           cls.dabs_sub_pub_diff_agency,
                                           is_fabs=False)

            cls.dabs_sub_pub_diff_year = insert_submission(
                sess,
                cls.submission_user_id,
                cgac_code=cgac.cgac_code,
                reporting_fiscal_year=diff_year,
                reporting_fisacal_period=period,
                publish_status_id=PUBLISH_STATUS_DICT['published'],
                is_fabs=False)
            cls.setup_published_submission(sess,
                                           cls.dabs_sub_pub_diff_year,
                                           is_fabs=False)

            cls.dabs_sub_pub_diff_period = insert_submission(
                sess,
                cls.submission_user_id,
                cgac_code=cgac.cgac_code,
                reporting_fiscal_year=year,
                reporting_fisacal_period=diff_period,
                publish_status_id=PUBLISH_STATUS_DICT['published'],
                is_fabs=False)
            cls.setup_published_submission(sess,
                                           cls.dabs_sub_pub_diff_period,
                                           is_fabs=False)

            # ======= FABS =======
            cls.fabs_sub_unpub = insert_submission(
                sess,
                cls.submission_user_id,
                cgac_code='333',
                reporting_fiscal_year=None,
                reporting_fisacal_period=None,
                publish_status_id=1,
                is_fabs=True)

            cls.fabs_sub_pub = insert_submission(
                sess,
                cls.submission_user_id,
                cgac_code=cgac.cgac_code,
                reporting_fiscal_year=None,
                reporting_fisacal_period=None,
                publish_status_id=PUBLISH_STATUS_DICT['published'],
                is_fabs=True)
            cls.setup_published_submission(sess,
                                           cls.fabs_sub_pub,
                                           date='10/01/2000',
                                           is_fabs=True)
            cls.fabs_sub_pub_2 = insert_submission(
                sess,
                cls.submission_user_id,
                cgac_code=cgac.cgac_code,
                reporting_fiscal_year=None,
                reporting_fisacal_period=None,
                publish_status_id=PUBLISH_STATUS_DICT['published'],
                is_fabs=True)
            cls.setup_published_submission(sess,
                                           cls.fabs_sub_pub_2,
                                           date='10/02/2000',
                                           is_fabs=True)

            cls.fabs_sub_pub_diff_agency = insert_submission(
                sess,
                cls.submission_user_id,
                frec_code=frec.frec_code,
                reporting_fiscal_year=None,
                reporting_fisacal_period=None,
                publish_status_id=PUBLISH_STATUS_DICT['published'],
                is_fabs=True)
            cls.setup_published_submission(sess,
                                           cls.fabs_sub_pub_diff_agency,
                                           date='10/01/2000',
                                           is_fabs=True)

            cls.fabs_sub_pub_diff_year = insert_submission(
                sess,
                cls.submission_user_id,
                cgac_code=cgac.cgac_code,
                reporting_fiscal_year=None,
                reporting_fisacal_period=None,
                publish_status_id=PUBLISH_STATUS_DICT['published'],
                is_fabs=True)
            cls.setup_published_submission(sess,
                                           cls.fabs_sub_pub_diff_year,
                                           date='10/01/2001',
                                           is_fabs=True)

            cls.fabs_sub_pub_diff_period = insert_submission(
                sess,
                cls.submission_user_id,
                cgac_code=cgac.cgac_code,
                reporting_fiscal_year=None,
                reporting_fisacal_period=None,
                publish_status_id=PUBLISH_STATUS_DICT['published'],
                is_fabs=True)
            cls.setup_published_submission(sess,
                                           cls.fabs_sub_pub_diff_period,
                                           date='01/01/2001',
                                           is_fabs=True)
コード例 #41
0
def load_cfda_program(base_path, load_local=False, local_file_name="cfda_program.csv"):
    """ Load cfda program.

        Args:
            base_path: directory that contains the cfda values files.
            load_local: boolean indicating whether to load from a local file or not
            local_file_name: the name of the file if loading locally
    """
    local_now = datetime.now()
    if not load_local:
        logger.info("Fetching CFDA file from {}".format(S3_CFDA_FILE))
        tmp_name = str(time.time()).replace(".", "") + "_cfda_program.csv"
        filename = os.path.join(base_path, tmp_name)
        r = requests.get(S3_CFDA_FILE, allow_redirects=True)
        open(filename, 'wb').write(r.content)
    else:
        filename = os.path.join(base_path, local_file_name)
    logger.info('Loading CFDA program file: ' + filename)
    model = CFDAProgram

    metrics_json = {
        'script_name': 'load_cfda_data.py',
        'start_time': str(local_now),
        'new_records': 0
    }

    def fix_program_number(row, decimals=3):
        multiplier = 10 ** decimals
        value = math.floor(row['program_number'] * multiplier + 0.5) / multiplier
        return str(value).ljust(6, '0')

    with create_app().app_context():
        configure_logging()
        sess = GlobalDB.db().session

        import_data = pd.read_csv(filename, dtype=str, encoding='cp1252', na_filter=False)
        import_data = clean_data(
            import_data,
            model,
            DATA_CLEANING_MAP,
            {}
        )
        import_data["published_date"] = format_date(import_data["published_date"])
        import_data["archived_date"] = format_date(import_data["archived_date"])
        table_name = model.__table__.name
        # Check if there is new data to load
        new_data = check_dataframe_diff(import_data, model, ['cfda_program_id'], ['program_number'],
                                        lambda_funcs=[('program_number', fix_program_number)])
        if new_data:
            # insert to db
            sess.query(model).delete()
            num = insert_dataframe(import_data, table_name, sess.connection())
            sess.commit()

            # If we've updated the data at all, update the external data load date
            update_external_data_load_date(local_now, datetime.now(), 'cfda')
    if not load_local:
        os.remove(filename)
    if new_data:
        logger.info('{} records inserted to {}'.format(num, table_name))
        metrics_json['new_records'] = num
    else:
        logger.info("Skipped cfda load, no new data.")
        sys.exit(3)

    metrics_json['duration'] = str(datetime.now() - local_now)

    with open('load_cfda_data_metrics.json', 'w+') as metrics_file:
        json.dump(metrics_json, metrics_file)
    def setUpClass(cls):
        """Set up class-wide resources (test data)"""
        super(ListSubmissionTests, cls).setUpClass()
        # TODO: refactor into a pytest fixture

        with create_app().app_context():
            # get an admin and non-admin user
            sess = GlobalDB.db().session
            cls.session = sess
            admin_user = sess.query(User).filter(
                User.email == cls.test_users['admin_user']).one()
            cls.admin_user_id = admin_user.user_id

            other_user = sess.query(User).filter(
                User.email == cls.test_users['agency_user']).one()
            cls.other_user_id = other_user.user_id

            # set up submissions for dabs
            cls.non_admin_dabs_sub_id = insert_submission(
                sess,
                cls.other_user_id,
                cgac_code='SYS',
                start_date='10/2015',
                end_date='12/2015',
                is_quarter=True,
                is_fabs=False,
                publish_status_id=PUBLISH_STATUS_DICT['unpublished'],
                updated_at='01/01/2010')

            cls.admin_dabs_sub_id = insert_submission(
                sess,
                cls.admin_user_id,
                cgac_code='000',
                start_date='10/2015',
                end_date='12/2015',
                is_quarter=True,
                is_fabs=False,
                publish_status_id=PUBLISH_STATUS_DICT['unpublished'],
                updated_at='01/01/2012')

            cls.test_sub_id = insert_submission(
                sess,
                cls.admin_user_id,
                cgac_code='SYS',
                start_date='10/2015',
                end_date='12/2015',
                is_quarter=True,
                is_fabs=False,
                publish_status_id=PUBLISH_STATUS_DICT['unpublished'],
                updated_at='01/02/2012',
                test_submission=True)

            # This is the min date, but the date everything should be using is the one in the job (MAX_UPDATED_AT)
            cls.certified_dabs_sub_id = insert_submission(
                sess,
                cls.admin_user_id,
                cgac_code='SYS',
                start_date='10/2015',
                end_date='12/2015',
                is_quarter=True,
                is_fabs=False,
                publish_status_id=PUBLISH_STATUS_DICT['published'],
                updated_at='01/01/2000')

            # Add a couple jobs for dabs files, make sure the updated at is the same as or earlier than the one on
            # the submission itself
            insert_job(sess,
                       FILE_TYPE_DICT['appropriations'],
                       FILE_STATUS_DICT['complete'],
                       JOB_TYPE_DICT['file_upload'],
                       cls.non_admin_dabs_sub_id,
                       filename='/path/to/test/file_1.csv',
                       file_size=123,
                       num_rows=3,
                       updated_at='01/01/2009')
            insert_job(sess,
                       FILE_TYPE_DICT['award'],
                       FILE_STATUS_DICT['complete'],
                       JOB_TYPE_DICT['file_upload'],
                       cls.non_admin_dabs_sub_id,
                       filename='/path/to/test/file_2.csv',
                       file_size=123,
                       num_rows=3,
                       updated_at='01/01/2009')

            # Min updated at date
            insert_job(sess,
                       FILE_TYPE_DICT['award'],
                       FILE_STATUS_DICT['complete'],
                       JOB_TYPE_DICT['file_upload'],
                       cls.certified_dabs_sub_id,
                       filename='/path/to/test/file_part_2.csv',
                       file_size=123,
                       num_rows=3,
                       updated_at=cls.MAX_UPDATED_AT)

            # set up submissions for fabs
            cls.non_admin_fabs_sub_id = insert_submission(
                sess,
                cls.admin_user_id,
                cgac_code='SYS',
                start_date='10/2015',
                end_date='12/2015',
                is_fabs=True,
                publish_status_id=PUBLISH_STATUS_DICT['unpublished'],
                updated_at='01/01/2016')

            # This is the min date, but the date everything should be using is the one in the job (MAX_UPDATED_AT)
            cls.admin_fabs_sub_id = insert_submission(
                sess,
                cls.other_user_id,
                cgac_code='000',
                start_date='10/2015',
                end_date='12/2015',
                is_fabs=True,
                publish_status_id=PUBLISH_STATUS_DICT['unpublished'],
                updated_at='01/01/2000')

            cls.published_fabs_sub_id = insert_submission(
                sess,
                cls.other_user_id,
                cgac_code='000',
                start_date='10/2015',
                end_date='12/2015',
                is_fabs=True,
                publish_status_id=PUBLISH_STATUS_DICT['published'],
                updated_at='01/02/2000')

            # Add a job for a FABS submission
            insert_job(sess,
                       FILE_TYPE_DICT['fabs'],
                       FILE_STATUS_DICT['complete'],
                       JOB_TYPE_DICT['file_upload'],
                       cls.admin_fabs_sub_id,
                       filename=str(cls.admin_fabs_sub_id) + '/test_file.csv',
                       file_size=123,
                       num_rows=3,
                       updated_at=cls.MAX_UPDATED_AT)
コード例 #43
0
def parse_sam_file(file_path, sess, monthly=False, benchmarks=False):
    parse_start_time = time.time()
    logger.info("Starting file " + str(file_path))

    dat_file_name = os.path.splitext(os.path.basename(file_path))[0]+'.dat'
    sam_file_type = "MONTHLY" if monthly else "DAILY"
    dat_file_date = re.findall(".*{}_(.*).dat".format(sam_file_type), dat_file_name)[0]

    with create_app().app_context():

        column_header_mapping = {
            "awardee_or_recipient_uniqu": 0,
            "sam_extract_code": 4,
            "registration_date": 6,
            "expiration_date": 7,
            "last_sam_mod_date": 8,
            "activation_date": 9,
            "legal_business_name": 10
        }
        column_header_mapping_ordered = OrderedDict(sorted(column_header_mapping.items(), key=lambda c: c[1]))

        # Initial sweep of the file to see rows and possibly what DUNS we're updating
        if benchmarks:
            initial_sweep = time.time()
        nrows = 0
        with zipfile.ZipFile(file_path) as zip_file:
            with zip_file.open(dat_file_name) as dat_file:
                nrows = len(dat_file.readlines())
        if benchmarks:
            logger.info("Initial sweep took {} seconds".format(time.time() - initial_sweep))

        block_size = 10000
        batches = nrows//block_size
        # skip the first line again if the last batch is also the first batch
        skiplastrows = 2 if batches == 0 else 1
        last_block_size = (nrows % block_size)-skiplastrows
        batch = 0
        added_rows = 0
        while batch <= batches:
            skiprows = 1 if batch == 0 else (batch*block_size)
            nrows = (((batch+1)*block_size)-skiprows) if (batch < batches) else last_block_size
            logger.info('Loading rows %s to %s', skiprows+1, nrows+skiprows)

            with zipfile.ZipFile(file_path) as zip_file:
                with zip_file.open(dat_file_name) as dat_file:
                    csv_data = pd.read_csv(dat_file, dtype=str, header=None, skiprows=skiprows, nrows=nrows, sep='|',
                                           usecols=column_header_mapping_ordered.values(),
                                           names=column_header_mapping_ordered.keys(), quoting=3)

                    # add deactivation_date column for delete records
                    lambda_func = (lambda sam_extract: pd.Series([dat_file_date if sam_extract == "1" else np.nan]))
                    csv_data = csv_data.assign(deactivation_date=pd.Series([np.nan], name='deactivation_date')
                                               if monthly else csv_data["sam_extract_code"].apply(lambda_func))
                    # removing rows where DUNS number isn't even provided
                    csv_data = csv_data.where(csv_data["awardee_or_recipient_uniqu"].notnull())
                    # cleaning and replacing NaN/NaT with None's
                    csv_data = clean_sam_data(csv_data.where(pd.notnull(csv_data), None))

                    if monthly:
                        logger.info("Adding all monthly data with bulk load")
                        if benchmarks:
                            bulk_month_load = time.time()
                        del csv_data["sam_extract_code"]
                        insert_dataframe(csv_data, DUNS.__table__.name, sess.connection())
                        if benchmarks:
                            logger.info("Bulk month load took {} seconds".format(time.time()-bulk_month_load))
                    else:
                        add_data = csv_data[csv_data.sam_extract_code == '2']
                        update_delete_data = csv_data[(csv_data.sam_extract_code == '3') |
                                                      (csv_data.sam_extract_code == '1')]
                        for dataframe in [add_data, update_delete_data]:
                            del dataframe["sam_extract_code"]

                        if not add_data.empty:
                            try:
                                logger.info("Attempting to bulk load add data")
                                insert_dataframe(add_data, DUNS.__table__.name, sess.connection())
                            except IntegrityError:
                                logger.info("Bulk loading add data failed, loading add data by row")
                                sess.rollback()
                                models, activated_models = get_relevant_models(add_data, benchmarks=benchmarks)
                                logger.info("Loading add data ({} rows)".format(len(add_data.index)))
                                load_duns_by_row(add_data, sess, models, activated_models, benchmarks=benchmarks)
                        if not update_delete_data.empty:
                            models, activated_models = get_relevant_models(update_delete_data, benchmarks=benchmarks)
                            logger.info("Loading update_delete data ({} rows)".format(len(update_delete_data.index)))
                            load_duns_by_row(update_delete_data, sess, models, activated_models, benchmarks=benchmarks)
                    sess.commit()

            added_rows += nrows
            batch += 1
            logger.info('%s DUNS records inserted', added_rows)
        if benchmarks:
            logger.info("Parsing {} took {} seconds with {} rows".format(dat_file_name, time.time()-parse_start_time,
                                                                         added_rows))
コード例 #44
0
    def load_sql(cls, filename):
        """Load SQL-based validation rules to db."""
        with create_app().app_context():
            sess = GlobalDB.db().session

            # Delete all records currently in table
            sess.query(RuleSql).delete()

            filename = os.path.join(cls.sql_rules_path, filename)

            # open csv
            with open(filename, 'rU') as csvfile:
                # read header
                header = csvfile.readline()
                # split header into filed names
                raw_field_names = header.split(',')
                field_names = []
                # clean field names
                for field in raw_field_names:
                    field_names.append(FieldCleaner.clean_string(field))

                unknown_fields = set(field_names) - set(cls.headers)
                if len(unknown_fields) != 0:
                    raise KeyError("".join(["Found unexpected fields: ", str(list(unknown_fields))]))

                missing_fields = set(cls.headers) - set(field_names)
                if len(missing_fields) != 0:
                    raise ValueError("".join(["Missing required fields: ", str(list(missing_fields))]))

                reader = csv.DictReader(csvfile, fieldnames=field_names)
                for row in reader:
                    sql = cls.read_sql_str(row['query_name'])

                    rule_sql = RuleSql(rule_sql=sql, rule_label=row['rule_label'],
                                       rule_error_message=row['rule_error_message'], query_name=row['query_name'])

                    # look up file type id
                    try:
                        file_id = FILE_TYPE_DICT[row["file_type"]]
                    except Exception as e:
                        raise Exception("{}: file type={}, rule label={}. Rule not loaded.".format(
                            e, row["file_type"], row["rule_label"]))
                    try:
                        if row["target_file"].strip() == "":
                            # No target file provided
                            target_file_id = None
                        else:
                            target_file_id = FILE_TYPE_DICT[row["target_file"]]
                    except Exception as e:
                        raise Exception("{}: file type={}, rule label={}. Rule not loaded.".format(
                            e, row["target_file"], row["rule_label"]))

                    # set cross file flag
                    flag = FieldCleaner.clean_string(row["rule_cross_file_flag"])
                    if flag in ('true', 't', 'y', 'yes'):
                        cross_file_flag = True
                    else:
                        cross_file_flag = False

                    rule_sql.rule_severity_id = RULE_SEVERITY_DICT[row['severity_name']]
                    rule_sql.file_id = file_id
                    rule_sql.target_file_id = target_file_id
                    rule_sql.rule_cross_file_flag = cross_file_flag

                    sess.merge(rule_sql)
            sess.commit()
コード例 #45
0
def load_program_activity_data(base_path):
    """ Load program activity lookup table.

        Args:
            base_path: directory of domain config files
    """
    last_upload = get_date_of_current_pa_upload(base_path)
    if not (last_upload > get_stored_pa_last_upload()):
        return

    program_activity_file = get_program_activity_file(base_path)

    logger.info('Loading program activity: ' + PA_FILE_NAME)

    with create_app().app_context():
        sess = GlobalDB.db().session
        try:
            data = pd.read_csv(program_activity_file, dtype=str)
        except pd.io.common.EmptyDataError as e:
            log_blank_file()
            exit_if_nonlocal(4)  # exit code chosen arbitrarily, to indicate distinct failure states
            return
        headers = set([header.upper() for header in list(data)])

        if not VALID_HEADERS.issubset(headers):
            logger.error("Missing required headers. Required headers include: %s" % str(VALID_HEADERS))
            exit_if_nonlocal(4)
            return

        try:
            dropped_count, data = clean_data(
                data,
                ProgramActivity,
                {"fyq": "fiscal_year_quarter", "agency_code": "agency_id", "allocation_id": "allocation_transfer_id",
                 "account_code": "account_number", "pa_code": "program_activity_code",
                 "pa_title": "program_activity_name"},
                {"program_activity_code": {"pad_to_length": 4}, "agency_id": {"pad_to_length": 3},
                 "allocation_transfer_id": {"pad_to_length": 3, "keep_null": True},
                 "account_number": {"pad_to_length": 4}},
                ["agency_id", "program_activity_code", "account_number", "program_activity_name"],
                True
            )
        except FailureThresholdExceededException as e:
            if e.count == 0:
                log_blank_file()
                exit_if_nonlocal(4)
                return
            else:
                count_str = "Application tried to drop {} rows".format(e.count)
                logger.error("Loading of program activity file failed due to exceeded failure threshold. " + count_str)
                exit_if_nonlocal(5)
                return

        sess.query(ProgramActivity).delete()

        # Lowercase Program Activity Name
        data['program_activity_name'] = data['program_activity_name'].apply(lambda x: lowercase_or_notify(x))

        # because we're only loading a subset of program activity info,
        # there will be duplicate records in the dataframe. this is ok,
        # but need to de-duped before the db load. We also need to log them.
        base_count = data.shape[0]
        data.drop_duplicates(inplace=True)
        logger.info("Dropped {} duplicate rows.".format(base_count - data.shape[0]))

        # insert to db
        table_name = ProgramActivity.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    set_stored_pa_last_upload(last_upload)
    logger.info('{} records inserted to {}'.format(num, table_name))

    if dropped_count > 0:
        exit_if_nonlocal(3)
        return
コード例 #46
0
def uncache_all_files():
    logger.info('Un-caching all generated files')
    with create_app().app_context():
        sess = GlobalDB.db().session
        sess.query(FileGeneration).update({"is_cached_file": False}, synchronize_session=False)
        sess.commit()
def load_program_activity_data(base_path, force_reload=False, export=False):
    """ Load program activity lookup table.

        Args:
            base_path: directory of domain config files
            force_reload: whether or not to force a reload
            export: whether or not to export a public copy of the file
    """
    now = datetime.datetime.now()
    metrics_json = {
        'script_name': 'load_program_activity.py',
        'start_time': str(now),
        'records_received': 0,
        'duplicates_dropped': 0,
        'invalid_records_dropped': 0,
        'records_deleted': 0,
        'records_inserted': 0
    }
    dropped_count = 0

    logger.info('Checking PA upload dates to see if we can skip.')
    last_upload = get_date_of_current_pa_upload(base_path)
    if not (last_upload > get_stored_pa_last_upload()) and not force_reload:
        logger.info('Skipping load as it\'s already been done')
    else:
        logger.info('Getting the progrma activity file')
        program_activity_file = get_program_activity_file(base_path)

        logger.info('Loading program activity: {}'.format(PA_FILE_NAME))

        with create_app().app_context():
            sess = GlobalDB.db().session
            try:
                raw_data = pd.read_csv(program_activity_file, dtype=str)
            except pd.io.common.EmptyDataError:
                log_blank_file()
                exit_if_nonlocal(4)  # exit code chosen arbitrarily, to indicate distinct failure states
                return
            headers = set([header.upper() for header in list(raw_data)])

            if not VALID_HEADERS.issubset(headers):
                logger.error('Missing required headers. Required headers include: %s' % str(VALID_HEADERS))
                exit_if_nonlocal(4)
                return

            try:
                dropped_count, data = clean_data(
                    raw_data,
                    ProgramActivity,
                    {'fyq': 'fiscal_year_period', 'agency_code': 'agency_id', 'allocation_id': 'allocation_transfer_id',
                     'account_code': 'account_number', 'pa_code': 'program_activity_code',
                     'pa_title': 'program_activity_name'},
                    {'program_activity_code': {'pad_to_length': 4}, 'agency_id': {'pad_to_length': 3},
                     'allocation_transfer_id': {'pad_to_length': 3, 'keep_null': True},
                     'account_number': {'pad_to_length': 4}},
                    ['agency_id', 'program_activity_code', 'account_number', 'program_activity_name'],
                    True
                )
            except FailureThresholdExceededException as e:
                if e.count == 0:
                    log_blank_file()
                    exit_if_nonlocal(4)
                    return
                else:
                    logger.error('Loading of program activity file failed due to exceeded failure threshold. '
                                 'Application tried to drop {} rows'.format(e.count))
                    exit_if_nonlocal(5)
                    return

            metrics_json['records_deleted'] = sess.query(ProgramActivity).delete()
            metrics_json['invalid_records_dropped'] = dropped_count

            # Lowercase Program Activity Name
            data['program_activity_name'] = data['program_activity_name'].apply(lambda x: lowercase_or_notify(x))
            # Convert FYQ to FYP
            data['fiscal_year_period'] = data['fiscal_year_period'].apply(lambda x: convert_fyq_to_fyp(x))

            # because we're only loading a subset of program activity info, there will be duplicate records in the
            # dataframe. this is ok, but need to de-duped before the db load. We also need to log them.
            base_count = len(data.index)
            metrics_json['records_received'] = base_count
            data.drop_duplicates(inplace=True)

            dupe_count = base_count - len(data.index)
            logger.info('Dropped {} duplicate rows.'.format(dupe_count))
            metrics_json['duplicates_dropped'] = dupe_count

            # insert to db
            table_name = ProgramActivity.__table__.name
            num = insert_dataframe(data, table_name, sess.connection())
            sess.commit()

            if export:
                export_public_pa(raw_data)

        end_time = datetime.datetime.now()
        update_external_data_load_date(now, end_time, 'program_activity')
        update_external_data_load_date(last_upload, end_time, 'program_activity_upload')
        logger.info('{} records inserted to {}'.format(num, table_name))
        metrics_json['records_inserted'] = num

        metrics_json['duration'] = str(end_time - now)

    with open('load_program_activity_metrics.json', 'w+') as metrics_file:
        json.dump(metrics_json, metrics_file)

    if dropped_count > 0:
        exit_if_nonlocal(3)
        return
def setup_validation_db():
    """Create validation tables from model metadata and do initial inserts."""
    with create_app().app_context():
        sess = GlobalDB.db().session
        insert_codes(sess)
        sess.commit()
コード例 #49
0
def setup_submission_type_db():
    """Create job tracker tables from model metadata."""
    with create_app().app_context():
        sess = GlobalDB.db().session
        insert_codes(sess)
        sess.commit()
import argparse
import logging

from dataactcore.interfaces.db import GlobalDB
from dataactcore.logging import configure_logging
from dataactcore.utils.parentDuns import sam_config_is_valid, get_duns_batches, update_missing_parent_names
from dataactvalidator.health_check import create_app

logger = logging.getLogger(__name__)


if __name__ == '__main__':
    configure_logging()

    with create_app().app_context():
        parser = argparse.ArgumentParser(description='Update parent duns columns in DUNS table')
        parser.add_argument('-b', '--batch_start', help='Batch to start with (type int)', type=int, default=0)
        parser.add_argument('-e', '--batch_end', help='Batch to end with (type int)', type=int)
        parser.add_argument('-n', '--parent_name', help='Derives parent name at the end', action='store_true')

        args = parser.parse_args()

        # Parse argument to do load on certain update date
        # Possible option if want to do make sure items load
        sess = GlobalDB.db().session

        if args.parent_name:
            # Derive missing parent names when a parent DUNS number is provided
            update_missing_parent_names(sess)

        else:
コード例 #51
0
def load_cfda_program(base_path):
    """ Load cfda program.

        Args:
            base_path: directory that contains the cfda values files.
    """
    if CONFIG_BROKER["use_aws"]:
        s3_client = boto3.client('s3', region_name=CONFIG_BROKER['aws_region'])
        filename = s3_client.generate_presigned_url(
            'get_object', {
                'Bucket': CONFIG_BROKER['sf_133_bucket'],
                'Key': "cfda_program.csv"
            },
            ExpiresIn=600)
    else:
        filename = os.path.join(base_path, "cfda_program.csv")

    logger.info('Loading CFDA program file: ' + "cfda_program.csv")
    """Load country code lookup table."""
    model = CFDAProgram

    with create_app().app_context():
        configure_logging()
        sess = GlobalDB.db().session
        # for object class, delete and replace values
        sess.query(model).delete()

        data = pd.read_csv(filename, dtype=str, encoding='latin1')

        data = clean_data(
            data, model, {
                "program_title": "program_title",
                "program_number": "program_number",
                "popular_name_(020)": "popular_name",
                "federal_agency_(030)": "federal_agency",
                "authorization_(040)": "authorization",
                "objectives_(050)": "objectives",
                "types_of_assistance_(060)": "types_of_assistance",
                "uses_and_use_restrictions_(070)": "uses_and_use_restrictions",
                "applicant_eligibility_(081)": "applicant_eligibility",
                "beneficiary_eligibility_(082)": "beneficiary_eligibility",
                "credentials/documentation_(083)": "credentials_documentation",
                "preapplication_coordination_(091)":
                "preapplication_coordination",
                "application_procedures_(092)": "application_procedures",
                "award_procedure_(093)": "award_procedure",
                "deadlines_(094)": "deadlines",
                "range_of_approval/disapproval_time_(095)":
                "range_of_approval_disapproval_time",
                "appeals_(096)": "appeals",
                "renewals_(097)": "renewals",
                "formula_and_matching_requirements_(101)":
                "formula_and_matching_requirements",
                "length_and_time_phasing_of_assistance_(102)":
                "length_and_time_phasing_of_assistance",
                "reports_(111)": "reports",
                "audits_(112)": "audits",
                "records_(113)": "records",
                "account_identification_(121)": "account_identification",
                "obligations_(122)": "obligations",
                "range_and_average_of_financial_assistance_(123)":
                "range_and_average_of_financial_assistance",
                "program_accomplishments_(130)": "program_accomplishments",
                "regulations__guidelines__and_literature_(140)":
                "regulations_guidelines_and_literature",
                "regional_or__local_office_(151)": "regional_or_local_office",
                "headquarters_office_(152)": "headquarters_office",
                "website_address_(153)": "website_address",
                "related_programs_(160)": "related_programs",
                "examples_of_funded_projects_(170)":
                "examples_of_funded_projects",
                "criteria_for_selecting_proposals_(180)":
                "criteria_for_selecting_proposals",
                "url": "url",
                "recovery": "recovery",
                "omb_agency_code": "omb_agency_code",
                "omb_bureau_code": "omb_bureau_code",
                "published_date": "published_date",
                "archived_date": "archived_date"
            }, {})
        data["published_date"] = format_date(data["published_date"])
        data["archived_date"] = format_date(data["archived_date"])

        # insert to db
        table_name = model.__table__.name
        num = insert_dataframe(data, table_name, sess.connection())
        sess.commit()

    logger.info('{} records inserted to {}'.format(num, table_name))
コード例 #52
0
def read_zips():
    """ Update zip codes in the zips table. """
    with create_app().app_context():
        sess = GlobalDB.db().session

        # Create temporary table to do work in so we don't disrupt the site for too long by altering the actual table
        sess.execute(
            'CREATE TABLE IF NOT EXISTS temp_zips (LIKE zips INCLUDING ALL);')
        # Truncating in case we didn't clear out this table after a failure in the script
        sess.execute('TRUNCATE TABLE temp_zips;')
        sess.commit()

        if CONFIG_BROKER["use_aws"]:
            zip_folder = CONFIG_BROKER["zip_folder"] + "/"
            s3_client = boto3.client('s3',
                                     region_name=CONFIG_BROKER['aws_region'])
            response = s3_client.list_objects_v2(
                Bucket=CONFIG_BROKER['sf_133_bucket'], Prefix=zip_folder)
            for obj in response.get('Contents', []):
                if obj['Key'] != zip_folder:
                    zip_4_file_path = s3_client.generate_presigned_url(
                        'get_object', {
                            'Bucket': CONFIG_BROKER['sf_133_bucket'],
                            'Key': obj['Key']
                        },
                        ExpiresIn=600)
                    parse_zip4_file(urllib.request.urlopen(zip_4_file_path),
                                    sess)

            # parse remaining 5 digit zips that weren't in the first file
            citystate_file = s3_client.generate_presigned_url(
                'get_object', {
                    'Bucket': CONFIG_BROKER['sf_133_bucket'],
                    'Key': "ctystate.txt"
                },
                ExpiresIn=600)
            parse_citystate_file(urllib.request.urlopen(citystate_file), sess)

            census_file = s3_client.generate_presigned_url(
                'get_object', {
                    'Bucket': CONFIG_BROKER['sf_133_bucket'],
                    'Key': "census_congressional_districts.csv"
                },
                ExpiresIn=600)
        else:
            base_path = os.path.join(CONFIG_BROKER["path"], "dataactvalidator",
                                     "config", CONFIG_BROKER["zip_folder"])
            # creating the list while ignoring hidden files on mac
            file_list = [
                f for f in os.listdir(base_path) if not re.match('^\.', f)
            ]
            for file in file_list:
                parse_zip4_file(open(os.path.join(base_path, file)), sess)

            # parse remaining 5 digit zips that weren't in the first file
            citystate_file = os.path.join(CONFIG_BROKER["path"],
                                          "dataactvalidator", "config",
                                          "ctystate.txt")
            parse_citystate_file(open(citystate_file), sess)

            census_file = os.path.join(base_path,
                                       "census_congressional_districts.csv")

        hot_swap_zip_tables(sess)
        update_state_congr_table_current(sess)
        update_state_congr_table_census(census_file, sess)

        logger.info("Zipcode script complete")
コード例 #53
0
def load_cfda_program(base_path, load_local=False, local_file_name="cfda_program.csv"):
    """ Load cfda program.

        Args:
            base_path: directory that contains the cfda values files.
    """
    if not load_local:
        logger.info("Fetching CFDA file from {}".format(S3_CFDA_FILE))
        tmp_name = str(time.time()).replace(".", "") + "_cfda_program.csv"
        filename = os.path.join(base_path, tmp_name)
        r = requests.get(S3_CFDA_FILE, allow_redirects=True)
        open(filename, 'wb').write(r.content)
    else:
        filename = os.path.join(base_path, local_file_name)
    logger.info('Loading CFDA program file: ' + filename)
    """Load country code lookup table."""
    model = CFDAProgram

    def fix_program_number(n, decimals=3):
        multiplier = 10 ** decimals
        value = math.floor(n * multiplier + 0.5) / multiplier
        return str(value).ljust(6, '0')

    with create_app().app_context():
        configure_logging()
        sess = GlobalDB.db().session

        now = datetime.utcnow()
        import_data = pd.read_csv(filename, dtype=str, encoding='cp1252', na_filter=False)
        import_data = clean_data(
            import_data,
            model,
            DATA_CLEANING_MAP,
            {}
        )
        import_data["published_date"] = format_date(import_data["published_date"])
        import_data["archived_date"] = format_date(import_data["archived_date"])
        import_dataframe = import_data.copy(deep=True)
        # To do the comparison, first we need to mock the pk column that postgres creates. We'll set it universally to 1
        import_dataframe = import_dataframe.assign(cfda_program_id=1, created_at=now, updated_at=now)

        table_name = model.__table__.name
        current_data = pd.read_sql_table(table_name, sess.connection(), coerce_float=False)
        # Now we need to overwrite the db's audit dates in the created dataframe, and
        # also set all the  pks to 1, so they match
        current_data = current_data.assign(cfda_program_id=1, created_at=now, updated_at=now)
        # pandas comparison requires everything to be in the same order
        current_data.sort_values('program_number', inplace=True)
        import_dataframe.sort_values('program_number', inplace=True)

        # columns too
        cols = import_dataframe.columns.tolist()
        cols.sort()
        import_dataframe = import_dataframe[cols]

        cols = current_data.columns.tolist()
        cols.sort()
        current_data = current_data[cols]

        # need to reset the indexes now that we've done all this sorting, so that they match
        import_dataframe.reset_index(drop=True, inplace=True)
        current_data.reset_index(drop=True, inplace=True)
        # My favorite part: When pandas pulls the data out of postgres, the program_number column
        # is a Decimal. However, in adding it to the dataframe, this column loses precision.
        # So for example, a program number  of 10.001 imports into the dataframe as 10.000999999999999.
        # It also needs to be cast to astring, and padded with the right number of zeroes, as needed.
        current_data['program_number'] = current_data['program_number'].apply(lambda x: fix_program_number(x))
        # Finally, you can execute this and get True back if the data truly has not changed from the last
        # time the CSV was loaded.
        new_data = not import_dataframe.equals(current_data)
        if new_data:
            # insert to db
            sess.query(model).delete()
            num = insert_dataframe(import_data, table_name, sess.connection())
            sess.commit()
    if not load_local:
        os.remove(filename)
    if new_data:
        logger.info('{} records inserted to {}'.format(num, table_name))
    else:
        logger.info("Skipped cfda load, no new data.")
        sys.exit(3)
コード例 #54
0
def setup_job_tracker_db():
    """Create job tracker tables from model metadata."""
    with create_app().app_context():
        sess = GlobalDB.db().session
        insert_codes(sess)
        sess.commit()
コード例 #55
0
def setup_validation_db():
    """Create validation tables from model metadata and do initial inserts."""
    with create_app().app_context():
        sess = GlobalDB.db().session
        insert_codes(sess)
        sess.commit()