def normalize_headers(header_row, long_headers, long_to_short_dict):
    """ Clean the headers (remove extra spaces and lowercase) and convert them to short headers if we're given long
        headers

        Args:
            header_row: an array of the file headers given
            long_headers: boolean indicating if we're using the long versions of the headers (True for long)
            long_to_short_dict: a dictionary containing a mapping from long headers to short ones for this file type

        Yields:
            A string containing the cleaned header name (converted to short version if long versions were provided and
            there is a mapping for that header).
    """
    for header in header_row:
        header = FieldCleaner.clean_string(header)
        # Replace headers that don't match DB but are allowed by the broker with their DB matches
        if header == 'deobligationsrecoveriesrefundsofprioryearbyprogramobjectclass_cpe':
            header = 'deobligationsrecoveriesrefundsdofprioryearbyprogramobjectclass_cpe'
        elif header == 'facevalueloanguarantee':
            header = 'facevalueofdirectloanorloanguarantee'
        elif header == 'budgetauthorityavailableamounttotal_cpe':
            header = 'totalbudgetaryresources_cpe'
        elif header == 'correctionlatedeleteindicator':
            header = 'correctiondeleteindicator'
        elif header == 'place_of_performance_zip4':
            header = 'place_of_performance_zip4a'

        # yield the short header when applicable, otherwise yield the cleaned header, whatever it is
        if long_headers and header in long_to_short_dict:
            yield FieldCleaner.clean_string(long_to_short_dict[header])
        else:
            yield header
def normalize_headers(header_row, long_headers, long_to_short_dict):
    for header in header_row:
        header = FieldCleaner.clean_string(header)
        # Replace correctly spelled header (which does NOT match the db) with the misspelling that DOES match the db
        if header == 'deobligationsrecoveriesrefundsofprioryearbyprogramobjectclass_cpe':
            header = 'deobligationsrecoveriesrefundsdofprioryearbyprogramobjectclass_cpe'
        if long_headers and header in long_to_short_dict:
            yield FieldCleaner.clean_string(long_to_short_dict[header])
        else:
            yield header
def normalize_headers(header_row, long_headers, long_to_short_dict):
    for header in header_row:
        header = FieldCleaner.clean_string(header)
        # Replace correctly spelled header (which does NOT match the db) with the misspelling that DOES match the db
        if header == 'deobligationsrecoveriesrefundsofprioryearbyprogramobjectclass_cpe':
            header = 'deobligationsrecoveriesrefundsdofprioryearbyprogramobjectclass_cpe'
        if header == 'facevalueloanguarantee':
            header = 'facevalueofdirectloanorloanguarantee'
        if long_headers and header in long_to_short_dict:
            yield FieldCleaner.clean_string(long_to_short_dict[header])
        else:
            yield header
Beispiel #4
0
    def load_fields(file_type_name, schema_file_name):
        """Load specified schema from a .csv."""
        with create_app().app_context():
            sess = GlobalDB.db().session

            # get file type object for specified fileTypeName
            file_type = sess.query(FileType).filter(
                FileType.name == file_type_name).one()

            # delete existing schema from database
            SchemaLoader.remove_columns_by_file_type(sess, file_type)

            # get allowable datatypes
            type_query = sess.query(FieldType.name,
                                    FieldType.field_type_id).all()
            types = {
                data_type.name: data_type.field_type_id
                for data_type in type_query
            }

            # add schema to database
            with open(schema_file_name, 'rU') as csvfile:
                reader = csv.DictReader(csvfile)
                file_column_count = 0
                for record in reader:
                    record = FieldCleaner.clean_record(record)

                    fields = ["fieldname", "required", "data_type"]
                    if all(field in record for field in fields):
                        SchemaLoader.add_column_by_file_type(
                            sess, types, file_type,
                            FieldCleaner.clean_string(record["fieldname"]),
                            FieldCleaner.clean_string(
                                record["fieldname_short"]), record["required"],
                            record["data_type"], record["padded_flag"],
                            record["field_length"])
                        file_column_count += 1
                    else:
                        raise ValueError('CSV File does not follow schema')

                sess.commit()
                logger.info({
                    'message':
                    '{} {} schema records added to {}'.format(
                        file_column_count, file_type_name,
                        FileColumn.__tablename__),
                    'message_type':
                    'ValidatorInfo',
                    'file_type':
                    file_type.letter_name
                })
    def count_and_set_headers(self, csv_schema, header_row):
        """Track how many times we've seen a field we were expecting and set self.expected_headers and
        self.flex_headers"""
        self.expected_headers = []
        self.flex_headers = []

        # Track how many times we've seen a field we were expecting. Keyed by the shorter, machine-readable column names
        expected_fields = {}

        for schema in csv_schema:
            expected_fields[FieldCleaner.clean_string(schema.name_short)] = 0

        for header_value in header_row:
            if header_value not in expected_fields:
                # Add flex headers to flex list
                if str(header_value).startswith("flex_"):
                    self.flex_headers.append(header_value)
                else:
                    self.flex_headers.append(None)
                # Allow unexpected headers, just mark the header as None so we skip it when reading
                self.expected_headers.append(None)
            else:
                self.flex_headers.append(None)
                self.expected_headers.append(header_value)
                expected_fields[header_value] += 1
        return expected_fields
    def count_and_set_headers(self, csv_schema, header_row):
        """Track how many times we've seen a field we were expecting and set self.expected_headers and
        self.flex_headers"""
        self.expected_headers = []
        self.flex_headers = []

        # Track how many times we've seen a field we were expecting. Keyed by the shorter, machine-readable column names
        expected_fields = {}

        for schema in csv_schema:
            expected_fields[FieldCleaner.clean_string(schema.name_short)] = 0

        for header_value in header_row:
            if header_value not in expected_fields:
                # Add flex headers to flex list
                if str(header_value).startswith("flex_"):
                    self.flex_headers.append(header_value)
                else:
                    self.flex_headers.append(None)
                # Allow unexpected headers, just mark the header as None so we skip it when reading
                self.expected_headers.append(None)
            else:
                self.flex_headers.append(None)
                self.expected_headers.append(header_value)
                expected_fields[header_value] += 1
        return expected_fields
def use_long_headers(header_row, long_to_short_dict):
    """Check to see if header contains long or short column names"""
    col_matches = 0
    for value in header_row:
        if FieldCleaner.clean_string(value) in long_to_short_dict:
            col_matches += 1
    # if most of column headers are in the long format, we'll treat the file as having long headers
    return col_matches > .5 * len(header_row)
def use_long_headers(header_row, long_to_short_dict):
    """Check to see if header contains long or short column names"""
    col_matches = 0
    for value in header_row:
        if FieldCleaner.clean_string(value) in long_to_short_dict:
            col_matches += 1
    # if most of column headers are in the long format, we'll treat the file as having long headers
    return col_matches > .5 * len(header_row)
    def load_fields(file_type_name, schema_file_name):
        """Load specified schema from a .csv."""
        with create_app().app_context():
            sess = GlobalDB.db().session

            # get file type object for specified fileTypeName
            file_type = sess.query(FileType).filter(FileType.name == file_type_name).one()

            # delete existing schema from database
            SchemaLoader.remove_columns_by_file_type(sess, file_type)

            # get allowable datatypes
            type_query = sess.query(FieldType.name, FieldType.field_type_id).all()
            types = {data_type.name: data_type.field_type_id for data_type in type_query}

            # add schema to database
            with open(schema_file_name, 'rU') as csvfile:
                reader = csv.DictReader(csvfile)
                file_column_count = 0
                for record in reader:
                    record = FieldCleaner.clean_record(record)

                    fields = ["fieldname", "required", "data_type"]
                    if all(field in record for field in fields):
                        SchemaLoader.add_column_by_file_type(
                            sess,
                            types,
                            file_type,
                            FieldCleaner.clean_string(record["fieldname"]),
                            FieldCleaner.clean_string(record["fieldname_short"]),
                            record["required"],
                            record["data_type"],
                            record["padded_flag"],
                            record["field_length"])
                        file_column_count += 1
                    else:
                            raise ValueError('CSV File does not follow schema')

                sess.commit()
                logger.info({
                    'message': '{} {} schema records added to {}'.format(file_column_count, file_type_name,
                                                                         FileColumn.__tablename__),
                    'message_type': 'ValidatorInfo',
                    'file_type': file_type.letter_name
                })
Beispiel #10
0
    def load_labels(cls, filename):
        """Load non-SQL-based validation rules to db."""
        with create_app().app_context():
            sess = GlobalDB.db().session

            # Delete all records currently in table
            sess.query(ValidationLabel).delete()

            filename = os.path.join(cls.validation_labels_path, filename)

            # open csv
            with open(filename, 'rU') as csvfile:
                # read header
                header = csvfile.readline()
                # split header into filed names
                raw_field_names = header.split(',')
                field_names = []
                # clean field names
                for field in raw_field_names:
                    field_names.append(FieldCleaner.clean_string(field))

                unknown_fields = set(field_names) - set(cls.headers)
                if len(unknown_fields) != 0:
                    raise KeyError("".join([
                        "Found unexpected fields: ",
                        str(list(unknown_fields))
                    ]))

                missing_fields = set(cls.headers) - set(field_names)
                if len(missing_fields) != 0:
                    raise ValueError("".join([
                        "Missing required fields: ",
                        str(list(missing_fields))
                    ]))

                reader = csv.DictReader(csvfile, fieldnames=field_names)
                for row in reader:
                    validation_label = ValidationLabel(
                        label=row['label'],
                        error_message=row['error_message'],
                        column_name=row['column_name'],
                        label_type=row['label_type'])

                    # look up file type id
                    try:
                        file_id = FILE_TYPE_DICT[row["file_type"]]
                    except Exception as e:
                        raise Exception(
                            "{}: file type={}, rule label={}. Rule not loaded."
                            .format(e, row["file_type"], row["rule_label"]))

                    validation_label.file_id = file_id

                    sess.merge(validation_label)
            sess.commit()
    def load_labels(cls, filename):
        """Load non-SQL-based validation rules to db."""
        with create_app().app_context():
            sess = GlobalDB.db().session

            # Delete all records currently in table
            sess.query(ValidationLabel).delete()

            filename = os.path.join(cls.validation_labels_path, filename)

            # open csv
            with open(filename, 'rU') as csvfile:
                # read header
                header = csvfile.readline()
                # split header into filed names
                raw_field_names = header.split(',')
                field_names = []
                # clean field names
                for field in raw_field_names:
                    field_names.append(FieldCleaner.clean_string(field))

                unknown_fields = set(field_names) - set(cls.headers)
                if len(unknown_fields) != 0:
                    raise KeyError("".join(["Found unexpected fields: ", str(list(unknown_fields))]))

                missing_fields = set(cls.headers) - set(field_names)
                if len(missing_fields) != 0:
                    raise ValueError("".join(["Missing required fields: ", str(list(missing_fields))]))

                reader = csv.DictReader(csvfile, fieldnames=field_names)
                for row in reader:
                    validation_label = ValidationLabel(label=row['label'], error_message=row['error_message'],
                                                       column_name=row['column_name'], label_type=row['label_type'])

                    # look up file type id
                    try:
                        file_id = FILE_TYPE_DICT[row["file_type"]]
                    except Exception as e:
                        raise Exception("{}: file type={}, rule label={}. Rule not loaded.".format(
                            e, row["file_type"], row["rule_label"]))

                    validation_label.file_id = file_id

                    sess.merge(validation_label)
            sess.commit()
Beispiel #12
0
    def load_sql(cls, filename):
        """Load SQL-based validation rules to db."""
        with create_app().app_context():
            sess = GlobalDB.db().session

            # Delete all records currently in table
            sess.query(RuleSql).delete()

            filename = os.path.join(cls.sql_rules_path, filename)

            # open csv
            with open(filename, 'rU') as csvfile:
                # read header
                header = csvfile.readline()
                # split header into filed names
                raw_field_names = header.split(',')
                field_names = []
                # clean field names
                for field in raw_field_names:
                    field_names.append(FieldCleaner.clean_string(field))

                unknown_fields = set(field_names) - set(cls.headers)
                if len(unknown_fields) != 0:
                    raise KeyError("".join([
                        "Found unexpected fields: ",
                        str(list(unknown_fields))
                    ]))

                missing_fields = set(cls.headers) - set(field_names)
                if len(missing_fields) != 0:
                    raise ValueError("".join([
                        "Missing required fields: ",
                        str(list(missing_fields))
                    ]))

                reader = csv.DictReader(csvfile, fieldnames=field_names)
                for row in reader:
                    sql = cls.read_sql_str(row['query_name'])

                    rule_sql = RuleSql(
                        rule_sql=sql,
                        rule_label=row['rule_label'],
                        rule_error_message=row['rule_error_message'],
                        query_name=row['query_name'])

                    # look up file type id
                    try:
                        file_id = FILE_TYPE_DICT[row["file_type"]]
                    except Exception as e:
                        raise Exception(
                            "{}: file type={}, rule label={}. Rule not loaded."
                            .format(e, row["file_type"], row["rule_label"]))
                    try:
                        if row["target_file"].strip() == "":
                            # No target file provided
                            target_file_id = None
                        else:
                            target_file_id = FILE_TYPE_DICT[row["target_file"]]
                    except Exception as e:
                        raise Exception(
                            "{}: file type={}, rule label={}. Rule not loaded."
                            .format(e, row["target_file"], row["rule_label"]))

                    # set cross file flag
                    flag = FieldCleaner.clean_string(
                        row["rule_cross_file_flag"])
                    if flag in ('true', 't', 'y', 'yes'):
                        cross_file_flag = True
                    else:
                        cross_file_flag = False

                    rule_sql.rule_severity_id = RULE_SEVERITY_DICT[
                        row['severity_name']]
                    rule_sql.file_id = file_id
                    rule_sql.target_file_id = target_file_id
                    rule_sql.rule_cross_file_flag = cross_file_flag

                    sess.merge(rule_sql)
            sess.commit()
    def load_sql(cls, filename):
        """Load SQL-based validation rules to db."""
        with create_app().app_context():
            sess = GlobalDB.db().session

            # Delete all records currently in table
            sess.query(RuleSql).delete()

            filename = os.path.join(cls.sql_rules_path, filename)

            # open csv
            with open(filename, 'rU') as csvfile:
                # read header
                header = csvfile.readline()
                # split header into filed names
                raw_field_names = header.split(',')
                field_names = []
                # clean field names
                for field in raw_field_names:
                    field_names.append(FieldCleaner.clean_string(field))

                unknown_fields = set(field_names) - set(cls.headers)
                if len(unknown_fields) != 0:
                    raise KeyError("".join(["Found unexpected fields: ", str(list(unknown_fields))]))

                missing_fields = set(cls.headers) - set(field_names)
                if len(missing_fields) != 0:
                    raise ValueError("".join(["Missing required fields: ", str(list(missing_fields))]))

                reader = csv.DictReader(csvfile, fieldnames=field_names)
                for row in reader:
                    sql = cls.read_sql_str(row['query_name'])

                    rule_sql = RuleSql(rule_sql=sql, rule_label=row['rule_label'],
                                       rule_error_message=row['rule_error_message'], query_name=row['query_name'])

                    # look up file type id
                    try:
                        file_id = FILE_TYPE_DICT[row["file_type"]]
                    except Exception as e:
                        raise Exception("{}: file type={}, rule label={}. Rule not loaded.".format(
                            e, row["file_type"], row["rule_label"]))
                    try:
                        if row["target_file"].strip() == "":
                            # No target file provided
                            target_file_id = None
                        else:
                            target_file_id = FILE_TYPE_DICT[row["target_file"]]
                    except Exception as e:
                        raise Exception("{}: file type={}, rule label={}. Rule not loaded.".format(
                            e, row["target_file"], row["rule_label"]))

                    # set cross file flag
                    flag = FieldCleaner.clean_string(row["rule_cross_file_flag"])
                    if flag in ('true', 't', 'y', 'yes'):
                        cross_file_flag = True
                    else:
                        cross_file_flag = False

                    rule_sql.rule_severity_id = RULE_SEVERITY_DICT[row['severity_name']]
                    rule_sql.file_id = file_id
                    rule_sql.target_file_id = target_file_id
                    rule_sql.rule_cross_file_flag = cross_file_flag

                    sess.merge(rule_sql)
            sess.commit()