Beispiel #1
0
def test_header_case_with_meta(uppercase_meta: bool, uppercase_data: bool,
                               headers_ignore_case: bool):
    """
    Tests whether the correct result is given using headers_ignore_case and either the
    data or metadata has captilalised column names. The result is the same as:
        (uppercase_data == uppercase_meta) OR headers_ignore_case
    i.e. they're both the same case or headers_ignore_case is True.
    """

    test_folder = "tests/data/headers/"
    full_file_path = os.path.join(test_folder, "table1.csv")

    # get the meta and set the correct case for the col names
    metadata = Metadata.from_json(
        os.path.join(test_folder, "meta_data/table1.json"))
    if uppercase_meta:
        for c in metadata.columns:
            c["name"] = c["name"].upper()

    # get the data an set the correct case for the columns
    if uppercase_data:
        full_file_path = os.path.join(test_folder, "table1_uppercase.csv")
    else:
        full_file_path = os.path.join(test_folder, "table1.csv")

    # get the expected result
    expected_result = (uppercase_data == uppercase_meta) or headers_ignore_case

    # get the validator and validate
    table_params = {"headers-ignore-case": headers_ignore_case}
    pv = PandasValidator(full_file_path, table_params, metadata)
    pv.read_data_and_validate()

    # assert the result is as expected
    assert expected_result == pv.response.result["valid"]
    def generate_to_meta(self,
                         arrow_schema: pa.Schema,
                         meta_init_dict: dict = None) -> Metadata:
        """Generates our metadata instance from an arrow schema

        Args:
            arrow_schema (pa.Schema): pa.Schema from an arrow table

        Returns:
            Metadata: An agnostic metadata instance
        """
        if not meta_init_dict:
            meta_init_dict = {}

        if "columns" in meta_init_dict:
            warnings.warn(
                "columns key found in meta_init_dict will be overwritten")

        meta_init_dict["columns"] = []
        meta_init_dict["_converted_from"] = "arrow_schema"

        for field in arrow_schema:
            meta_init_dict["columns"].append({
                "name":
                field.name,
                "type":
                self.reverse_convert_col_type(field.type)
            })

        m = Metadata.from_dict(meta_init_dict)
        return m
Beispiel #3
0
def validate_data(config: dict) -> ValidatorResult:

    validator_engine = config.get("validator-engine", "pandas")
    validator_params = config.get("validator-engine-params", {})

    all_table_responses = []

    for table_name, table_params in config["tables"].items():

        table_params["lint-response"] = []

        if table_params["matched_files"]:
            log.info(f"Linting {table_name}")

            meta_file_path = table_params.get("metadata",
                                              f"meta_data/{table_name}.json")

            meta_obj = Metadata.from_json(meta_file_path)
            meta_obj.set_col_type_category_from_types()
            metadata = meta_obj.to_dict()

            for i, matched_file in enumerate(table_params["matched_files"]):

                log.info(f"{matched_file} ...file {i+1} "
                         f"of {len(table_params['matched_files'])}")
                validator = get_validator[validator_engine](matched_file,
                                                            table_params,
                                                            metadata,
                                                            **validator_params)

                validator.read_data_and_validate()
                validator.write_validation_errors_to_log()

                table_response = {
                    "valid": validator.valid,
                    "response": validator.get_response_dict(),
                    "original-path": matched_file,
                    "table-name": table_name,
                }

                if table_response["valid"]:
                    log.info("...file passed.")
                else:
                    log.info("...file failed.")

                all_table_responses.append(table_response)

        else:
            msg4 = f"SKIPPING {table_name}. No files found."
            log.info(msg4)

    if all_table_responses:
        save_completion_status(config, all_table_responses)

    return validator.response
def _get_arrow_schema(schema: Union[pa.schema, Metadata, dict]):
    ac = ArrowConverter()
    if isinstance(schema, Metadata):
        schema = ac.generate_to_meta(schema)
    elif isinstance(schema, dict):
        schema = Metadata.from_dict(schema)
        schema = ac.generate_to_meta(schema)
    elif isinstance(schema, pa.Schema):
        pass
    else:
        raise TypeError(f"schema type not allowed: {type(schema)}")

    return schema
Beispiel #5
0
    def generate_to_meta(self, database: str, table: str) -> Metadata:
        # get the table information
        glue_client = boto3.client("glue")
        resp = glue_client.get_table(DatabaseName=database, Name=table)

        # pull out just the columns
        columns = resp["Table"]["StorageDescriptor"]["Columns"]

        # convert the columns
        mojap_meta_cols = self.convert_columns(columns)

        # check for partitions
        partitions = resp["Table"].get("PartitionKeys")
        if partitions:
            # convert
            part_cols_full = self.convert_columns(partitions)
            # extend the mojap_meta_cols with the partiton cols
            mojap_meta_cols.extend(part_cols_full)
            part_cols = [p["name"] for p in part_cols_full]

            # make a metadata object
            meta = Metadata(name=table,
                            columns=mojap_meta_cols,
                            partitions=part_cols)
        else:
            meta = Metadata(name=table, columns=mojap_meta_cols)

        # get the file format if possible
        try:
            ff = resp["Table"]["StorageDescriptor"]["Parameters"].get(
                "classification")
        except KeyError:
            warnings.warn("unable to parse file format, please manually set")
            ff = None

        if ff:
            meta.file_format = ff.lower()

        return meta
Beispiel #6
0
    def generate_to_meta(
        self,
        table_meta: TableMeta,
        data_format_mapper: Callable = None,
        col_type_mapper: Callable = None,
    ) -> Metadata:
        """Takes a TableMeta object and converts it to our Metadata object

        Args:
            etl_manager_table_meta (Metadata): TableMeta object from etl-manager
            data_format_mapper (Callable, optional): If not set the function
                will just set the file_format parameter to the str in the
                original data_format of the TableMeta. If you want to use
                your own mapper set a function object to this param e.g.
                data_format_mapper = my_lookup_dict.get
            col_type_mapper (Callable, option): If not set the col type conversion
                from TableMeta -> Metadata is done using the converters
                reverse_convert_col_type method. If you need a custom conversion
                set a function to this parameter to use said function instead of
                reverse_convert_col_type This callable should expect the TableMeta
                col type str and return the Metadata col type str name.
        Returns:
            TableMeta: An object from the TableMeta class in etl_manager.meta
        """

        table_meta_dict = deepcopy(table_meta.to_dict())

        renamed_params = {"data_format": "file_format"}
        for old_name, new_name in renamed_params.items():
            table_meta_dict[new_name] = table_meta_dict.pop(old_name)

        if data_format_mapper:
            table_meta_dict["file_format"] = data_format_mapper(
                table_meta_dict["file_format"])

        # remove etl_manager schema
        del table_meta_dict["$schema"]

        # convert columns
        etl_cols = table_meta_dict.pop("columns")
        for c in etl_cols:
            if col_type_mapper is None:
                c["type"] = self.reverse_convert_col_type(c["type"])
            else:
                c["type"] = col_type_mapper(c["type"])

        table_meta_dict["columns"] = etl_cols

        table_meta_dict["_converted_from"] = "etl_manager"
        return Metadata.from_dict(table_meta_dict)
Beispiel #7
0
    def generate_from_meta(
        self,
        metadata: Union[Metadata, str, dict],
        database_name: str = None,
        table_location: str = None,
        run_msck_repair=False,
    ):
        """
        Creates a glue table from metadata
        arguments:
            - metadata: Metadata object, string path, or dictionary metadata.
            - database_name (optional): name of the glue database the table is to be
            created in. can also be a property of the metadata.
            - table_location (optional): the s3 location of the table. can also be a
            property of the metadata.
            - run_msck_repair (optional): run msck repair table on the created table,
            should be set to True for tables with partitions.
        Raises:
            - ValueError if run_msck_repair table is False, metadata has partitions, and
            options.ignore_warnings is set to False
        """

        # set database_name to metadata.database_name if none
        database_name = database_name if database_name else metadata.database_name
        # do the same with table_location
        table_location = table_location if table_location else metadata.table_location

        glue_client = boto3.client(
            "glue",
            region_name=os.getenv("AWS_REGION",
                                  os.getenv("AWS_DEFAULT_REGION",
                                            "eu-west-1")),
        )
        metadata = Metadata.from_infer(metadata)
        boto_dict = self.gc.generate_from_meta(metadata,
                                               database_name=database_name,
                                               table_location=table_location)
        delete_table_if_exists(database=database_name, table=metadata.name)
        glue_client.create_table(**boto_dict)

        if (not run_msck_repair and metadata.partitions
                and not self.options.ignore_warnings):
            w = (
                "metadata has partitions and run_msck_reapair is set to false. To "
                "To supress these warnings set this converters "
                "options.ignore_warnings = True")
            warnings.warn(w)
        elif run_msck_repair:
            pydb.read_sql_query(
                f"msck repair table {database_name}.{metadata.name}")
Beispiel #8
0
def test_headers(file_name, expected_result):
    """
    Tests files against the _read_data_and_validate function.
    runs each file and corresponding meta (table1 or table2).
    Against the additional table config params:
    - expected-headers is False
    - expected-headers is True and ignore-case is False
    - expected-headers is True and ignore-case is True
    In that order.
    Args:
        file_name ([str]): The filename in the dir tests/data/headers/
        expected_results ([Tuple(bool)]): expected results for the 3
        different config params listed above
    """
    test_folder = "tests/data/headers/"
    full_file_path = os.path.join(test_folder, file_name)

    table_name = file_name.split(".")[0].split("_")[0]
    metadata = Metadata.from_json(
        os.path.join(test_folder, f"meta_data/{table_name}.json"))

    table_params = [
        {
            "expect-header": False
        },
        {
            "expect-header": True,
            "headers-ignore-case": False
        },
        {
            "expect-header": True,
            "headers-ignore-case": True
        },
    ]

    all_tests = []
    for table_param in table_params:
        validator = PandasValidator(full_file_path, table_param, metadata)
        validator.read_data_and_validate()
        table_response = validator.response
        all_tests.append(table_response.result["valid"])

    assert expected_result == all_tests