def test_generate_from_meta():
    md = Metadata.from_dict({
        "name":
        "test_table",
        "file_format":
        "test-format",
        "columns": [
            {
                "name": "my_int",
                "type": "int64",
                "description": "This is an integer",
                "nullable": False,
            },
            {
                "name": "my_double",
                "type": "float64",
                "nullable": True
            },
            {
                "name": "my_date",
                "type": "date64"
            },
            {
                "name": "my_decimal",
                "type": "decimal128(10,2)"
            },
            {
                "name": "my_timestamp",
                "type": "timestamp(s)",
                "description": "Partition column",
            },
        ],
        "partitions": ["my_timestamp"],
    })

    ac = ArrowConverter()
    assert isinstance(ac.options, BaseConverterOptions)

    schema1 = ac.generate_from_meta(md)
    schema2 = ac.generate_from_meta(md, False)

    assert isinstance(schema1, pa.Schema)
    assert isinstance(schema2, pa.Schema)

    schema_str1 = ("my_int: int64 not null\nmy_double: double\n"
                   "my_date: date64[ms]\nmy_decimal: decimal(10, 2)")
    schema_str2 = schema_str1 + "\nmy_timestamp: timestamp[s]"
    assert schema1.to_string() == schema_str1
    assert schema2.to_string() == schema_str2
def test_generate_from_meta():
    md = Metadata.from_dict({
        "name":
        "test_table",
        "file_format":
        "test-format",
        "columns": [
            {
                "name": "my_int",
                "type": "int64",
                "description": "This is an integer",
                "nullable": False,
            },
            {
                "name": "my_double",
                "type": "float64",
                "nullable": True
            },
            {
                "name": "my_date",
                "type": "date64"
            },
            {
                "name": "my_decimal",
                "type": "decimal128(10,2)"
            },
            {
                "name": "my_timestamp",
                "type": "timestamp(s)",
                "description": "Partition column",
            },
        ],
        "partitions": ["my_timestamp"],
    })

    ac = ArrowConverter()
    assert isinstance(ac.options, BaseConverterOptions)

    schema1 = ac.generate_from_meta(md)
    schema2 = ac.generate_from_meta(md, False)

    assert isinstance(schema1, pa.Schema)
    assert isinstance(schema2, pa.Schema)

    expected_names = ["my_int", "my_double", "my_date", "my_decimal"]
    expected_types = [
        pa.int64(),
        pa.float64(),
        pa.date64(),
        pa.decimal128(10, 2)
    ]
    assert schema1.names == expected_names

    checks1 = [a.equals(e) for a, e in zip(schema1.types, expected_types)]
    assert all(checks1)

    # Do schema2 assertions
    expected_names.append("my_timestamp")
    expected_types.append(pa.timestamp("s"))

    assert schema2.names == expected_names

    checks2 = [a.equals(e) for a, e in zip(schema2.types, expected_types)]
    assert all(checks2)

    # Also check specific type properties
    assert schema2.field("my_decimal").type.precision == 10
    assert schema2.field("my_decimal").type.scale == 2
    assert schema2.field("my_timestamp").type.unit == "s"
Esempio n. 3
0
def _parse_data_to_pandas(filepath: str, table_params: dict, metadata: dict):
    """
    Reads in the data from the given filepath and returns
    a dataframe
    """

    meta_col_names = [
        c["name"] for c in metadata["columns"]
        if c["name"] not in metadata.get("partitions", [])
    ]

    # For string based file types convert make arrow readers read them in as strings
    # validators will still treat these as dates but will run validation against strings
    # cols expecting values to match a timestamp format
    if "json" in metadata["file_format"] or "csv" in metadata["file_format"]:
        md_obj = Metadata.from_dict(metadata)
        cols = md_obj.columns

        cols_to_force_str_read_in = []
        for c in cols:
            if c["type"].startswith("time") or c["type"].startswith("date"):
                c["type"] = "string"
                c["type_category"] = "string"
                cols_to_force_str_read_in.append(c["name"])

        md_obj.columns = cols
        ac = ArrowConverter()
        arrow_schema = ac.generate_from_meta(md_obj)

        ts_as_str_schema = pa.schema([])
        for cname in cols_to_force_str_read_in:
            ts_as_str_schema = ts_as_str_schema.append(
                arrow_schema.field(cname))

    # Set the reader type
    if filepath.startswith("s3://"):
        reader_fs = fs.S3FileSystem(region="eu-west-1")
        fp_for_file_reader = filepath.replace("s3://", "", 1)

    else:
        reader_fs = fs.LocalFileSystem()
        fp_for_file_reader = filepath

    with reader_fs.open_input_stream(fp_for_file_reader) as f:
        if "csv" in metadata["file_format"]:

            # Safer CSV load for newlines_in_values set to True
            if table_params.get("expect-header", True):
                po = csv.ParseOptions(newlines_in_values=True)
            else:
                po = csv.ParseOptions(newlines_in_values=True,
                                      column_names=meta_col_names)

            if ts_as_str_schema:
                co = csv.ConvertOptions(column_types=ts_as_str_schema)
            else:
                co = None

            df = pa_read_csv_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
                convert_options=co,
            )
            # dates/datetimes == string

        elif "json" in metadata["file_format"]:

            po = json.ParseOptions(
                newlines_in_values=True,
                explicit_schema=ts_as_str_schema if ts_as_str_schema else None,
            )

            df = pa_read_json_to_pandas(
                input_file=f,
                schema=arrow_schema,
                expect_full_schema=False,
                parse_options=po,
            )
            # dates/datetimes == string

        elif "parquet" in metadata["file_format"]:
            df = arrow_to_pandas(pq.read_table(f))
            # dates/datetimes == datetime / date

        else:
            raise ValueError(
                f"Unknown file_format in metadata: {metadata['file_format']}.")

    if table_params.get("row-limit"):
        df = df.sample(table_params.get("row-limit"))

    if table_params.get("headers-ignore-case"):
        df_cols = [c.lower() for c in df.columns]
        df.columns = df_cols

    if table_params.get("only-test-cols-in-metadata", False):
        keep_cols = [c for c in df.columns if c in meta_col_names]
        df = df[keep_cols]

    return df