コード例 #1
0
def test_auto_mapper_datetime_regex_replace_format(
        spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "1/13/1995"),
            (2, "1/3/1995"),
            (3, "11/3/1995"),
        ],
        ["member_id", "opening_date"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")
    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"]).columns(formatted_date=A.datetime(
                            value=A.regex_replace(A.column("opening_date"),
                                                  pattern=r"\b(\d)(?=/)",
                                                  replacement="0$1"),
                            formats=["M/dd/yyyy"],
                        ).to_date_format("yyyy-M-dd"))
    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")
    result_df: DataFrame = mapper.transform(df=source_df)

    assert (result_df.where("member_id == 1").select(
        "formatted_date").collect()[0][0] == "1995-1-13")
    assert (result_df.where("member_id == 2").select(
        "formatted_date").collect()[0][0] == "1995-1-03")
    assert (result_df.where("member_id == 3").select(
        "formatted_date").collect()[0][0] == "1995-11-03")
コード例 #2
0
def test_auto_mapper_date_format(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "1970-01-01 12:30"),
            (2, "Vidal", "Michael", "1970-02-02 06:30"),
        ],
        ["member_id", "last_name", "first_name", "opening_time"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    source_df = source_df.withColumn(
        "opening_time", to_timestamp("opening_time",
                                     format="yyyy-MM-dd hh:mm"))

    assert dict(source_df.dtypes)["opening_time"] == "timestamp"

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients",
        keys=["member_id"]).columns(openingTime=A.datetime(
            A.column("opening_time")).to_date_format("hh:mm:ss"))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["openingTime"],
        date_format(coalesce(to_timestamp(col("b.opening_time"))),
                    "hh:mm:ss").alias("openingTime"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").select("openingTime").collect()
            [0][0] == "12:30:00")
    assert (result_df.where("member_id == 2").select("openingTime").collect()
            [0][0] == "06:30:00")

    # check type
    assert dict(result_df.dtypes)["openingTime"] == "string"