Ejemplo n.º 1
0
def test_to_parquet(session, bucket, redshift_parameters):
    n: int = 1_000_000
    df = pd.DataFrame({
        "id":
        list((range(n))),
        "name":
        list(["foo" if i % 2 == 0 else "boo" for i in range(n)])
    })
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="******",
        password=redshift_parameters.get("RedshiftPassword"),
    )
    path = f"s3://{bucket}/test_to_parquet/"
    session.pandas.to_redshift(
        dataframe=df,
        path=path,
        schema="public",
        table="test",
        connection=con,
        iam_role=redshift_parameters.get("RedshiftRole"),
        mode="overwrite",
        preserve_index=True,
    )
    path = f"s3://{bucket}/test_to_parquet2/"
    paths = Redshift.to_parquet(
        sql="SELECT * FROM public.test",
        path=path,
        iam_role=redshift_parameters.get("RedshiftRole"),
        connection=con,
        partition_cols=["name"])
    assert len(paths) == 4
Ejemplo n.º 2
0
def test_connection_timeout(redshift_parameters):
    with pytest.raises(pg8000.core.InterfaceError):
        Redshift.generate_connection(
            database="test",
            host=redshift_parameters.get("RedshiftAddress"),
            port=12345,
            user="******",
            password=redshift_parameters.get("RedshiftPassword"),
        )
Ejemplo n.º 3
0
def test_to_redshift_pandas_cast(session, bucket, redshift_parameters):
    df = pd.DataFrame({
        "id": [1, 2, 3],
        "name": ["name1", "name2", "name3"],
        "foo": [None, None, None],
        "boo": [date(2020, 1, 1), None, None],
        "bar": [datetime(2021, 1, 1), None, None]
    })
    schema = {"id": "BIGINT", "name": "VARCHAR", "foo": "REAL", "boo": "DATE", "bar": "TIMESTAMP"}
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="******",
        password=redshift_parameters.get("RedshiftPassword"),
    )
    path = f"s3://{bucket}/redshift-load/"
    session.pandas.to_redshift(dataframe=df,
                               path=path,
                               schema="public",
                               table="test",
                               connection=con,
                               iam_role=redshift_parameters.get("RedshiftRole"),
                               mode="overwrite",
                               preserve_index=False,
                               cast_columns=schema)
    cursor = con.cursor()
    cursor.execute("SELECT * from public.test")
    rows = cursor.fetchall()
    cursor.close()
    con.close()
    print(rows)
    assert len(df.index) == len(rows)
    assert len(list(df.columns)) == len(list(rows[0]))
Ejemplo n.º 4
0
def test_to_redshift_pandas_exceptions(session, bucket, redshift_parameters, sample_name, mode, factor, diststyle,
                                       distkey, sortstyle, sortkey, exc):
    dataframe = pd.read_csv(f"data_samples/{sample_name}.csv")
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="******",
        password=redshift_parameters.get("RedshiftPassword"),
    )
    path = f"s3://{bucket}/redshift-load/"
    with pytest.raises(exc):
        assert session.pandas.to_redshift(
            dataframe=dataframe,
            path=path,
            schema="public",
            table="test",
            connection=con,
            iam_role=redshift_parameters.get("RedshiftRole"),
            diststyle=diststyle,
            distkey=distkey,
            sortstyle=sortstyle,
            sortkey=sortkey,
            mode=mode,
            preserve_index=False,
        )
    con.close()
def test_stress_to_redshift_spark_big(session, bucket, redshift_parameters):
    print("Creating DataFrame...")
    dataframe = session.spark_session.createDataFrame(
        pd.DataFrame({
            "A": list(range(10_000)),
            "B": list(range(10_000))
        }))
    dataframe.cache()
    for i in range(10):
        print(f"Run number: {i}")
        con = Redshift.generate_connection(
            database="test",
            host=redshift_parameters.get("RedshiftAddress"),
            port=redshift_parameters.get("RedshiftPort"),
            user="******",
            password=redshift_parameters.get("Password"),
        )
        session.spark.to_redshift(
            dataframe=dataframe,
            path=f"s3://{bucket}/redshift-load-{i}/",
            connection=con,
            schema="public",
            table="test",
            iam_role=redshift_parameters.get("RedshiftRole"),
            mode="overwrite",
            min_num_partitions=16,
        )
        con.close()
        dataframe.unpersist()
def test_to_redshift_spark_bool(session, bucket, redshift_parameters):
    dataframe = session.spark_session.createDataFrame(
        pd.DataFrame({
            "A": [1, 2, 3],
            "B": [True, False, True]
        }))
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="******",
        password=redshift_parameters.get("Password"),
    )
    session.spark.to_redshift(
        dataframe=dataframe,
        path=f"s3://{bucket}/redshift-load-bool/",
        connection="aws-data-wrangler-redshift",
        schema="public",
        table="test",
        iam_role=redshift_parameters.get("RedshiftRole"),
        mode="overwrite",
        min_num_partitions=1,
    )
    cursor = con.cursor()
    cursor.execute("SELECT * from public.test")
    rows = cursor.fetchall()
    cursor.close()
    con.close()
    assert dataframe.count() == len(rows)
    assert len(list(dataframe.columns)) == len(list(rows[0]))
    assert type(rows[0][0]) == int
    assert type(rows[0][1]) == bool
def test_to_redshift_spark_big(session, bucket, redshift_parameters):
    dataframe = session.spark_session.createDataFrame(
        pd.DataFrame({
            "A": list(range(100_000)),
            "B": list(range(100_000)),
            "C": list(range(100_000))
        }))
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="******",
        password=redshift_parameters.get("Password"),
    )
    session.spark.to_redshift(
        dataframe=dataframe,
        path=f"s3://{bucket}/redshift-load/",
        connection=con,
        schema="public",
        table="test",
        iam_role=redshift_parameters.get("RedshiftRole"),
        mode="overwrite",
        min_num_partitions=10,
    )
    cursor = con.cursor()
    cursor.execute("SELECT * from public.test")
    rows = cursor.fetchall()
    cursor.close()
    con.close()
    assert dataframe.count() == len(rows)
    assert len(list(dataframe.columns)) == len(list(rows[0]))
Ejemplo n.º 8
0
def test_to_redshift_spark_exceptions(session, bucket, redshift_parameters, sample_name, mode, factor, diststyle,
                                      distkey, sortstyle, sortkey, exc):
    path = f"data_samples/{sample_name}.csv"
    dataframe = session.spark.read_csv(path=path)
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="******",
        password=redshift_parameters.get("RedshiftPassword"),
    )
    with pytest.raises(exc):
        assert session.spark.to_redshift(
            dataframe=dataframe,
            path=f"s3://{bucket}/redshift-load/",
            connection=con,
            schema="public",
            table="test",
            iam_role=redshift_parameters.get("RedshiftRole"),
            diststyle=diststyle,
            distkey=distkey,
            sortstyle=sortstyle,
            sortkey=sortkey,
            mode=mode,
            min_num_partitions=2,
        )
    con.close()
Ejemplo n.º 9
0
def test_to_redshift_pandas(session, bucket, redshift_parameters, sample_name,
                            mode, factor):
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="******",
        password=redshift_parameters.get("RedshiftPassword"),
    )
    dataframe = pandas.read_csv(f"data_samples/{sample_name}.csv")
    path = f"s3://{bucket}/redshift-load/"
    session.pandas.to_redshift(
        dataframe=dataframe,
        path=path,
        schema="public",
        table="test",
        connection=con,
        iam_role=redshift_parameters.get("RedshiftRole"),
        mode=mode,
        preserve_index=False,
    )
    cursor = con.cursor()
    cursor.execute("SELECT COUNT(*) as counter from public.test")
    counter = cursor.fetchall()[0][0]
    cursor.close()
    con.close()
    assert len(dataframe.index) * factor == counter
Ejemplo n.º 10
0
def test_to_redshift_spark(session, bucket, redshift_parameters, sample_name,
                           mode, factor):
    path = f"data_samples/{sample_name}.csv"
    dataframe = session.spark.read_csv(path=path)
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="******",
        password=redshift_parameters.get("RedshiftPassword"),
    )
    session.spark.to_redshift(
        dataframe=dataframe,
        path=f"s3://{bucket}/redshift-load/",
        connection=con,
        schema="public",
        table="test",
        iam_role=redshift_parameters.get("RedshiftRole"),
        mode=mode,
        min_num_partitions=2,
    )
    cursor = con.cursor()
    cursor.execute("SELECT COUNT(*) as counter from public.test")
    counter = cursor.fetchall()[0][0]
    cursor.close()
    con.close()
    assert dataframe.count() * factor == counter
Ejemplo n.º 11
0
def test_read_sql_redshift_pandas2(session, bucket, redshift_parameters):
    n: int = 1_000_000
    df = pd.DataFrame({
        "id":
        list((range(n))),
        "val":
        list(["foo" if i % 2 == 0 else "boo" for i in range(n)])
    })
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="******",
        password=redshift_parameters.get("Password"),
    )
    path = f"s3://{bucket}/test_read_sql_redshift_pandas2/"
    session.pandas.to_redshift(
        dataframe=df,
        path=path,
        schema="public",
        table="test",
        connection=con,
        iam_role=redshift_parameters.get("RedshiftRole"),
        mode="overwrite",
        preserve_index=True,
    )
    path2 = f"s3://{bucket}/test_read_sql_redshift_pandas22/"
    df2 = session.pandas.read_sql_redshift(
        sql="select * from public.test",
        iam_role=redshift_parameters.get("RedshiftRole"),
        connection=con,
        temp_s3_path=path2)
    wr.s3.delete_objects(path=f"s3://{bucket}/")
    assert len(df.index) == len(df2.index)
    assert len(df.columns) + 1 == len(df2.columns)
Ejemplo n.º 12
0
def test_connection_with_different_port_types(redshift_parameters):
    conn = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=str(redshift_parameters.get("RedshiftPort")),
        user="******",
        password=redshift_parameters.get("RedshiftPassword"),
    )
    conn.close()
    conn = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=float(redshift_parameters.get("RedshiftPort")),
        user="******",
        password=redshift_parameters.get("RedshiftPassword"),
    )
    conn.close()
Ejemplo n.º 13
0
def test_to_redshift_spark_decimal(session, bucket, redshift_parameters):
    df = session.spark_session.createDataFrame(
        pd.DataFrame(
            {
                "id": [1, 2, 3],
                "decimal_2": [
                    Decimal((0, (1, 9, 9), -2)), None,
                    Decimal((0, (1, 9, 0), -2))
                ],
                "decimal_5": [
                    Decimal((0, (1, 9, 9, 9, 9, 9), -5)), None,
                    Decimal((0, (1, 9, 0, 0, 0, 0), -5))
                ]
            }),
        schema="id INTEGER, decimal_2 DECIMAL(3,2), decimal_5 DECIMAL(6,5)")
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="******",
        password=redshift_parameters.get("Password"),
    )
    path = f"s3://{bucket}/redshift-load2/"
    session.spark.to_redshift(
        dataframe=df,
        path=path,
        schema="public",
        table="test2",
        connection=con,
        iam_role=redshift_parameters.get("RedshiftRole"),
        mode="overwrite",
    )
    cursor = con.cursor()
    cursor.execute("SELECT * from public.test2")
    rows = cursor.fetchall()
    cursor.close()
    con.close()
    assert df.count() == len(rows)
    assert len(list(df.columns)) == len(list(rows[0]))
    for row in rows:
        if row[0] == 1:
            assert row[1] == Decimal((0, (1, 9, 9), -2))
            assert row[2] == Decimal((0, (1, 9, 9, 9, 9, 9), -5))
        elif row[1] == 2:
            assert row[1] is None
            assert row[2] is None
        elif row[2] == 3:
            assert row[1] == Decimal((0, (1, 9, 0), -2))
            assert row[2] == Decimal((0, (1, 9, 0, 0, 0, 0), -5))
Ejemplo n.º 14
0
def test_to_redshift_spark(session, bucket, redshift_parameters, sample_name,
                           mode, factor, diststyle, distkey, sortstyle,
                           sortkey):
    path = f"data_samples/{sample_name}.csv"
    if sample_name == "micro":
        schema = "id SMALLINT, name STRING, value FLOAT, date DATE"
        timestamp_format = "yyyy-MM-dd"
    elif sample_name == "small":
        schema = "id BIGINT, name STRING, date DATE"
        timestamp_format = "dd-MM-yy"
    elif sample_name == "nano":
        schema = "id INTEGER, name STRING, value DOUBLE, date DATE, time TIMESTAMP"
        timestamp_format = "yyyy-MM-dd"
    dataframe = session.spark.read_csv(path=path,
                                       schema=schema,
                                       timestampFormat=timestamp_format,
                                       dateFormat=timestamp_format,
                                       header=True)
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="******",
        password=redshift_parameters.get("Password"),
    )
    session.spark.to_redshift(
        dataframe=dataframe,
        path=f"s3://{bucket}/redshift-load/",
        connection=con,
        schema="public",
        table="test",
        iam_role=redshift_parameters.get("RedshiftRole"),
        diststyle=diststyle,
        distkey=distkey,
        sortstyle=sortstyle,
        sortkey=sortkey,
        mode=mode,
        min_num_partitions=2,
    )
    cursor = con.cursor()
    cursor.execute("SELECT * from public.test")
    rows = cursor.fetchall()
    cursor.close()
    con.close()
    assert (dataframe.count() * factor) == len(rows)
    assert len(list(dataframe.columns)) == len(list(rows[0]))
Ejemplo n.º 15
0
def test_to_redshift_pandas_decimal(session, bucket, redshift_parameters):
    df = pd.DataFrame({
        "id": [1, 2, 3],
        "decimal_2":
        [Decimal((0, (1, 9, 9), -2)), None,
         Decimal((0, (1, 9, 0), -2))],
        "decimal_5": [
            Decimal((0, (1, 9, 9, 9, 9, 9), -5)), None,
            Decimal((0, (1, 9, 0, 0, 0, 0), -5))
        ],
    })
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="******",
        password=redshift_parameters.get("DatabasesPassword"),
    )
    path = f"s3://{bucket}/redshift-load/"
    session.pandas.to_redshift(
        dataframe=df,
        path=path,
        schema="public",
        table="test",
        connection=con,
        iam_role=redshift_parameters.get("RedshiftRole"),
        mode="overwrite",
        preserve_index=False,
    )
    cursor = con.cursor()
    cursor.execute("SELECT * from public.test")
    rows = cursor.fetchall()
    cursor.close()
    con.close()
    assert len(df.index) == len(rows)
    assert len(list(df.columns)) == len(list(rows[0]))
    for row in rows:
        if row[0] == 1:
            assert row[1] == Decimal((0, (1, 9, 9), -2))
            assert row[2] == Decimal((0, (1, 9, 9, 9, 9, 9), -5))
        elif row[1] == 2:
            assert row[1] is None
            assert row[2] is None
        elif row[2] == 3:
            assert row[1] == Decimal((0, (1, 9, 0), -2))
            assert row[2] == Decimal((0, (1, 9, 0, 0, 0, 0), -5))
Ejemplo n.º 16
0
def test_to_redshift_pandas(session, bucket, redshift_parameters, sample_name,
                            mode, factor, diststyle, distkey, sortstyle,
                            sortkey):

    if sample_name == "micro":
        dates = ["date"]
    if sample_name == "small":
        dates = ["date"]
    if sample_name == "nano":
        dates = ["date", "time"]
    dataframe = pd.read_csv(f"data_samples/{sample_name}.csv",
                            parse_dates=dates,
                            infer_datetime_format=True)
    dataframe["date"] = dataframe["date"].dt.date
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="******",
        password=redshift_parameters.get("Password"),
    )
    path = f"s3://{bucket}/redshift-load/"
    session.pandas.to_redshift(
        dataframe=dataframe,
        path=path,
        schema="public",
        table="test",
        connection=con,
        iam_role=redshift_parameters.get("RedshiftRole"),
        diststyle=diststyle,
        distkey=distkey,
        sortstyle=sortstyle,
        sortkey=sortkey,
        mode=mode,
        preserve_index=True,
    )
    cursor = con.cursor()
    cursor.execute("SELECT * from public.test")
    rows = cursor.fetchall()
    cursor.close()
    con.close()
    assert len(dataframe.index) * factor == len(rows)
    assert len(list(dataframe.columns)) + 1 == len(list(rows[0]))
Ejemplo n.º 17
0
def test_read_sql_redshift_pandas(session, bucket, redshift_parameters,
                                  sample_name):
    if sample_name == "micro":
        dates = ["date"]
    elif sample_name == "small":
        dates = ["date"]
    else:
        dates = ["date", "time"]
    df = pd.read_csv(f"data_samples/{sample_name}.csv",
                     parse_dates=dates,
                     infer_datetime_format=True)
    df["date"] = df["date"].dt.date
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="******",
        password=redshift_parameters.get("Password"),
    )
    path = f"s3://{bucket}/test_read_sql_redshift_pandas/"
    session.pandas.to_redshift(
        dataframe=df,
        path=path,
        schema="public",
        table="test",
        connection=con,
        iam_role=redshift_parameters.get("RedshiftRole"),
        mode="overwrite",
        preserve_index=True,
    )
    path2 = f"s3://{bucket}/test_read_sql_redshift_pandas2/"
    df2 = session.pandas.read_sql_redshift(
        sql="select * from public.test",
        iam_role=redshift_parameters.get("RedshiftRole"),
        connection=con,
        temp_s3_path=path2)
    assert len(df.index) == len(df2.index)
    assert len(df.columns) + 1 == len(df2.columns)
Ejemplo n.º 18
0
def test_to_redshift_pandas_upsert(session, bucket, redshift_parameters):
    wr.s3.delete_objects(path=f"s3://{bucket}/")
    con = Redshift.generate_connection(
        database="test",
        host=redshift_parameters.get("RedshiftAddress"),
        port=redshift_parameters.get("RedshiftPort"),
        user="******",
        password=redshift_parameters.get("Password"),
    )

    df = pd.DataFrame({
        "id":
        list((range(1_000))),
        "val":
        list(["foo" if i % 2 == 0 else "boo" for i in range(1_000)])
    })

    df3 = pd.DataFrame({
        "id":
        list((range(1_000, 1_500))),
        "val":
        list(["foo" if i % 2 == 0 else "boo" for i in range(500)])
    })
Ejemplo n.º 19
0
from awswrangler import Session, Redshift
import awswrangler as wr

con = Redshift.generate_connection(
    database="dev",
    host=
    'redshift-cluster-1.cajhj66uu5bu.ap-northeast-1.redshift.amazonaws.com',
    port='5439',
    user="******",
    password='******')

df = wr.pandas.read_sql_redshift(sql="SELECT * FROM test",
                                 iam_role="AWSServiceRoleForRedshift",
                                 connection=con)

#temp_s3_path="s3://809392324773-titanic/titanic")

print(df)