Exemple #1
0
def validate_pandas(conn_cnx, sql, cases, col_count, method='one', data_type='float', epsilon=None, scale=0, timezone=None):
    """Tests that parameters can be customized.

    Args:
        conn_cnx: Connection object.
        sql: SQL command for execution.
        cases: Test cases.
        col_count: Number of columns in dataframe.
        method: If method is 'batch', we fetch dataframes in batch. If method is 'one', we fetch a single dataframe
            containing all data (Default value = 'one').
        data_type: Defines how to compare values (Default value = 'float').
        epsilon: For comparing double values (Default value = None).
        scale: For comparing time values with scale (Default value = 0).
        timezone: For comparing timestamp ltz (Default value = None).
    """
    row_count = len(cases)
    assert col_count != 0, '# of columns should be larger than 0'
    with conn_cnx() as cnx_table:
        # fetch dataframe with new arrow support
        cursor_table = cnx_table.cursor()
        cursor_table.execute(SQL_ENABLE_ARROW)
        cursor_table.execute(sql)

        # build dataframe
        total_rows, total_batches = 0, 0
        start_time = time.time()
        if method == 'one':
            df_new = cursor_table.fetch_pandas_all()
            total_rows = df_new.shape[0]
        else:
            for df_new in cursor_table.fetch_pandas_batches():
                total_rows += df_new.shape[0]
                total_batches += 1
        end_time = time.time()
        print('new way (fetching {}) took {}s'.format(method, end_time - start_time))
        if method == 'batch':
            print('new way has # of batches : {}'.format(total_batches))
        cursor_table.close()
        assert total_rows == row_count, 'there should be {} rows, but {} rows'.format(row_count, total_rows)

        # verify the correctness
        # only do it when fetch one dataframe
        if method == 'one':
            assert (row_count, col_count) == df_new.shape, 'the shape of old dataframe is {}, ' \
                                                           'the shape of new dataframe is {}, ' \
                                                           'shapes are not equal'.format((row_count, col_count),
                                                                                         df_new.shape)

            for i in range(row_count):
                for j in range(col_count):
                    c_new = df_new.iat[i, j]
                    if cases[i] == "NULL":
                        assert c_new is None or pd.isnull(c_new), '{} row, {} column: original value is NULL, ' \
                                                                  'new value is {}, values are not equal'.format(
                            i, j, c_new)
                    else:
                        if data_type == 'float':
                            c_case = float(cases[i])
                        elif data_type == 'decimal':
                            c_case = Decimal(cases[i])
                        elif data_type == 'date':
                            c_case = datetime.strptime(cases[i], '%Y-%m-%d').date()
                        elif data_type == 'time':
                            time_str_len = 8 if scale == 0 else 9 + scale
                            c_case = cases[i].strip()[:time_str_len]
                            c_new = str(c_new).strip()[:time_str_len]
                            assert c_case == c_new, '{} row, {} column: original value is {}, ' \
                                                    'new value is {}, ' \
                                                    'values are not equal'.format(i, j, cases[i],
                                                                                  c_new)
                            break
                        elif data_type.startswith('timestamp'):
                            time_str_len = 19 if scale == 0 else 20 + scale
                            if timezone:
                                c_case = pd.Timestamp(cases[i][:time_str_len], tz=timezone)
                                if data_type == 'timestamptz':
                                    c_case = c_case.tz_convert('UTC')
                                    c_case = c_case.tz_localize(None)
                            else:
                                c_case = pd.Timestamp(cases[i][:time_str_len])
                            assert c_case == c_new, '{} row, {} column: original value is {}, new value is {}, ' \
                                                    'values are not equal'.format(i, j, cases[i], c_new)
                            break
                        else:
                            c_case = cases[i]
                        if epsilon is None:
                            assert c_case == c_new, '{} row, {} column: original value is {}, new value is {}, ' \
                                                    'values are not equal'.format(i, j, cases[i], c_new)
                        else:
                            assert abs(c_case - c_new) < epsilon, '{} row, {} column: original value is {}, ' \
                                                                  'new value is {}, epsilon is {} \
                            values are not equal'.format(i, j, cases[i], c_new, epsilon)
def validate_pandas(
    cnx_table,
    sql,
    cases,
    col_count,
    method="one",
    data_type="float",
    epsilon=None,
    scale=0,
    timezone=None,
):
    """Tests that parameters can be customized.

    Args:
        cnx_table: Connection object.
        sql: SQL command for execution.
        cases: Test cases.
        col_count: Number of columns in dataframe.
        method: If method is 'batch', we fetch dataframes in batch. If method is 'one', we fetch a single dataframe
            containing all data (Default value = 'one').
        data_type: Defines how to compare values (Default value = 'float').
        epsilon: For comparing double values (Default value = None).
        scale: For comparing time values with scale (Default value = 0).
        timezone: For comparing timestamp ltz (Default value = None).
    """

    row_count = len(cases)
    assert col_count != 0, "# of columns should be larger than 0"

    cursor_table = cnx_table.cursor()
    cursor_table.execute(SQL_ENABLE_ARROW)
    cursor_table.execute(sql)

    # build dataframe
    total_rows, total_batches = 0, 0
    start_time = time.time()

    if method == "one":
        df_new = cursor_table.fetch_pandas_all()
        total_rows = df_new.shape[0]
    else:
        for df_new in cursor_table.fetch_pandas_batches():
            total_rows += df_new.shape[0]
            total_batches += 1
    end_time = time.time()

    print(f"new way (fetching {method}) took {end_time - start_time}s")
    if method == "batch":
        print(f"new way has # of batches : {total_batches}")
    cursor_table.close()
    assert (
        total_rows == row_count
    ), f"there should be {row_count} rows, but {total_rows} rows"

    # verify the correctness
    # only do it when fetch one dataframe
    if method == "one":
        assert (row_count, col_count) == df_new.shape, (
            "the shape of old dataframe is {}, "
            "the shape of new dataframe is {}, "
            "shapes are not equal".format((row_count, col_count), df_new.shape)
        )

        for i in range(row_count):
            for j in range(col_count):
                c_new = df_new.iat[i, j]
                if cases[i] == "NULL":
                    assert c_new is None or pandas.isnull(c_new), (
                        "{} row, {} column: original value is NULL, "
                        "new value is {}, values are not equal".format(i, j, c_new)
                    )
                else:
                    if data_type == "float":
                        c_case = float(cases[i])
                    elif data_type == "decimal":
                        c_case = Decimal(cases[i])
                    elif data_type == "date":
                        c_case = datetime.strptime(cases[i], "%Y-%m-%d").date()
                    elif data_type == "time":
                        time_str_len = 8 if scale == 0 else 9 + scale
                        c_case = cases[i].strip()[:time_str_len]
                        c_new = str(c_new).strip()[:time_str_len]
                        assert c_new == c_case, (
                            "{} row, {} column: original value is {}, "
                            "new value is {}, "
                            "values are not equal".format(i, j, cases[i], c_new)
                        )
                        break
                    elif data_type.startswith("timestamp"):
                        time_str_len = 19 if scale == 0 else 20 + scale
                        if timezone:
                            c_case = pandas.Timestamp(
                                cases[i][:time_str_len], tz=timezone
                            )
                            if data_type == "timestamptz":
                                c_case = c_case.tz_convert("UTC")
                        else:
                            c_case = pandas.Timestamp(cases[i][:time_str_len])
                        assert c_case == c_new, (
                            "{} row, {} column: original value is {}, new value is {}, "
                            "values are not equal".format(i, j, cases[i], c_new)
                        )
                        break
                    else:
                        c_case = cases[i]
                    if epsilon is None:
                        assert c_case == c_new, (
                            "{} row, {} column: original value is {}, new value is {}, "
                            "values are not equal".format(i, j, cases[i], c_new)
                        )
                    else:
                        assert abs(c_case - c_new) < epsilon, (
                            "{} row, {} column: original value is {}, "
                            "new value is {}, epsilon is {} \
                        values are not equal".format(
                                i, j, cases[i], c_new, epsilon
                            )
                        )