コード例 #1
0
def spark_session():
    pyspark = import_or_none('pyspark.sql')

    if pyspark:
        spark = pyspark.SparkSession.builder \
            .master("local[*]") \
            .config("spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=True") \
            .config("spark.sql.shuffle.partitions", "2") \
            .getOrCreate()

        return spark
コード例 #2
0
def spark_session():
    pyspark = import_or_none("pyspark.sql")

    if pyspark:
        spark = (pyspark.SparkSession.builder.master("local[*]").config(
            "spark.driver.extraJavaOptions",
            "-Dio.netty.tryReflectionSetAccessible=True",
        ).config("spark.sql.shuffle.partitions",
                 "2").config("spark.driver.bindAddress",
                             "127.0.0.1").getOrCreate())

        return spark
コード例 #3
0
def test_import_or_none():
    assert import_or_none("pandas") == pd
    assert import_or_none("nonexistent") is None
コード例 #4
0
    _infer_datetime_format,
    _is_latlong_nan,
    _is_nan,
    _is_s3,
    _is_url,
    _is_valid_latlong_series,
    _is_valid_latlong_value,
    _parse_logical_type,
    _reformat_to_latlong,
    camel_to_snake,
    get_valid_mi_types,
    import_or_none,
    import_or_raise,
)

dd = import_or_none("dask.dataframe")
ps = import_or_none("pyspark.pandas")


def test_camel_to_snake():
    test_items = {
        "PostalCode": "postal_code",
        "SubRegionCode": "sub_region_code",
        "NaturalLanguage": "natural_language",
        "Categorical": "categorical",
    }

    for key, value in test_items.items():
        assert camel_to_snake(key) == value

コード例 #5
0
from woodwork.utils import import_or_none

UNSUPPORTED_KOALAS_DTYPES = [
    'int32',
    'intp',
    'uint8',
    'uint16',
    'uint32',
    'uint64',
    'uintp',
    'float_',
    'object',
    'category',
]

ks = import_or_none('databricks.koalas')


def get_koalas_dtypes(dtypes):
    return [
        dtype for dtype in dtypes if dtype not in UNSUPPORTED_KOALAS_DTYPES
    ]


def test_integer_inference(integers):
    dtypes = ['int8', 'int16', 'int32', 'int64', 'intp', 'int', 'Int64']
    if ks and isinstance(integers[0], ks.Series):
        dtypes = get_koalas_dtypes(dtypes)

    for series in integers:
        for dtype in dtypes:
コード例 #6
0
ファイル: table_utils.py プロジェクト: VibhuJawa/woodwork
import pandas as pd

from woodwork.utils import import_or_none

dd = import_or_none('dask.dataframe')
ks = import_or_none('databricks.koalas')


def validate_subset_schema(subset_schema, schema):
    assert subset_schema.name == schema.name
    for subset_col_name, subset_col in subset_schema.columns.items():
        assert subset_col_name in schema.columns
        col = schema.columns[subset_col_name]
        assert subset_col.logical_type == col.logical_type
        assert subset_col.semantic_tags == col.semantic_tags


def mi_between_cols(col1, col2, df):
    mi_series = df.loc[df['column_1'] == col1].loc[df['column_2'] ==
                                                   col2]['mutual_info']

    if len(mi_series) == 0:
        mi_series = df.loc[df['column_1'] == col2].loc[df['column_2'] ==
                                                       col1]['mutual_info']

    return mi_series.iloc[0]


def to_pandas(df, index=None, sort_index=False):
    """Testing util to convert dataframes to pandas. If a pandas dataframe is passed in, just returns the dataframe.
コード例 #7
0
ファイル: test_indexers.py プロジェクト: VibhuJawa/woodwork
import pytest

from woodwork.exceptions import WoodworkNotInitError
from woodwork.indexers import _iLocIndexer, _locIndexer
from woodwork.logical_types import (
    Categorical,
    Datetime,
    Double,
    EmailAddress,
    Integer,
    PhoneNumber
)
from woodwork.tests.testing_utils import to_pandas
from woodwork.utils import import_or_none

dd = import_or_none('dask.dataframe')


def test_iLocIndexer_class_error(sample_df_dask, sample_series_dask):
    with pytest.raises(TypeError, match="iloc is not supported for Dask DataFrames"):
        _iLocIndexer(sample_df_dask)

    with pytest.raises(TypeError, match="iloc is not supported for Dask Series"):
        _iLocIndexer(sample_series_dask)


def test_iLocIndexer_class(sample_df):
    if dd and isinstance(sample_df, dd.DataFrame):
        pytest.xfail('iloc is not supported with Dask inputs')
    sample_df.ww.init()
    ind = _iLocIndexer(sample_df)
コード例 #8
0
ファイル: test_utils.py プロジェクト: kaidisn/woodwork
def test_import_or_none():
    assert import_or_none('pandas') == pd
    assert import_or_none('nonexistent') is None
コード例 #9
0
from woodwork.utils import import_or_none

UNSUPPORTED_SPARK_DTYPES = [
    "int32",
    "intp",
    "uint8",
    "uint16",
    "uint32",
    "uint64",
    "uintp",
    "float_",
    "object",
    "category",
]

ps = import_or_none("pyspark.pandas")


def get_spark_dtypes(dtypes):
    return [dtype for dtype in dtypes if dtype not in UNSUPPORTED_SPARK_DTYPES]


def test_integer_inference(integers):
    dtypes = ["int8", "int16", "int32", "int64", "intp", "int", "Int64"]
    if _is_spark_series(integers[0]):
        dtypes = get_spark_dtypes(dtypes)

    for series in integers:
        for dtype in dtypes:
            inferred_type = ww.type_system.infer_logical_type(series.astype(dtype))
            assert isinstance(inferred_type, Integer)
コード例 #10
0
from woodwork.accessor_utils import _is_dask_dataframe, _is_dask_series
from woodwork.exceptions import WoodworkNotInitError
from woodwork.indexers import _iLocIndexer, _locIndexer
from woodwork.logical_types import (
    Categorical,
    Datetime,
    Double,
    EmailAddress,
    Integer,
    PhoneNumber,
)
from woodwork.tests.testing_utils import to_pandas
from woodwork.utils import import_or_none

dd = import_or_none("dask.dataframe")


def test_iLocIndexer_class_error(sample_df_dask, sample_series_dask):
    with pytest.raises(TypeError,
                       match="iloc is not supported for Dask DataFrames"):
        _iLocIndexer(sample_df_dask)

    with pytest.raises(TypeError,
                       match="iloc is not supported for Dask Series"):
        _iLocIndexer(sample_series_dask)


def test_iLocIndexer_class(sample_df):
    if _is_dask_dataframe(sample_df):
        pytest.xfail("iloc is not supported with Dask inputs")