Ejemplo n.º 1
0
def read_parquet(path, columns=None):
    """Load a parquet object from the file path, returning a DataFrame.

    Parameters
    ----------
    path : string
        File path
    columns : list, default=None
        If not None, only these columns will be read from the file.

    Returns
    -------
    DataFrame

    Examples
    --------
    >>> ks.read_parquet('data.parquet', columns=['name', 'gender'])  # doctest: +SKIP
    """
    if columns is not None:
        columns = list(columns)
    if columns is None or len(columns) > 0:
        sdf = default_session().read.parquet(path)
        if columns is not None:
            fields = [field.name for field in sdf.schema]
            cols = [col for col in columns if col in fields]
            if len(cols) > 0:
                sdf = sdf.select(cols)
            else:
                sdf = default_session().createDataFrame([],
                                                        schema=StructType())
    else:
        sdf = default_session().createDataFrame([], schema=StructType())
    return DataFrame(sdf)
Ejemplo n.º 2
0
    def attach_distributed_sequence_column(sdf, column_name):
        """
        This method attaches a Spark column that has a sequence in a distributed manner.
        This is equivalent to the column assigned when default index type 'distributed-sequence'.

        >>> sdf = ks.DataFrame(['a', 'b', 'c']).to_spark()
        >>> sdf = InternalFrame.attach_distributed_sequence_column(sdf, column_name="sequence")
        >>> sdf.show()  # doctest: +NORMALIZE_WHITESPACE
        +--------+---+
        |sequence|  0|
        +--------+---+
        |       0|  a|
        |       1|  b|
        |       2|  c|
        +--------+---+
        """
        if len(sdf.columns) > 0:
            try:
                jdf = sdf._jdf.toDF()

                sql_ctx = sdf.sql_ctx
                encoders = sql_ctx._jvm.org.apache.spark.sql.Encoders
                encoder = encoders.tuple(jdf.exprEnc(), encoders.scalaLong())

                jrdd = jdf.localCheckpoint(False).rdd().zipWithIndex()

                df = spark.DataFrame(
                    sql_ctx.sparkSession._jsparkSession.createDataset(
                        jrdd, encoder).toDF(), sql_ctx)
                columns = df.columns
                return df.selectExpr(
                    "`{}` as `{}`".format(columns[1], column_name),
                    "`{}`.*".format(columns[0]))
            except py4j.protocol.Py4JError:
                if is_testing():
                    raise
                return InternalFrame._attach_distributed_sequence_column(
                    sdf, column_name)
        else:
            cnt = sdf.count()
            if cnt > 0:
                return default_session().range(cnt).toDF(column_name)
            else:
                return default_session().createDataFrame(
                    [],
                    schema=StructType().add(column_name,
                                            data_type=LongType(),
                                            nullable=False))
Ejemplo n.º 3
0
def read_table(name: str) -> DataFrame:
    """
    Read a Spark table and return a DataFrame.

    Parameters
    ----------
    name : string
        Table name in Spark.

    Returns
    -------
    DataFrame

    See Also
    --------
    DataFrame.to_table
    read_delta
    read_parquet
    read_spark_io

    Examples
    --------
    >>> ks.range(1).to_table('%s.my_table' % db)
    >>> ks.read_table('%s.my_table' % db)
       id
    0   0
    """
    sdf = default_session().read.table(name)
    return DataFrame(sdf)
Ejemplo n.º 4
0
def get_option(key: str, default: Union[Any, _NoValueType] = _NoValue) -> Any:
    """
    Retrieves the value of the specified option.

    Parameters
    ----------
    key : str
        The key which should match a single option.
    default : object
        The default value if the option is not set yet. The value should be JSON serializable.

    Returns
    -------
    result : the value of the option

    Raises
    ------
    OptionError : if no such option exists and the default is not provided
    """
    _check_option(key)
    if default is _NoValue:
        default = _options_dict[key].default
    _options_dict[key].validate(default)

    return json.loads(default_session().conf.get(_key_format(key),
                                                 default=json.dumps(default)))
Ejemplo n.º 5
0
 def test_value_counts(self):
     if LooseVersion(pyspark.__version__) < LooseVersion("2.4") and \
             default_session().conf.get("spark.sql.execution.arrow.enabled") == "true":
         default_session().conf.set("spark.sql.execution.arrow.enabled",
                                    "false")
         try:
             self._test_value_counts()
         finally:
             default_session().conf.set("spark.sql.execution.arrow.enabled",
                                        "true")
         self.assertRaises(
             RuntimeError,
             lambda: ks.MultiIndex.from_tuples([('x', 'a'),
                                                ('x', 'b')]).value_counts())
     else:
         self._test_value_counts()
Ejemplo n.º 6
0
def reset_option(key: str) -> None:
    """
    Reset one option to their default value.

    Pass "all" as argument to reset all options.

    Parameters
    ----------
    key : str
        If specified only option will be reset.

    Returns
    -------
    None
    """
    _check_option(key)
    default_session().conf.unset(_key_format(key))
Ejemplo n.º 7
0
def set_option(key: str, value: Any) -> None:
    """
    Sets the value of the specified option.

    Parameters
    ----------
    key : str
        The key which should match a single option.
    value : object
        New value of option. The value should be JSON serializable.

    Returns
    -------
    None
    """
    _check_option(key, value)
    default_session().conf.set(_key_format(key), json.dumps(value))
Ejemplo n.º 8
0
def set_option(key: str, value: str) -> None:
    """
    Sets the value of the specified option.

    Parameters
    ----------
    key : str
        The key which should match a single option.
    value : object
        New value of option.

    Returns
    -------
    None
    """
    _check_option_key(key)
    default_session().conf.set(_key_format(key), value)
Ejemplo n.º 9
0
 def value_counts(self, normalize=False, sort=True, ascending=False, bins=None, dropna=True):
     if LooseVersion(pyspark.__version__) < LooseVersion("2.4") and \
             default_session().conf.get("spark.sql.execution.arrow.enabled") == "true" and \
             isinstance(self, MultiIndex):
         raise RuntimeError("if you're using pyspark < 2.4, set conf "
                            "'spark.sql.execution.arrow.enabled' to 'false' "
                            "for using this function with MultiIndex")
     return super(MultiIndex, self).value_counts(
         normalize=normalize, sort=sort, ascending=ascending, bins=bins, dropna=dropna)
Ejemplo n.º 10
0
    def from_pandas(pdf: pd.DataFrame) -> '_InternalFrame':
        """ Create an immutable DataFrame from pandas DataFrame.

        :param pdf: :class:`pd.DataFrame`
        :return: the created immutable DataFrame
        """
        columns = pdf.columns
        data_columns = [name_like_string(col) for col in columns]
        if isinstance(columns, pd.MultiIndex):
            column_index = columns.tolist()
        else:
            column_index = None
        column_index_names = columns.names

        index = pdf.index

        index_map = []  # type: List[IndexMap]
        if isinstance(index, pd.MultiIndex):
            if index.names is None:
                index_map = [(SPARK_INDEX_NAME_FORMAT(i), None)
                             for i in range(len(index.levels))]
            else:
                index_map = [
                    (SPARK_INDEX_NAME_FORMAT(i)
                     if name is None else name_like_string(name),
                     name if name is None or isinstance(name, tuple) else
                     (name, )) for i, name in enumerate(index.names)
                ]
        else:
            name = index.name
            index_map = [(name_like_string(name)
                          if name is not None else SPARK_INDEX_NAME_FORMAT(0),
                          name if name is None or isinstance(name, tuple) else
                          (name, ))]

        index_columns = [index_column for index_column, _ in index_map]

        reset_index = pdf.reset_index()
        reset_index.columns = index_columns + data_columns
        schema = StructType([
            StructField(name_like_string(name),
                        infer_pd_series_spark_type(col),
                        nullable=bool(col.isnull().any()))
            for name, col in reset_index.iteritems()
        ])
        for name, col in reset_index.iteritems():
            dt = col.dtype
            if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt):
                continue
            reset_index[name] = col.replace({np.nan: None})
        sdf = default_session().createDataFrame(reset_index, schema=schema)
        return _InternalFrame(
            sdf=sdf,
            index_map=index_map,
            column_index=column_index,
            column_scols=[scol_for(sdf, col) for col in data_columns],
            column_index_names=column_index_names)
Ejemplo n.º 11
0
def read_parquet(path, columns=None) -> DataFrame:
    """Load a parquet object from the file path, returning a DataFrame.

    Parameters
    ----------
    path : string
        File path
    columns : list, default=None
        If not None, only these columns will be read from the file.

    Returns
    -------
    DataFrame

    See Also
    --------
    DataFrame.to_parquet
    DataFrame.read_table
    DataFrame.read_delta
    DataFrame.read_spark_io

    Examples
    --------
    >>> ks.range(1).to_parquet('%s/read_spark_io/data.parquet' % path)
    >>> ks.read_parquet('%s/read_spark_io/data.parquet' % path, columns=['id'])
       id
    0   0
    """
    if columns is not None:
        columns = list(columns)
    if columns is None or len(columns) > 0:
        sdf = default_session().read.parquet(path)
        if columns is not None:
            fields = [field.name for field in sdf.schema]
            cols = [col for col in columns if col in fields]
            if len(cols) > 0:
                sdf = sdf.select(cols)
            else:
                sdf = default_session().createDataFrame([],
                                                        schema=StructType())
    else:
        sdf = default_session().createDataFrame([], schema=StructType())
    return DataFrame(sdf)
Ejemplo n.º 12
0
def range(start: int,
          end: Optional[int] = None,
          step: int = 1,
          num_partitions: Optional[int] = None) -> DataFrame:
    """
    Create a DataFrame with some range of numbers.

    The resulting DataFrame has a single int64 column named `id`, containing elements in a range
    from ``start`` to ``end`` (exclusive) with step value ``step``. If only the first parameter
    (i.e. start) is specified, we treat it as the end value with the start value being 0.

    This is similar to the range function in SparkSession and is used primarily for testing.

    Parameters
    ----------
    start : int
        the start value (inclusive)
    end : int, optional
        the end value (exclusive)
    step : int, optional, default 1
        the incremental step
    num_partitions : int, optional
        the number of partitions of the DataFrame

    Returns
    -------
    DataFrame

    Examples
    --------
    When the first parameter is specified, we generate a range of values up till that number.

    >>> ks.range(5)
       id
    0   0
    1   1
    2   2
    3   3
    4   4

    When start, end, and step are specified:

    >>> ks.range(start = 100, end = 200, step = 20)
        id
    0  100
    1  120
    2  140
    3  160
    4  180
    """
    sdf = default_session().range(start=start,
                                  end=end,
                                  step=step,
                                  numPartitions=num_partitions)
    return DataFrame(sdf)
Ejemplo n.º 13
0
def read_parquet(path, columns=None):
    """Load a parquet object from the file path, returning a DataFrame.

    :param path: File path
    :param columns: If not None, only these columns will be read from the file.
    :return: :class:`DataFrame`
    """
    if columns is not None:
        columns = list(columns)
    if columns is None or len(columns) > 0:
        sdf = default_session().read.parquet(path)
        if columns is not None:
            fields = [field.name for field in sdf.schema]
            cols = [col for col in columns if col in fields]
            if len(cols) > 0:
                sdf = sdf.select(cols)
            else:
                sdf = default_session().createDataFrame([], schema=StructType())
    else:
        sdf = default_session().createDataFrame([], schema=StructType())
    return DataFrame(sdf)
Ejemplo n.º 14
0
    def from_pandas(pdf: pd.DataFrame) -> "InternalFrame":
        """ Create an immutable DataFrame from pandas DataFrame.

        :param pdf: :class:`pd.DataFrame`
        :return: the created immutable DataFrame
        """
        columns = pdf.columns
        data_columns = [name_like_string(col) for col in columns]
        if isinstance(columns, pd.MultiIndex):
            column_labels = columns.tolist()
        else:
            column_labels = [(col, ) for col in columns]
        column_label_names = [
            name if name is None or isinstance(name, tuple) else (name, )
            for name in columns.names
        ]

        index_names = [
            name if name is None or isinstance(name, tuple) else (name, )
            for name in pdf.index.names
        ]
        index_columns = [
            SPARK_INDEX_NAME_FORMAT(i) for i in range(len(index_names))
        ]

        pdf = pdf.copy()
        pdf.index.names = index_columns
        reset_index = pdf.reset_index()
        reset_index.columns = index_columns + data_columns
        schema = StructType([
            StructField(
                name,
                infer_pd_series_spark_type(col),
                nullable=bool(col.isnull().any()),
            ) for name, col in reset_index.iteritems()
        ])
        for name, col in reset_index.iteritems():
            dt = col.dtype
            if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt):
                continue
            reset_index[name] = col.replace({np.nan: None})
        sdf = default_session().createDataFrame(reset_index, schema=schema)
        return InternalFrame(
            spark_frame=sdf,
            index_spark_columns=[scol_for(sdf, col) for col in index_columns],
            index_names=index_names,
            column_labels=column_labels,
            data_spark_columns=[scol_for(sdf, col) for col in data_columns],
            column_label_names=column_label_names,
        )
Ejemplo n.º 15
0
 def _init_from_pandas(self, pdf, *args):
     metadata = Metadata.from_pandas(pdf)
     reset_index = pdf.reset_index()
     reset_index.columns = metadata.all_fields
     schema = StructType([StructField(name, infer_pd_series_spark_type(col),
                                      nullable=bool(col.isnull().any()))
                          for name, col in reset_index.iteritems()])
     for name, col in reset_index.iteritems():
         dt = col.dtype
         if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt):
             continue
         reset_index[name] = col.replace({np.nan: None})
     self._init_from_spark(default_session().createDataFrame(reset_index, schema=schema),
                           metadata)
Ejemplo n.º 16
0
def sql(query: str) -> DataFrame:
    """
    Execute a SQL query and return the result as a Koalas DataFrame.

    Parameters
    ----------
    query : str
        the SQL query
    >>> ks.sql("select * from range(10) where id > 7")
       id
    0   8
    1   9
    """
    return DataFrame(default_session().sql(query))
Ejemplo n.º 17
0
def read_spark_io(path: Optional[str] = None,
                  format: Optional[str] = None,
                  schema: Union[str, 'StructType'] = None,
                  **options) -> DataFrame:
    """Load a DataFrame from a Spark data source.

    Parameters
    ----------
    path : string, optional
        Path to the data source.
    format : string, optional
        Specifies the output data source format. Some common ones are:

        - 'delta'
        - 'parquet'
        - 'orc'
        - 'json'
        - 'csv'
    schema : string or StructType, optional
        Input schema. If none, Spark tries to infer the schema automatically.
        The schema can either be a Spark StructType, or a DDL-formatted string like
        `col0 INT, col1 DOUBLE`.
    options : dict
        All other options passed directly into Spark's data source.

    See Also
    --------
    DataFrame.to_spark_io
    DataFrame.read_table
    DataFrame.read_delta
    DataFrame.read_parquet

    Examples
    --------
    >>> ks.range(1).to_spark_io('%s/read_spark_io/data.parquet' % path)
    >>> ks.read_spark_io(
    ...     '%s/read_spark_io/data.parquet' % path, format='parquet', schema='id long')
       id
    0   0
    """
    sdf = default_session().read.load(path=path,
                                      format=format,
                                      schema=schema,
                                      options=options)
    return DataFrame(sdf)
Ejemplo n.º 18
0
    def from_pandas(pdf: pd.DataFrame) -> '_InternalFrame':
        """ Create an immutable DataFrame from pandas DataFrame.

        :param pdf: :class:`pd.DataFrame`
        :return: the created immutable DataFrame
        """
        data_columns = [str(col) for col in pdf.columns]

        index = pdf.index

        index_map = []  # type: List[IndexMap]
        if isinstance(index, pd.MultiIndex):
            if index.names is None:
                index_map = [('__index_level_{}__'.format(i), None)
                             for i in range(len(index.levels))]
            else:
                index_map = [
                    ('__index_level_{}__'.format(i) if name is None else name,
                     name) for i, name in enumerate(index.names)
                ]
        else:
            index_map = [
                (index.name if index.name is not None else '__index_level_0__',
                 index.name)
            ]

        index_columns = [index_column for index_column, _ in index_map]

        reset_index = pdf.reset_index()
        reset_index.columns = index_columns + data_columns
        schema = StructType([
            StructField(name,
                        infer_pd_series_spark_type(col),
                        nullable=bool(col.isnull().any()))
            for name, col in reset_index.iteritems()
        ])
        for name, col in reset_index.iteritems():
            dt = col.dtype
            if is_datetime64_dtype(dt) or is_datetime64tz_dtype(dt):
                continue
            reset_index[name] = col.replace({np.nan: None})
        sdf = default_session().createDataFrame(reset_index, schema=schema)
        return _InternalFrame(sdf=sdf,
                              index_map=index_map,
                              data_columns=data_columns)
Ejemplo n.º 19
0
def get_option(key: str, default: Union[str, _NoValueType] = _NoValue) -> str:
    """
    Retrieves the value of the specified option.

    Parameters
    ----------
    key : str
        The key which should match a single option.

    default : str
        The default value if the option is not set yet.

    Returns
    -------
    result : the value of the option

    Raises
    ------
    OptionError : if no such option exists and the default is not provided
    """
    _check_option_key(key)
    if default is _NoValue:
        default = _registered_options[key]
    return default_session().conf.get(_key_format(key), default=default)
Ejemplo n.º 20
0
 def _model_udf(self):
     spark = default_session()
     return pyfunc.spark_udf(spark,
                             model_uri=self._model_uri,
                             result_type=self._return_type)
Ejemplo n.º 21
0
def read_csv(path,
             header='infer',
             names=None,
             usecols=None,
             mangle_dupe_cols=True,
             parse_dates=False,
             comment=None):
    """Read CSV (comma-separated) file into DataFrame.

    Parameters
    ----------
    path : str
        The path string storing the CSV file to be read.
    header : int, list of int, default ‘infer’
        Whether to to use as the column names, and the start of the data.
        Default behavior is to infer the column names: if no names are passed
        the behavior is identical to `header=0` and column names are inferred from
        the first line of the file, if column names are passed explicitly then
        the behavior is identical to `header=None`. Explicitly pass `header=0` to be
        able to replace existing names
    names : array-like, optional
        List of column names to use. If file contains no header row, then you should
        explicitly pass `header=None`. Duplicates in this list will cause an error to be issued.
    usecols : list-like or callable, optional
        Return a subset of the columns. If list-like, all elements must either be
        positional (i.e. integer indices into the document columns) or strings that
        correspond to column names provided either by the user in names or inferred
        from the document header row(s).
        If callable, the callable function will be evaluated against the column names,
        returning names where the callable function evaluates to `True`.
    mangle_dupe_cols : bool, default True
        Duplicate columns will be specified as 'X0', 'X1', ... 'XN', rather
        than 'X' ... 'X'. Passing in False will cause data to be overwritten if
        there are duplicate names in the columns.
        Currently only `True` is allowed.
    parse_dates : boolean or list of ints or names or list of lists or dict, default `False`.
        Currently only `False` is allowed.
    comment: str, optional
        Indicates the line should not be parsed.

    Returns
    -------
    DataFrame

    See Also
    --------
    DataFrame.to_csv : Write DataFrame to a comma-separated values (csv) file.

    Examples
    --------
    >>> ks.read_csv('data.csv')  # doctest: +SKIP
    """
    if mangle_dupe_cols is not True:
        raise ValueError("mangle_dupe_cols can only be `True`: %s" %
                         mangle_dupe_cols)
    if parse_dates is not False:
        raise ValueError("parse_dates can only be `False`: %s" % parse_dates)

    if usecols is not None and not callable(usecols):
        usecols = list(usecols)
    if usecols is None or callable(usecols) or len(usecols) > 0:
        reader = default_session().read.option("inferSchema", "true")

        if header == 'infer':
            header = 0 if names is None else None
        if header == 0:
            reader.option("header", True)
        elif header is None:
            reader.option("header", False)
        else:
            raise ValueError("Unknown header argument {}".format(header))

        if comment is not None:
            if not isinstance(comment, str) or len(comment) != 1:
                raise ValueError("Only length-1 comment characters supported")
            reader.option("comment", comment)

        sdf = reader.csv(path)

        if header is None:
            sdf = sdf.selectExpr(*[
                "`%s` as `%s`" % (field.name, i)
                for i, field in enumerate(sdf.schema)
            ])
        if names is not None:
            names = list(names)
            if len(set(names)) != len(names):
                raise ValueError('Found non-unique column index')
            if len(names) != len(sdf.schema):
                raise ValueError(
                    'Names do not match the number of columns: %d' %
                    len(names))
            sdf = sdf.selectExpr(*[
                "`%s` as `%s`" % (field.name, name)
                for field, name in zip(sdf.schema, names)
            ])

        if usecols is not None:
            if callable(usecols):
                cols = [
                    field.name for field in sdf.schema if usecols(field.name)
                ]
                missing = []
            elif all(isinstance(col, int) for col in usecols):
                cols = [
                    field.name for i, field in enumerate(sdf.schema)
                    if i in usecols
                ]
                missing = [
                    col for col in usecols if col >= len(sdf.schema)
                    or sdf.schema[col].name not in cols
                ]
            elif all(isinstance(col, str) for col in usecols):
                cols = [
                    field.name for field in sdf.schema if field.name in usecols
                ]
                missing = [col for col in usecols if col not in cols]
            else:
                raise ValueError(
                    "'usecols' must either be list-like of all strings, "
                    "all unicode, all integers or a callable.")
            if len(missing) > 0:
                raise ValueError(
                    'Usecols do not match columns, columns expected but not '
                    'found: %s' % missing)

            if len(cols) > 0:
                sdf = sdf.select(cols)
            else:
                sdf = default_session().createDataFrame([],
                                                        schema=StructType())
    else:
        sdf = default_session().createDataFrame([], schema=StructType())
    return DataFrame(sdf)
Ejemplo n.º 22
0
Archivo: utils.py Proyecto: wwwK/koalas
 def setUpClass(cls):
     cls.spark = default_session()
     cls.spark.conf.set(SPARK_CONF_ARROW_ENABLED, True)
Ejemplo n.º 23
0
 def setUpClass(cls):
     cls.spark = default_session()
     cls.spark.conf.set("spark.sql.execution.arrow.enabled", True)
Ejemplo n.º 24
0
# files to reuse in Read the Docs build
if "READTHEDOCS" not in os.environ:
    # Remove previously generated rst files. Ignore errors just in case it stops
    # generating whole docs.
    shutil.rmtree("%s/reference/api" %
                  os.path.dirname(os.path.abspath(__file__)),
                  ignore_errors=True)
    try:
        os.mkdir("%s/reference/api" %
                 os.path.dirname(os.path.abspath(__file__)))
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

# Lower the number of partitions to speed up documentation build
utils.default_session({"spark.sql.shuffle.partitions": "4"})


def gendoc():
    """Get releases from Github and generate reStructuredText files for release notes."""
    source_dir = os.path.dirname(os.path.abspath(__file__))
    whatsnew_dir = "%s/whatsnew" % source_dir

    # Read the Docs builds multiple times. To speed up, we don't delete the generated rst
    # files to reuse in Read the Docs build
    if "READTHEDOCS" in os.environ and os.path.isdir(whatsnew_dir):
        return

    dev_dir = "%s/../../dev" % os.path.dirname(os.path.abspath(__file__))
    spec = importlib.util.spec_from_file_location("gendoc",
                                                  "%s/gendoc.py" % dev_dir)
Ejemplo n.º 25
0
 def _init_from_pandas(self, pdf, *args):
     metadata = Metadata.from_pandas(pdf)
     reset_index = pdf.reset_index()
     reset_index.columns = metadata.all_fields
     self._init_from_spark(default_session().createDataFrame(reset_index),
                           metadata)
Ejemplo n.º 26
0
import logging
from distutils.version import LooseVersion

import pandas as pd
import pyarrow as pa
import matplotlib.pyplot as plt
from pyspark import __version__

from databricks import koalas
from databricks.koalas import utils

# Initialize Spark session that should be used in doctests or unittests.
# Delta requires Spark 2.4.2+. See
# https://github.com/delta-io/delta#compatibility-with-apache-spark-versions.
if LooseVersion(__version__) >= LooseVersion("3.0.0"):
    session = utils.default_session(
        {"spark.jars.packages": "io.delta:delta-core_2.12:0.1.0"})
elif LooseVersion(__version__) >= LooseVersion("2.4.2"):
    session = utils.default_session(
        {"spark.jars.packages": "io.delta:delta-core_2.11:0.1.0"})
else:
    session = utils.default_session()


@pytest.fixture(autouse=True)
def add_ks(doctest_namespace):
    doctest_namespace['ks'] = koalas


@pytest.fixture(autouse=True)
def add_pd(doctest_namespace):
    if os.getenv("PANDAS_VERSION", None) is not None:
Ejemplo n.º 27
0
import pandas as pd
import pyarrow as pa
import matplotlib.pyplot as plt
from pyspark import __version__

from databricks import koalas as ks
from databricks.koalas import utils

shared_conf = {"spark.sql.shuffle.partitions": "4"}
# Initialize Spark session that should be used in doctests or unittests.
# Delta requires Spark 2.4.2+. See
# https://github.com/delta-io/delta#compatibility-with-apache-spark-versions.
if LooseVersion(__version__) >= LooseVersion("3.0.0"):
    shared_conf["spark.jars.packages"] = "io.delta:delta-core_2.12:0.7.0"
    session = utils.default_session(shared_conf)
elif LooseVersion(__version__) >= LooseVersion("2.4.2"):
    shared_conf["spark.jars.packages"] = "io.delta:delta-core_2.11:0.6.1"
    session = utils.default_session(shared_conf)
else:
    session = utils.default_session(shared_conf)

if os.getenv("DEFAULT_INDEX_TYPE", "") != "":
    ks.options.compute.default_index_type = os.getenv("DEFAULT_INDEX_TYPE")


@pytest.fixture(scope="session", autouse=True)
def session_termination():
    yield
    # Share one session across all the tests. Repeating starting and stopping sessions and contexts
    # seems causing a memory leak for an unknown reason in PySpark.
Ejemplo n.º 28
0
def sql(query: str, globals=None, locals=None, **kwargs) -> DataFrame:
    """
    Execute a SQL query and return the result as a Koalas DataFrame.

    This function also supports embedding Python variables (locals, globals, and parameters)
    in the SQL statement by wrapping them in curly braces. See examples section for details.

    In addition to the locals, globals and parameters, the function will also attempt
    to determine if the program currently runs in an IPython (or Jupyter) environment
    and to import the variables from this environment. The variables have the same
    precedence as globals.

    The following variable types are supported:

    - string
    - int
    - float
    - list, tuple, range of above types
    - Koalas DataFrame
    - Koalas Series
    - pandas DataFrame

    Parameters
    ----------
    query : str
        the SQL query
    globals : dict, optional
        the dictionary of global variables, if explicitly set by the user
    locals : dict, optional
        the dictionary of local variables, if explicitly set by the user
    kwargs
        other variables that the user may want to set manually that can be referenced in the query

    Returns
    -------
    Koalas DataFrame

    Examples
    --------

    Calling a built-in SQL function.

    >>> ks.sql("select * from range(10) where id > 7")
       id
    0   8
    1   9

    A query can also reference a local variable or parameter by wrapping them in curly braces:

    >>> bound1 = 7
    >>> ks.sql("select * from range(10) where id > {bound1} and id < {bound2}", bound2=9)
       id
    0   8

    You can also wrap a DataFrame with curly braces to query it directly. Note that when you do
    that, the indexes, if any, automatically become top level columns.

    >>> mydf = ks.range(10)
    >>> x = range(4)
    >>> ks.sql("SELECT * from {mydf} WHERE id IN {x}")
       id
    0   0
    1   1
    2   2
    3   3

    Queries can also be arbitrarily nested in functions:

    >>> def statement():
    ...     mydf2 = ks.DataFrame({"x": range(2)})
    ...     return ks.sql("SELECT * from {mydf2}")
    >>> statement()
       x
    0  0
    1  1

    Mixing Koalas and pandas DataFrames in a join operation. Note that the index is dropped.

    >>> ks.sql('''
    ...   SELECT m1.a, m2.b
    ...   FROM {table1} m1 INNER JOIN {table2} m2
    ...   ON m1.key = m2.key
    ...   ORDER BY m1.a, m2.b''',
    ...   table1=ks.DataFrame({"a": [1,2], "key": ["a", "b"]}),
    ...   table2=pd.DataFrame({"b": [3,4,5], "key": ["a", "b", "b"]}))
       a  b
    0  1  3
    1  2  4
    2  2  5

    Also, it is possible to query using Series.

    >>> myser = ks.Series({'a': [1.0, 2.0, 3.0], 'b': [15.0, 30.0, 45.0]})
    >>> ks.sql("SELECT * from {myser}")
                        0
    0     [1.0, 2.0, 3.0]
    1  [15.0, 30.0, 45.0]
    """
    if globals is None:
        globals = _get_ipython_scope()
    _globals = builtin_globals() if globals is None else dict(globals)
    _locals = builtin_locals() if locals is None else dict(locals)
    # The default choice is the globals
    _dict = dict(_globals)
    # The vars:
    _scope = _get_local_scope()
    _dict.update(_scope)
    # Then the locals
    _dict.update(_locals)
    # Highest order of precedence is the locals
    _dict.update(kwargs)
    return SQLProcessor(_dict, query, default_session()).execute()