Esempio n. 1
0
    def test_get_index_map(self):
        psdf = ps.DataFrame({
            "year": [2015, 2016],
            "month": [2, 3],
            "day": [4, 5]
        })
        sdf = psdf.to_spark()
        self.assertEqual(_get_index_map(sdf), (None, None))

        def check(actual, expected):
            actual_scols, actual_labels = actual
            expected_column_names, expected_labels = expected
            self.assertEqual(len(actual_scols), len(expected_column_names))
            for actual_scol, expected_column_name in zip(
                    actual_scols, expected_column_names):
                expected_scol = sdf[expected_column_name]
                self.assertTrue(spark_column_equals(actual_scol,
                                                    expected_scol))
            self.assertEqual(actual_labels, expected_labels)

        check(_get_index_map(sdf, "year"), (["year"], [("year", )]))
        check(_get_index_map(sdf, ["year", "month"]),
              (["year", "month"], [("year", ), ("month", )]))

        self.assertRaises(KeyError,
                          lambda: _get_index_map(sdf, ["year", "hour"]))
Esempio n. 2
0
    def execute(self, index_col: Optional[Union[str, List[str]]]) -> DataFrame:
        """
        Returns a DataFrame for which the SQL statement has been executed by
        the underlying SQL engine.

        >>> from pyspark.pandas import sql_processor
        >>> # we will call 'sql_processor' directly in doctests so decrease one level.
        >>> sql_processor._CAPTURE_SCOPES = 2
        >>> sql = sql_processor.sql
        >>> str0 = 'abc'
        >>> sql("select {str0}")
           abc
        0  abc

        >>> str1 = 'abc"abc'
        >>> str2 = "abc'abc"
        >>> sql("select {str0}, {str1}, {str2}")
           abc  abc"abc  abc'abc
        0  abc  abc"abc  abc'abc

        >>> strs = ['a', 'b']
        >>> sql("select 'a' in {strs} as cond1, 'c' in {strs} as cond2")
           cond1  cond2
        0   True  False
        """
        blocks = _string.formatter_parser(self._statement)
        # TODO: use a string builder
        res = ""
        try:
            for (pre, inner, _, _) in blocks:
                var_next = "" if inner is None else self._convert(inner)
                res = res + pre + var_next
            self._normalized_statement = res

            sdf = self._session.sql(self._normalized_statement)
        finally:
            for v in self._temp_views:
                self._session.catalog.dropTempView(v)

        index_spark_columns, index_names = _get_index_map(sdf, index_col)

        return DataFrame(
            InternalFrame(spark_frame=sdf,
                          index_spark_columns=index_spark_columns,
                          index_names=index_names))
Esempio n. 3
0
def sql(
    query: str,
    index_col: Optional[Union[str, List[str]]] = None,
    **kwargs: Any,
) -> DataFrame:
    """
    Execute a SQL query and return the result as a pandas-on-Spark DataFrame.

    This function acts as a standard Python string formatter with understanding
    the following variable types:

        * pandas-on-Spark DataFrame
        * pandas-on-Spark Series
        * pandas DataFrame
        * pandas Series
        * string

    Parameters
    ----------
    query : str
        the SQL query
    index_col : str or list of str, optional
        Column names to be used in Spark to represent pandas-on-Spark's index. The index name
        in pandas-on-Spark is ignored. By default, the index is always lost.

        .. note:: If you want to preserve the index, explicitly use :func:`DataFrame.reset_index`,
            and pass it to the sql statement with `index_col` parameter.

            For example,

            >>> psdf = ps.DataFrame({"A": [1, 2, 3], "B":[4, 5, 6]}, index=['a', 'b', 'c'])
            >>> new_psdf = psdf.reset_index()
            >>> ps.sql("SELECT * FROM {new_psdf}", index_col="index", new_psdf=new_psdf)
            ... # doctest: +NORMALIZE_WHITESPACE
                   A  B
            index
            a      1  4
            b      2  5
            c      3  6

            For MultiIndex,

            >>> psdf = ps.DataFrame(
            ...     {"A": [1, 2, 3], "B": [4, 5, 6]},
            ...     index=pd.MultiIndex.from_tuples(
            ...         [("a", "b"), ("c", "d"), ("e", "f")], names=["index1", "index2"]
            ...     ),
            ... )
            >>> new_psdf = psdf.reset_index()
            >>> ps.sql(
            ...     "SELECT * FROM {new_psdf}", index_col=["index1", "index2"], new_psdf=new_psdf)
            ... # doctest: +NORMALIZE_WHITESPACE
                           A  B
            index1 index2
            a      b       1  4
            c      d       2  5
            e      f       3  6

            Also note that the index name(s) should be matched to the existing name.
    kwargs
        other variables that the user want to set that can be referenced in the query

    Returns
    -------
    pandas-on-Spark DataFrame

    Examples
    --------

    Calling a built-in SQL function.

    >>> ps.sql("SELECT * FROM range(10) where id > 7")
       id
    0   8
    1   9

    >>> ps.sql("SELECT * FROM range(10) WHERE id > {bound1} AND id < {bound2}", bound1=7, bound2=9)
       id
    0   8

    >>> mydf = ps.range(10)
    >>> x = tuple(range(4))
    >>> ps.sql("SELECT {ser} FROM {mydf} WHERE id IN {x}", ser=mydf.id, mydf=mydf, x=x)
       id
    0   0
    1   1
    2   2
    3   3

    Mixing pandas-on-Spark and pandas DataFrames in a join operation. Note that the index is
    dropped.

    >>> ps.sql('''
    ...   SELECT m1.a, m2.b
    ...   FROM {table1} m1 INNER JOIN {table2} m2
    ...   ON m1.key = m2.key
    ...   ORDER BY m1.a, m2.b''',
    ...   table1=ps.DataFrame({"a": [1,2], "key": ["a", "b"]}),
    ...   table2=pd.DataFrame({"b": [3,4,5], "key": ["a", "b", "b"]}))
       a  b
    0  1  3
    1  2  4
    2  2  5

    Also, it is possible to query using Series.

    >>> psdf = ps.DataFrame({"A": [1, 2, 3], "B":[4, 5, 6]}, index=['a', 'b', 'c'])
    >>> ps.sql("SELECT {mydf.A} FROM {mydf}", mydf=psdf)
       A
    0  1
    1  2
    2  3
    """
    if os.environ.get("PYSPARK_PANDAS_SQL_LEGACY") == "1":
        from pyspark.pandas import sql_processor

        warnings.warn(
            "Deprecated in 3.3.0, and the legacy behavior "
            "will be removed in the future releases.",
            FutureWarning,
        )
        return sql_processor.sql(query, index_col=index_col, **kwargs)

    session = default_session()
    formatter = PandasSQLStringFormatter(session)
    try:
        sdf = session.sql(formatter.format(query, **kwargs))
    finally:
        formatter.clear()

    index_spark_columns, index_names = _get_index_map(sdf, index_col)

    return DataFrame(
        InternalFrame(spark_frame=sdf,
                      index_spark_columns=index_spark_columns,
                      index_names=index_names))