def test_get_index_map(self): psdf = ps.DataFrame({ "year": [2015, 2016], "month": [2, 3], "day": [4, 5] }) sdf = psdf.to_spark() self.assertEqual(_get_index_map(sdf), (None, None)) def check(actual, expected): actual_scols, actual_labels = actual expected_column_names, expected_labels = expected self.assertEqual(len(actual_scols), len(expected_column_names)) for actual_scol, expected_column_name in zip( actual_scols, expected_column_names): expected_scol = sdf[expected_column_name] self.assertTrue(spark_column_equals(actual_scol, expected_scol)) self.assertEqual(actual_labels, expected_labels) check(_get_index_map(sdf, "year"), (["year"], [("year", )])) check(_get_index_map(sdf, ["year", "month"]), (["year", "month"], [("year", ), ("month", )])) self.assertRaises(KeyError, lambda: _get_index_map(sdf, ["year", "hour"]))
def execute(self, index_col: Optional[Union[str, List[str]]]) -> DataFrame: """ Returns a DataFrame for which the SQL statement has been executed by the underlying SQL engine. >>> from pyspark.pandas import sql_processor >>> # we will call 'sql_processor' directly in doctests so decrease one level. >>> sql_processor._CAPTURE_SCOPES = 2 >>> sql = sql_processor.sql >>> str0 = 'abc' >>> sql("select {str0}") abc 0 abc >>> str1 = 'abc"abc' >>> str2 = "abc'abc" >>> sql("select {str0}, {str1}, {str2}") abc abc"abc abc'abc 0 abc abc"abc abc'abc >>> strs = ['a', 'b'] >>> sql("select 'a' in {strs} as cond1, 'c' in {strs} as cond2") cond1 cond2 0 True False """ blocks = _string.formatter_parser(self._statement) # TODO: use a string builder res = "" try: for (pre, inner, _, _) in blocks: var_next = "" if inner is None else self._convert(inner) res = res + pre + var_next self._normalized_statement = res sdf = self._session.sql(self._normalized_statement) finally: for v in self._temp_views: self._session.catalog.dropTempView(v) index_spark_columns, index_names = _get_index_map(sdf, index_col) return DataFrame( InternalFrame(spark_frame=sdf, index_spark_columns=index_spark_columns, index_names=index_names))
def sql( query: str, index_col: Optional[Union[str, List[str]]] = None, **kwargs: Any, ) -> DataFrame: """ Execute a SQL query and return the result as a pandas-on-Spark DataFrame. This function acts as a standard Python string formatter with understanding the following variable types: * pandas-on-Spark DataFrame * pandas-on-Spark Series * pandas DataFrame * pandas Series * string Parameters ---------- query : str the SQL query index_col : str or list of str, optional Column names to be used in Spark to represent pandas-on-Spark's index. The index name in pandas-on-Spark is ignored. By default, the index is always lost. .. note:: If you want to preserve the index, explicitly use :func:`DataFrame.reset_index`, and pass it to the sql statement with `index_col` parameter. For example, >>> psdf = ps.DataFrame({"A": [1, 2, 3], "B":[4, 5, 6]}, index=['a', 'b', 'c']) >>> new_psdf = psdf.reset_index() >>> ps.sql("SELECT * FROM {new_psdf}", index_col="index", new_psdf=new_psdf) ... # doctest: +NORMALIZE_WHITESPACE A B index a 1 4 b 2 5 c 3 6 For MultiIndex, >>> psdf = ps.DataFrame( ... {"A": [1, 2, 3], "B": [4, 5, 6]}, ... index=pd.MultiIndex.from_tuples( ... [("a", "b"), ("c", "d"), ("e", "f")], names=["index1", "index2"] ... ), ... ) >>> new_psdf = psdf.reset_index() >>> ps.sql( ... "SELECT * FROM {new_psdf}", index_col=["index1", "index2"], new_psdf=new_psdf) ... # doctest: +NORMALIZE_WHITESPACE A B index1 index2 a b 1 4 c d 2 5 e f 3 6 Also note that the index name(s) should be matched to the existing name. kwargs other variables that the user want to set that can be referenced in the query Returns ------- pandas-on-Spark DataFrame Examples -------- Calling a built-in SQL function. >>> ps.sql("SELECT * FROM range(10) where id > 7") id 0 8 1 9 >>> ps.sql("SELECT * FROM range(10) WHERE id > {bound1} AND id < {bound2}", bound1=7, bound2=9) id 0 8 >>> mydf = ps.range(10) >>> x = tuple(range(4)) >>> ps.sql("SELECT {ser} FROM {mydf} WHERE id IN {x}", ser=mydf.id, mydf=mydf, x=x) id 0 0 1 1 2 2 3 3 Mixing pandas-on-Spark and pandas DataFrames in a join operation. Note that the index is dropped. >>> ps.sql(''' ... SELECT m1.a, m2.b ... FROM {table1} m1 INNER JOIN {table2} m2 ... ON m1.key = m2.key ... ORDER BY m1.a, m2.b''', ... table1=ps.DataFrame({"a": [1,2], "key": ["a", "b"]}), ... table2=pd.DataFrame({"b": [3,4,5], "key": ["a", "b", "b"]})) a b 0 1 3 1 2 4 2 2 5 Also, it is possible to query using Series. >>> psdf = ps.DataFrame({"A": [1, 2, 3], "B":[4, 5, 6]}, index=['a', 'b', 'c']) >>> ps.sql("SELECT {mydf.A} FROM {mydf}", mydf=psdf) A 0 1 1 2 2 3 """ if os.environ.get("PYSPARK_PANDAS_SQL_LEGACY") == "1": from pyspark.pandas import sql_processor warnings.warn( "Deprecated in 3.3.0, and the legacy behavior " "will be removed in the future releases.", FutureWarning, ) return sql_processor.sql(query, index_col=index_col, **kwargs) session = default_session() formatter = PandasSQLStringFormatter(session) try: sdf = session.sql(formatter.format(query, **kwargs)) finally: formatter.clear() index_spark_columns, index_names = _get_index_map(sdf, index_col) return DataFrame( InternalFrame(spark_frame=sdf, index_spark_columns=index_spark_columns, index_names=index_names))