def approxCountDistinct(col, rsd=None): """Returns a new :class:`Column` for approximate distinct count of ``col``. >>> df.agg(approxCountDistinct(df.age).alias('c')).collect() [Row(c=2)] """ sc = SparkContext._active_spark_context if rsd is None: jc = sc._jvm.functions.approxCountDistinct(_to_java_column(col)) else: jc = sc._jvm.functions.approxCountDistinct(_to_java_column(col), rsd) return Column(jc)
def countDistinct(col, *cols): """Returns a new :class:`Column` for distinct count of ``col`` or ``cols``. >>> df.agg(countDistinct(df.age, df.name).alias('c')).collect() [Row(c=2)] >>> df.agg(countDistinct("age", "name").alias('c')).collect() [Row(c=2)] """ sc = SparkContext._active_spark_context jcols = ListConverter().convert([_to_java_column(c) for c in cols], sc._gateway._gateway_client) jc = sc._jvm.functions.countDistinct(_to_java_column(col), sc._jvm.PythonUtils.toSeq(jcols)) return Column(jc)
def countDistinct(col, *cols): """Returns a new :class:`Column` for distinct count of ``col`` or ``cols``. >>> df.agg(countDistinct(df.age, df.name).alias('c')).collect() [Row(c=2)] >>> df.agg(countDistinct("age", "name").alias('c')).collect() [Row(c=2)] """ sc = SparkContext._active_spark_context jc = sc._jvm.functions.countDistinct(_to_java_column(col), _to_seq(sc, cols, _to_java_column)) return Column(jc)
def explode(col): """Returns a new row for each element in the given array or map. >>> from pyspark.sql import Row >>> eDF = sqlContext.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})]) >>> eDF.select(explode(eDF.intlist).alias("anInt")).collect() [Row(anInt=1), Row(anInt=2), Row(anInt=3)] >>> eDF.select(explode(eDF.mapfield).alias("key", "value")).show() +---+-----+ |key|value| +---+-----+ | a| b| +---+-----+ """ sc = SparkContext._active_spark_context jc = sc._jvm.functions.explode(_to_java_column(col)) return Column(jc)
def __call__(self, *cols): sc = SparkContext._active_spark_context jcols = ListConverter().convert([_to_java_column(c) for c in cols], sc._gateway._gateway_client) jc = self._judf.apply(sc._jvm.PythonUtils.toSeq(jcols)) return Column(jc)
def _(col): sc = SparkContext._active_spark_context jc = getattr(sc._jvm.functions, name)(_to_java_column(col)) return Column(jc)