def coalesce(*cols): """Returns the first column that is not null. >>> cDf = sqlContext.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b")) >>> cDf.show() +----+----+ | a| b| +----+----+ |null|null| | 1|null| |null| 2| +----+----+ >>> cDf.select(coalesce(cDf["a"], cDf["b"])).show() +-------------+ |Coalesce(a,b)| +-------------+ | null| | 1| | 2| +-------------+ >>> cDf.select('*', coalesce(cDf["a"], lit(0.0))).show() +----+----+---------------+ | a| b|Coalesce(a,0.0)| +----+----+---------------+ |null|null| 0.0| | 1|null| 1.0| |null| 2| 0.0| +----+----+---------------+ """ sc = SparkContext._active_spark_context jc = sc._jvm.functions.coalesce(_to_seq(sc, cols, _to_java_column)) return Column(jc)
def countDistinct(col, *cols): """Returns a new :class:`Column` for distinct count of ``col`` or ``cols``. >>> df.agg(countDistinct(df.age, df.name).alias('c')).collect() [Row(c=2)] >>> df.agg(countDistinct("age", "name").alias('c')).collect() [Row(c=2)] """ sc = SparkContext._active_spark_context jc = sc._jvm.functions.countDistinct(_to_java_column(col), _to_seq(sc, cols, _to_java_column)) return Column(jc)
def struct(*cols): """Creates a new struct column. :param cols: list of column names (string) or list of :class:`Column` expressions that are named or aliased. >>> df.select(struct('age', 'name').alias("struct")).collect() [Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))] >>> df.select(struct([df.age, df.name]).alias("struct")).collect() [Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))] """ sc = SparkContext._active_spark_context if len(cols) == 1 and isinstance(cols[0], (list, set)): cols = cols[0] jc = sc._jvm.functions.struct(_to_seq(sc, cols, _to_java_column)) return Column(jc)
def array(*cols): """Creates a new array column. :param cols: list of column names (string) or list of :class:`Column` expressions that have the same data type. >>> df.select(array('age', 'age').alias("arr")).collect() [Row(arr=[2, 2]), Row(arr=[5, 5])] >>> df.select(array([df.age, df.age]).alias("arr")).collect() [Row(arr=[2, 2]), Row(arr=[5, 5])] """ sc = SparkContext._active_spark_context if len(cols) == 1 and isinstance(cols[0], (list, set)): cols = cols[0] jc = sc._jvm.functions.array(_to_seq(sc, cols, _to_java_column)) return Column(jc)
def __call__(self, *cols): sc = SparkContext._active_spark_context jc = self._judf.apply(_to_seq(sc, cols, _to_java_column)) return Column(jc)