Beispiel #1
0
def coalesce(*cols):
    """Returns the first column that is not null.

    >>> cDf = sqlContext.createDataFrame([(None, None), (1, None), (None, 2)], ("a", "b"))
    >>> cDf.show()
    +----+----+
    |   a|   b|
    +----+----+
    |null|null|
    |   1|null|
    |null|   2|
    +----+----+

    >>> cDf.select(coalesce(cDf["a"], cDf["b"])).show()
    +-------------+
    |coalesce(a,b)|
    +-------------+
    |         null|
    |            1|
    |            2|
    +-------------+

    >>> cDf.select('*', coalesce(cDf["a"], lit(0.0))).show()
    +----+----+---------------+
    |   a|   b|coalesce(a,0.0)|
    +----+----+---------------+
    |null|null|            0.0|
    |   1|null|            1.0|
    |null|   2|            0.0|
    +----+----+---------------+
    """
    sc = SparkContext._active_spark_context
    jc = sc._jvm.functions.coalesce(_to_seq(sc, cols, _to_java_column))
    return Column(jc)
Beispiel #2
0
    def metrics(*metrics):
        """
        Given a list of metrics, provides a builder that it turns computes metrics from a column.

        See the documentation of [[Summarizer]] for an example.

        The following metrics are accepted (case sensitive):
         - mean: a vector that contains the coefficient-wise mean.
         - variance: a vector tha contains the coefficient-wise variance.
         - count: the count of all vectors seen.
         - numNonzeros: a vector with the number of non-zeros for each coefficients
         - max: the maximum for each coefficient.
         - min: the minimum for each coefficient.
         - normL2: the Euclidean norm for each coefficient.
         - normL1: the L1 norm of each coefficient (sum of the absolute values).

        :param metrics:
         metrics that can be provided.
        :return:
         an object of :py:class:`pyspark.ml.stat.SummaryBuilder`

        Note: Currently, the performance of this interface is about 2x~3x slower then using the RDD
        interface.
        """
        sc = SparkContext._active_spark_context
        js = JavaWrapper._new_java_obj("org.apache.spark.ml.stat.Summarizer.metrics",
                                       _to_seq(sc, metrics))
        return SummaryBuilder(js)
Beispiel #3
0
    def agg(self, *exprs):
        """Compute aggregates and returns the result as a :class:`DataFrame`.

        The available aggregate functions are `avg`, `max`, `min`, `sum`, `count`.

        If ``exprs`` is a single :class:`dict` mapping from string to string, then the key
        is the column to perform aggregation on, and the value is the aggregate function.

        Alternatively, ``exprs`` can also be a list of aggregate :class:`Column` expressions.

        :param exprs: a dict mapping from column name (string) to aggregate functions (string),
            or a list of :class:`Column`.

        >>> gdf = df.groupBy(df.name)
        >>> sorted(gdf.agg({"*": "count"}).collect())
        [Row(name=u'Alice', count(1)=1), Row(name=u'Bob', count(1)=1)]

        >>> from pyspark.sql import functions as F
        >>> sorted(gdf.agg(F.min(df.age)).collect())
        [Row(name=u'Alice', min(age)=2), Row(name=u'Bob', min(age)=5)]
        """
        assert exprs, "exprs should not be empty"
        if len(exprs) == 1 and isinstance(exprs[0], dict):
            jdf = self._jgd.agg(exprs[0])
        else:
            # Columns
            assert all(isinstance(c, Column) for c in exprs), "all exprs should be Column"
            jdf = self._jgd.agg(exprs[0]._jc,
                                _to_seq(self.sql_ctx._sc, [c._jc for c in exprs[1:]]))
        return DataFrame(jdf, self.sql_ctx)
Beispiel #4
0
    def bucketBy(self, numBuckets, col, *cols):
        """Buckets the output by the given columns.If specified,
        the output is laid out on the file system similar to Hive's bucketing scheme.

        :param numBuckets: the number of buckets to save
        :param col: a name of a column, or a list of names.
        :param cols: additional names (optional). If `col` is a list it should be empty.

        .. note:: Applicable for file-based data sources in combination with
                  :py:meth:`DataFrameWriter.saveAsTable`.

        >>> (df.write.format('parquet')  # doctest: +SKIP
        ...     .bucketBy(100, 'year', 'month')
        ...     .mode("overwrite")
        ...     .saveAsTable('bucketed_table'))
        """
        if not isinstance(numBuckets, int):
            raise TypeError("numBuckets should be an int, got {0}.".format(type(numBuckets)))

        if isinstance(col, (list, tuple)):
            if cols:
                raise ValueError("col is a {0} but cols are not empty".format(type(col)))

            col, cols = col[0], col[1:]

        if not all(isinstance(c, basestring) for c in cols) or not(isinstance(col, basestring)):
            raise TypeError("all names should be `str`")

        self._jwrite = self._jwrite.bucketBy(numBuckets, col, _to_seq(self._spark._sc, cols))
        return self
Beispiel #5
0
    def parquet(self, *paths):
        """Loads a Parquet file, returning the result as a :class:`DataFrame`.

        >>> df = sqlContext.read.parquet('python/test_support/sql/parquet_partitioned')
        >>> df.dtypes
        [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
        """
        return self._df(self._jreader.parquet(_to_seq(self._sqlContext._sc, paths)))
Beispiel #6
0
    def parquet(self, *paths):
        """Loads a Parquet file, returning the result as a :class:`DataFrame`.

        >>> df = sqlContext.read.parquet('python/test_support/sql/parquet_partitioned')
        >>> df.dtypes
        [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
        """
        return self._df(
            self._jreader.parquet(_to_seq(self._sqlContext._sc, paths)))
Beispiel #7
0
def serialize_cnn_features_udf(sc, arr):
    """
        Serialize CNN features
    :param sc: SparkContext
    :param arr: CNN features DataFrame
    :return: DataFrame
    """
    _serialize_array = sc._jvm.vista.udf.VistaUDFs.serializeCNNFeaturesArrUDF()
    return Column(_serialize_array.apply(_to_seq(sc, [arr], _to_java_column)))
Beispiel #8
0
def do_add_java(col):

    # This works
    spark.sparkContext._jvm.com.example.sparkoperations.SparkUDF.hello("Juha")

    # This does not work
    _add_one = spark.sparkContext._jvm.com.example.sparkoperations.SparkUDF(
    ).call
    return Column(_add_one(_to_seq(spark.sparkContext, [col],
                                   _to_java_column)))
Beispiel #9
0
    def generate_uuid(self):
        """ Generate V4 UUID.

        Returns:
            Spark Column (StringType): containing v4 UUIDs.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _generate_uuid = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.generateUUID_UDF()
        return Column(_generate_uuid.apply(_to_seq(sc, [], _to_java_column)))
Beispiel #10
0
def image_to_byte_arr_udf(sc, image_buffer):
    """
        Transforms the JPEG encoded images to raw format returns a DataFrame of byte[]
    :param sc: SparkContext
    :param image_buffer: Images in JPEG format
    :return: DataFrame
    """
    _image_to_byte_arr = sc._jvm.vista.udf.VistaUDFs.imageToByteArrayUDF()
    return Column(
        _image_to_byte_arr.apply(_to_seq(sc, [image_buffer], _to_java_column)))
Beispiel #11
0
def concat(*cols):
    """
    Concatenates multiple input string columns together into a single string column.

    >>> df = sqlContext.createDataFrame([('abcd','123')], ['s', 'd'])
    >>> df.select(concat(df.s, df.d).alias('s')).collect()
    [Row(s=u'abcd123')]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.concat(_to_seq(sc, cols, _to_java_column)))
Beispiel #12
0
def at_least_n_distinct(col, limit):
    """Count distinct that works with windows

    The standard distinct count in spark sql can't be applied in
    a window. This implementation allows that to work
    """
    sc = SparkContext._active_spark_context
    j_cols = _to_seq(sc, [_to_java_column(col), _to_java_column(F.lit(limit))])
    jc = sc._jvm.org.wikimedia.search.mjolnir.AtLeastNDistinct().apply(j_cols)
    return Column(jc)
Beispiel #13
0
def hlike(col, regexps):
    """Hyperscan regex like. Returns true if col matches one of regexps
    :param col: Column
    :param regexps: list of patterns to match
    :return: boolean column with match result
    """
    sc = SparkContext._active_spark_context
    patterns = sc._jvm.functions.array(_to_seq(sc, [
        _create_column_from_literal(x) for x in regexps
    ]))
    return Column(sc._jvm.ru.napalabs.spark.hscan.functions.hlike(_to_java_column(col), patterns))
Beispiel #14
0
 def select_freqitems(self, cols, support=None):
     if isinstance(cols, tuple):
         cols = list(cols)
     if not isinstance(cols, list):
         raise ValueError(
             "cols must be a list or tuple of column names as strings.")
     if not support:
         support = 0.01
     return DataFrame(
         self._jdf.stat().freqItems(_to_seq(self._sc, cols), support),
         self.sql_ctx)
Beispiel #15
0
    def partitionBy(self, *cols):
        """
        Partitions the output by the given columns on the file system.
        If specified, the output is laid out on the file system similar
        to Hive's partitioning scheme.

        :param cols: name of columns
        """
        if len(cols) == 1 and isinstance(cols[0], (list, tuple)):
            cols = cols[0]
        self._jwrite = self._jwrite.partitionBy(_to_seq(self._sqlContext._sc, cols))
        return self
Beispiel #16
0
def slice_layers_udf(sc, image_features, cum_sums):
    """
        Slice bulk inference cnn features into multiple layers
    :param sc: SparkContext
    :param image_features: Bulk inference CNN image features
    :param cum_sums: List containing cumulative sizes of the CNN layer feature sizes
    :return: DataFrame
    """
    _slice_layers = sc._jvm.vista.udf.VistaUDFs.sliceLayersUDF()
    return Column(
        _slice_layers.apply(
            _to_seq(sc, [image_features, cum_sums], _to_java_column)))
def _unresolved_named_lambda_variable(*name_parts: Any) -> Column:
    """
    Create `o.a.s.sql.expressions.UnresolvedNamedLambdaVariable`,
    convert it to o.s.sql.Column and wrap in Python `Column`
    :param name_parts: str
    """
    sc = SparkContext._active_spark_context
    name_parts_seq = _to_seq(sc, name_parts)
    expressions = sc._jvm.org.apache.spark.sql.catalyst.expressions
    return Column(
        sc._jvm.Column(
            expressions.UnresolvedNamedLambdaVariable(name_parts_seq)))
Beispiel #18
0
    def parquet(self, *path):
        """Loads a Parquet file, returning the result as a :class:`DataFrame`.

        >>> import tempfile, shutil
        >>> parquetFile = tempfile.mkdtemp()
        >>> shutil.rmtree(parquetFile)
        >>> df.saveAsParquetFile(parquetFile)
        >>> df2 = sqlContext.read.parquet(parquetFile)
        >>> sorted(df.collect()) == sorted(df2.collect())
        True
        """
        return self._df(self._jreader.parquet(_to_seq(self._sqlContext._sc, path)))
Beispiel #19
0
    def parquet(self, *path):
        """Loads a Parquet file, returning the result as a :class:`DataFrame`.

        >>> import tempfile, shutil
        >>> parquetFile = tempfile.mkdtemp()
        >>> shutil.rmtree(parquetFile)
        >>> df.saveAsParquetFile(parquetFile)
        >>> df2 = sqlContext.read.parquet(parquetFile)
        >>> sorted(df.collect()) == sorted(df2.collect())
        True
        """
        return self._df(self._jreader.parquet(_to_seq(self._sqlContext._sc, path)))
Beispiel #20
0
def countDistinct(col, *cols):
    """Returns a new :class:`Column` for distinct count of ``col`` or ``cols``.

    >>> df.agg(countDistinct(df.age, df.name).alias('c')).collect()
    [Row(c=2)]

    >>> df.agg(countDistinct("age", "name").alias('c')).collect()
    [Row(c=2)]
    """
    sc = SparkContext._active_spark_context
    jc = sc._jvm.functions.countDistinct(_to_java_column(col), _to_seq(sc, cols, _to_java_column))
    return Column(jc)
Beispiel #21
0
    def orc(self, path):
        """Loads ORC files, returning the result as a :class:`DataFrame`.

        .. note:: Currently ORC support is only available together with Hive support.

        >>> df = spark.read.orc('python/test_support/sql/orc_partitioned')
        >>> df.dtypes
        [('a', 'bigint'), ('b', 'int'), ('c', 'int')]
        """
        if isinstance(path, basestring):
            path = [path]
        return self._df(self._jreader.orc(_to_seq(self._spark._sc, path)))
Beispiel #22
0
    def orc(self, path):
        """Loads ORC files, returning the result as a :class:`DataFrame`.

        .. note:: Currently ORC support is only available together with Hive support.

        >>> df = spark.read.orc('python/test_support/sql/orc_partitioned')
        >>> df.dtypes
        [('a', 'bigint'), ('b', 'int'), ('c', 'int')]
        """
        if isinstance(path, basestring):
            path = [path]
        return self._df(self._jreader.orc(_to_seq(self._spark._sc, path)))
Beispiel #23
0
def countDistinct(col, *cols):
    """Returns a new :class:`Column` for distinct count of ``col`` or ``cols``.

    >>> df.agg(countDistinct(df.age, df.name).alias('c')).collect()
    [Row(c=2)]

    >>> df.agg(countDistinct("age", "name").alias('c')).collect()
    [Row(c=2)]
    """
    sc = SparkContext._active_spark_context
    jc = sc._jvm.functions.countDistinct(_to_java_column(col), _to_seq(sc, cols, _to_java_column))
    return Column(jc)
Beispiel #24
0
    def agg(self, *exprs):
        """Compute aggregates and returns the result as a :class:`DataFrame`.

        The available aggregate functions can be:

        1. built-in aggregation functions, such as `avg`, `max`, `min`, `sum`, `count`

        2. group aggregate pandas UDFs, created with :func:`pyspark.sql.functions.pandas_udf`

           .. note:: There is no partial aggregation with group aggregate UDFs, i.e.,
               a full shuffle is required. Also, all the data of a group will be loaded into
               memory, so the user should be aware of the potential OOM risk if data is skewed
               and certain groups are too large to fit in memory.

           .. seealso:: :func:`pyspark.sql.functions.pandas_udf`

        If ``exprs`` is a single :class:`dict` mapping from string to string, then the key
        is the column to perform aggregation on, and the value is the aggregate function.

        Alternatively, ``exprs`` can also be a list of aggregate :class:`Column` expressions.

        .. note:: Built-in aggregation functions and group aggregate pandas UDFs cannot be mixed
            in a single call to this function.

        :param exprs: a dict mapping from column name (string) to aggregate functions (string),
            or a list of :class:`Column`.

        >>> gdf = df.groupBy(df.name)
        >>> sorted(gdf.agg({"*": "count"}).collect())
        [Row(name=u'Alice', count(1)=1), Row(name=u'Bob', count(1)=1)]

        >>> from pyspark.sql import functions as F
        >>> sorted(gdf.agg(F.min(df.age)).collect())
        [Row(name=u'Alice', min(age)=2), Row(name=u'Bob', min(age)=5)]

        >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
        >>> @pandas_udf('int', PandasUDFType.GROUPED_AGG)  # doctest: +SKIP
        ... def min_udf(v):
        ...     return v.min()
        >>> sorted(gdf.agg(min_udf(df.age)).collect())  # doctest: +SKIP
        [Row(name=u'Alice', min_udf(age)=2), Row(name=u'Bob', min_udf(age)=5)]
        """
        assert exprs, "exprs should not be empty"
        if len(exprs) == 1 and isinstance(exprs[0], dict):
            jdf = self._jgd.agg(exprs[0])
        else:
            # Columns
            assert all(isinstance(c, Column)
                       for c in exprs), "all exprs should be Column"
            jdf = self._jgd.agg(
                exprs[0]._jc,
                _to_seq(self.sql_ctx._sc, [c._jc for c in exprs[1:]]))
        return DataFrame(jdf, self.sql_ctx)
Beispiel #25
0
    def empty_string_to_null(self, target_col):
        """ Convert empty strings to nulls.

        Args:
            target_col (Spark Column): target column to convert.

        Returns:
            Spark Column (StringType): target column with empty values converted to nulls.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _empty_string_to_null = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.emptyStringToNull_UDF()
        return Column(_empty_string_to_null.apply(_to_seq(sc, [target_col], _to_java_column)))
Beispiel #26
0
    def string_is_number(self, target_col):
        """ Return boolean if string can be converted to a number.

        Args:
            target_col (Spark Column): containing string to check for convertability to number.

        Returns:
            Spark Column (BooleanType): whether string can converted to a number.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _string_is_number = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.stringIsNumber_UDF()
        return Column(_string_is_number.apply(_to_seq(sc, [target_col], _to_java_column)))
Beispiel #27
0
    def clean_string(self, target_col):
        """ Remove Java ISO control characters from, and trim, string.

        Args:
            target_col (Spark Column): target column to be cleaned.

        Returns:
            Spark Column (StringType): cleaned version of input column.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _clean_string = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.cleanString_UDF()
        return Column(_clean_string.apply(_to_seq(sc, [target_col], _to_java_column)))
Beispiel #28
0
    def map_booleans_ynu(self, target_col):
        """ Map boolean values to `Y`, `N`, `Unknown`

        Args:
            target_col (Spark Column): target column containing boolean values to map.

        Returns:
            Spark Column (StringType): mapped values (`Y`, `N`, `Unknown`)
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _map_booleans_ynu = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.mapBooleansYNU_UDF()
        return Column(_map_booleans_ynu.apply(_to_seq(sc, [target_col], _to_java_column)))
Beispiel #29
0
def format_string(format, *cols):
    """
    Formats the arguments in printf-style and returns the result as a string column.

    :param col: the column name of the numeric value to be formatted
    :param d: the N decimal places

    >>> df = sqlContext.createDataFrame([(5, "hello")], ['a', 'b'])
    >>> df.select(format_string('%d %s', df.a, df.b).alias('v')).collect()
    [Row(v=u'5 hello')]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.format_string(format, _to_seq(sc, cols, _to_java_column)))
Beispiel #30
0
    def string_to_double_cfd(self, target_col):
        """ Convert string to doubles where commas represents decimal places (`cfd`).

        Args:
            target_col (Spark Column): containing double values in string format.

        Returns:
            Spark Column (DoubleType): containing double values converted from strings.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _string_to_double = sc._jvm.com.civicboost.spark.etl.utilities.GeneralUDFs.stringToDoubleCommaForDecimal_UDF()
        return Column(_string_to_double.apply(_to_seq(sc, [target_col], _to_java_column)))
Beispiel #31
0
    def parquet(self, *paths):
        """Loads Parquet files, returning the result as a :class:`DataFrame`.

        You can set the following Parquet-specific option(s) for reading Parquet files:
            * ``mergeSchema``: sets whether we should merge schemas collected from all \
                Parquet part-files. This will override ``spark.sql.parquet.mergeSchema``. \
                The default value is specified in ``spark.sql.parquet.mergeSchema``.

        >>> df = spark.read.parquet('python/test_support/sql/parquet_partitioned')
        >>> df.dtypes
        [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
        """
        return self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths)))
Beispiel #32
0
    def parquet(self, *paths):
        """Loads a Parquet file, returning the result as a :class:`DataFrame`.

        You can set the following Parquet-specific option(s) for reading Parquet files:
            * ``mergeSchema``: sets whether we should merge schemas collected from all \
                Parquet part-files. This will override ``spark.sql.parquet.mergeSchema``. \
                The default value is specified in ``spark.sql.parquet.mergeSchema``.

        >>> df = spark.read.parquet('python/test_support/sql/parquet_partitioned')
        >>> df.dtypes
        [('name', 'string'), ('year', 'int'), ('month', 'int'), ('day', 'int')]
        """
        return self._df(self._jreader.parquet(_to_seq(self._spark._sc, paths)))
Beispiel #33
0
    def partitionBy(self, *cols):
        """
        Partitions the output by the given columns on the file system.
        If specified, the output is laid out on the file system similar
        to Hive's partitioning scheme.

        :param cols: name of columns
        """
        if len(cols) == 1 and isinstance(cols[0], (list, tuple)):
            cols = cols[0]
        self._jwrite = self._jwrite.partitionBy(
            _to_seq(self._sqlContext._sc, cols))
        return self
Beispiel #34
0
    def normalize_timestamp_dm(self, target_col):
        """ Convert string to timestamp where DAY is BEFORE MONTH.

        Args:
            target_col (Spark Column): containing strings representing timestamps.

        Returns:
            Spark Column (TimestampType): containing timestamps converted from strings.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _normalize_timestamp_dm = sc._jvm.com.civicboost.spark.etl.utilities.DateTimeUDFs.normalizeTimestampDM_UDF()
        return Column(_normalize_timestamp_dm.apply(_to_seq(sc, [target_col], _to_java_column)))
Beispiel #35
0
def least(*cols):
    """
    Returns the least value of the list of column names, skipping null values.
    This function takes at least 2 parameters. It will return null iff all parameters are null.

    >>> df = sqlContext.createDataFrame([(1, 4, 3)], ['a', 'b', 'c'])
    >>> df.select(least(df.a, df.b, df.c).alias("least")).collect()
    [Row(least=1)]
    """
    if len(cols) < 2:
        raise ValueError("least should take at least two columns")
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.least(_to_seq(sc, cols, _to_java_column)))
Beispiel #36
0
    def normalize_date_md(self, target_col):
        """ Convert string to date where MONTH is BEFORE DAY.

        Args:
            target_col (Spark Column): containing strings representing dates.

        Returns:
            Spark Column (DateType): containing dates converted from strings.
        """
        sc = self.spark.sparkContext
        # noinspection PyUnresolvedReferences, PyProtectedMember
        _normalize_date_md = sc._jvm.com.civicboost.spark.etl.utilities.DateTimeUDFs.normalizeDateMD_UDF()
        return Column(_normalize_date_md.apply(_to_seq(sc, [target_col], _to_java_column)))
Beispiel #37
0
    def agg(self, *exprs):
        """Compute aggregates and returns the result as a :class:`DataFrame`.

        The available aggregate functions can be:

        1. built-in aggregation functions, such as `avg`, `max`, `min`, `sum`, `count`

        2. group aggregate pandas UDFs, created with :func:`pyspark.sql.functions.pandas_udf`

           .. note:: There is no partial aggregation with group aggregate UDFs, i.e.,
               a full shuffle is required. Also, all the data of a group will be loaded into
               memory, so the user should be aware of the potential OOM risk if data is skewed
               and certain groups are too large to fit in memory.

           .. seealso:: :func:`pyspark.sql.functions.pandas_udf`

        If ``exprs`` is a single :class:`dict` mapping from string to string, then the key
        is the column to perform aggregation on, and the value is the aggregate function.

        Alternatively, ``exprs`` can also be a list of aggregate :class:`Column` expressions.

        .. note:: Built-in aggregation functions and group aggregate pandas UDFs cannot be mixed
            in a single call to this function.

        :param exprs: a dict mapping from column name (string) to aggregate functions (string),
            or a list of :class:`Column`.

        >>> gdf = df.groupBy(df.name)
        >>> sorted(gdf.agg({"*": "count"}).collect())
        [Row(name=u'Alice', count(1)=1), Row(name=u'Bob', count(1)=1)]

        >>> from pyspark.sql import functions as F
        >>> sorted(gdf.agg(F.min(df.age)).collect())
        [Row(name=u'Alice', min(age)=2), Row(name=u'Bob', min(age)=5)]

        >>> from pyspark.sql.functions import pandas_udf, PandasUDFType
        >>> @pandas_udf('int', PandasUDFType.GROUPED_AGG)  # doctest: +SKIP
        ... def min_udf(v):
        ...     return v.min()
        >>> sorted(gdf.agg(min_udf(df.age)).collect())  # doctest: +SKIP
        [Row(name=u'Alice', min_udf(age)=2), Row(name=u'Bob', min_udf(age)=5)]
        """
        assert exprs, "exprs should not be empty"
        if len(exprs) == 1 and isinstance(exprs[0], dict):
            jdf = self._jgd.agg(exprs[0])
        else:
            # Columns
            assert all(isinstance(c, Column) for c in exprs), "all exprs should be Column"
            jdf = self._jgd.agg(exprs[0]._jc,
                                _to_seq(self.sql_ctx._sc, [c._jc for c in exprs[1:]]))
        return DataFrame(jdf, self.sql_ctx)
 def _gk(col):
     UntypedGKAggregator_instance = (
         sc._jvm.com.github.nlzimmerman.UntypedGKAggregator(q, e)
     )
     instance_applier = UntypedGKAggregator_instance.apply
     return Column(
         instance_applier(
             _to_seq(
                 sc,
                 [col],
                 _to_java_column
             )
         )
     )
Beispiel #39
0
    def partitionBy(self, *cols):
        """Partitions the output by the given columns on the file system.

        If specified, the output is laid out on the file system similar
        to Hive's partitioning scheme.

        :param cols: name of columns

        >>> df.write.partitionBy('year', 'month').parquet(os.path.join(tempfile.mkdtemp(), 'data'))
        """
        if len(cols) == 1 and isinstance(cols[0], (list, tuple)):
            cols = cols[0]
        self._jwrite = self._jwrite.partitionBy(_to_seq(self._sqlContext._sc, cols))
        return self
Beispiel #40
0
def _sqlFunc(func, *params):
    """
    SQL aggregate function called by name to run over groubBy

    Parameters:
        func (str): SQL function name

    Returns:     
        Spark column
    """

    return Column(
        spark.sparkContext._jvm.org.apache.spark.sql.functions.callUDF(
            func, _to_seq(sc, *params, _to_java_column)))
Beispiel #41
0
    def partitionBy(self, *cols):
        """Partitions the output by the given columns on the file system.

        If specified, the output is laid out on the file system similar
        to Hive's partitioning scheme.

        :param cols: name of columns

        >>> df.write.partitionBy('year', 'month').parquet(os.path.join(tempfile.mkdtemp(), 'data'))
        """
        if len(cols) == 1 and isinstance(cols[0], (list, tuple)):
            cols = cols[0]
        self._jwrite = self._jwrite.partitionBy(_to_seq(self._spark._sc, cols))
        return self
Beispiel #42
0
    def pivot(self, pivot_col, *values):
        """Pivots a column of the current DataFrame and preform the specified aggregation.

        :param pivot_col: Column to pivot
        :param values: Optional list of values of pivotColumn that will be translated to columns in
            the output data frame. If values are not provided the method with do an immediate call
            to .distinct() on the pivot column.
        >>> df4.groupBy("year").pivot("course", "dotNET", "Java").sum("earnings").collect()
        [Row(year=2012, dotNET=15000, Java=20000), Row(year=2013, dotNET=48000, Java=30000)]
        >>> df4.groupBy("year").pivot("course").sum("earnings").collect()
        [Row(year=2012, Java=20000, dotNET=15000), Row(year=2013, Java=30000, dotNET=48000)]
        """
        jgd = self._jdf.pivot(_to_java_column(pivot_col),
                              _to_seq(self.sql_ctx._sc, values, _create_column_from_literal))
        return GroupedData(jgd, self.sql_ctx)
Beispiel #43
0
def struct(*cols):
    """Creates a new struct column.

    :param cols: list of column names (string) or list of :class:`Column` expressions

    >>> df.select(struct('age', 'name').alias("struct")).collect()
    [Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))]
    >>> df.select(struct([df.age, df.name]).alias("struct")).collect()
    [Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))]
    """
    sc = SparkContext._active_spark_context
    if len(cols) == 1 and isinstance(cols[0], (list, set)):
        cols = cols[0]
    jc = sc._jvm.functions.struct(_to_seq(sc, cols, _to_java_column))
    return Column(jc)
Beispiel #44
0
def struct(*cols):
    """Creates a new struct column.

    :param cols: list of column names (string) or list of :class:`Column` expressions

    >>> df.select(struct('age', 'name').alias("struct")).collect()
    [Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))]
    >>> df.select(struct([df.age, df.name]).alias("struct")).collect()
    [Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))]
    """
    sc = SparkContext._active_spark_context
    if len(cols) == 1 and isinstance(cols[0], (list, set)):
        cols = cols[0]
    jc = sc._jvm.functions.struct(_to_seq(sc, cols, _to_java_column))
    return Column(jc)
Beispiel #45
0
def array(*cols):
    """Creates a new array column.

    :param cols: list of column names (string) or list of :class:`Column` expressions that have
        the same data type.

    >>> df.select(array('age', 'age').alias("arr")).collect()
    [Row(arr=[2, 2]), Row(arr=[5, 5])]
    >>> df.select(array([df.age, df.age]).alias("arr")).collect()
    [Row(arr=[2, 2]), Row(arr=[5, 5])]
    """
    sc = SparkContext._active_spark_context
    if len(cols) == 1 and isinstance(cols[0], (list, set)):
        cols = cols[0]
    jc = sc._jvm.functions.array(_to_seq(sc, cols, _to_java_column))
    return Column(jc)
Beispiel #46
0
    def freqItems(self, cols, support=None):
        """
        Finding frequent items for columns, possibly with false positives. Using the
        frequent element count algorithm described in
        "http://dx.doi.org/10.1145/762471.762473, proposed by Karp, Schenker, and Papadimitriou".
        :func:`DataFrame.freqItems` and :func:`DataFrameStatFunctions.freqItems` are aliases.

        :param cols: Names of the columns to calculate frequent items for as a list or tuple of
            strings.
        :param support: The frequency with which to consider an item 'frequent'. Default is 1%.
            The support must be greater than 1e-4.
        """
        if isinstance(cols, tuple):
            cols = list(cols)
        if not isinstance(cols, list):
            raise ValueError("cols must be a list or tuple of column names as strings.")
        if not support:
            support = 0.01
        return DataFrame(self._jdf.stat().freqItems(_to_seq(self._sc, cols), support), self.sql_ctx)
Beispiel #47
0
    def randomSplit(self, weights, seed=None):
        """Randomly splits this :class:`DataFrame` with the provided weights.

        :param weights: list of doubles as weights with which to split the DataFrame. Weights will
            be normalized if they don't sum up to 1.0.
        :param seed: The seed for sampling.

        >>> splits = df4.randomSplit([1.0, 2.0], 24)
        >>> splits[0].count()
        1

        >>> splits[1].count()
        3
        """
        for w in weights:
            if w < 0.0:
                raise ValueError("Weights must be positive. Found weight value: %s" % w)
        seed = seed if seed is not None else random.randint(0, sys.maxsize)
        rdd_array = self._jdf.randomSplit(_to_seq(self.sql_ctx._sc, weights), long(seed))
        return [DataFrame(rdd, self.sql_ctx) for rdd in rdd_array]
Beispiel #48
0
    def sortBy(self, col, *cols):
        """Sorts the output in each bucket by the given columns on the file system.

        :param col: a name of a column, or a list of names.
        :param cols: additional names (optional). If `col` is a list it should be empty.

        >>> (df.write.format('parquet')  # doctest: +SKIP
        ...     .bucketBy(100, 'year', 'month')
        ...     .sortBy('day')
        ...     .mode("overwrite")
        ...     .saveAsTable('sorted_bucketed_table'))
        """
        if isinstance(col, (list, tuple)):
            if cols:
                raise ValueError("col is a {0} but cols are not empty".format(type(col)))

            col, cols = col[0], col[1:]

        if not all(isinstance(c, basestring) for c in cols) or not(isinstance(col, basestring)):
            raise TypeError("all names should be `str`")

        self._jwrite = self._jwrite.sortBy(col, _to_seq(self._spark._sc, cols))
        return self
Beispiel #49
0
 def _api(self, *cols):
     name = f.__name__
     jdf = getattr(self._jgd, name)(_to_seq(self.sql_ctx._sc, cols))
     return DataFrame(jdf, self.sql_ctx)
Beispiel #50
0
def _to_java_cols(cols):
    sc = SparkContext._active_spark_context
    if len(cols) == 1 and isinstance(cols[0], list):
        cols = cols[0]
    return _to_seq(sc, cols, _to_java_column)
Beispiel #51
0
 def __call__(self, *cols):
     sc = SparkContext._active_spark_context
     jc = self._judf.apply(_to_seq(sc, cols, _to_java_column))
     return Column(jc)
Beispiel #52
0
 def _jseq(self, cols, converter=None):
     """Return a JVM Seq of Columns from a list of Column or names"""
     return _to_seq(self.sql_ctx._sc, cols, converter)