def percentile_approx(col, percentage, accuracy=10000): """ Returns the approximate percentile value of numeric column col at the given percentage. The value of percentage must be between 0.0 and 1.0. The accuracy parameter (default: 10000) is a positive numeric literal which controls approximation accuracy at the cost of memory. Higher value of accuracy yields better accuracy, 1.0/accuracy is the relative error of the approximation. When percentage is an array, each value of the percentage array must be between 0.0 and 1.0. In this case, returns the approximate percentile array of column col at the given percentage array. Ported from Spark 3.1. """ sc = SparkContext._active_spark_context if isinstance(percentage, (list, tuple)): # A local list percentage = sc._jvm.functions.array( _to_seq(sc, [_create_column_from_literal(x) for x in percentage])) elif isinstance(percentage, Column): # Already a Column percentage = _to_java_column(percentage) else: # Probably scalar percentage = _create_column_from_literal(percentage) accuracy = (_to_java_column(accuracy) if isinstance(accuracy, Column) else _create_column_from_literal(accuracy)) return _call_udf(sc, "percentile_approx", _to_java_column(col), percentage, accuracy)
def repeat(col: Column, n: Union[int, Column]) -> Column: """ Repeats a string column n times, and returns it as a new string column. """ sc = SparkContext._active_spark_context n = _to_java_column(n) if isinstance(n, Column) else _create_column_from_literal(n) return _call_udf(sc, "repeat", _to_java_column(col), n)
def date_part(field: Union[str, Column], source: Column) -> Column: """ Extracts a part of the date/timestamp or interval source. """ sc = SparkContext._active_spark_context field = (_to_java_column(field) if isinstance(field, Column) else _create_column_from_literal(field)) return _call_udf(sc, "date_part", field, _to_java_column(source))
def hlike(col, regexps): """Hyperscan regex like. Returns true if col matches one of regexps :param col: Column :param regexps: list of patterns to match :return: boolean column with match result """ sc = SparkContext._active_spark_context patterns = sc._jvm.functions.array(_to_seq(sc, [ _create_column_from_literal(x) for x in regexps ])) return Column(sc._jvm.ru.napalabs.spark.hscan.functions.hlike(_to_java_column(col), patterns))
def _(col, other): # convert other to a Row if necessary jcol = col._jc sc = SparkContext._active_spark_context loader = sc._jvm.Thread.currentThread().getContextClassLoader() wclass = loader.loadClass(name) expr_class = sc._jvm.java.lang.Object expr_array = sc._gateway.new_array(expr_class, 2) expr_array[0] = jcol.expr() expr_array[1] = _create_column_from_literal(other) w = wclass.getConstructors()[0].newInstance(expr_array) wcol = sc._jvm.org.apache.spark.sql.Column(w) return Column(wcol)