Python Column Beispiele, pyspark.sql.column.Column Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: functions.py Projekt: JLarseneur/LabTP

def _sqlFunc(func, *params):
    """
    SQL aggregate function called by name to run over groubBy

    Parameters:
        func (str): SQL function name

    Returns:     
        Spark column
    """

    return Column(
        spark.sparkContext._jvm.org.apache.spark.sql.functions.callUDF(
            func, _to_seq(sc, *params, _to_java_column)))

Beispiel #2

0

Datei anzeigen

Datei: dataframe.py Projekt: zyjibmcn/spark

    def withColumnRenamed(self, existing, new):
        """Returns a new :class:`DataFrame` by renaming an existing column.

        :param existing: string, name of the existing column to rename.
        :param col: string, new name of the column.

        >>> df.withColumnRenamed('age', 'age2').collect()
        [Row(age2=2, name=u'Alice'), Row(age2=5, name=u'Bob')]
        """
        cols = [
            Column(_to_java_column(c)).alias(new) if c == existing else c
            for c in self.columns
        ]
        return self.select(*cols)

Beispiel #3

0

Datei anzeigen

def lead(col, count=1, default=None):
    """
    Window function: returns the value that is `offset` rows after the current row, and
    `defaultValue` if there is less than `offset` rows after the current row. For example,
    an `offset` of one will return the next row at any given point in the window partition.

    This is equivalent to the LEAD function in SQL.

    :param col: name of column or expression
    :param count: number of row to extend
    :param default: default value
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.lead(_to_java_column(col), count, default))

Beispiel #4

0

Datei anzeigen

Datei: functions.py Projekt: Pushking4real/spark

def vector_to_array(col: Column, dtype: str = "float64") -> Column:
    """
    Converts a column of MLlib sparse/dense vectors into a column of dense arrays.

    .. versionadded:: 3.0.0

    Parameters
    ----------
    col : :py:class:`pyspark.sql.Column` or str
        Input column
    dtype : str, optional
        The data type of the output array. Valid values: "float64" or "float32".

    Returns
    -------
    :py:class:`pyspark.sql.Column`
        The converted column of dense arrays.

    Examples
    --------
    >>> from pyspark.ml.linalg import Vectors
    >>> from pyspark.ml.functions import vector_to_array
    >>> from pyspark.mllib.linalg import Vectors as OldVectors
    >>> df = spark.createDataFrame([
    ...     (Vectors.dense(1.0, 2.0, 3.0), OldVectors.dense(10.0, 20.0, 30.0)),
    ...     (Vectors.sparse(3, [(0, 2.0), (2, 3.0)]),
    ...      OldVectors.sparse(3, [(0, 20.0), (2, 30.0)]))],
    ...     ["vec", "oldVec"])
    >>> df1 = df.select(vector_to_array("vec").alias("vec"),
    ...                 vector_to_array("oldVec").alias("oldVec"))
    >>> df1.collect()
    [Row(vec=[1.0, 2.0, 3.0], oldVec=[10.0, 20.0, 30.0]),
     Row(vec=[2.0, 0.0, 3.0], oldVec=[20.0, 0.0, 30.0])]
    >>> df2 = df.select(vector_to_array("vec", "float32").alias("vec"),
    ...                 vector_to_array("oldVec", "float32").alias("oldVec"))
    >>> df2.collect()
    [Row(vec=[1.0, 2.0, 3.0], oldVec=[10.0, 20.0, 30.0]),
     Row(vec=[2.0, 0.0, 3.0], oldVec=[20.0, 0.0, 30.0])]
    >>> df1.schema.fields
    [StructField(vec,ArrayType(DoubleType,false),false),
    StructField(oldVec,ArrayType(DoubleType,false),false)]
    >>> df2.schema.fields
    [StructField(vec,ArrayType(FloatType,false),false),
    StructField(oldVec,ArrayType(FloatType,false),false)]
    """
    sc = SparkContext._active_spark_context
    assert sc is not None and sc._jvm is not None
    return Column(
        sc._jvm.org.apache.spark.ml.functions.vector_to_array(_to_java_column(col), dtype)
    )

Beispiel #5

0

Datei anzeigen

def sha2(col, numBits):
    """Returns the hex string result of SHA-2 family of hash functions (SHA-224, SHA-256, SHA-384,
    and SHA-512). The numBits indicates the desired bit length of the result, which must have a
    value of 224, 256, 384, 512, or 0 (which is equivalent to 256).

    >>> digests = df.select(sha2(df.name, 256).alias('s')).collect()
    >>> digests[0]
    Row(s=u'3bc51062973c458d5a6f2d8d64a023246354ad7e064b1e4e009ec8a0699a3043')
    >>> digests[1]
    Row(s=u'cd9fb1e148ccd8442e5aa74904cc73bf6fb54d1d54d333bd596aa9bb4bb4e961')
    """
    sc = SparkContext._active_spark_context
    jc = sc._jvm.functions.sha2(_to_java_column(col), numBits)
    return Column(jc)

Beispiel #6

0

Datei anzeigen

Datei: rasterfunctions.py Projekt: zacklin923/rasterframes

def rf_agg_approx_quantiles(tile_col, probabilities, relative_error=0.00001):
    """
    Calculates the approximate quantiles of a tile column of a DataFrame.

    :param tile_col: column to extract cells from.
    :param probabilities: a list of quantile probabilities. Each number must belong to [0, 1].
            For example 0 is the minimum, 0.5 is the median, 1 is the maximum.
    :param relative_error: The relative target precision to achieve (greater than or equal to 0). Default is 0.00001
    :return: An array of values approximately at the specified `probabilities`
    """

    _jfn = RFContext.active().lookup('rf_agg_approx_quantiles')
    _tile_col = _to_java_column(tile_col)
    return Column(_jfn(_tile_col, probabilities, relative_error))

Beispiel #7

0

Datei anzeigen

def trunc(date, format):
    """
    Returns date truncated to the unit specified by the format.

    :param format: 'year', 'YYYY', 'yy' or 'month', 'mon', 'mm'

    >>> df = sqlContext.createDataFrame([('1997-02-28',)], ['d'])
    >>> df.select(trunc(df.d, 'year').alias('year')).collect()
    [Row(year=datetime.date(1997, 1, 1))]
    >>> df.select(trunc(df.d, 'mon').alias('month')).collect()
    [Row(month=datetime.date(1997, 2, 1))]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.trunc(_to_java_column(date), format))

Beispiel #8

0

Datei anzeigen

def sort_array(col, asc=True):
    """
    Collection function: sorts the input array for the given column in ascending order.

    :param col: name of column or expression

    >>> df = sqlContext.createDataFrame([([2, 1, 3],),([1],),([],)], ['data'])
    >>> df.select(sort_array(df.data).alias('r')).collect()
    [Row(r=[1, 2, 3]), Row(r=[1]), Row(r=[])]
    >>> df.select(sort_array(df.data, asc=False).alias('r')).collect()
    [Row(r=[3, 2, 1]), Row(r=[1]), Row(r=[])]
     """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.sort_array(_to_java_column(col), asc))

Beispiel #9

0

Datei anzeigen

def array_contains(col, value):
    """
    Collection function: returns True if the array contains the given value. The collection
    elements and value must be of the same type.

    :param col: name of column containing array
    :param value: value to check for in array

    >>> df = sqlContext.createDataFrame([(["a", "b", "c"],), ([],)], ['data'])
    >>> df.select(array_contains(df.data, "a")).collect()
    [Row(array_contains(data,a)=True), Row(array_contains(data,a)=False)]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.array_contains(_to_java_column(col), value))

Beispiel #10

0

Datei anzeigen

def instr(str, substr):
    """
    Locate the position of the first occurrence of substr column in the given string.
    Returns null if either of the arguments are null.

    NOTE: The position is not zero based, but 1 based index, returns 0 if substr
    could not be found in str.

    >>> df = sqlContext.createDataFrame([('abcd',)], ['s',])
    >>> df.select(instr(df.s, 'b').alias('s')).collect()
    [Row(s=2)]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.instr(_to_java_column(str), substr))

Beispiel #11

0

Datei anzeigen

Datei: GK.py Projekt: nlzimmerman/greenwald-khanna-spark

 def _gk(col):
     UntypedGKAggregator_instance = (
         sc._jvm.com.github.nlzimmerman.UntypedGKAggregator(q, e)
     )
     instance_applier = UntypedGKAggregator_instance.apply
     return Column(
         instance_applier(
             _to_seq(
                 sc,
                 [col],
                 _to_java_column
             )
         )
     )

Beispiel #12

0

Datei anzeigen

    def summary(self, featuresCol, weightCol=None):
        """
        Returns an aggregate object that contains the summary of the column with the requested
        metrics.

        :param featuresCol:
         a column that contains features Vector object.
        :param weightCol:
         a column that contains weight value. Default weight is 1.0.
        :return:
         an aggregate column that contains the statistics. The exact content of this
         structure is determined during the creation of the builder.
        """
        featuresCol, weightCol = Summarizer._check_param(featuresCol, weightCol)
        return Column(self._java_obj.summary(featuresCol._jc, weightCol._jc))

Beispiel #13

0

Datei anzeigen

Datei: rasterfunctions.py Projekt: zacklin923/rasterframes

def rf_mask_by_values(
        data_tile: Column_type, mask_tile: Column_type,
        mask_values: Union[List[Union[int, float]], Column_type]) -> Column:
    """Generate a tile with the values from `data_tile`, but where cells in the `mask_tile` are in the `mask_values`
       list, replace the value with NODATA.
    """
    from pyspark.sql.functions import array as sql_array
    if isinstance(mask_values, list):
        mask_values = sql_array([lit(v) for v in mask_values])

    jfcn = RFContext.active().lookup('rf_mask_by_values')
    col_args = [
        _to_java_column(c) for c in [data_tile, mask_tile, mask_values]
    ]
    return Column(jfcn(*col_args))

Beispiel #14

0

Datei anzeigen

def smvFirst(c, nonNull = False):
    """Variation of Spark "first" which also returns null values

        Since Spark "first" will return the first non-null value, we have to
        create our version smvFirst which to retune the real first value, even
        if it's null. Alternatively can return the first non-null value.

        Args:
            c (Column: column to extract first value from
            nonNull (bool): If false, return first value even if null. If true, return first non-null value. Defaults to false.

        Returns:
            (object): first value
    """
    return Column(SmvApp.getInstance()._jvm.org.tresamigos.smv.smvfuncs.smvFirst(c._jc, nonNull))

Beispiel #15

0

Datei anzeigen

    def __getitem__(self, item):
        """Returns the column as a :class:`Column`.

        >>> df.select(df['age']).collect()
        [Row(age=2), Row(age=5)]
        >>> df[ ["name", "age"]].collect()
        [Row(name=u'Alice', age=2), Row(name=u'Bob', age=5)]
        >>> df[ df.age > 3 ].collect()
        [Row(age=5, name=u'Bob')]
        >>> df[df[0] > 3].collect()
        [Row(age=5, name=u'Bob')]
        """
        if isinstance(item, basestring):
            jc = self._jdf.apply(item)
            return Column(jc)
        elif isinstance(item, Column):
            return self.filter(item)
        elif isinstance(item, (list, tuple)):
            return self.select(*item)
        elif isinstance(item, int):
            jc = self._jdf.apply(self.columns[item])
            return Column(jc)
        else:
            raise TypeError("unexpected item type: %s" % type(item))

Beispiel #16

0

Datei anzeigen

Datei: column.py Projekt: stevebuckingham/magellan

 def _(col, other):
     if isinstance(other, str):
         other = _col(other)
     jc = other._jc if isinstance(other, Column) else other
     jcol = col._jc
     sc = SparkContext._active_spark_context
     loader = sc._jvm.Thread.currentThread().getContextClassLoader()
     wclass = loader.loadClass(name)
     expr_class = sc._jvm.java.lang.Object
     expr_array = sc._gateway.new_array(expr_class, 2)
     expr_array[0] = jcol.expr()
     expr_array[1] = jc.expr()
     w = wclass.getConstructors()[0].newInstance(expr_array)
     wcol = sc._jvm.org.apache.spark.sql.Column(w)
     return Column(wcol)

Beispiel #17

0

Datei anzeigen

Datei: functions.py Projekt: zeyaddeeb/spark-1

def struct(*cols):
    """Creates a new struct column.

    :param cols: list of column names (string) or list of :class:`Column` expressions

    >>> df.select(struct('age', 'name').alias("struct")).collect()
    [Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))]
    >>> df.select(struct([df.age, df.name]).alias("struct")).collect()
    [Row(struct=Row(age=2, name=u'Alice')), Row(struct=Row(age=5, name=u'Bob'))]
    """
    sc = SparkContext._active_spark_context
    if len(cols) == 1 and isinstance(cols[0], (list, set)):
        cols = cols[0]
    jc = sc._jvm.functions.struct(_to_seq(sc, cols, _to_java_column))
    return Column(jc)

Beispiel #18

0

Datei anzeigen

Datei: functions.py Projekt: LXYTSOS/spyderworkspace

def substring_index(str, delim, count):
    """
    Returns the substring from string str before count occurrences of the delimiter delim.
    If count is positive, everything the left of the final delimiter (counting from left) is
    returned. If count is negative, every to the right of the final delimiter (counting from the
    right) is returned. substring_index performs a case-sensitive match when searching for delim.

    >>> df = sqlContext.createDataFrame([('a.b.c.d',)], ['s'])
    >>> df.select(substring_index(df.s, '.', 2).alias('s')).collect()
    [Row(s=u'a.b')]
    >>> df.select(substring_index(df.s, '.', -3).alias('s')).collect()
    [Row(s=u'b.c.d')]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.substring_index(_to_java_column(str), delim, count))

Beispiel #19

0

Datei anzeigen

Datei: functions.py Projekt: zeyaddeeb/spark-1

def array(*cols):
    """Creates a new array column.

    :param cols: list of column names (string) or list of :class:`Column` expressions that have
        the same data type.

    >>> df.select(array('age', 'age').alias("arr")).collect()
    [Row(arr=[2, 2]), Row(arr=[5, 5])]
    >>> df.select(array([df.age, df.age]).alias("arr")).collect()
    [Row(arr=[2, 2]), Row(arr=[5, 5])]
    """
    sc = SparkContext._active_spark_context
    if len(cols) == 1 and isinstance(cols[0], (list, set)):
        cols = cols[0]
    jc = sc._jvm.functions.array(_to_seq(sc, cols, _to_java_column))
    return Column(jc)

Beispiel #20

0

Datei anzeigen

def to_avro(data: "ColumnOrName", jsonFormatSchema: str = "") -> Column:
    """
    Converts a column into binary of avro format.

    .. versionadded:: 3.0.0

    Parameters
    ----------
    data : :class:`~pyspark.sql.Column` or str
        the data column.
    jsonFormatSchema : str, optional
        user-specified output avro schema in JSON string format.

    Notes
    -----
    Avro is built-in but external data source module since Spark 2.4. Please deploy the
    application as per the deployment section of "Apache Avro Data Source Guide".

    Examples
    --------
    >>> from pyspark.sql import Row
    >>> from pyspark.sql.avro.functions import to_avro
    >>> data = ['SPADES']
    >>> df = spark.createDataFrame(data, "string")
    >>> df.select(to_avro(df.value).alias("suite")).collect()
    [Row(suite=bytearray(b'\\x00\\x0cSPADES'))]

    >>> jsonFormatSchema = '''["null", {"type": "enum", "name": "value",
    ...     "symbols": ["SPADES", "HEARTS", "DIAMONDS", "CLUBS"]}]'''
    >>> df.select(to_avro(df.value, jsonFormatSchema).alias("suite")).collect()
    [Row(suite=bytearray(b'\\x02\\x00'))]
    """

    sc = SparkContext._active_spark_context
    assert sc is not None and sc._jvm is not None
    try:
        if jsonFormatSchema == "":
            jc = sc._jvm.org.apache.spark.sql.avro.functions.to_avro(
                _to_java_column(data))
        else:
            jc = sc._jvm.org.apache.spark.sql.avro.functions.to_avro(
                _to_java_column(data), jsonFormatSchema)
    except TypeError as e:
        if str(e) == "'JavaPackage' object is not callable":
            _print_missing_jar("Avro", "avro", "avro", sc.version)
        raise
    return Column(jc)

Beispiel #21

0

Datei anzeigen

Datei: functions.py Projekt: projectglow/glow

def normalize_variant(contigName: Union[Column, str],
                      start: Union[Column, str], end: Union[Column, str],
                      refAllele: Union[Column, str], altAlleles: Union[Column,
                                                                       str],
                      refGenomePathString: str) -> Column:
    """
    Normalizes the variant with a behavior similar to vt normalize or bcftools norm.
    Creates a StructType column including the normalized ``start``, ``end``, ``referenceAllele`` and
    ``alternateAlleles`` fields (whether they are changed or unchanged as the result of
    normalization) as well as a StructType field called ``normalizationStatus`` that
    contains the following fields:

       ``changed``: A boolean field indicating whether the variant data was changed as a result of normalization

       ``errorMessage``: An error message in case the attempt at normalizing the row hit an error. In this case, the ``changed`` field will be set to ``false``. If no errors occur, this field will be ``null``.

    In case of an error, the ``start``, ``end``, ``referenceAllele`` and ``alternateAlleles`` fields in the generated struct will be ``null``.

    Added in version 0.3.0.

    Examples:
        >>> df = spark.read.format('vcf').load('test-data/variantsplitternormalizer-test/test_left_align_hg38_altered.vcf')
        >>> ref_genome = 'test-data/variantsplitternormalizer-test/Homo_sapiens_assembly38.20.21_altered.fasta'
        >>> df.select('contigName', 'start', 'end', 'referenceAllele', 'alternateAlleles').head()
        Row(contigName='chr20', start=400, end=401, referenceAllele='G', alternateAlleles=['GATCTTCCCTCTTTTCTAATATAAACACATAAAGCTCTGTTTCCTTCTAGGTAACTGGTTTGAG'])
        >>> normalized_df = df.select('contigName', glow.expand_struct(glow.normalize_variant('contigName', 'start', 'end', 'referenceAllele', 'alternateAlleles', ref_genome)))
        >>> normalized_df.head()
        Row(contigName='chr20', start=268, end=269, referenceAllele='A', alternateAlleles=['ATTTGAGATCTTCCCTCTTTTCTAATATAAACACATAAAGCTCTGTTTCCTTCTAGGTAACTGG'], normalizationStatus=Row(changed=True, errorMessage=None))

    Args:
        contigName : The current contig name
        start : The current start
        end : The current end
        refAllele : The current reference allele
        altAlleles : The current array of alternate alleles
        refGenomePathString : A path to the reference genome ``.fasta`` file. The ``.fasta`` file must be accompanied with a ``.fai`` index file in the same folder.

    Returns:
        A struct as explained above
    """
    assert check_argument_types()
    output = Column(sc()._jvm.io.projectglow.functions.normalize_variant(
        _to_java_column(contigName), _to_java_column(start),
        _to_java_column(end), _to_java_column(refAllele),
        _to_java_column(altAlleles), refGenomePathString))
    assert check_return_type(output)
    return output

Beispiel #22

0

Datei anzeigen

Datei: functions.py Projekt: zeyaddeeb/spark-1

def date_format(dateCol, format):
    """
    Converts a date/timestamp/string to a value of string in the format specified by the date
    format given by the second argument.

    A pattern could be for instance `dd.MM.yyyy` and could return a string like '18.03.1993'. All
    pattern letters of the Java class `java.text.SimpleDateFormat` can be used.

    NOTE: Use when ever possible specialized functions like `year`. These benefit from a
    specialized implementation.

    >>> df = sqlContext.createDataFrame([('2015-04-08',)], ['a'])
    >>> df.select(date_format('a', 'MM/dd/yyy').alias('date')).collect()
    [Row(date=u'04/08/2015')]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.date_format(dateCol, format))

Beispiel #23

0

Datei anzeigen

Datei: functions.py Projekt: zeyaddeeb/spark-1

def log(arg1, arg2=None):
    """Returns the first argument-based logarithm of the second argument.

    If there is only one argument, then this takes the natural logarithm of the argument.

    >>> df.select(log(10.0, df.age).alias('ten')).map(lambda l: str(l.ten)[:7]).collect()
    ['0.30102', '0.69897']

    >>> df.select(log(df.age).alias('e')).map(lambda l: str(l.e)[:7]).collect()
    ['0.69314', '1.60943']
    """
    sc = SparkContext._active_spark_context
    if arg2 is None:
        jc = sc._jvm.functions.log(_to_java_column(arg1))
    else:
        jc = sc._jvm.functions.log(arg1, _to_java_column(arg2))
    return Column(jc)

Beispiel #24

0

Datei anzeigen

Datei: functions.py Projekt: LXYTSOS/spyderworkspace

def locate(substr, str, pos=0):
    """
    Locate the position of the first occurrence of substr in a string column, after position pos.

    NOTE: The position is not zero based, but 1 based index. returns 0 if substr
    could not be found in str.

    :param substr: a string
    :param str: a Column of StringType
    :param pos: start position (zero based)

    >>> df = sqlContext.createDataFrame([('abcd',)], ['s',])
    >>> df.select(locate('b', df.s, 1).alias('s')).collect()
    [Row(s=2)]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.locate(substr, _to_java_column(str), pos))

Beispiel #25

0

Datei anzeigen

    def smvPlusMonths(self, delta):
        """Add N months to `Timestamp` column

            Args:
                delta (integer): the number of months to add

            Note:
                The calculation will do its best to only change the month field retaining the same day of month. However, in certain circumstances, it may be necessary to alter smaller fields. For example, 2007-03-31 plus one month cannot result in 2007-04-31, so the day of month is adjusted to 2007-04-30.

            Example:
                >>> df.select(col("dob")).smvPlusMonths(3)

            Returns:
                (Timestamp): the incremented Timestamp, or null if input is null
        """
        jc = self._jColumnHelper.smvPlusMonths(delta)
        return Column(jc)

Beispiel #26

0

Datei anzeigen

def smvHashKey(head, *others):
    """Create MD5 on concatenated columns.
    Return "Prefix" + MD5 Hex string(size 32 string) as the unique key

    MD5's collisions rate on real data records could be ignored based on the following discussion.

    https://marc-stevens.nl/research/md5-1block-collision/
    The shortest messages have the same MD5 are 512-bit (64-byte) messages as below

    4dc968ff0ee35c209572d4777b721587d36fa7b21bdc56b74a3dc0783e7b9518afbfa200a8284bf36e8e4b55b35f427593d849676da0d1555d8360fb5f07fea2
    and the (different by two bits)
    4dc968ff0ee35c209572d4777b721587d36fa7b21bdc56b74a3dc0783e7b9518afbfa202a8284bf36e8e4b55b35f427593d849676da0d1d55d8360fb5f07fea2
    both have MD5 hash
    008ee33a9d58b51cfeb425b0959121c9

    There are other those pairs, but all carefully constructed.
    Theoretically the random collisions will happen on data size approaching 2^64 (since MD5 has
    128-bit), which is much larger than the number of records we deal with (a billion is about 2^30)
    There for using MD5 to hash primary key columns is good enough for creating an unique key

    This function can take 2 forms:
    - smvHashKey(prefix, col1, col2, ...)
    - smvHashKey(col1, col2, ...)

    Args:
     prefix (String): return string's prefix
     col. (Column): columns to be part of hash

    Return:
     (col): a StringType column as Prefix + MD5 Hex string
    """

    if (isinstance(head, basestring)):
        pre = head
        cols = list(others)
    elif (isinstance(head, Column)):
        pre = ""
        cols = [head] + list(others)
    else:
        raise RuntimeError(
            "first parameter must be either a String or a Column")
    app = SmvApp.getInstance()
    return Column(
        app._jvm.org.tresamigos.smv.python.SmvPythonHelper.smvHashKey(
            pre, smv_copy_array(app.sc, *cols)))

Beispiel #27

0

Datei anzeigen

Datei: functions.py Projekt: zeyaddeeb/spark-1

def monotonicallyIncreasingId():
    """A column that generates monotonically increasing 64-bit integers.

    The generated ID is guaranteed to be monotonically increasing and unique, but not consecutive.
    The current implementation puts the partition ID in the upper 31 bits, and the record number
    within each partition in the lower 33 bits. The assumption is that the data frame has
    less than 1 billion partitions, and each partition has less than 8 billion records.

    As an example, consider a :class:`DataFrame` with two partitions, each with 3 records.
    This expression would return the following IDs:
    0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594.

    >>> df0 = sc.parallelize(range(2), 2).mapPartitions(lambda x: [(1,), (2,), (3,)]).toDF(['col1'])
    >>> df0.select(monotonicallyIncreasingId().alias('id')).collect()
    [Row(id=0), Row(id=1), Row(id=2), Row(id=8589934592), Row(id=8589934593), Row(id=8589934594)]
    """
    sc = SparkContext._active_spark_context
    return Column(sc._jvm.functions.monotonicallyIncreasingId())

Beispiel #28

0

Datei anzeigen

Datei: functions.py Projekt: zeyaddeeb/spark-1

def explode(col):
    """Returns a new row for each element in the given array or map.

    >>> from pyspark.sql import Row
    >>> eDF = sqlContext.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])
    >>> eDF.select(explode(eDF.intlist).alias("anInt")).collect()
    [Row(anInt=1), Row(anInt=2), Row(anInt=3)]

    >>> eDF.select(explode(eDF.mapfield).alias("key", "value")).show()
    +---+-----+
    |key|value|
    +---+-----+
    |  a|    b|
    +---+-----+
    """
    sc = SparkContext._active_spark_context
    jc = sc._jvm.functions.explode(_to_java_column(col))
    return Column(jc)

Beispiel #29

0

Datei anzeigen

Datei: vista_utils.py Projekt: Advitya17/Vista

def merge_features_udf(sc, layer, x, y, z, image_features,
                       structured_features):
    """
        Merge structured and cnn features into one array
    :param sc:
    :param layer:
    :param x:
    :param y:
    :param z:
    :param image_features:
    :param structured_features:
    :return:
    """
    _merge_features = sc._jvm.vista.udf.VistaUDFs.mergeFeaturesUDF()
    return Column(
        _merge_features.apply(
            _to_seq(sc, [layer, image_features, structured_features, x, y, z],
                    _to_java_column)))

Beispiel #30

0

Datei anzeigen

Datei: functions.py Projekt: zeyaddeeb/spark-1

def when(condition, value):
    """Evaluates a list of conditions and returns one of multiple possible result expressions.
    If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.

    :param condition: a boolean :class:`Column` expression.
    :param value: a literal value, or a :class:`Column` expression.

    >>> df.select(when(df['age'] == 2, 3).otherwise(4).alias("age")).collect()
    [Row(age=3), Row(age=4)]

    >>> df.select(when(df.age == 2, df.age + 1).alias("age")).collect()
    [Row(age=3), Row(age=None)]
    """
    sc = SparkContext._active_spark_context
    if not isinstance(condition, Column):
        raise TypeError("condition should be a Column")
    v = value._jc if isinstance(value, Column) else value
    jc = sc._jvm.functions.when(condition._jc, v)
    return Column(jc)