Esempio n. 1
0
def merge_schemas(left_schema, right_schema, how, on=None):
    if on is None:
        on = []

    left_on_fields, right_on_fields = get_on_fields(left_schema, right_schema,
                                                    on)
    other_left_fields = [
        field for field in left_schema.fields if field not in left_on_fields
    ]
    other_right_fields = [
        field for field in right_schema.fields if field not in right_on_fields
    ]

    if how in (INNER_JOIN, CROSS_JOIN, LEFT_JOIN, LEFT_ANTI_JOIN,
               LEFT_SEMI_JOIN):
        on_fields = left_on_fields
    elif how == RIGHT_JOIN:
        on_fields = right_on_fields
    elif how == FULL_JOIN:
        on_fields = [
            StructField(field.name, field.dataType, nullable=True)
            for field in left_on_fields
        ]
    else:
        raise IllegalArgumentException(
            "Invalid how argument in join: {0}".format(how))

    return StructType(fields=on_fields + other_left_fields +
                      other_right_fields)
Esempio n. 2
0
def merge_rows_joined_on_values(left, right, left_schema, right_schema, how,
                                on):
    left_names = left_schema.names
    right_names = right_schema.names

    left_on_fields, right_on_fields = get_on_fields(left_schema, right_schema,
                                                    on)

    on_parts = [(on_field,
                 left[on_field] if left is not None else right[on_field])
                for on_field in on]

    if left is None and how in (FULL_JOIN, RIGHT_JOIN):
        left = create_row(left_names, [None for _ in left_names])
    if right is None and how in (LEFT_JOIN, FULL_JOIN):
        right = create_row(right_names, [None for _ in right_names])

    left_parts = ((field.name, value)
                  for field, value in zip(left_schema.fields, left)
                  if field not in left_on_fields)

    if how in (INNER_JOIN, CROSS_JOIN, LEFT_JOIN, FULL_JOIN, RIGHT_JOIN):
        right_parts = ((field.name, value)
                       for field, value in zip(right_schema.fields, right)
                       if field not in right_on_fields)
    elif how in (LEFT_SEMI_JOIN, LEFT_ANTI_JOIN):
        right_parts = ()
    else:
        raise IllegalArgumentException(
            "Argument 'how' cannot be '{0}'".format(how))

    return row_from_keyed_values(
        itertools.chain(on_parts, left_parts, right_parts))
Esempio n. 3
0
    def otherwise(self, value):
        """
        Evaluates a list of conditions and returns one of multiple possible result expressions.
        If :func:`Column.otherwise` is not invoked, None is returned for unmatched conditions.

        See :func:`pyspark.sql.functions.when` for example usage.

        :param value: a literal value, or a :class:`Column` expression.

        >>> from pysparkling.sql import functions as F
        >>> from pysparkling import Context, Row
        >>> from pysparkling.sql.session import SparkSession
        >>> spark = SparkSession(Context())
        >>> df = spark.createDataFrame(
        ...   [Row(age=2, name='Alice'), Row(age=5, name='Bob')]
        ... )
        >>> df.select(df.name, F.when(df.age > 3, 1).otherwise(0)).show()
        +-----+-------------------------------------+
        | name|CASE WHEN (age > 3) THEN 1 ELSE 0 END|
        +-----+-------------------------------------+
        |Alice|                                    0|
        |  Bob|                                    1|
        +-----+-------------------------------------+

        """
        if not isinstance(self.expr, CaseWhen):
            raise IllegalArgumentException(
                'otherwise() can only be applied on a Column previously generated by when()'
            )

        return Column(self.expr.set_otherwise(parse(value)))
Esempio n. 4
0
 def mode(self, saveMode):
     if saveMode is None:
         return self
     if saveMode not in WRITE_MODES:
         raise IllegalArgumentException(
             "Unknown save mode: {0}. Accepted save modes are {1}.".format(
                 saveMode, "', '".join(WRITE_MODES)))
     self._jwrite = self._jwrite.mode(saveMode)
     return self
Esempio n. 5
0
    def join_on_values(self, other, on, how):
        if how != CROSS_JOIN:

            def add_key(row):
                # When joining on value, no check on schema (and lack of duplicated col) is done
                return tuple(row[on_column] for on_column in on), row
        else:

            def add_key(row):
                return True, row

        keyed_self = self.rdd().map(add_key)
        keyed_other = other.rdd().map(add_key)
        if how == LEFT_JOIN:
            joined_rdd = keyed_self.leftOuterJoin(keyed_other)
        elif how == RIGHT_JOIN:
            joined_rdd = keyed_self.rightOuterJoin(keyed_other)
        elif how == FULL_JOIN:
            joined_rdd = keyed_self.fullOuterJoin(keyed_other)
        elif how in (INNER_JOIN, CROSS_JOIN):
            joined_rdd = keyed_self.join(keyed_other)
        elif how == LEFT_ANTI_JOIN:
            joined_rdd = keyed_self._leftAntiJoin(keyed_other)
        elif how == LEFT_SEMI_JOIN:
            joined_rdd = keyed_self._leftSemiJoin(keyed_other)
        else:
            raise IllegalArgumentException(
                "Invalid how argument in join: {0}".format(how))

        def format_output(entry):
            _, (left, right) = entry

            return merge_rows_joined_on_values(left, right, self.bound_schema,
                                               other.bound_schema, how, on)

        output_rdd = joined_rdd.map(format_output)
        return output_rdd
Esempio n. 6
0
 def schema(self, schema):
     # By default OptionUtils subclass do not support schema
     raise IllegalArgumentException(
         "schema is not a valid argument for {0}".format(self.__class__))