Ejemplo n.º 1
0
def track_cols(self, df, add=None, remove=None):
    """
    This track the columns that are updated by an Spark Operation. The function must be used at the end of the fucnion
    after all the transformations
    :param self:
    :param df: dataframe with the old column data.
    :param add: Columns to be added.
    :param remove: Columns to be removed.
    :return:
    """
    if df._original_cols is None:
        df._original_cols = df.cols.names()

    if add is not None:
        df._track_cols = (list(set(df._track_cols + val_to_list(add))))
    if remove is not None:
        df._track_cols = (list(
            set([
                item for item in df._track_cols
                if item not in val_to_list(remove)
            ])))

    self._track_cols = df._track_cols

    return self
Ejemplo n.º 2
0
    def append(rows):
        """
        Append a row at the end of a dataframe
        :param rows: List of values or tuples to be appended
        :return: Spark DataFrame
        """
        df = self

        if is_list_of_tuples(rows):
            columns = [str(i) for i in range(df.cols.count())]
            if not is_list_of_tuples(rows):
                rows = [tuple(rows)]
            new_row = op.Create.df(columns, rows)
            df_result = df.union(new_row)

        elif is_list_of_dataframes(rows) or is_dataframe(rows):
            row = val_to_list(rows)
            row.insert(0, df)
            df_result = append_df(row, like="rows")
        else:
            RaiseIt.type_error(rows, ["list of tuples", "list of dataframes"])

        df_result = df_result.preserve_meta(self, Actions.NEST.value,
                                            df.cols.names())

        return df_result
Ejemplo n.º 3
0
def melt(self,
         id_vars,
         value_vars,
         var_name="variable",
         value_name="value",
         data_type="str"):
    """
    Convert DataFrame from wide to long format.
    :param self: Spark Dataframe
    :param id_vars: column with unique values
    :param value_vars: Column names that are going to be converted to columns values
    :param var_name: Column name for vars
    :param value_name: Column name for values
    :param data_type: All columns must have the same type. It will transform all columns to this data type.
    :return:
    """

    df = self
    id_vars = val_to_list(id_vars)
    # Cast all columns to the same type
    df = df.cols.cast(id_vars + value_vars, data_type)

    vars_and_vals = [
        F.struct(F.lit(c).alias(var_name),
                 F.col(c).alias(value_name)) for c in value_vars
    ]

    # Add to the DataFrame and explode
    df = df.withColumn("vars_and_vals", F.explode(F.array(*vars_and_vals)))

    cols = id_vars + [
        F.col("vars_and_vals")[x].alias(x) for x in [var_name, value_name]
    ]

    return df.select(*cols)
Ejemplo n.º 4
0
def columns_meta(self, value):
    """
    Shortcut to add transformations to a dataframe
    :param self:
    :param value:
    :return:
    """
    df = self
    value = val_to_list(value)
    for v in value:
        df = df.update_meta("transformations.columns", v, list)
    return df
Ejemplo n.º 5
0
def action_meta(self, key, value):
    """
    Shortcut to add transformations to a dataframe
    :param self:
    :param key:
    :param value:
    :return:
    """
    df = self
    value = val_to_list(value)
    for v in value:
        df = df.update_meta("transformations.actions." + key, v, list)
    return df
Ejemplo n.º 6
0
    def is_in(columns, values):
        """
        Filter rows which columns that match a specific value
        :return: Spark DataFrame
        """

        # Ensure that we have a list
        values = val_to_list(values)

        # Create column/value expression
        column_expr = [(F.col(columns) == v) for v in values]

        # Concat expression with and logical or
        expr = reduce(lambda a, b: a | b, column_expr)

        return self.rows.select(expr)
Ejemplo n.º 7
0
    def is_in(input_cols, values):
        """
        Filter rows which columns that match a specific value
        :return: Spark DataFrame
        """
        df = self

        # Ensure that we have a list
        values = val_to_list(values)

        # Create column/value expression
        column_expr = [(F.col(input_cols) == v) for v in values]

        # Concat expression with and logical or
        expr = reduce(lambda a, b: a | b, column_expr)
        df = df.rows.select(expr)
        df = df.preserve_meta(self, Actions.DROP_ROW.value, df.cols.names())
        return df