def track_cols(self, df, add=None, remove=None): """ This track the columns that are updated by an Spark Operation. The function must be used at the end of the fucnion after all the transformations :param self: :param df: dataframe with the old column data. :param add: Columns to be added. :param remove: Columns to be removed. :return: """ if df._original_cols is None: df._original_cols = df.cols.names() if add is not None: df._track_cols = (list(set(df._track_cols + val_to_list(add)))) if remove is not None: df._track_cols = (list( set([ item for item in df._track_cols if item not in val_to_list(remove) ]))) self._track_cols = df._track_cols return self
def append(rows): """ Append a row at the end of a dataframe :param rows: List of values or tuples to be appended :return: Spark DataFrame """ df = self if is_list_of_tuples(rows): columns = [str(i) for i in range(df.cols.count())] if not is_list_of_tuples(rows): rows = [tuple(rows)] new_row = op.Create.df(columns, rows) df_result = df.union(new_row) elif is_list_of_dataframes(rows) or is_dataframe(rows): row = val_to_list(rows) row.insert(0, df) df_result = append_df(row, like="rows") else: RaiseIt.type_error(rows, ["list of tuples", "list of dataframes"]) df_result = df_result.preserve_meta(self, Actions.NEST.value, df.cols.names()) return df_result
def melt(self, id_vars, value_vars, var_name="variable", value_name="value", data_type="str"): """ Convert DataFrame from wide to long format. :param self: Spark Dataframe :param id_vars: column with unique values :param value_vars: Column names that are going to be converted to columns values :param var_name: Column name for vars :param value_name: Column name for values :param data_type: All columns must have the same type. It will transform all columns to this data type. :return: """ df = self id_vars = val_to_list(id_vars) # Cast all columns to the same type df = df.cols.cast(id_vars + value_vars, data_type) vars_and_vals = [ F.struct(F.lit(c).alias(var_name), F.col(c).alias(value_name)) for c in value_vars ] # Add to the DataFrame and explode df = df.withColumn("vars_and_vals", F.explode(F.array(*vars_and_vals))) cols = id_vars + [ F.col("vars_and_vals")[x].alias(x) for x in [var_name, value_name] ] return df.select(*cols)
def columns_meta(self, value): """ Shortcut to add transformations to a dataframe :param self: :param value: :return: """ df = self value = val_to_list(value) for v in value: df = df.update_meta("transformations.columns", v, list) return df
def action_meta(self, key, value): """ Shortcut to add transformations to a dataframe :param self: :param key: :param value: :return: """ df = self value = val_to_list(value) for v in value: df = df.update_meta("transformations.actions." + key, v, list) return df
def is_in(columns, values): """ Filter rows which columns that match a specific value :return: Spark DataFrame """ # Ensure that we have a list values = val_to_list(values) # Create column/value expression column_expr = [(F.col(columns) == v) for v in values] # Concat expression with and logical or expr = reduce(lambda a, b: a | b, column_expr) return self.rows.select(expr)
def is_in(input_cols, values): """ Filter rows which columns that match a specific value :return: Spark DataFrame """ df = self # Ensure that we have a list values = val_to_list(values) # Create column/value expression column_expr = [(F.col(input_cols) == v) for v in values] # Concat expression with and logical or expr = reduce(lambda a, b: a | b, column_expr) df = df.rows.select(expr) df = df.preserve_meta(self, Actions.DROP_ROW.value, df.cols.names()) return df