Esempio n. 1
0
def is_column_a(df, column=None, dtypes="str"):
    """
    Check if column match a list of data types
    :param df: spark or dask dataframe
    :param column: column to be compared with
    :param dtypes: types to be checked
    :return:
    """
    column = val_to_list(column)

    if len(column) > 1:
        RaiseIt.length_error(column, 1)
    data_type = tuple(val_to_list(parse_dtypes(df, dtypes)))
    column = one_list_to_val(column)

    # Filter columns by data type
    # print("df",type(df),df)
    v = df.cols.schema_dtype(column)

    if is_spark_dataframe(df.data):
        result = isinstance(v, data_type)
    elif is_dask_dataframe(df):
        result = v in data_type
    else:
        result = None
    return result
Esempio n. 2
0
def append(dfs, like="columns"):
    """
    Concat multiple dataFrames columns or rows wise
    :param dfs: List of DataFrames
    :param like: concat as columns or rows
    :return:
    """

    # FIX: Because monotonically_increasing_id can create different
    # sequence for different dataframes the result could be wrong.

    if like == "columns":
        temp_dfs = []
        col_temp_name = "id_" + random_int()

        dfs = val_to_list(dfs)
        for df in dfs:
            from pyspark.sql import functions as F
            temp_dfs.append(
                df.withColumn(col_temp_name, F.monotonically_increasing_id()))

        def _append(df1, df2):
            return df1.join(df2, col_temp_name, "outer")

        df_result = reduce(_append, temp_dfs).drop(col_temp_name)

    elif like == "rows":
        from pyspark.sql import DataFrame
        df_result = reduce(DataFrame.union, dfs)
    else:
        RaiseIt.value_error(like, ["columns", "rows"])

    return df_result
Esempio n. 3
0
def percentile_(col_name, df, values, relative_error):
    """
    Return the percentile of a dataframe
    :param col_name:  '*', list of columns names or a single column name.
    :param df:
    :param values: list of percentiles to be calculated
    :param relative_error:  If set to zero, the exact percentiles are computed, which could be very expensive. 0 to 1 accepted
    :return: percentiles per columns
    """

    # Make sure values are double

    if values is None:
        values = [0.05, 0.25, 0.5, 0.75, 0.95]

    values = val_to_list(values)
    values = list(map(str, values))

    if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES):
        # Get percentiles

        p = F.expr("percentile_approx(`{COLUMN}`, array({VALUES}), {ERROR})".format(COLUMN=col_name,
                                                                                    VALUES=" , ".join(values),
                                                                                    ERROR=relative_error))

        # Zip the arrays
        expr = [[F.lit(v), p.getItem(i)] for i, v in enumerate(values)]
        expr = F.create_map(*list(itertools.chain(*expr)))

    else:
        expr = None
    # print(expr)
    return expr
Esempio n. 4
0
 def columns(meta, value) -> dict:
     """
     Shortcut to cache the columns in a dataframe
     :param meta: Meta data to be modified
     :param value:
     :return: dict (Meta)
     """
     value = val_to_list(value)
     for v in value:
         meta = Meta.update(meta, "transformations.columns", v, list)
     return meta
Esempio n. 5
0
    def show(self, table_names="*", limit=None):
        db = self.db

        if table_names == "*":
            table_names = db.tables()
        else:
            table_names = val_to_list(table_names)

        print("Total Tables:" + str(len(table_names)))

        for table_name in table_names:
            db.table_to_df(table_name, "*", limit) \
                .table(title=table_name)
Esempio n. 6
0
def filter_col_name_by_dtypes(df, data_type):
    """
    Return column names filtered by the column data type
    :param df: Dataframe which columns are going to be filtered
    :param data_type: Datatype used to filter the column.
    :type data_type: str or list
    :return:
    """
    data_type = parse_dtypes(df, data_type)
    data_type = tuple(val_to_list(data_type))
    # Filter columns by data type
    result = []
    for col_name in df.cols.names():
        if str(df.cols.schema_dtype(col_name)) in data_type:
            result.append(col_name)
    return result
Esempio n. 7
0
    def table_to_df(self, table_name, schema="public", columns="*", limit=None, partition_column=None,
                    num_partition=NUM_PARTITIONS):
        """
        Return cols as Spark data frames from a specific table
        :param partition_column:
        :param table_name:
        :param schema:
        :param columns:
        :param limit: how many rows will be retrieved
        """

        if schema is None:
            schema = self.schema

        # db_table = table_name

        query = self.driver_context.count_query(db_table=table_name)
        if limit == "all":
            count = self.execute(query, "all").first()[0]

            # We want to count the number of rows to warn the users how much it can take to bring the whole data
            print(str(int(count)) + " rows")

        if columns == "*":
            columns_sql = "*"
        else:
            columns = val_to_list(columns)
            columns_sql = ",".join(columns)

        # min, max = self.execute(query, limit)

        query = "SELECT " + columns_sql + " FROM " + table_name

        logger.print(query)
        df = self.execute(query, limit, partition_column=partition_column, table_name=table_name,
                          num_partitions=num_partition)

        # Bring the data to local machine if not every time we call an action is going to be
        # retrieved from the remote server
        df = df.run()
        return df
Esempio n. 8
0
def validate_columns_names(df, col_names: [str, list], index=0):
    """
    Check if a string or list of string are valid spark columns
    :param df: Dataframe to be analyzed
    :param col_names: columns names to be checked
    :param index:
    :return:
    """

    columns = val_to_list(col_names)

    if is_list_of_tuples(columns):
        columns = [c[index] for c in columns]

    # Remove duplicates in the list

    if is_list_of_str(columns):
        columns = OrderedSet(columns)

    check_for_missing_columns(df, columns)

    return True
Esempio n. 9
0
def name_col(col_names: str, append: str = None) -> str:
    """
    Whenever you want to name and output use this function. This ensure that we use and Standard when naming
    :param col_names: Column name
    :param append: string to be appended
    :return:
    """
    separator = "_"

    col_names = val_to_list(col_names)
    if len(col_names) > 1 and append is None:
        output_col = ('_'.join(str(elem) for elem in col_names))
    elif len(col_names) > 1:
        output_col = ('_'.join(str(elem)
                               for elem in col_names))[:10] + separator
    else:
        output_col = one_list_to_val(col_names)

    if append is not None:
        append = separator + str(one_list_to_val(append))
    else:
        append = ""
    return output_col + append
Esempio n. 10
0
def equal_function(f1, f2):
    f2 = val_to_list(f2)
    for func in f2:
        if f1.__name__ == func.__name__:
            return True
    return False
Esempio n. 11
0
def get_output_cols(input_cols,
                    output_cols,
                    merge=False,
                    auto_increment=False):
    """
    Construct output columns taking the input columns.
    If it receive a list of input columns and on output column the function will append the output_col name to the input cols list

    If
    intput_cols = ['col1'] to output_cols = None
    intput_cols = ['col1'] to output_cols = ['col1']


    If merge is True

    For 1 column
    intput_cols = ['col1'] to output_cols = 'new'
    Result:
    intput_cols = ['col1'] to output_cols = ['col1_new']

    For multiple columns
    intput_cols = ['col1', 'col2'...'col3'] to output_cols = 'new'
    Result:
    intput_cols = ['col1', 'col2',...'col3'] to output_cols = ['col1_new', 'col2_new',...'col3_new']

    else merge is False
    For 1 column
    intput_cols = ['col1'] to output_cols = 'new'
    intput_cols = ['col1'] to output_cols = ['new']

    :param input_cols:
    :param output_cols:
    :param merge:
    :param auto_increment:
    :return:
    """

    output_cols = val_to_list(output_cols)

    # if is_list(input_cols) and is_list(output_cols):
    #     if len(input_cols) != len(output_cols):
    #         RaiseIt.length_error(input_cols, output_cols)

    if output_cols is None:
        output_cols = val_to_list(input_cols)
    else:
        output_cols = val_to_list(output_cols)

        # if auto_increment is not None:

    if auto_increment is True:
        # input_cols = input_cols * auto_increment
        # output_cols = val_to_list(output_cols)
        # print("LEAN @", len(input_cols)/auto_increment)
        # r = int(len(input_cols) / auto_increment)
        r = list(range(auto_increment)) * 2
        output_cols = [
            col_name + "_" + str(i) for i, col_name in zip(r, input_cols)
        ]
    elif merge is True:
        output_cols = val_to_list(output_cols)
        output_cols = list(
            [name_col(input_col, output_cols) for input_col in input_cols])

    return output_cols
Esempio n. 12
0
def prepare_columns(df,
                    input_cols: [str, list],
                    output_cols: [str, list] = None,
                    is_regex=None,
                    filter_by_column_dtypes=None,
                    accepts_missing_cols=False,
                    invert: bool = False,
                    default=None,
                    columns=None,
                    auto_increment=False,
                    args=None):
    """
    One input columns- > Same output column. lower(), upper()
    One input column -> One output column. copy()
    One input column -> Multiple output column. unnest()
    Multiple input columns -> One output column. nest()
    Multiple input columns -> Multiple output columns. lower(), upper()

    For multiple output we can pass a string, a list o columns or just enumerate de output column.
    See get_output_cols() for more info

    Accepts Return an iterator with input and output columns
    :param df: dataframe against to check that the input columns are valid
    :param input_cols: intput columns names
    :param output_cols: output columns names
    :param is_regex: input columns is a regex
    :param filter_by_column_dtypes: filter column selection by data type
    :param accepts_missing_cols: dont check the input columns exist
    :param invert: Invert selection
    :param default: Default column name if output_cols is not provider
    :param columns: In case you have a input output dictionary already defined. {incol:outcol1 , incol2:outcol2}
    :param auto_increment:
    :param merge:
    :return:
    """
    if columns:
        result = zip(*columns)
    else:
        input_cols = parse_columns(df, input_cols, is_regex,
                                   filter_by_column_dtypes,
                                   accepts_missing_cols, invert)
        merge = False
        if output_cols is None and default is not None:
            output_cols = default
            merge = True

        elif auto_increment is not False:
            input_cols = input_cols * auto_increment

        if output_cols is not None and (len(input_cols) != len(
                val_to_list(output_cols))):
            merge = True

        output_cols = get_output_cols(input_cols,
                                      output_cols,
                                      merge=merge,
                                      auto_increment=auto_increment)

        if args is None:
            result = zip(input_cols, output_cols)
        else:
            args = val_to_list(args)
            if len(args) == 1:
                args = args * len(input_cols)

            result = zip(input_cols, output_cols, args)
    return result
Esempio n. 13
0
def parse_columns(df,
                  cols_args,
                  is_regex=None,
                  filter_by_column_dtypes=None,
                  accepts_missing_cols=False,
                  invert=False):
    """
    Return a list of columns and check that columns exists in the spark
    Accept '*' as parameter in which case return a list of all columns in the spark.
    Also accept a regex.
    If a list of tuples return to list. The first element is the columns name the others element are params.
    This params can be used to create custom transformation functions. You can find and example in cols().cast()
    :param df: Dataframe in which the columns are going to be checked
    :param cols_args: Accepts * as param to return all the string columns in the spark
    :param is_regex: Use True is col_attrs is a regex
    :param filter_by_column_dtypes: A data type for which a columns list is going be filtered
    :param accepts_missing_cols: if true not check if column exist in the spark
    :param invert: Invert the final selection. For example if you want to select not integers

    :return: A list of columns string names
    """

    # if columns value is * get all dataframes columns

    df_columns = df.cols._names()
    if is_regex is not None:
        r = re.compile(is_regex)
        cols = list(filter(r.match, df_columns))

    elif cols_args == "*" or cols_args is None:
        cols = df_columns
    elif is_int(cols_args):
        cols = val_to_list(df_columns[cols_args])
    elif is_list_of_int(cols_args):
        cols = list(df_columns[i] for i in cols_args)

    elif is_tuple(cols_args) or is_list_of_tuples(cols_args):
        # In case we have a list of tuples we use the first element of the tuple is taken as the column name
        # and the rest as params. We can use the param in a custom function as follow
        # def func(attrs): attrs return (1,2) and (3,4)
        #   return attrs[0] + 1
        # df.cols().apply([('col_1',1,2),('cols_2', 3 ,4)], func)

        # Verify if we have a list with tuples

        cols_args = val_to_list(cols_args)
        # Extract a specific position in the tuple
        cols = [(i[0:1][0]) for i in cols_args]
        attrs = [(i[1:]) for i in cols_args]
    else:
        # if not a list convert to list
        cols = val_to_list(cols_args)
        # Get col name from index
        cols = [c if is_str(c) else df_columns[c] for c in cols]

    # Check for missing columns
    if accepts_missing_cols is False:
        check_for_missing_columns(df, cols)

    # Filter by column data type
    filter_by_column_dtypes = val_to_list(filter_by_column_dtypes)
    if is_list_of_list(filter_by_column_dtypes):
        filter_by_column_dtypes = [
            item for sublist in filter_by_column_dtypes for item in sublist
        ]

    columns_residual = None

    # If necessary filter the columns by data type
    if filter_by_column_dtypes:
        # Get columns for every data type
        columns_filtered = filter_col_name_by_dtypes(df,
                                                     filter_by_column_dtypes)

        # Intersect the columns filtered per data type from the whole spark with the columns passed to the function
        final_columns = list(OrderedSet(cols).intersection(columns_filtered))

        # This columns match filtered data type
        columns_residual = list(
            OrderedSet(cols) - OrderedSet(columns_filtered))

    else:
        final_columns = cols

    cols_params = []
    if invert:
        final_columns = list(
            OrderedSet(df_columns) - OrderedSet(final_columns))

    cols_params = final_columns

    if columns_residual:
        logger.print("%s %s %s", ",".join(escape_columns(columns_residual)),
                     "column(s) was not processed because is/are not",
                     ",".join(filter_by_column_dtypes))

    # if because of filtering we got 0 columns return None
    if len(cols_params) == 0:
        cols_params = None
        logger.print("Outputting 0 columns after filtering. Is this expected?")

    return cols_params