Beispiel #1
0
    def sort(col_sort):
        """
        Sort rows taking in account multiple columns
        :param col_sort: column and sort type combination (col_name, "asc")
        :type col_sort: list of tuples
        """
        # If a list of columns names are given order this by desc. If you need to specify the order of every
        # column use a list of tuples (col_name, "asc")
        df = self

        t = []
        if is_list_of_str_or_int(col_sort):
            for col_name in col_sort:
                t.append(tuple([col_name, "desc"]))
            col_sort = t

        func = []
        for cs in col_sort:
            col_name = one_list_to_val(cs[0])
            order = cs[1]

            if order == "asc":
                sort_func = F.asc
            elif order == "desc":
                sort_func = F.desc
            else:
                RaiseIt.value_error(sort_func, ["asc", "desc"])

            func.append(sort_func(col_name))
            df = df.preserve_meta(self, Actions.SORT_ROW.value, col_name)

        df = df.sort(*func)
        return df
Beispiel #2
0
    def value_error(var=None, data_values=None):
        """
        Raise a ValueError exception
        :param var:
        :type var:

        :param data_values: values accepted by the variable
        :type data_values: str/list
        :return:
        """
        from optimus.helpers.debug import get_var_name

        if not isinstance(data_values, list):
            data_values = [data_values]

        # data_values = val_to_list(data_values)

        if len(data_values) == 1:
            divisor = ""
        elif len(data_values) == 2:
            divisor = " or "
        elif len(data_values) > 2:
            divisor = ", "

        raise ValueError("'{var_name}' must be {type}, received '{var_type}'"
                         .format(var_name=get_var_name(var),
                                 type=divisor.join(map(
                                     lambda x: "'" + x + "'",
                                     data_values)), var_type=one_list_to_val(var)))
Beispiel #3
0
def plot_hist(column_data=None, output=None, sub_title="", path=None):
    """
    Plot a histogram
    obj = {"col_name":[{'lower': -87.36666870117188, 'upper': -70.51333465576172, 'value': 0},
    {'lower': -70.51333465576172, 'upper': -53.66000061035157, 'value': 22094},
    {'lower': -53.66000061035157, 'upper': -36.80666656494141, 'value': 2},
    ...
    ]}
    :param column_data: column data in json format
    :param output: image, base64 or plot. Image output a file, base64 output a base64 encoded image and plot output the
    image to the notebook
    :param sub_title: plot subtitle
    :param path:
    :return: plot, image or base64
    """

    for col_name, data in column_data.items():
        bins = []
        # print(data)
        # print("**********")
        for d in data:
            bins.append(d['lower'])

        last = data[len(data) - 1]["upper"]
        bins.append(last)

        # Transform hist Optimus format to matplot lib format
        hist = []
        for d in data:
            if d is not None:
                hist.append(d["count"])

        array_bins = array(bins)
        center = (array_bins[:-1] + array_bins[1:]) / 2
        width = 0.9 * (array_bins[1] - array_bins[0])

        hist = one_list_to_val(hist)

        # Plot
        fig = plt.figure(figsize=(12, 5))
        plt.bar(center, hist, width=width)
        plt.title("Histogram '" + col_name + "' " + sub_title)

        # fig.tight_layout()

        if output is "base64":
            return output_base64(fig)
        elif output is "image":
            # Save image
            output_image(plt, path)
            print_html("<img src='" + path + "'>")
            # Print in jupyter notebook

        elif output is "plot":
            plt.subplots_adjust(left=0.05, right=0.99, top=0.9, bottom=0.3)
Beispiel #4
0
    def __init__(self, df, col_name):
        """

        :param df: Spark Dataframe
        :param col_name: column name
        """
        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        self.df = df
        self.col_name = one_list_to_val(parse_columns(df, col_name))
Beispiel #5
0
def name_col(col_names: str, append: str) -> str:
    """
    Whenever you want to name and output user this function. This ensure that we manage and Standard when naming
    :param col_names: Column name
    :param append: string to be appended
    :return:
    """
    col_names = val_to_list(col_names)
    if len(col_names) > 1:
        output_col = ('_'.join(str(elem) for elem in col_names))[:10] + "***"
    else:
        output_col = one_list_to_val(col_names)

    return output_col + "_" + append.upper()
Beispiel #6
0
def filter_list(val, index=0):
    """
    Convert a list to None, int, str or a list filtering a specific index
    [] to None
    ['test'] to test

    :param val:
    :param index:
    :return:
    """
    if len(val) == 0:
        return None
    else:
        return one_list_to_val([column[index] for column in val])
Beispiel #7
0
def absolute_path(files, format="posix"):
    """
    User project base folder to construct and absolute path
    :param files: path files
    :param format: posix or uri
    :return:
    """
    files = val_to_list(files)
    if format == "uri":
        result = [Path(ROOT_DIR + file).as_uri() for file in files]
    elif format == "posix":
        result = [Path(ROOT_DIR + file).as_posix() for file in files]
    else:
        RaiseIt.value_error(format, ["posix", "uri"])

    result = one_list_to_val(result)
    return result
Beispiel #8
0
    def __init__(self, df, col_name, threshold):
        """

        :param df: Spark Dataframe
        :param col_name:
        """

        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        self.df = df

        if not is_numeric(threshold):
            raise TypeError("Numeric expected")
        self.threshold = threshold

        self.col_name = one_list_to_val(parse_columns(df, col_name))
Beispiel #9
0
def is_column_a(df, column, dtypes):
    """
    Check if column match a list of data types
    :param df: dataframe
    :param column: column to be compared with
    :param dtypes: types to be checked
    :return:
    """
    column = val_to_list(column)

    if len(column) > 1:
        RaiseIt.length_error(column, 1)

    data_type = tuple(val_to_list(parse_spark_dtypes(dtypes)))
    column = one_list_to_val(column)

    # Filter columns by data type
    return isinstance(df.schema[column].dataType, data_type)
Beispiel #10
0
    def __init__(self, df, col_name, threshold):
        """

        :para df:
        :param col_name:
        :param threshold:
        """
        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        if not is_numeric(threshold):
            raise TypeError("Numeric expected")

        self.df = df
        self.threshold = threshold
        self.col_name = one_list_to_val(parse_columns(df, col_name))
        self.tmp_col = name_col(col_name, "z_score")
        self.df_score = self.z_score()
        super().__init__(self.df_score, col_name, "z_score")
Beispiel #11
0
    def __init__(self, df, col_name, threshold, relative_error=RELATIVE_ERROR):
        """

        :param df:
        :param col_name:
        :param threshold:
        """
        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        if not is_numeric(threshold):
            raise TypeError("Numeric expected")

        if not is_numeric(relative_error):
            raise TypeError("Numeric expected")

        self.df = df
        self.threshold = threshold
        self.relative_error = relative_error

        self.col_name = one_list_to_val(parse_columns(df, col_name))
Beispiel #12
0
def filter_row_by_data_type(col_name, data_type=None, get_type=False):
    """
    A Pandas UDF function that returns bool if the value match with the data_type param passed to the function.
    Also can return the data type
    :param col_name: Column to be process
    :param data_type: The data_type to be compared with
    :param get_type: Value to be returned as string or boolean
    :return: True or False
    """

    if data_type is not None:
        data_type = parse_python_dtypes(data_type)

    def pandas_udf_func(v):

        return v.apply(Infer.func, args=(data_type, get_type))

    if get_type is True:
        return_data_type = "string"
    else:
        return_data_type = "boolean"

    col_name = one_list_to_val(col_name)
    return F.pandas_udf(pandas_udf_func, return_data_type)(col_name)
Beispiel #13
0
def filter_row_by_data_type(col_name, data_type=None, get_type=False):
    """
    A Pandas UDF function that returns bool if the value match with the data_type param passed to the function.
    Also can return the data type
    :param col_name: Column to be process
    :param data_type: The data_type to be compared with
    :param get_type: Value to be returned as string or boolean
    :return: True or False
    """
    from ast import literal_eval

    if data_type is not None:
        data_type = parse_python_dtypes(data_type)

    def pandas_udf_func(v):
        def str_to_boolean(value):
            """
            Check if a str can be converted to boolean
            :param value:
            :return:
            """
            value = value.lower()
            if value == "true" or value == "false":
                return True

        def str_to_date(value):
            try:
                dateutil.parser.parse(value)
                return True
            except (ValueError, OverflowError):
                pass

        def str_to_array(value):
            """
            Check if value can be parsed to a tuple or and array.
            Because Spark can handle tuples we will try to transform tuples to arrays
            :param value:
            :return:
            """
            try:
                if isinstance(
                        literal_eval((value.encode('ascii',
                                                   'ignore')).decode("utf-8")),
                    (list, tuple)):
                    return True
            except (ValueError, SyntaxError):
                pass

        def func(value):
            """
            Check if a value can be casted to a specific
            :param value: value to be checked
            :return:
            """
            if isinstance(value, bool):
                _data_type = "bool"
            elif fastnumbers.isint(value):  # Check if value is integer
                _data_type = "int"
            elif fastnumbers.isfloat(value):
                _data_type = "float"
            # if string we try to parse it to int, float or bool
            elif isinstance(value, str):
                if str_to_boolean(value):
                    _data_type = "bool"
                elif str_to_date(value):
                    _data_type = "date"
                elif str_to_array(value):
                    _data_type = "array"
                else:
                    _data_type = "string"
            else:
                _data_type = "null"

            if get_type is False:
                if _data_type == data_type:
                    return True
                else:
                    return False
            else:
                return _data_type

        return v.apply(func)

    if get_type is True:
        return_data_type = "string"
    else:
        return_data_type = "boolean"

    col_name = one_list_to_val(col_name)
    return F.pandas_udf(pandas_udf_func, return_data_type)(col_name)