def sort(col_sort): """ Sort rows taking in account multiple columns :param col_sort: column and sort type combination (col_name, "asc") :type col_sort: list of tuples """ # If a list of columns names are given order this by desc. If you need to specify the order of every # column use a list of tuples (col_name, "asc") df = self t = [] if is_list_of_str_or_int(col_sort): for col_name in col_sort: t.append(tuple([col_name, "desc"])) col_sort = t func = [] for cs in col_sort: col_name = one_list_to_val(cs[0]) order = cs[1] if order == "asc": sort_func = F.asc elif order == "desc": sort_func = F.desc else: RaiseIt.value_error(sort_func, ["asc", "desc"]) func.append(sort_func(col_name)) df = df.preserve_meta(self, Actions.SORT_ROW.value, col_name) df = df.sort(*func) return df
def value_error(var=None, data_values=None): """ Raise a ValueError exception :param var: :type var: :param data_values: values accepted by the variable :type data_values: str/list :return: """ from optimus.helpers.debug import get_var_name if not isinstance(data_values, list): data_values = [data_values] # data_values = val_to_list(data_values) if len(data_values) == 1: divisor = "" elif len(data_values) == 2: divisor = " or " elif len(data_values) > 2: divisor = ", " raise ValueError("'{var_name}' must be {type}, received '{var_type}'" .format(var_name=get_var_name(var), type=divisor.join(map( lambda x: "'" + x + "'", data_values)), var_type=one_list_to_val(var)))
def plot_hist(column_data=None, output=None, sub_title="", path=None): """ Plot a histogram obj = {"col_name":[{'lower': -87.36666870117188, 'upper': -70.51333465576172, 'value': 0}, {'lower': -70.51333465576172, 'upper': -53.66000061035157, 'value': 22094}, {'lower': -53.66000061035157, 'upper': -36.80666656494141, 'value': 2}, ... ]} :param column_data: column data in json format :param output: image, base64 or plot. Image output a file, base64 output a base64 encoded image and plot output the image to the notebook :param sub_title: plot subtitle :param path: :return: plot, image or base64 """ for col_name, data in column_data.items(): bins = [] # print(data) # print("**********") for d in data: bins.append(d['lower']) last = data[len(data) - 1]["upper"] bins.append(last) # Transform hist Optimus format to matplot lib format hist = [] for d in data: if d is not None: hist.append(d["count"]) array_bins = array(bins) center = (array_bins[:-1] + array_bins[1:]) / 2 width = 0.9 * (array_bins[1] - array_bins[0]) hist = one_list_to_val(hist) # Plot fig = plt.figure(figsize=(12, 5)) plt.bar(center, hist, width=width) plt.title("Histogram '" + col_name + "' " + sub_title) # fig.tight_layout() if output is "base64": return output_base64(fig) elif output is "image": # Save image output_image(plt, path) print_html("<img src='" + path + "'>") # Print in jupyter notebook elif output is "plot": plt.subplots_adjust(left=0.05, right=0.99, top=0.9, bottom=0.3)
def __init__(self, df, col_name): """ :param df: Spark Dataframe :param col_name: column name """ if not is_dataframe(df): raise TypeError("Spark Dataframe expected") self.df = df self.col_name = one_list_to_val(parse_columns(df, col_name))
def name_col(col_names: str, append: str) -> str: """ Whenever you want to name and output user this function. This ensure that we manage and Standard when naming :param col_names: Column name :param append: string to be appended :return: """ col_names = val_to_list(col_names) if len(col_names) > 1: output_col = ('_'.join(str(elem) for elem in col_names))[:10] + "***" else: output_col = one_list_to_val(col_names) return output_col + "_" + append.upper()
def filter_list(val, index=0): """ Convert a list to None, int, str or a list filtering a specific index [] to None ['test'] to test :param val: :param index: :return: """ if len(val) == 0: return None else: return one_list_to_val([column[index] for column in val])
def absolute_path(files, format="posix"): """ User project base folder to construct and absolute path :param files: path files :param format: posix or uri :return: """ files = val_to_list(files) if format == "uri": result = [Path(ROOT_DIR + file).as_uri() for file in files] elif format == "posix": result = [Path(ROOT_DIR + file).as_posix() for file in files] else: RaiseIt.value_error(format, ["posix", "uri"]) result = one_list_to_val(result) return result
def __init__(self, df, col_name, threshold): """ :param df: Spark Dataframe :param col_name: """ if not is_dataframe(df): raise TypeError("Spark Dataframe expected") self.df = df if not is_numeric(threshold): raise TypeError("Numeric expected") self.threshold = threshold self.col_name = one_list_to_val(parse_columns(df, col_name))
def is_column_a(df, column, dtypes): """ Check if column match a list of data types :param df: dataframe :param column: column to be compared with :param dtypes: types to be checked :return: """ column = val_to_list(column) if len(column) > 1: RaiseIt.length_error(column, 1) data_type = tuple(val_to_list(parse_spark_dtypes(dtypes))) column = one_list_to_val(column) # Filter columns by data type return isinstance(df.schema[column].dataType, data_type)
def __init__(self, df, col_name, threshold): """ :para df: :param col_name: :param threshold: """ if not is_dataframe(df): raise TypeError("Spark Dataframe expected") if not is_numeric(threshold): raise TypeError("Numeric expected") self.df = df self.threshold = threshold self.col_name = one_list_to_val(parse_columns(df, col_name)) self.tmp_col = name_col(col_name, "z_score") self.df_score = self.z_score() super().__init__(self.df_score, col_name, "z_score")
def __init__(self, df, col_name, threshold, relative_error=RELATIVE_ERROR): """ :param df: :param col_name: :param threshold: """ if not is_dataframe(df): raise TypeError("Spark Dataframe expected") if not is_numeric(threshold): raise TypeError("Numeric expected") if not is_numeric(relative_error): raise TypeError("Numeric expected") self.df = df self.threshold = threshold self.relative_error = relative_error self.col_name = one_list_to_val(parse_columns(df, col_name))
def filter_row_by_data_type(col_name, data_type=None, get_type=False): """ A Pandas UDF function that returns bool if the value match with the data_type param passed to the function. Also can return the data type :param col_name: Column to be process :param data_type: The data_type to be compared with :param get_type: Value to be returned as string or boolean :return: True or False """ if data_type is not None: data_type = parse_python_dtypes(data_type) def pandas_udf_func(v): return v.apply(Infer.func, args=(data_type, get_type)) if get_type is True: return_data_type = "string" else: return_data_type = "boolean" col_name = one_list_to_val(col_name) return F.pandas_udf(pandas_udf_func, return_data_type)(col_name)
def filter_row_by_data_type(col_name, data_type=None, get_type=False): """ A Pandas UDF function that returns bool if the value match with the data_type param passed to the function. Also can return the data type :param col_name: Column to be process :param data_type: The data_type to be compared with :param get_type: Value to be returned as string or boolean :return: True or False """ from ast import literal_eval if data_type is not None: data_type = parse_python_dtypes(data_type) def pandas_udf_func(v): def str_to_boolean(value): """ Check if a str can be converted to boolean :param value: :return: """ value = value.lower() if value == "true" or value == "false": return True def str_to_date(value): try: dateutil.parser.parse(value) return True except (ValueError, OverflowError): pass def str_to_array(value): """ Check if value can be parsed to a tuple or and array. Because Spark can handle tuples we will try to transform tuples to arrays :param value: :return: """ try: if isinstance( literal_eval((value.encode('ascii', 'ignore')).decode("utf-8")), (list, tuple)): return True except (ValueError, SyntaxError): pass def func(value): """ Check if a value can be casted to a specific :param value: value to be checked :return: """ if isinstance(value, bool): _data_type = "bool" elif fastnumbers.isint(value): # Check if value is integer _data_type = "int" elif fastnumbers.isfloat(value): _data_type = "float" # if string we try to parse it to int, float or bool elif isinstance(value, str): if str_to_boolean(value): _data_type = "bool" elif str_to_date(value): _data_type = "date" elif str_to_array(value): _data_type = "array" else: _data_type = "string" else: _data_type = "null" if get_type is False: if _data_type == data_type: return True else: return False else: return _data_type return v.apply(func) if get_type is True: return_data_type = "string" else: return_data_type = "boolean" col_name = one_list_to_val(col_name) return F.pandas_udf(pandas_udf_func, return_data_type)(col_name)