def sort(col_sort): """ Sort columns taking in account multiple columns :param col_sort: column and sort type combination (col_name, "asc") :type col_sort: list of tuples """ # If a list of columns names are given order this by desc. If you need to specify the order of every # column use a list of tuples (col_name, "asc") t = [] if is_list_of_str_or_int(col_sort): for col_name in col_sort: t.append(tuple([col_name, "desc"])) col_sort = t func = [] for cs in col_sort: col_name = one_list_to_val(cs[0]) order = cs[1] if order == "asc": sort_func = F.asc elif order == "desc": sort_func = F.desc func.append(sort_func(col_name)) df = self.sort(*func) return df
def filter_row_by_data_type(col_name, data_type=None, get_type=False): from ast import literal_eval """ A Pandas UDF function that returns bool if the value match with the data_type param passed to the function. Also can return the data type :param col_name: Column to be process :param data_type: The data_type to be compared with :param get_type: Value to be returned as string or boolean :return: True or False """ if data_type is not None: data_type = parse_python_dtypes(data_type) def pandas_udf_func(v): def str_to_boolean(value): """ Check if a str can be converted to boolean :param value: :return: """ value = value.lower() if value == "true" or value == "false": return True def str_to_date(value): try: dateutil.parser.parse(value) return True except ValueError: pass def str_to_array(value): """ Check if value can be parsed to a tuple or and array. Because Spark can handle tuples we will try to transform tuples to arrays :param value: :return: """ try: if isinstance( literal_eval((value.encode('ascii', 'ignore')).decode("utf-8")), (list, tuple)): return True except ( ValueError, SyntaxError, ): pass def func(value): """ Check if a value can be casted to a specific :param value: value to be checked :return: """ if isinstance(value, bool): _data_type = "bool" # _data_type = data_type elif isint(value): # Check if value is integer _data_type = "int" elif isfloat(value): _data_type = "float" # if string we try to parse it to int, float or bool elif isinstance(value, str): if str_to_boolean(value): _data_type = "bool" elif str_to_date(value): _data_type = "date" elif str_to_array(value): _data_type = "array" else: _data_type = "string" else: _data_type = "null" if get_type is False: if _data_type == data_type: return True else: return False else: return _data_type return v.apply(func) if get_type is True: return_data_type = "string" else: return_data_type = "boolean" col_name = one_list_to_val(col_name) return F.pandas_udf(pandas_udf_func, return_data_type)(col_name)
def filter_row_by_data_type(col_name, data_type=None, get_type=False): """ A Pandas UDF function that returns bool if the value match with the data_type param passed to the function. Also can return the data type :param col_name: Column to be process :param data_type: The data_type to be compared :param get_type: :return: True or False """ if data_type is not None: data_type = parse_python_dtypes(data_type) def pandas_udf_func(v): def str_to_boolean(value): """ Check if a str can be converted to boolean :param value: :return: """ value = value.lower() if value == "true" or value == "false": return True def str_to_date(value): try: dateutil.parser.parse(value) return True except ValueError: pass def func(value): """ Check if a value can be casted to a specific :param value: value to be checked :return: """ if isinstance(value, bool): _data_type = "bool" # _data_type = data_type elif isint(value): # Check if value is integer _data_type = "int" elif isfloat(value): _data_type = "float" # if string we try to parse it to int, float or bool elif isinstance(value, str): if str_to_boolean(value): _data_type = "bool" elif str_to_date(value): _data_type = "date" else: _data_type = "string" else: _data_type = "null" if get_type is False: if _data_type == data_type: return True else: return False else: return _data_type return v.apply(func) if get_type is True: a = "string" else: a = "boolean" col_name = one_list_to_val(col_name) return F.pandas_udf(pandas_udf_func, a)(col_name)