def apply_by_dtypes(columns, func, func_return_type, args=None, func_type=None, data_type=None): """ Apply a function using pandas udf or udf if apache arrow is not available :param columns: Columns in which the function is going to be applied :param func: Functions to be applied to a columns :param func_return_type :param args: :param func_type: pandas_udf or udf. If none try to use pandas udf (Pyarrow needed) :param data_type: :return: """ columns = parse_columns(self, columns) for c in columns: df = self.cols.apply(c, func, func_return_type, args=args, func_type=func_type, when=fbdt(c, data_type)) return df
def _count_data_types(col_name): """ Function for determine if register value is float or int or string. :param col_name: :return: """ temp = col_name + "_type" # Count by data type types = df.withColumn(temp, fbdt( col_name, get_type=True)).groupBy(temp).count().collect() count_by_data_type = {} for row in types: count_by_data_type[row[0]] = row[1] # Fill missing data types with 0 count_by_data_type = fill_missing_var_types(count_by_data_type) # Subtract white spaces to the total string count count_empty_strings = df.where(F.col(col_name) == '').count() count_by_data_type[ 'string'] = count_by_data_type['string'] - count_empty_strings # if the data type is string we try to infer data_types_count = { "string": count_by_data_type['string'], "bool": count_by_data_type['bool'], "int": count_by_data_type['int'], "float": count_by_data_type['float'], "date": count_by_data_type['date'] } null_missed_count = { "null": count_by_data_type['null'], "missing": count_empty_strings, } # Get the greatest count by column data type greatest_data_type_count = max(data_types_count, key=data_types_count.get) if greatest_data_type_count is "string": cat = "categorical" elif greatest_data_type_count is "int" or greatest_data_type_count is "float": cat = "numeric" elif greatest_data_type_count is "date": cat = "date" elif greatest_data_type_count is "bool": cat = "bool" else: cat = "null" col = {} col['dtype'] = greatest_data_type_count col['type'] = cat col['details'] = {**data_types_count, **null_missed_count} return col
def drop_by_dtypes(col_name, data_type=None): """ Drop rows by cell data type :param col_name: Column in which the filter is going to be apllied :param data_type: filter by string, integer, float or boolean :return: Spark DataFrame """ validate_columns_names(self, col_name) return self.rows.drop(fbdt(col_name, data_type))
def select_by_dtypes(col_name, data_type=None): """ This function has built in order to filter some type of row depending of the var type detected by python for Example if you have a column with | a | | 1 | | b | and you filter by type = integer you will get | 1 | :param col_name: Column to be filtered :param data_type: Datatype use filter values :return: Spark DataFrame """ col_name = parse_columns(self, col_name) return self.where(fbdt(col_name, data_type))
def _count_data_types(col_name): """ Function for determine if register value is float or int or string. :param col_name: :return: """ logger.print("Processing column '" + col_name + "'...") # If String, process the data to try to infer which data type is inside. This a kind of optimization. # We do not need to analyze the data if the column data type is integer or boolean.etc temp = col_name + "_type" col_data_type = df.cols.dtypes(col_name) count_by_data_type = {} count_empty_strings = 0 if infer is True and col_data_type == "string": types = (df.h_repartition(col_name=col_name).withColumn( temp, fbdt(col_name, get_type=True)).groupBy(temp).count().to_json()) for row in types: count_by_data_type[row[temp]] = row["count"] count_empty_strings = df.where(F.col(col_name) == '').count() else: nulls = df.cols.count_na(col_name) count_by_data_type[col_data_type] = int(df.count()) - nulls count_by_data_type["null"] = nulls count_by_data_type = fill_missing_var_types(count_by_data_type) # Subtract white spaces to the total string count data_types_count = { "string": count_by_data_type['string'], "bool": count_by_data_type['bool'], "int": count_by_data_type['int'], "float": count_by_data_type['float'], "double": count_by_data_type['double'], "date": count_by_data_type['date'], "array": count_by_data_type['array'] } null_missed_count = { "null": count_by_data_type['null'], "missing": count_empty_strings, } # Get the greatest count by column data type greatest_data_type_count = max(data_types_count, key=data_types_count.get) if greatest_data_type_count is "string": cat = "categorical" elif greatest_data_type_count is "int" or greatest_data_type_count is "float" or greatest_data_type_count is "double": cat = "numeric" elif greatest_data_type_count is "date": cat = "date" elif greatest_data_type_count is "bool": cat = "bool" elif greatest_data_type_count is "array": cat = "array" else: cat = "null" col = {} col['dtype'] = greatest_data_type_count col['type'] = cat col['details'] = {**data_types_count, **null_missed_count} return col
# + def func(val, attr): return val + attr df.cols.apply(["num", "new_col_1"], func, "int", 10).table() # - # ### Select row where column "filter" is "integer" # + from optimus.functions import filter_row_by_data_type as fbdt df.rows.select(fbdt("filter", "integer")).table() # - # ### Create an abstract dataframe to filter a rows where the value of column "num"> 1 # + from optimus.functions import abstract_udf as audf def func(val, attr): return val > 1 df.rows.select(audf("num", func, "boolean")).table() # -