def nest(input_cols, output_col, shape="string", separator=""): """ Concat multiple columns to one with the format specified :param input_cols: columns to be nested :param output_col: final column with the nested content :param separator: char to be used as separator at the concat time :param shape: final data type, 'array', 'string' or 'vector' :return: Spark DataFrame """ df = self if has_(input_cols, F.Column): # Transform non Column data to lit columns = [F.lit(col) if not is_(col, F.Column) else col for col in input_cols] else: columns = parse_columns(self, input_cols) if shape is "vector": columns = parse_columns(self, input_cols, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES) vector_assembler = VectorAssembler( inputCols=columns, outputCol=output_col) df = vector_assembler.transform(df) elif shape is "array": df = apply_expr(output_col, F.array(*columns)) elif shape is "string": df = apply_expr(output_col, F.concat_ws(separator, *columns)) else: RaiseIt.value_error(shape, ["vector", "array", "string"]) return df
def _mad(self, action): """ :type action: :return: """ df = self.df columns = self.columns threshold = self.threshold if not is_dataframe(df): raise TypeError("Spark Dataframe expected") if not is_int(threshold): raise TypeError("Integer expected") columns = parse_columns(df, columns) for c in columns: mad_value = df.cols.mad(c, more=True) lower_bound = mad_value["median"] - threshold * mad_value["mad"] upper_bound = mad_value["median"] + threshold * mad_value["mad"] if action is "select": df = df.rows.select((F.col(c) > upper_bound) | (F.col(c) < lower_bound)) elif action is "drop": df = df.rows.drop((F.col(c) > upper_bound) | (F.col(c) < lower_bound)) return df
def decision_tree(df, columns, input_col, **kargs): """ Runs a decision tree classifier for input DataFrame. :param df: Pyspark dataframe to analyze. :param columns: List of columns to select for prediction. :param input_col: Column to predict. :return: DataFrame with decision tree and prediction run. """ if not is_dataframe(df): raise TypeError("Spark dataframe expected") columns = parse_columns(df, columns) assert isinstance(input_col, str), "Error, input column must be a string" data = df.select(columns) feats = data.columns feats.remove(input_col) df = string_to_index(df, input_cols=input_col) df = vector_assembler(df, input_cols=feats) model = DecisionTreeClassifier(**kargs) df = df.cols.rename([(input_col + "_index", "label")]) dt_model = model.fit(df) df_model = dt_model.transform(df) return df_model, dt_model
def hist(columns, min_value, max_value, buckets=10): """ Get the histogram column in json format :param columns: Columns to be processed :param min_value: Min value used to calculate the buckets :param max_value: Max value used to calculate the buckets :param buckets: Number of buckets :return: """ columns = parse_columns(self, columns) for col_name in columns: # Create splits splits = create_buckets(min_value, max_value, buckets) # Create buckets in the dataFrame df = bucketizer(self, col_name, splits=splits) counts = (df.groupBy(col_name + "_buckets").agg( F.count(col_name + "_buckets").alias("count")).cols.rename( col_name + "_buckets", "value").sort(F.asc("value")).to_json()) hist = [] for x, y in zip(counts, splits): # if x["value"] is not None and x["count"] != 0: hist.append({ "lower": y["lower"], "upper": y["upper"], "count": x["count"] }) return hist
def apply_by_dtypes(columns, func, func_return_type, args=None, func_type=None, data_type=None): """ Apply a function using pandas udf or udf if apache arrow is not available :param columns: Columns in which the function is going to be applied :param func: Functions to be applied to a columns :param func_return_type :param args: :param func_type: pandas_udf or udf. If none try to use pandas udf (Pyarrow needed) :param data_type: :return: """ columns = parse_columns(self, columns) for c in columns: df = self.cols.apply(c, func, func_return_type, args=args, func_type=func_type, when=fbdt(c, data_type)) return df
def table_html(self, limit=100, columns=None): """ Return a HTML table with the dataframe cols, data types and values :param self: :param columns: Columns to be printed :param limit: how many rows will be printed :return: """ columns = parse_columns(self, columns) data = self.select(columns).limit(limit).to_json() # Load template path = os.path.dirname(os.path.abspath(__file__)) template_loader = jinja2.FileSystemLoader(searchpath=path + "//../templates") template_env = jinja2.Environment(loader=template_loader, autoescape=True) template = template_env.get_template("table.html") # Filter only the columns and data type info need it dtypes = list(filter(lambda x: x[0] in columns, self.dtypes)) total_rows = self.count() if total_rows < limit: limit = total_rows # Print table output = template.render(cols=dtypes, data=data, limit=limit, total_rows=total_rows, total_cols=self.cols.count()) return output
def nest(input_cols, output_col, shape=None, separator=" "): """ Concat multiple columns to one with the format specified :param input_cols: columns to be nested :param output_col: final column with the nested content :param separator: char to be used as separator at the concat time :param shape: final data type, 'array', 'string' or 'vector' :return: Spark DataFrame """ columns = parse_columns(self, input_cols) df = self if shape is "vector": vector_assembler = VectorAssembler(inputCols=input_cols, outputCol=output_col) df = vector_assembler.transform(self) elif shape is "array": df = apply_expr(output_col, F.array(*columns)) elif shape is "string": df = apply_expr(output_col, F.concat_ws(separator, *columns)) else: RaiseIfNot.value_error(shape, ["vector", "array", "string"]) return df
def percentile(columns, values=None, error=1): """ Return the percentile of a dataframe :param columns: '*', list of columns names or a single column name. :param values: list of percentiles to be calculated :return: percentiles per columns """ start_time = timeit.default_timer() if values is None: values = [0.05, 0.25, 0.5, 0.75, 0.95] columns = parse_columns(self, columns) # Get percentiles percentile_results = [] for c in columns: percentile_per_col = self \ .rows.drop_na(c) \ .cols.cast(c, "double") \ .approxQuantile(c, values, error) percentile_results.append(dict(zip(values, percentile_per_col))) percentile_results = dict(zip(columns, percentile_results)) logging.info("percentile") logging.info(timeit.default_timer() - start_time) return format_dict(percentile_results)
def _iqr(self, action): """ Select or drop outliers :param action: :return: """ df = self.df columns = self.columns if not is_dataframe(self.df): raise TypeError("Spark Dataframe expected") columns = parse_columns(self.df, columns) for col_name in columns: iqr = df.cols.iqr(col_name, more=True) lower_bound = iqr["q1"] - (iqr["iqr"] * 1.5) upper_bound = iqr["q3"] + (iqr["iqr"] * 1.5) if action is "drop": df = df.rows.drop((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound)) elif action is "select": df = df.rows.select((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound)) return df
def z_score(df, columns, threshold=None): """ Delete outlier using z score :param df: :param columns: :param threshold: :return: """ if not is_dataframe(df): raise TypeError("Spark Dataframe expected") if not is_int(threshold): raise TypeError("Integer expected") columns = parse_columns(df, columns) for c in columns: # the column with the z_col value is always the string z_col plus the name of column z_col = "z_col_" + c df = df.cols.z_score(c) \ .rows.drop(F.col(z_col) > threshold) \ .cols.drop(z_col) return df
def random_forest(df, columns, input_col, **kargs): """ Runs a random forest classifier for input DataFrame. :param df: Pyspark dataframe to analyze. :param columns: List of columns to select for prediction. :param input_col: Column to predict. :return: DataFrame with random forest and prediction run. """ columns = parse_columns(df, columns) data = df.select(columns) feats = data.columns feats.remove(input_col) df = string_to_index(df, input_cols=input_col) df = vector_assembler(df, input_cols=feats) model = RandomForestClassifier(**kargs) df = df.cols.rename([(input_col + "_index", "label")]) rf_model = model.fit(df) df_model = rf_model.transform(df) return df_model, rf_model
def _z_score(self, action): """ Get outlier using z score :return: """ df = self.df columns = self.columns threshold = self.threshold if not is_dataframe(df): raise TypeError("Spark Dataframe expected") if not is_numeric(threshold): raise TypeError("Numeric expected") columns = parse_columns(df, columns) for col_name in columns: # the column with the z_col value is always the string z_col plus the name of column z_col_name = _z_score_col_name(col_name) if action is "drop": df = df.cols.z_score(col_name,z_col_name) \ .rows.drop(F.col(z_col_name) > threshold) \ .cols.drop(z_col_name) elif action is "select": df = df.cols.z_score(col_name) \ .rows.select(F.col(z_col_name) > threshold) \ .cols.drop(z_col_name) return df
def parquet(path, mode="overwrite", num_partitions=1): """ Save data frame to a parquet file :param path: path where the dataframe will be saved. :param mode: Specifies the behavior of the save operation when data already exists. "append": Append contents of this DataFrame to existing data. "overwrite" (default case): Overwrite existing data. "ignore": Silently ignore this operation if data already exists. "error": Throw an exception if data already exists. :param num_partitions: the number of partitions of the DataFrame :return: """ # This character are invalid as column names by parquet invalid_character = [ " ", ",", ";", "{", "}", "(", ")", "\n", "\t", "=" ] def func(col_name): for i in invalid_character: col_name = col_name.replace(i, "_") return col_name df = self.cols.rename(func) columns = parse_columns(self, "*", filter_by_column_dtypes=["null"]) df = df.cols.cast(columns, "str") try: df.coalesce(num_partitions) \ .write \ .mode(mode) \ .parquet(path) except IOError as e: logger.print(e) raise
def csv(path, header="true", mode="overwrite", sep=",", num_partitions=1): """ Save data frame to a CSV file. :param path: path where the dataframe will be saved. :param header: True or False to include header :param mode: Specifies the behavior of the save operation when data already exists. "append": Append contents of this DataFrame to existing data. "overwrite" (default case): Overwrite existing data. "ignore": Silently ignore this operation if data already exists. "error": Throw an exception if data already exists. :param sep: sets the single character as a separator for each field and value. If None is set, it uses the default value. :param num_partitions: the number of partitions of the DataFrame :return: Dataframe in a CSV format in the specified path. """ try: df = self columns = parse_columns(self, "*", filter_by_column_dtypes=[ "date", "array", "vector", "binary", "null" ]) df = df.cols.cast(columns, "str").repartition(num_partitions) # Save to csv df.write.options(header=header).mode(mode).csv(path, sep=sep) except IOError as error: logger.print(error) raise
def fingerprint_cluster(df, columns): """ Cluster a dataframe column based on the Fingerprint algorithm :param df: :param columns: Columns to be processed :return: """ # df = self.df columns = parse_columns(df, columns) for col_name in columns: output_col = col_name + "_FINGERPRINT" # Instead of apply the fingerprint to the whole data set we group by names df = ( df.groupBy(col_name).count().select('count', col_name).repartition( 1) # Needed for optimization in a single machine .cache()) # Calculate the fingeprint df = fingerprint(df, col_name) # Create cluster df = df.groupby(output_col).agg( F.collect_set(col_name).alias("cluster"), F.sum("count").alias("count"), F.first(col_name).alias("recommended"), F.size(F.collect_set(col_name)).alias("cluster_size") ) \ .select("cluster_size", "cluster", "count", "recommended") return df
def count_na(columns): """ Return the NAN and Null count in a Column :param columns: '*', list of columns names or a single column name. :param type: Accepts integer, float, string or None :return: """ columns = parse_columns(self, columns) df = self expr = [] for col_name in columns: # If type column is Struct parse to String. isnan/isNull can not handle Structure if is_(df.cols.schema_dtypes(col_name), (StructType, BooleanType)): df = df.cols.cast(col_name, "string") expr.append( F.count( F.when( F.isnan(col_name) | F.col(col_name).isNull(), col_name)).alias(col_name)) result = format_dict(collect_as_dict(df.select(*expr).collect())) return result
def fingerprint(df, columns): """ Create the fingerprint for a :param df: :param columns: :return: """ def _split_sort_remove_join(value, args): """ Helper function to split, remove duplicates, sort and join back together :param value: :param args: :return: """ # Split into whitespace-separated token split_key = value.split() # Sort and remove duplicated items sorted(set(split_key)) # join the tokens back together return "".join(split_key) columns = parse_columns(df, columns) for col_name in columns: output_col = col_name + "_FINGERPRINT" df = (df.withColumn(output_col, F.col(col_name)).cols.trim( output_col).cols.lower(output_col).cols.remove_special_chars( output_col).cols.remove_accents(output_col).cols.apply( output_col, _split_sort_remove_join, "string").repartition(1).cache()) return df
def n_gram_fingerprint_cluster(df, columns, n_size=2): """ Cluster a DataFrame column based on the N-Gram Fingerprint algorithm :param df: :param columns: :param n_size: :return: """ columns = parse_columns(df, columns) for col_name in columns: n_gram_col = col_name + "_ngram_fingerprint" # Prepare a group so we don need to apply the fingerprint to the whole data set df = ( df.select(col_name).groupBy(col_name).count().select( 'count', col_name).repartition( 1) # Needed for optimization in a single machine .cache()) df = n_gram_fingerprint(df, col_name, n_size) # df.table() df = df.groupby(n_gram_col).agg( F.collect_set(col_name).alias("cluster"), F.sum("count").alias("count"), F.first(col_name).alias("recommended"), F.size(F.collect_set(col_name)).alias("cluster_size")).select( "cluster_size", "cluster", "count", "recommended") return df
def count_na(columns): """ Return the NAN and Null count in a Column :param columns: '*', list of columns names or a single column name. :return: """ columns = parse_columns(self, columns) df = self expr = [] for col_name in columns: # If type column is Struct parse to String. isnan/isNull can not handle Structure/Boolean if is_(df.cols.schema_dtype(col_name), (StructType, BooleanType)): df = df.cols.cast(col_name, "string") if is_(df.cols.schema_dtype(col_name), (float, int)): expr.append(F.count(F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name)).alias(col_name)) elif is_(df.cols.schema_dtype(col_name), (NullType)): expr.append(F.count(col_name).alias(col_name)) else: expr.append(F.count(F.when(F.col(col_name).isNull(), col_name)).alias(col_name)) result = format_dict(df.select(*expr).to_json()) return result
def years_between(columns, date_format): """ This method compute the age based on a born date. :param columns: Name of the column born dates column. :param date_format: String format date of the column provided. """ # Asserting if column if in dataFrame: columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NOT_ARRAY_TYPES) # Output format date format_dt = "yyyy-MM-dd" # Some SimpleDateFormat string def _years_between(_new_col_name, attr): _date_format = attr[0] _col_name = attr[1] return F.format_number( F.abs( F.months_between( F.date_format( F.unix_timestamp( _col_name, _date_format).cast("timestamp"), format_dt), F.current_date()) / 12), 4) \ .alias( _new_col_name) df = self for col_name in columns: new_col_name = col_name + "_years_between" df = df.cols.apply_expr(new_col_name, _years_between, [date_format, col_name]).cols.cast(new_col_name, "float") return df
def gbt(df, columns, input_col, **kargs): """ Runs a gradient boosting tree classifier for input DataFrame. :param df: Pyspark dataframe to analyze. :param columns: List of columns to select for prediction. :param input_col: Column to predict. :return: DataFrame with gradient boosting tree and prediction run. """ if not is_dataframe(df): raise TypeError("Spark dataframe expected") columns = parse_columns(df, columns) if not is_str(input_col): raise TypeError("Error, input column must be a string") data = df.select(columns) feats = data.columns feats.remove(input_col) df = string_to_index(df, input_cols=input_col) df = vector_assembler(df, input_cols=feats) model = GBTClassifier(**kargs) df = df.cols.rename([(input_col + "_index", "label")]) gbt_model = model.fit(df) df_model = gbt_model.transform(df) return df_model, gbt_model
def mode(columns): """ Return the the column mode :param columns: '*', list of columns names or a single column name. :return: """ columns = parse_columns(self, columns) mode_result = [] for col_name in columns: cnts = self.groupBy(col_name).count() mode_df = cnts.join( cnts.agg(F.max("count").alias("max_")), F.col("count") == F.col("max_") ) # if none of the values are repeated we not have mode mode_list = (mode_df .rows.select(mode_df["count"] > 1) .cols.select(col_name) .collect()) mode_result.append({col_name: filter_list(mode_list)}) return mode_result
def date_transform(columns, current_format, output_format): """ Tranform a column date format :param columns: Columns to be transformed. :param current_format: current_format is the current string dat format of columns specified. Of course, all columns specified must have the same format. Otherwise the function is going to return tons of null values because the transformations in the columns with different formats will fail. :param output_format: output date string format to be expected. """ def _date_transform(_new_col_name, attr): _col_name = attr[0] _current_format = attr[1] _output_format = attr[2] return F.date_format(F.unix_timestamp(_col_name, _current_format).cast("timestamp"), _output_format).alias( _new_col_name) # Asserting if column if in dataFrame: columns = parse_columns(self, columns) df = self for col_name in columns: new_col_name = col_name + "_data_transform" df = df.cols.apply_expr(new_col_name, _date_transform, [col_name, current_format, output_format]) return df
def mad(columns, more=None): """ Return the Median Absolute Deviation :param columns: Column to be processed :param more: Return some extra computed values (Median). :return: """ columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES) result = {} for col_name in columns: _mad = {} # return mean(absolute(data - mean(data, axis)), axis) median_value = self.cols.median(col_name) mad_value = self.select(col_name) \ .withColumn(col_name, F.abs(F.col(col_name) - median_value)) \ .cols.median(col_name) if more: _mad = {"mad": mad_value, "median": median_value} else: _mad = {"mad": mad_value} result[col_name] = _mad return format_dict(result)
def percentile(columns, values=None, error=1): """ Return the percentile of a dataframe :param columns: '*', list of columns names or a single column name. :param values: list of percentiles to be calculated :param error: :return: percentiles per columns """ if values is None: values = [0.05, 0.25, 0.5, 0.75, 0.95] columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES) # Get percentiles percentile_results = [] for c in columns: percentile_per_col = self \ .rows.drop_na(c) \ .cols.cast(c, "double") \ .approxQuantile(c, values, error) percentile_results.append(dict(zip(values, percentile_per_col))) percentile_results = dict(zip(columns, percentile_results)) return format_dict(percentile_results)
def apply(columns, func, func_return_type, args=None, func_type=None, when=None, filter_col_by_dtypes=None, verbose=True): """ Apply a function using pandas udf or udf if apache arrow is not available :param columns: Columns in which the function is going to be applied :param func: Functions to be applied to a columns. The declaration must have always 2 params. def func(value, args): :param func_return_type: function return type. This is required by UDF and Pandas UDF. :param args: Arguments to be passed to the function :param func_type: pandas_udf or udf. If none try to use pandas udf (Pyarrow needed) :param when: A expression to better control when the function is going to be apllied :param filter_col_by_dtypes: Only apply the filter to specific type of value ,integer, float, string or bool :param verbose: Print additional information about :return: DataFrame """ columns = parse_columns(self, columns, filter_by_column_dtypes=filter_col_by_dtypes, accepts_missing_cols=True) df = self def expr(_when): main_query = audf(c, func, func_return_type, args, func_type, verbose=verbose) if when is not None: # Use the data type to filter the query main_query = F.when(_when, main_query).otherwise(F.col(c)) return main_query for c in columns: df = df.withColumn(c, expr(when)) return df
def apply_expr(columns, func=None, args=None, filter_col_by_dtypes=None, verbose=True): """ Apply a expression to column. :param columns: Columns in which the function is going to be applied :param func: function to be applied :type func: A plain expression or a function :param args: Argument passed to the function :param filter_col_by_dtypes: Only apply the filter to specific type of value ,integer, float, string or bool :param verbose: Print additional information about :return: Dataframe """ # It handle if func param is a plain expression or a function returning and expression def func_col_exp(col_name, attr): return func if is_(func, F.Column): _func = func_col_exp else: _func = func columns = parse_columns(self, columns, filter_by_column_dtypes=filter_col_by_dtypes, accepts_missing_cols=True) df = self for col_name in columns: df = df.withColumn(col_name, audf(col_name, _func, attrs=args, func_type="column_exp", verbose=verbose)) return df
def _exprs(funcs, columns): """ Helper function to apply multiple columns expression to multiple columns :param funcs: Aggregation functions from Apache Spark :param columns: list or string of columns names or a . :return: """ def parse_col_names_funcs_to_keys(data): """ Helper function that return a formatted json with function:value inside columns. Transform from {'max_antiguedad_anos': 15, 'max_m2_superficie_construida': 1800000, 'min_antiguedad_anos': 2, 'min_m2_superficie_construida': 20} to {'m2_superficie_construida': {'min': 20, 'max': 1800000}, 'antiguedad_anos': {'min': 2, 'max': 15}} :param data: json data :return: json """ functions_array = [ "min", "max", "stddev", "kurtosis", "mean", "skewness", "sum", "variance", "approx_count_distinct", "na", "zeros", "percentile" ] result = {} if is_dict(data): for k, v in data.items(): for f in functions_array: temp_func_name = f + "_" if k.startswith(temp_func_name): _col_name = k[len(temp_func_name):] result.setdefault(_col_name, {})[f] = v return result else: return data columns = parse_columns(self, columns) # Ensure that is a list funcs = val_to_list(funcs) df = self # Parse the columns to float. Seems that spark can handle some aggregation with string columns giving # unexpected results # df = df.cols.cast(columns, "float") # Create a Column Expression for every column exprs = [] for col_name in columns: for func in funcs: exprs.append( func(col_name).alias(func.__name__ + "_" + col_name)) return (parse_col_names_funcs_to_keys( format_dict(df.agg(*exprs).to_json())))
def correlation(self, columns, method="pearson", strategy="mean", output="json"): """ Calculate the correlation between columns. It will try to cast a column to float where necessary and impute missing values :param self: :param columns: Columns to be processed :param method: Method used to calculate the correlation :param strategy: Imputing strategy :param output: array or json :return: """ columns = parse_columns(self, columns) # try to parse the select column to float and create a vector df = self for col_name in columns: df = df.cols.cast(col_name, "float") logging.info( "Casting {col_name} to float...".format(col_name=col_name)) # Impute missing values imputed_cols = [c + "_imputed" for c in columns] df = df.cols.impute(columns, imputed_cols, strategy) logging.info("Imputing {columns}, Using '{strategy}'...".format( columns=columns, strategy=strategy)) # Create Vector necessary to calculate the correlation df = df.cols.nest(imputed_cols, "features", "vector") corr = Correlation.corr(df, "features", method).head()[0].toArray() if output is "array": result = corr elif output is "json": # Parse result to json col_pair = [] for col_name in columns: for col_name_2 in columns: col_pair.append({"between": col_name, "an": col_name_2}) # flat array values = corr.flatten('F').tolist() result = [] for n, v in zip(col_pair, values): # Remove correlation between the same column if n["between"] is not n["an"]: n["value"] = v result.append(n) result = sorted(result, key=lambda k: k['value'], reverse=True) return result
def variance(columns): """ Return the column variance :param columns: '*', list of columns names or a single column name. :return: """ columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES) return _exprs(F.variance, columns)