def hist(self, col_name): # buckets = 20 # min_value, max_value = self.df.cols.range(col_name) # # create_buckets(min_value, self.lower_bound, buckets) # # create_buckets(self.lower_bound, self.upper_bound, buckets) # # create_buckets(self.upper_bound, max_value, buckets) # lower bound lower_bound_hist = self.df.rows.select( self.df[col_name] < self.lower_bound).cols.hist(col_name, 20) # upper bound upper_bound_hist = self.df.rows.select( self.df[col_name] > self.upper_bound).cols.hist(col_name, 20) # Non outliers non_outlier_hist = self.df.rows.select( (F.col(col_name) >= self.lower_bound) & (F.col(col_name) <= self.upper_bound)).cols.hist(col_name, 20) result = {} if lower_bound_hist is not None: result.update(lower_bound_hist) if upper_bound_hist is not None: result.update(upper_bound_hist) if non_outlier_hist is not None: result.update(non_outlier_hist) return dump_json(result)
def info(self, output: str = "dict"): """ Get whiskers, iqrs and outliers and non outliers count :return: """ lower_bound = self.lower_bound upper_bound = self.upper_bound q1 = self.q1 median = self.median q3 = self.q3 iqr = self.iqr result = { "count_outliers": self.count(), "count_non_outliers": self.non_outliers_count(), "lower_bound": lower_bound, "lower_bound_count": self.count_lower_bound(lower_bound), "upper_bound": upper_bound, "upper_bound_count": self.count_upper_bound(upper_bound), "q1": q1, "median": median, "q3": q3, "iqr": iqr } if output == "json": result = dump_json(result) return result
def select_lower_bound(self): col_name = self.col_name sample = { "columns": [{ "title": cols } for cols in val_to_list(self.col_name)], "value": self.df.rows.select(self.df[col_name] < self.lower_bound).limit( 100).rows.to_list(col_name) } return dump_json(sample)
def base_clustering_function(df, input_cols, output, func=None, args=None): """ Cluster a dataframe column based on the Fingerprint algorithm :return: """ # df = self.df input_cols = parse_columns(df, input_cols) result = {} for input_col in input_cols: output_col = name_col(input_col, FINGERPRINT_COL) count_col = name_col(input_col, COUNT_COL) # Instead of apply the fingerprint to the whole data set we group by names df = (df.groupBy(input_col).agg( F.count(input_col).alias(count_col)).select( count_col, input_col)).h_repartition(1) # Calculate the fingerprint recommended_col = name_col(input_col, RECOMMENDED_COL) cluster_col = name_col(input_col, CLUSTER_COL) cluster_sum_col = name_col(input_col, CLUSTER_SUM_COL) cluster_count_col = name_col(input_col, CLUSTER_COUNT_COL) # Apply function cluster df = func(df, *args) df = df.withColumn(cluster_col, F.create_map([input_col, count_col])) df = df.groupBy(output_col).agg( F.max(F.struct(F.col(count_col), F.col(input_col))).alias(recommended_col), F.collect_list(cluster_col).alias(cluster_col), F.count(output_col).alias(cluster_count_col), F.sum(count_col).alias(cluster_sum_col)) df = df.select(recommended_col + "." + input_col, cluster_col, cluster_count_col, cluster_sum_col) for row in df.collect(): _row = list(row.asDict().values()) # List of dict to dict flatted_dict = { k: v for element in _row[1] for k, v in element.items() } result[_row[0]] = { "similar": flatted_dict, "count": _row[2], "sum": _row[3] } if output == "json": result = dump_json(result) return result
def profile(self, columns="*", bins: int = MAX_BUCKETS, output: str = None, flush: bool = False, size=False): """ Return a dict the profile of the dataset :param columns: :param bins: :param output: :param flush: :param size: get the dataframe size in memory. Use with caution this could be slow for big data frames. :return: """ df = self.root meta = self.meta profile = Meta.get(meta, "profile") if not profile: flush = True if columns or flush: columns = parse_columns(df, columns) if columns else [] transformations = Meta.get(meta, "transformations") calculate = False if flush or len(transformations): calculate = True else: for col in columns: if col not in profile["columns"]: calculate = True if calculate: df = df[columns].calculate_profile("*", bins, flush, size) profile = Meta.get(df.meta, "profile") self.meta = df.meta profile["columns"] = { key: profile["columns"][key] for key in columns } if output == "json": profile = dump_json(profile) return profile
def info(self, output: str = "dict"): """ Get whiskers, iqrs and outliers and non outliers count :return: """ upper_bound, lower_bound, = dict_filter(self.whiskers(), ["upper_bound", "lower_bound"]) result = { "count_outliers": self.count(), "count_non_outliers": self.non_outliers_count(), "lower_bound": lower_bound, "lower_bound_count": self.count_lower_bound(lower_bound), "upper_bound": upper_bound, "upper_bound_count": self.count_upper_bound(upper_bound) } if output == "json": result = dump_json(result) return result
def hist(self, col_name: str): df = self.df # lower bound lower_bound_hist = df.rows.select(df[col_name] < self.lower_bound).cols.hist(col_name, 20) # upper bound upper_bound_hist = df.rows.select(df[col_name] > self.upper_bound).cols.hist(col_name, 20) # Non outliers non_outlier_hist = df.rows.select( (df[col_name] >= self.lower_bound) & (df[col_name] <= self.upper_bound)).cols.hist(col_name, 20) result = {} if lower_bound_hist is not None: result.update(lower_bound_hist) if upper_bound_hist is not None: result.update(upper_bound_hist) if non_outlier_hist is not None: result.update(non_outlier_hist) return dump_json(result)
def levenshtein_cluster(df, input_col, threshold: int = None, output: str = "dict"): """ Output the levenshtein distance in json format :param df: Spark Dataframe :param input_col: Column to be processed :param threshold: number :param output: "dict" or "json" :return: """ # Create fingerprint df_fingerprint = keycollision.fingerprint(df, input_col) # Names fingerprint_col = name_col(input_col, FINGERPRINT_COL) distance_col_name = name_col(input_col, LEVENSHTEIN_DISTANCE) temp_col_1 = input_col + "_LEVENSHTEIN_1" temp_col_2 = input_col + "_LEVENSHTEIN_2" count = "count" # Prepare the columns to calculate the cross join fingerprint_count = df_fingerprint.select(input_col, fingerprint_col).groupby(input_col) \ .agg(F.first(input_col).alias(temp_col_1), F.first(fingerprint_col).alias(temp_col_2), F.count(input_col).alias(count)) \ .select(temp_col_1, temp_col_2, count).collect() df = df_fingerprint.select( input_col, F.col(fingerprint_col).alias(temp_col_1), F.col(fingerprint_col).alias(temp_col_2)).distinct() # Create all the combination between the string to calculate the levenshtein distance df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \ .withColumn(distance_col_name, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2))) # Select only the string with shortest path distance_col = name_col(input_col, LEVENSHTEIN_DISTANCE) distance_r_col = input_col + "_LEVENSHTEIN_DISTANCE_R" temp_r = "TEMP_R" if threshold is None: where = ((F.col(distance_col) == 0) & (F.col(temp_col_1) != F.col(temp_col_2))) else: where = (F.col(distance_col) == 0) | (F.col(distance_col) > threshold) df_r = ( df.rows.drop(where).cols.replace( distance_col, 0, None, search_by="numeric").groupby(temp_col_1).agg( F.min(distance_col).alias(distance_r_col)) # .cols.rename(distance_col, distance_r_col) .cols.rename(temp_col_1, temp_r)).repartition(1) df = df.join(df_r, ((df_r[temp_r] == df[temp_col_1]) & (df_r[distance_r_col] == df[distance_col]))) \ .select(temp_col_1, distance_col, temp_col_2).repartition(1) # Create the clusters/lists df = (df.groupby(temp_col_1).agg(F.collect_list(temp_col_2), F.count(temp_col_2))) # Replace ngram per string kv_dict = {} for row in fingerprint_count: _row = list(row.asDict().values()) kv_dict[_row[1]] = {_row[0]: _row[2]} result = {} for row in df.collect(): _row = list(row.asDict().values()) d = {} for i in _row[1]: key = list(kv_dict[i].keys())[0] value = list(kv_dict[i].values())[0] d[key] = value key = list(kv_dict[_row[0]].keys())[0] value = list(kv_dict[_row[0]].values())[0] d.update({key: value}) result[key] = d # Calculate count and sum f = {} for k, v in result.items(): _sum = 0 for x, y in v.items(): _sum = _sum + y f[k] = {"similar": v, "count": len(v), "sum": _sum} result = f if output == "json": result = dump_json(result) return result
def dataset(self, df, columns="*", buckets=10, infer=False, relative_error=RELATIVE_ERROR, approx_count=True, sample=10000, stats=True, format="dict", mismatch=None, advanced_stats=False): """ Return the profiling data in json format :param df: Dataframe to be processed :param columns: column to calculate the histogram :param buckets: buckets on the histogram :param infer: :param relative_error: :param approx_count: :param sample: numbers of rows to retrieve with random sampling :param stats: calculate stats, if not only data table returned :param format: dict or json :param mismatch: :param advanced_stats: :return: dict or json """ output_columns = self.output_columns cols_to_profile = self.cols_needs_profiling(df, columns) # Get the stats for all the columns if stats is True: # Are there column to process? if cols_to_profile or not self.is_cached(): rows_count = df.count() self.rows_count = rows_count self.cols_count = cols_count = len(df.columns) updated_columns = self.columns_stats(df, cols_to_profile, buckets, infer, relative_error, approx_count, mismatch, advanced_stats) output_columns = update_dict(output_columns, updated_columns) assign(output_columns, "name", df.get_name(), dict) assign(output_columns, "file_name", df.get_meta("file_name"), dict) # Add the General data summary to the output data_set_info = {'cols_count': cols_count, 'rows_count': rows_count, 'size': humanize.naturalsize(df.size()), 'sample_size': sample} assign(output_columns, "summary", data_set_info, dict) # Nulls total_count_na = 0 for k, v in output_columns["columns"].items(): total_count_na = total_count_na + v["stats"]["count_na"] assign(output_columns, "summary.missing_count", total_count_na, dict) assign(output_columns, "summary.p_missing", round(total_count_na / self.rows_count * 100, 2)) # TODO: drop, rename and move operation must affect the sample sample = {"columns": [{"title": cols} for cols in df.cols.names()], "value": df.sample_n(sample).rows.to_list(columns)} assign(output_columns, "sample", sample, dict) actual_columns = output_columns["columns"] # Order columns output_columns["columns"] = dict(OrderedDict( {_cols_name: actual_columns[_cols_name] for _cols_name in df.cols.names() if _cols_name in list(actual_columns.keys())})) df = df.set_meta(value={}) df = df.columns_meta(df.cols.names()) # col_names = output_columns["columns"].keys() if format == "json": result = dump_json(output_columns) else: result = output_columns self.output_columns = output_columns df.set_meta("transformations.actions", {}) return result