def to_file(self, path=None, output="html"): """ Save profiler data to a file in the specified format (html, json) :param output: html or json :param path: filename in which the data will be saved :return: """ if path is None: RaiseIt.value_error(path, "str") # We need to append a some extra html tags to display it correctly in the browser. if output is "html": if self.html is None: RaiseIt.not_ready_error( "You must first run the profiler, then it can be exported. Try op.profiler.run(df, '*')" ) write_html(HEADER + self.html + FOOTER, path) elif output is "json": if self.json is None: RaiseIt.not_ready_error( "You must first run the profiler, then it can be exported. Try op.profiler.run(df, '*')" ) write_json(self.json, path) else: RaiseIt.type_error(output, ["html", "json"])
def normalizer(df, input_cols, p=2.0): """ Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which specifies the p-norm used for normalization. (p=2) by default. :param df: Dataframe to be transformed :param input_cols: Columns to be normalized. :param p: p-norm used for normalization. :return: Dataframe with normalized columns. """ # Check if columns argument must be a string or list datatype: if is_(input_cols, [str, list]): RaiseIt.type_error(input_cols, [str, list]) if is_str(input_cols): input_cols = [input_cols] if is_(input_cols, [float, int]): RaiseIt.type_error(input_cols, [float, int]) df = df.cols.cast(input_cols, "vector") # TODO https://developer.ibm.com/code/2018/04/10/improve-performance-ml-pipelines-wide-dataframes-apache-spark-2-3/ normal = [ Normalizer(inputCol=col_name, outputCol=name_col(col_name, "normalized"), p=p) for col_name in list(set(input_cols)) ] pipeline = Pipeline(stages=normal) df = pipeline.fit(df).transform(df) return df
def append(rows): """ Append a row at the end of a dataframe :param rows: List of values or tuples to be appended :return: Spark DataFrame """ df = self if is_list_of_tuples(rows): columns = [str(i) for i in range(df.cols.count())] if not is_list_of_tuples(rows): rows = [tuple(rows)] new_row = op.Create.df(columns, rows) df_result = df.union(new_row) elif is_list_of_dataframes(rows) or is_dataframe(rows): row = val_to_list(rows) row.insert(0, df) df_result = append_df(row, like="rows") else: RaiseIt.type_error(rows, ["list of tuples", "list of dataframes"]) df_result = df_result.preserve_meta(self, Actions.NEST.value, df.cols.names()) return df_result
def _data_loader(self, url, type_of): """ Select the correct method to download the file depending of the format :param url: string url :param type_of: format data type :return: """ file_format = None if type_of == "csv": file_format = self.csv elif type_of == "json": file_format = self.json elif type_of == "parquet": file_format = self.parquet elif type_of == "avro": file_format = self.avro else: RaiseIt.type_error(file_format, ["csv", "json", "parquet", "avro", ]) i = url.rfind('/') data_name = url[(i + 1):] data_def = { "displayName": data_name, "url": url } return Downloader(data_def).download(file_format, type_of)
def data_loader(self, url, type_of): """ Load data in from a url :param url: url string :param type_of: format data type :return: """ data_loader = None if type_of == "csv": data_loader = self.csv elif type_of == "json": data_loader = self.json elif type_of == "parquet": data_loader = self.parquet elif type_of == "avro": data_loader = self.avro else: RaiseIt.type_error(data_loader, [ "csv", "json", "parquet", "avro", ]) i = url.rfind('/') data_name = url[(i + 1):] data_def = {"displayName": data_name, "url": url} return Downloader(data_def).download(data_loader, type_of)
def normalizer(df, input_cols, p=2.0): """ Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which specifies the p-norm used for normalization. (p=2) by default. :param df: Dataframe to be transformed :param input_cols: Columns to be normalized. :param p: p-norm used for normalization. :return: Dataframe with normalized columns. """ # Check if columns argument must be a string or list datatype: if is_(input_cols, [str, list]): RaiseIt.type_error(input_cols, [str, list]) if is_str(input_cols): input_cols = [input_cols] if is_(input_cols, [float, int]): RaiseIt.type_error(input_cols, [float, int]) df = df.cols.cast(input_cols, "vector") normal = [ Normalizer(inputCol=column, outputCol=column + "_normalized", p=p) for column in list(set(input_cols)) ] pipeline = Pipeline(stages=normal) df = pipeline.fit(df).transform(df) return df
def url(self, path=None, type_of="csv"): """ Entry point for loading data from a URL. Check that the url is well format :param path: string for URL to read :param type_of: type of the URL backend (can be csv or json) :return: pyspark dataframe from URL. """ if "https://" in str(path) or "http://" in str(path) or "file://" in str(path): return self._data_loader(str(path), type_of) else: RaiseIt.type_error(type_of, ["https://", "http://", "file://"])
def set_name(self, value=None): """ Create a temp view for a data frame also used in the json output profiling :param self: :param value: :return: """ self._name = value if not is_str(value): RaiseIt.type_error(value, ["string"]) if len(value) == 0: RaiseIt.value_error(value, ["> 0"]) self.createOrReplaceTempView(value)
def table_name(self, name=None): """ Create a temp view for a data frame :param self: :param name: :return: """ if not is_str(name): RaiseIt.type_error(name, ["string"]) if len(name) is 0: RaiseIt.value_error(name, ["> 0"]) self.createOrReplaceTempView(name) return self
def to_file(self, path=None, output=None): """ Save profiler data to a file in the specified format (html, json) :param output: html or json :param path: filename in which the data will be saved :return: """ if path is None: RaiseIt.value_error(path, ["Invalid file path"]) # We need to append a some extra html tags to display it correctly in the browser. if output is "html": if self.html is None: assert self.html is not None, "Please run the profiler first" header = '''<!doctype html> <html class="no-js" lang=""> <head> <meta charset="utf-8"> <meta http-equiv="x-ua-compatible" content="ie=edge"> <title></title> <meta name="description" content=""> <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> <link rel="manifest" href="site.webmanifest"> <link rel="apple-touch-icon" href="icon.png"> <!-- Place favicon.ico in the root directory --> <link rel="stylesheet" href="css/normalize.css"> <link rel="stylesheet" href="css/main.css"> </head> <body>''' footer = '''</body></html>''' write_html(header + self.html + footer, path) elif output is "json": if self.json is None: assert self.json is not None, "Please run the profiler first" write_json(self.json, path) else: print("sdf") RaiseIt.type_error(output, ["html", "json"])
def compare(df1, df2, method="json"): """ Compare 2 Spark dataframes :param df1: :param df2: :param method: json or a :return: """ if method is "json": diff = DeepDiff(df1.to_json(), df2.to_json(), ignore_order=False) print_json(diff) elif method is "collect": if df1.collect() == df2.collect(): print("Dataframes are equal") return True else: print("Dataframes not equal. Use 'json' param to check for diffrences") return False else: RaiseIt.type_error(method, ["json", "collect"])
def parse_columns(df, cols_args, get_args=False, is_regex=None, filter_by_column_dtypes=None, accepts_missing_cols=False, invert=False): """ Return a list of columns and check that columns exists in the dataframe Accept '*' as parameter in which case return a list of all columns in the dataframe. Also accept a regex. If a list of tuples return to list. The first element is the columns name the others element are params. This params can be used to create custom transformation functions. You can find and example in cols().cast() :param df: Dataframe in which the columns are going to be checked :param cols_args: Accepts * as param to return all the string columns in the dataframe :param get_args: :param is_regex: Use True is col_attrs is a regex :param filter_by_column_dtypes: A data type for which a columns list is going be filtered :param accepts_missing_cols: if true not check if column exist in the dataframe :param invert: Invert the final selection. For example if you want to select not integers :return: A list of columns string names """ if not is_dataframe(df): RaiseIt.type_error(df, "Dataframe") attrs = None # if columns value is * get all dataframes columns if is_regex is True: r = re.compile(cols_args[0]) cols = list(filter(r.match, df.columns)) elif cols_args == "*" or cols_args is None: cols = df.columns # In case we have a list of tuples we use the first element of the tuple is taken as the column name # and the rest as params. We can use the param in a custom function as follow # def func(attrs): attrs return (1,2) and (3,4) # return attrs[0] + 1 # df.cols().apply([('col_1',1,2),('cols_2', 3 ,4)], func) # Verify if we have a list with tuples elif is_tuple(cols_args) or is_list_of_tuples(cols_args): cols_args = val_to_list(cols_args) # Extract a specific position in the tuple cols = [(i[0:1][0]) for i in cols_args] attrs = [(i[1:]) for i in cols_args] else: # if not a list convert to list cols = val_to_list(cols_args) # Get col name from index cols = [c if is_str(c) else df.columns[c] for c in cols] # Check for missing columns if accepts_missing_cols is False: check_for_missing_columns(df, cols) # Filter by column data type filter_by_column_dtypes = val_to_list(filter_by_column_dtypes) if is_list_of_list(filter_by_column_dtypes): filter_by_column_dtypes = [ item for sublist in filter_by_column_dtypes for item in sublist ] columns_residual = None # If necessary filter the columns by data type if filter_by_column_dtypes: # Get columns for every data type columns_filtered = filter_col_name_by_dtypes(df, filter_by_column_dtypes) # Intersect the columns filtered per data type from the whole dataframe with the columns passed to the function final_columns = list(OrderedSet(cols).intersection(columns_filtered)) # This columns match filtered data type columns_residual = list( OrderedSet(cols) - OrderedSet(columns_filtered)) else: final_columns = cols cols_params = [] if invert: final_columns = list( OrderedSet(df.cols.names()) - OrderedSet(final_columns)) if get_args is True: cols_params = final_columns, attrs elif get_args is False: cols_params = final_columns else: RaiseIt.value_error(get_args, ["True", "False"]) if columns_residual: logger.print("%s %s %s", ",".join(escape_columns(columns_residual)), "column(s) was not processed because is/are not", ",".join(filter_by_column_dtypes)) # if because of filtering we got 0 columns return None if len(cols_params) == 0: cols_params = None logger.print("Outputting 0 columns after filtering. Is this expected?") return cols_params