def is_column_a(df, column=None, dtypes="str"): """ Check if column match a list of data types :param df: spark or dask dataframe :param column: column to be compared with :param dtypes: types to be checked :return: """ column = val_to_list(column) if len(column) > 1: RaiseIt.length_error(column, 1) data_type = tuple(val_to_list(parse_dtypes(df, dtypes))) column = one_list_to_val(column) # Filter columns by data type # print("df",type(df),df) v = df.cols.schema_dtype(column) if is_spark_dataframe(df.data): result = isinstance(v, data_type) elif is_dask_dataframe(df): result = v in data_type else: result = None return result
def normalizer(df, input_cols, p=2.0): """ Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which specifies the p-norm used for normalization. (p=2) by default. :param df: Dataframe to be transformed :param input_cols: Columns to be normalized. :param p: p-norm used for normalization. :return: Dataframe with normalized columns. """ # Check if columns argument must be a string or list datatype: if is_(input_cols, [str, list]): RaiseIt.type_error(input_cols, [str, list]) if is_str(input_cols): input_cols = [input_cols] if is_(input_cols, [float, int]): RaiseIt.type_error(input_cols, [float, int]) df = df.cols.cast(input_cols, "vector") normal = [ Normalizer(inputCol=column, outputCol=column + "_normalized", p=p) for column in list(set(input_cols)) ] pipeline = Pipeline(stages=normal) df = pipeline.fit(df).transform(df) return df
def abstract_udf(col, func, func_return_type=None, attrs=None, func_type=None, verbose=False): """ Abstract User defined functions. This is a helper function to create udf, pandas udf or a Column Exp :param col: Column to created or transformed :param func: Function to be applied to the data :param attrs: If required attributes to be passed to the function :param func_return_type: Required by UDF and Pandas UDF. :param func_type: pandas_udf or udf. The function is going to try to use pandas_udf if func_type is not defined :param verbose: print additional info :return: A function, UDF or Pandas UDF """ # By default is going to try to use pandas UDF if func_type is None and is_pyarrow_installed() is True: func_type = "pandas_udf" types = ["column_exp", "udf", "pandas_udf"] if func_type not in types: RaiseIt.value_error(func_type, types) # if verbose is True: # logging.info("Using '{func_type}' to process column '{column}' with function {func_name}" # .format(func_type=func_type, column=col, func_name=func.__name__)) df_func = func_factory(func_type, func_return_type) return df_func(attrs, func)(col)
def data_loader(self, url, type_of): """ Load data in from a url :param url: url string :param type_of: format data type :return: """ data_loader = None if type_of == "csv": data_loader = self.csv elif type_of == "json": data_loader = self.json elif type_of == "parquet": data_loader = self.parquet elif type_of == "avro": data_loader = self.avro else: RaiseIt.type_error(data_loader, [ "csv", "json", "parquet", "avro", ]) i = url.rfind('/') data_name = url[(i + 1):] data_def = {"displayName": data_name, "url": url} return Downloader(data_def).download(data_loader, type_of)
def _data_loader(self, url, type_of): """ Select the correct method to download the file depending of the format :param url: string url :param type_of: format data type :return: """ file_format = None if type_of == "csv": file_format = self.csv elif type_of == "json": file_format = self.json elif type_of == "parquet": file_format = self.parquet elif type_of == "avro": file_format = self.avro else: RaiseIt.type_error(file_format, ["csv", "json", "parquet", "avro", ]) i = url.rfind('/') data_name = url[(i + 1):] data_def = { "displayName": data_name, "url": url } return Downloader(data_def).download(file_format, type_of)
def delete_check_point_folder(path, file_system): """ Function that deletes the temporal folder where temp files were stored. The path required is the same provided by user in setCheckPointFolder(). :param path: path where the info will be saved :param file_system: Describes if file system is local or hadoop file system. :return: """ if file_system == "hadoop": # Folder path: folder_path = path + "/" + "checkPointFolder" logger.print("Deleting checkpoint folder...") command = "hadoop fs -rm -r " + folder_path os.system(command) logger.print("$" + command) logger.print("Folder deleted.") elif file_system == "local": logger.print("Deleting checkpoint folder...") # Folder path: folder_path = path + "/" + "checkPointFolder" # Checking if tempFolder exits: if os.path.isdir(folder_path): # Deletes folder if exits: rmtree(folder_path) # Creates new folder: logger.print("Folder deleted.") else: logger.print("Folder deleted.") else: RaiseIt.value_error(file_system, ["hadoop", "local"])
def get(driver_type) -> AbstractDriver: """ Returns a driver implementation given a database name :param driver_type: name of the database :return: a database driver """ if driver_type == DriverProperties.CASSANDRA.value["name"]: return CassandraDriver() elif driver_type == DriverProperties.MYSQL.value["name"]: return MySQLDriver() elif driver_type == DriverProperties.ORACLE.value["name"]: return OracleDriver() elif driver_type == DriverProperties.POSTGRESQL.value["name"]: return PostgreSQLDriver() elif driver_type == DriverProperties.PRESTO.value["name"]: return PrestoDriver() elif driver_type == DriverProperties.REDSHIFT.value["name"]: return RedshiftDriver() elif driver_type == DriverProperties.SQLITE.value["name"]: return SQLiteDriver() elif driver_type == DriverProperties.SQLSERVER.value["name"]: return SQLServerDriver() elif driver_type == DriverProperties.BIGQUERY.value["name"]: return BigQueryDriver() elif driver_type == DriverProperties.IMPALA.value["name"]: return ImpalaDriver() else: RaiseIt.value_error( driver_type, [database["name"] for database in DriverProperties.list()])
def append(rows): """ Append a row at the end of a dataframe :param rows: List of values or tuples to be appended :return: Spark DataFrame """ df = self if is_list_of_tuples(rows): columns = [str(i) for i in range(df.cols.count())] if not is_list_of_tuples(rows): rows = [tuple(rows)] new_row = op.Create.df(columns, rows) df_result = df.union(new_row) elif is_list_of_dataframes(rows) or is_dataframe(rows): row = val_to_list(rows) row.insert(0, df) df_result = append_df(row, like="rows") else: RaiseIt.type_error(rows, ["list of tuples", "list of dataframes"]) df_result = df_result.preserve_meta(self, Actions.NEST.value, df.cols.names()) return df_result
def sort(col_sort): """ Sort rows taking in account multiple columns :param col_sort: column and sort type combination (col_name, "asc") :type col_sort: list of tuples """ # If a list of columns names are given order this by desc. If you need to specify the order of every # column use a list of tuples (col_name, "asc") df = self t = [] if is_list_of_str_or_int(col_sort): for col_name in col_sort: t.append(tuple([col_name, "desc"])) col_sort = t func = [] for cs in col_sort: col_name = one_list_to_val(cs[0]) order = cs[1] if order == "asc": sort_func = F.asc elif order == "desc": sort_func = F.desc else: RaiseIt.value_error(sort_func, ["asc", "desc"]) func.append(sort_func(col_name)) df = df.preserve_meta(self, Actions.SORT_ROW.value, col_name) df = df.sort(*func) return df
def cast_factory(cls): # Parse to Vector if is_type(cls, Vectors): func_type = "udf" def cast_to_vectors(val, attr): return Vectors.dense(val) func_return_type = VectorUDT() # Parse standard data types elif get_spark_dtypes_object(cls): func_type = "column_exp" def cast_to_vectors(col_name, attr): return F.col(col_name).cast(get_spark_dtypes_object(cls)) func_return_type = None # Add here any other parse you want else: RaiseIt.value_error(cls) return func_return_type, cast_to_vectors, func_type
def nest(input_cols, output_col, shape="string", separator=""): """ Concat multiple columns to one with the format specified :param input_cols: columns to be nested :param output_col: final column with the nested content :param separator: char to be used as separator at the concat time :param shape: final data type, 'array', 'string' or 'vector' :return: Spark DataFrame """ df = self if has_(input_cols, F.Column): # Transform non Column data to lit columns = [F.lit(col) if not is_(col, F.Column) else col for col in input_cols] else: columns = parse_columns(self, input_cols) if shape is "vector": columns = parse_columns(self, input_cols, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES) vector_assembler = VectorAssembler( inputCols=columns, outputCol=output_col) df = vector_assembler.transform(df) elif shape is "array": df = apply_expr(output_col, F.array(*columns)) elif shape is "string": df = apply_expr(output_col, F.concat_ws(separator, *columns)) else: RaiseIt.value_error(shape, ["vector", "array", "string"]) return df
def nest(input_cols, output_col, shape=None, separator=" "): """ Concat multiple columns to one with the format specified :param input_cols: columns to be nested :param output_col: final column with the nested content :param separator: char to be used as separator at the concat time :param shape: final data type, 'array', 'string' or 'vector' :return: Spark DataFrame """ columns = parse_columns(self, input_cols) df = self if shape is "vector": vector_assembler = VectorAssembler(inputCols=input_cols, outputCol=output_col) df = vector_assembler.transform(self) elif shape is "array": df = apply_expr(output_col, F.array(*columns)) elif shape is "string": df = apply_expr(output_col, F.concat_ws(separator, *columns)) else: RaiseIt.value_error(shape, ["vector", "array", "string"]) return df
def normalizer(df, input_cols, p=2.0): """ Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which specifies the p-norm used for normalization. (p=2) by default. :param df: Dataframe to be transformed :param input_cols: Columns to be normalized. :param p: p-norm used for normalization. :return: Dataframe with normalized columns. """ # Check if columns argument must be a string or list datatype: if is_(input_cols, [str, list]): RaiseIt.type_error(input_cols, [str, list]) if is_str(input_cols): input_cols = [input_cols] if is_(input_cols, [float, int]): RaiseIt.type_error(input_cols, [float, int]) df = df.cols.cast(input_cols, "vector") # TODO https://developer.ibm.com/code/2018/04/10/improve-performance-ml-pipelines-wide-dataframes-apache-spark-2-3/ normal = [ Normalizer(inputCol=col_name, outputCol=name_col(col_name, "normalized"), p=p) for col_name in list(set(input_cols)) ] pipeline = Pipeline(stages=normal) df = pipeline.fit(df).transform(df) return df
def append(dfs, like="columns"): """ Concat multiple dataFrames columns or rows wise :param dfs: List of DataFrames :param like: concat as columns or rows :return: """ # FIX: Because monotonically_increasing_id can create different # sequence for different dataframes the result could be wrong. if like == "columns": temp_dfs = [] col_temp_name = "id_" + random_int() dfs = val_to_list(dfs) for df in dfs: temp_dfs.append( df.withColumn(col_temp_name, F.monotonically_increasing_id())) def _append(df1, df2): return df1.join(df2, col_temp_name, "outer") df_result = reduce(_append, temp_dfs).drop(col_temp_name) elif like == "rows": df_result = reduce(DataFrame.union, dfs) else: RaiseIt.value_error(like, ["columns", "rows"]) return df_result
def concat(dfs, like="columns"): """ Concat multiple dataframes as columns or rows :param dfs: :param like: The way dataframes is going to be concat. like columns or rows :return: """ # Add increasing Ids, and they should be the same. if like == "columns": temp_dfs = [] col_temp_name = "id_" + random_int() for df in dfs: temp_dfs.append( df.withColumn(col_temp_name, F.monotonically_increasing_id())) def _append_df(df1, df2): return df1.join(df2, col_temp_name, "outer") df_result = reduce(_append_df, temp_dfs).drop(col_temp_name) elif like == "rows": df_result = reduce(DataFrame.union, dfs) else: RaiseIt.value_error(like, ["columns", "rows"]) return df_result
def check_for_missing_columns(df, col_names): """ Check if the columns you want to select exits in the dataframe :param df: Dataframe to be checked :param col_names: cols names to :return: """ missing_columns = list(OrderedSet(col_names) - OrderedSet(df.schema.names)) if len(missing_columns) > 0: RaiseIt.value_error(missing_columns, df.columns) return False
def url(self, path=None, type_of="csv"): """ Entry point for loading data from a URL. Check that the url is well format :param path: string for URL to read :param type_of: type of the URL backend (can be csv or json) :return: pyspark dataframe from URL. """ if "https://" in str(path) or "http://" in str(path) or "file://" in str(path): return self._data_loader(str(path), type_of) else: RaiseIt.type_error(type_of, ["https://", "http://", "file://"])
def check_column_numbers(columns, number=0): """ Check if the columns number match number expected :param columns: :param number: Number of columns to check :return: """ if columns is None: RaiseIt.value_error( columns, ["str", "list"], extra_text= "Maybe the columns selected do not match a specified datatype filter." ) if isinstance(columns, zip): columns = list(columns) count = list(columns) if number == "*": if not len(columns) >= 1: RaiseIt.value_error(len(columns), ["1 or greater"]) elif number == ">1": if not len(columns) > 1: RaiseIt.value_error(len(columns), ["more than 1"]) elif len(columns) != number: RaiseIt.value_error(count, "{} columns, {} needed".format(number, columns))
def to_file(self, path=None, output="html"): """ Save profiler data to a file in the specified format (html, json) :param output: html or json :param path: filename in which the data will be saved :return: """ if path is None: RaiseIt.value_error(path, "str") # We need to append a some extra html tags to display it correctly in the browser. if output is "html": if self.html is None: RaiseIt.not_ready_error( "You must first run the profiler, then it can be exported. Try op.profiler.run(df, '*')" ) write_html(HEADER + self.html + FOOTER, path) elif output is "json": if self.json is None: RaiseIt.not_ready_error( "You must first run the profiler, then it can be exported. Try op.profiler.run(df, '*')" ) write_json(self.json, path) else: RaiseIt.type_error(output, ["html", "json"])
def sort(order="asc"): """ Sort dataframes columns asc or desc :param order: 'asc' or 'desc' accepted :return: Spark DataFrame """ if order == "asc": sorted_col_names = sorted(self.columns) elif order == "desc": sorted_col_names = sorted(self.columns, reverse=True) else: RaiseIt.value_error(order, ["asc", "desc"]) return self.select(sorted_col_names)
def table_name(self, name=None): """ Create a temp view for a data frame :param self: :param name: :return: """ if not is_str(name): RaiseIt.type_error(name, ["string"]) if len(name) is 0: RaiseIt.value_error(name, ["> 0"]) self.createOrReplaceTempView(name) return self
def check_column_numbers(columns, number=0): """ Check if the columns number match number expected :param columns: :param number: Number of columns to check :return: """ count = len(columns) if number is "*": if not len(columns) >= 1: RaiseIt.value_error(len(columns), ["more than 1"]) elif not len(columns) == number: RaiseIt.value_error(count, "Receive {} columns, {} needed".format(number, columns))
def set_name(self, value=None): """ Create a temp view for a data frame also used in the json output profiling :param self: :param value: :return: """ self._name = value if not is_str(value): RaiseIt.type_error(value, ["string"]) if len(value) == 0: RaiseIt.value_error(value, ["> 0"]) self.createOrReplaceTempView(value)
def get_output_cols(input_cols, output_cols): # Construct input and output columns names if is_list(input_cols) and is_list(output_cols): if len(input_cols) != len(output_cols): RaiseIt.length_error(input_cols, output_cols) elif is_list(input_cols) and is_str(output_cols): if len(input_cols) > 1: output_cols = list([i + output_cols for i in input_cols]) else: output_cols = val_to_list(output_cols) elif is_str(input_cols) and is_str(output_cols): output_cols = val_to_list(output_cols) elif output_cols is None: output_cols = input_cols return output_cols
def to_file(self, path=None, output=None): """ Save profiler data to a file in the specified format (html, json) :param output: html or json :param path: filename in which the data will be saved :return: """ if path is None: RaiseIt.value_error(path, ["Invalid file path"]) # We need to append a some extra html tags to display it correctly in the browser. if output is "html": if self.html is None: assert self.html is not None, "Please run the profiler first" header = '''<!doctype html> <html class="no-js" lang=""> <head> <meta charset="utf-8"> <meta http-equiv="x-ua-compatible" content="ie=edge"> <title></title> <meta name="description" content=""> <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> <link rel="manifest" href="site.webmanifest"> <link rel="apple-touch-icon" href="icon.png"> <!-- Place favicon.ico in the root directory --> <link rel="stylesheet" href="css/normalize.css"> <link rel="stylesheet" href="css/main.css"> </head> <body>''' footer = '''</body></html>''' write_html(header + self.html + footer, path) elif output is "json": if self.json is None: assert self.json is not None, "Please run the profiler first" write_json(self.json, path) else: print("sdf") RaiseIt.type_error(output, ["html", "json"])
def absolute_path(files, format="posix"): """ User project base folder to construct and absolute path :param files: path files :param format: posix or uri :return: """ files = val_to_list(files) if format == "uri": result = [Path(ROOT_DIR + file).as_uri() for file in files] elif format == "posix": result = [Path(ROOT_DIR + file).as_posix() for file in files] else: RaiseIt.value_error(format, ["posix", "uri"]) result = one_list_to_val(result) return result
def is_column_a(df, column, dtypes): """ Check if column match a list of data types :param df: dataframe :param column: column to be compared with :param dtypes: types to be checked :return: """ column = val_to_list(column) if len(column) > 1: RaiseIt.length_error(column, 1) data_type = tuple(val_to_list(parse_spark_dtypes(dtypes))) column = one_list_to_val(column) # Filter columns by data type return isinstance(df.schema[column].dataType, data_type)
def _set_check_point_folder(path, file_system): """ Function that receives a workspace path where a folder is created. This folder will store temporal dataframes when user writes the .checkPoint(). :param path: Location of the dataset (string). :param file_system: Describes if file system is local or hadoop file system. """ print_check_point_config(file_system) if file_system == "hadoop": folder_path = path + "/" + "checkPointFolder" Optimus.delete_check_point_folder(path=path, file_system=file_system) # Creating file: logger.print("Creating the hadoop folder...") command = "hadoop fs -mkdir " + folder_path logger.print("$" + command) os.system(command) logger.print("Hadoop folder created. \n") logger.print("Setting created folder as checkpoint folder...") Spark.instance.sc.setCheckpointDir(folder_path) elif file_system == "local": # Folder path: folder_path = path + "/" + "checkPointFolder" # Checking if tempFolder exits: logger.print("Deleting previous folder if exists...") if os.path.isdir(folder_path): # Deletes folder if exits: rmtree(folder_path) logger.print("Creating the checkpoint directory...") # Creates new folder: os.mkdir(folder_path) Spark.instance.sc.setCheckpointDir(dirName="file:///" + folder_path) else: RaiseIt.value_error(file_system, ["hadoop", "local"])
def compare(df1, df2, method="json"): """ Compare 2 Spark dataframes :param df1: :param df2: :param method: json or a :return: """ if method is "json": diff = DeepDiff(df1.to_json(), df2.to_json(), ignore_order=False) print_json(diff) elif method is "collect": if df1.collect() == df2.collect(): print("Dataframes are equal") return True else: print("Dataframes not equal. Use 'json' param to check for diffrences") return False else: RaiseIt.type_error(method, ["json", "collect"])
def abstract_udf(col, func, func_return_type=None, args=None, func_type=None): """ Abstract User defined functions. This is a helper function to create udf, pandas udf or a Column Exp :param col: Column to created or transformed :param func: Function to be applied to the data :param args: If required attributes to be passed to the function :param func_return_type: Required by UDF and Pandas UDF. :param func_type: pandas_udf or udf. The function is going to try to use pandas_udf if func_type is not defined :return: A function, UDF or Pandas UDF """ if func_return_type is None: func_type = "column_expr" # By default is going to try to use pandas UDF elif func_type is None and is_pyarrow_installed() is True: func_type = "pandas_udf" types = ["column_expr", "udf", "pandas_udf"] if func_type not in types: RaiseIt.value_error(func_type, types) # It handle if func param is a plain expression or a function returning and expression def func_col_exp(col_name, attr): return func if is_column(func): _func = func_col_exp else: _func = func # print(func_type) logger.print( "Using '{func_type}' to process column '{column}' with function {func_name}" .format(func_type=func_type, column=col, func_name=_func.__name__)) df_func = func_factory(func_type, func_return_type) if not is_tuple(args): args = (args, ) # print("-----------------df_func(_func, args)(col)", df_func(_func, args)(col)) return df_func(_func, args)(col)