def melt(self, id_vars, value_vars, var_name="variable", value_name="value", data_type="str"): """ Convert DataFrame from wide to long format. :param self: Spark Dataframe :param id_vars: column with unique values :param value_vars: Column names that are going to be converted to columns values :param var_name: Column name for vars :param value_name: Column name for values :param data_type: All columns must have the same type. It will transform all columns to this data type. :return: """ df = self id_vars = val_to_list(id_vars) # Cast all columns to the same type df = df.cols.cast(id_vars + value_vars, data_type) vars_and_vals = [F.struct(F.lit(c).alias(var_name), F.col(c).alias(value_name)) for c in value_vars] # Add to the DataFrame and explode df = df.withColumn("vars_and_vals", F.explode(F.array(*vars_and_vals))) cols = id_vars + [F.col("vars_and_vals")[x].alias(x) for x in [var_name, value_name]] return df.select(*cols)
def _add_driver_class_path(self, driver_class_path): if self.driver_class_path is None: self.driver_class_path = [] if is_list(driver_class_path): for d in val_to_list(driver_class_path): self.driver_class_path.append(d)
def table_to_df(self, table_name, columns="*", limit=None): """ Return cols as Spark dataframe from a specific table :type table_name: object :param columns: :param limit: how many rows will be retrieved """ db_table = "public." + table_name if self._limit(limit) is "all": query = "SELECT COUNT(*) FROM " + db_table # We want to count the number of rows to warn the users how much it can take to bring the whole data count = self.execute(query, "all").to_json()[0]["count"] print(str(count) + " rows") if columns is "*": columns_sql = "*" else: columns = val_to_list(columns) columns_sql = ",".join(columns) query = "SELECT " + columns_sql + " FROM " + db_table logger.print(query) df = self.execute(query, limit) # Bring the data to local machine if not every time we call an action is going to be # retrieved from the remote server df = df.run() return df
def value_error(var=None, data_values=None): """ Raise a ValueError exception :param var: :type var: :param data_values: values accepted by the variable :type data_values: str/list :return: """ from optimus.helpers.functions import get_var_name data_values = val_to_list(data_values) if len(data_values) == 1: divisor = "" elif len(data_values) == 2: divisor = " or " elif len(data_values) > 2: divisor = ", " raise ValueError( "'{var_name}' must be {type}, received '{var_type}'".format( var_name=get_var_name(var), type=divisor.join(map(lambda x: "'" + x + "'", data_values)), var_type=var))
def _add_jars(self, jar): if self.jars is None: self.jars = [] if is_list(jar): for j in val_to_list(jar): self.jars.append(j)
def get_output_cols(input_cols, output_cols): # Construct input and output columns names if is_list(input_cols) and is_list(output_cols): if len(input_cols) != len(output_cols): RaiseIt.length_error(input_cols, output_cols) elif is_list(input_cols) and is_str(output_cols): if len(input_cols) > 1: output_cols = list([i + output_cols for i in input_cols]) else: output_cols = val_to_list(output_cols) elif is_str(input_cols) and is_str(output_cols): output_cols = val_to_list(output_cols) elif output_cols is None: output_cols = input_cols return output_cols
def _add_spark_packages(self, packages): """ Define the Spark packages that must be loaded at start time :param packages: :return: """ for p in val_to_list(packages): self.packages.append(p)
def show(self, table_names="*", limit="all"): db = self.db if table_names is "*": table_names = db.tables_names_to_json() else: table_names = val_to_list(table_names) print("Total Tables:" + str(len(table_names))) for table_name in table_names: db.table_to_df(table_name, "*", limit) \ .table(title=table_name)
def is_column_a(df, column, dtypes): """ Check if column match a list of data types :param df: :param column: :param dtypes: :return: """ data_type = tuple(val_to_list(parse_spark_dtypes(dtypes))) column = one_list_to_val(column) # Filter columns by data type return isinstance(df.schema[column].dataType, data_type)
def get_spark_dtypes_object(value): """ Get a pyspark data class from a string data type representation. for example 'StringType()' from 'string' :param value: :return: """ value = val_to_list(value) try: data_type = [SPARK_DTYPES_DICT_OBJECTS[SPARK_SHORT_DTYPES[v]] for v in value] except (KeyError, TypeError): data_type = value data_type = one_list_to_val(data_type) return data_type
def filter_col_name_by_dtypes(df, data_type): """ Return column names filtered by the column data type :param df: Dataframe which columns are going to be filtered :param data_type: Datatype used to filter the column. :type data_type: str or list :return: """ data_type = parse_spark_dtypes(data_type) # isinstace require a tuple data_type = tuple(val_to_list(data_type)) # Filter columns by data type return [c for c in df.columns if isinstance(df.schema[c].dataType, data_type)]
def is_in(columns, values): """ Filter rows which columns that match a specific value :return: Spark DataFrame """ # Ensure that we have a list values = val_to_list(values) # Create column/value expression column_expr = [(F.col(columns) == v) for v in values] # Concat expression with and logical or expr = reduce(lambda a, b: a | b, column_expr) return self.rows.select(expr)
def parse_spark_dtypes(value): """ Get a pyspark data type from a string data type representation. for example 'StringType' from 'string' :param value: :return: """ value = val_to_list(value) try: data_type = [SPARK_DTYPES_DICT[SPARK_SHORT_DTYPES[v]] for v in value] except KeyError: data_type = value data_type = one_list_to_val(data_type) return data_type
def validate_columns_names(df, col_names, index=0): """ Check if a string or list of string are valid dataframe columns :param df: Data frame to be analyzed :param col_names: columns names to be checked :param index: :return: """ columns = val_to_list(col_names) if is_list_of_tuples(columns): columns = [c[index] for c in columns] # Remove duplicates in the list if is_list_of_strings(columns): columns = OrderedSet(columns) check_for_missing_columns(df, columns) return True
def __init__(self, session=None, master="local[*]", app_name="optimus", checkpoint=False, path=None, file_system="local", verbose=False, server=False, repositories=None, packages=None, jars=None, options=None, additional_options=None, queue_url=None, queue_exchange=None, queue_routing_key="optimus"): """ Transform and roll out :param master: 'Master', 'local' or ip address to a cluster :param app_name: Spark app name :param path: path to the checkpoint folder :param checkpoint: If True create a checkpoint folder :param file_system: 'local' or 'hadoop' :param additional_options: :param options: Configuration options that are passed to spark-submit. See `the list of possible options <https://spark.apache.org/docs/2.4.1/configuration.html#available-properties>`_. Note that any options set already through PYSPARK_SUBMIT_ARGS will override these. :type options: (dict[str,str]) :param repositories: List of additional maven repositories for package lookup. :type repositories: (list[str]) :param packages: Spark packages that should be installed. :type packages: (list[str]) :param jars: Full paths to jar files that we want to include to the session. :type jars: (list[str]) """ if session is None: # print("Creating Spark Session...") # If a Spark session in not passed by argument create it self.master = master self.app_name = app_name if options is None: options = {} self.options = options if packages is None: packages = [] else: packages = val_to_list(packages) self.packages = packages self.repositories = repositories if jars is None: jars = {} self.jars = jars self.additional_options = additional_options self.verbose(verbose) # Load Avro. # TODO: # if the Spark 2.4 version is going to be used this is not neccesesary. # Maybe we can check a priori which version fo Spark is going to be used # self._add_spark_packages(["com.databricks:spark-avro_2.11:4.0.0"]) self._start_session() if path is None: path = os.getcwd() if checkpoint is True: self._set_check_point_folder(path, file_system) else: # If a session is passed by arguments just save the reference Spark.instance = Spark().load(session) # Initialize Spark logger.print(""" ____ __ _ / __ \____ / /_(_)___ ___ __ _______ / / / / __ \/ __/ / __ `__ \/ / / / ___/ / /_/ / /_/ / /_/ / / / / / / /_/ (__ ) \____/ .___/\__/_/_/ /_/ /_/\__,_/____/ /_/ """) logger.print(STARTING_OPTIMUS) if server: logger.print("Starting Optimus Server...") s = Server() s.start() self.server_instance = s logger.print(SUCCESS) self.create = Create() self.load = Load() self.read = self.spark.read self.profiler = Profiler(queue_url=queue_url, queue_exchange=queue_exchange, queue_routing_key=queue_routing_key) self.ml = ML() # self._load_css() # Set global output as html self.output("html")
def parse_columns(df, cols_args, get_args=False, is_regex=None, filter_by_column_dtypes=None, accepts_missing_cols=False): """ Return a list of columns and check that columns exists in the dataframe Accept '*' as parameter in which case return a list of all columns in the dataframe. Also accept a regex. If a list of tuples return to list. The first element is the columns name the others element are params. This params can be used to create custom transformation functions. You can find and example in cols().cast() :param df: Dataframe in which the columns are going to be checked :param cols_args: Accepts * as param to return all the string columns in the dataframe :param get_args: :param is_regex: Use True is col_attrs is a regex :param filter_by_column_dtypes: A data type for which a columns list is going be filtered :param accepts_missing_cols: if true not check if column exist in the dataframe :return: A list of columns string names """ attrs = None # ensure that cols_args is a list # cols_args = val_to_list(cols_args) # if columns value is * get all dataframes columns if is_regex is True: r = re.compile(cols_args[0]) cols = list(filter(r.match, df.columns)) elif cols_args == "*" or cols_args is None: cols = df.columns # Return filtered columns # columns_filtered = list(set(columns) - set(columns_filtered)) # In case we have a list of tuples we use the first element of the tuple is taken as the column name # and the rest as params. We can use the param in a custom function as follow # def func(attrs): attrs return (1,2) and (3,4) # return attrs[0] + 1 # df.cols().apply([('col_1',1,2),('cols_2', 3 ,4)], func) # Verify if we have a list with tuples elif is_tuple(cols_args) or is_list_of_tuples(cols_args): cols_args = val_to_list(cols_args) # Extract a specific position in the tuple cols = [(i[0:1][0]) for i in cols_args] attrs = [(i[1:]) for i in cols_args] else: # if not a list convert to list cols = val_to_list(cols_args) # Get col name from index cols = [c if is_str(c) else df.columns[c] for c in cols] # Check for missing columns if accepts_missing_cols is False: check_for_missing_columns(df, cols) # Filter by column data type filter_by_column_dtypes = val_to_list(filter_by_column_dtypes) columns_residual = None # If necessary filter the columns by data type if is_list_of_strings(filter_by_column_dtypes): # Get columns for every data type columns_filtered = filter_col_name_by_dtypes(df, filter_by_column_dtypes) # Intersect the columns filtered per data type from the whole dataframe with the columns passed to the function final_columns = list(OrderedSet(cols).intersection(columns_filtered)) # This columns match filtered data type columns_residual = list(OrderedSet(cols) - OrderedSet(columns_filtered)) else: final_columns = cols # final_columns = escape_columns(final_columns) # Return cols or cols an params cols_params = [] if get_args is True: cols_params = final_columns, attrs elif get_args is False: cols_params = final_columns else: RaiseIt.value_error(get_args, ["True", "False"]) if columns_residual: print(",".join(escape_columns(columns_residual)), "column(s) was not processed because is/are not", ",".join(filter_by_column_dtypes)) return cols_params