Exemple #1
0
def melt(self, id_vars, value_vars, var_name="variable", value_name="value", data_type="str"):
    """
    Convert DataFrame from wide to long format.
    :param self: Spark Dataframe
    :param id_vars: column with unique values
    :param value_vars: Column names that are going to be converted to columns values
    :param var_name: Column name for vars
    :param value_name: Column name for values
    :param data_type: All columns must have the same type. It will transform all columns to this data type.
    :return:
    """

    df = self
    id_vars = val_to_list(id_vars)
    # Cast all columns to the same type
    df = df.cols.cast(id_vars + value_vars, data_type)

    vars_and_vals = [F.struct(F.lit(c).alias(var_name), F.col(c).alias(value_name)) for c in value_vars]

    # Add to the DataFrame and explode
    df = df.withColumn("vars_and_vals", F.explode(F.array(*vars_and_vals)))

    cols = id_vars + [F.col("vars_and_vals")[x].alias(x) for x in [var_name, value_name]]

    return df.select(*cols)
Exemple #2
0
    def _add_driver_class_path(self, driver_class_path):
        if self.driver_class_path is None:
            self.driver_class_path = []

        if is_list(driver_class_path):
            for d in val_to_list(driver_class_path):
                self.driver_class_path.append(d)
Exemple #3
0
    def table_to_df(self, table_name, columns="*", limit=None):
        """
        Return cols as Spark dataframe from a specific table
        :type table_name: object
        :param columns:
        :param limit: how many rows will be retrieved
        """

        db_table = "public." + table_name
        if self._limit(limit) is "all":
            query = "SELECT COUNT(*) FROM " + db_table
            # We want to count the number of rows to warn the users how much it can take to bring the whole data
            count = self.execute(query, "all").to_json()[0]["count"]

            print(str(count) + " rows")

        if columns is "*":
            columns_sql = "*"
        else:
            columns = val_to_list(columns)
            columns_sql = ",".join(columns)

        query = "SELECT " + columns_sql + " FROM " + db_table
        logger.print(query)
        df = self.execute(query, limit)

        # Bring the data to local machine if not every time we call an action is going to be
        # retrieved from the remote server
        df = df.run()
        return df
Exemple #4
0
    def value_error(var=None, data_values=None):
        """
        Raise a ValueError exception
        :param var:
        :type var:

        :param data_values: values accepted by the variable
        :type data_values: str/list
        :return:
        """
        from optimus.helpers.functions import get_var_name
        data_values = val_to_list(data_values)

        if len(data_values) == 1:
            divisor = ""
        elif len(data_values) == 2:
            divisor = " or "
        elif len(data_values) > 2:
            divisor = ", "

        raise ValueError(
            "'{var_name}' must be {type}, received '{var_type}'".format(
                var_name=get_var_name(var),
                type=divisor.join(map(lambda x: "'" + x + "'", data_values)),
                var_type=var))
Exemple #5
0
    def _add_jars(self, jar):
        if self.jars is None:
            self.jars = []

        if is_list(jar):
            for j in val_to_list(jar):
                self.jars.append(j)
Exemple #6
0
def get_output_cols(input_cols, output_cols):
    # Construct input and output columns names
    if is_list(input_cols) and is_list(output_cols):
        if len(input_cols) != len(output_cols):
            RaiseIt.length_error(input_cols, output_cols)
    elif is_list(input_cols) and is_str(output_cols):
        if len(input_cols) > 1:
            output_cols = list([i + output_cols for i in input_cols])
        else:
            output_cols = val_to_list(output_cols)
    elif is_str(input_cols) and is_str(output_cols):
        output_cols = val_to_list(output_cols)
    elif output_cols is None:
        output_cols = input_cols

    return output_cols
Exemple #7
0
    def _add_spark_packages(self, packages):
        """
        Define the Spark packages that must be loaded at start time
        :param packages:
        :return:
        """

        for p in val_to_list(packages):
            self.packages.append(p)
Exemple #8
0
    def show(self, table_names="*", limit="all"):
        db = self.db

        if table_names is "*":
            table_names = db.tables_names_to_json()
        else:
            table_names = val_to_list(table_names)

        print("Total Tables:" + str(len(table_names)))

        for table_name in table_names:
            db.table_to_df(table_name, "*", limit) \
                .table(title=table_name)
Exemple #9
0
def is_column_a(df, column, dtypes):
    """
    Check if column match a list of data types
    :param df:
    :param column:
    :param dtypes:
    :return:
    """

    data_type = tuple(val_to_list(parse_spark_dtypes(dtypes)))

    column = one_list_to_val(column)

    # Filter columns by data type
    return isinstance(df.schema[column].dataType, data_type)
Exemple #10
0
def get_spark_dtypes_object(value):
    """
    Get a pyspark data class from a string data type representation. for example 'StringType()' from 'string'
    :param value:
    :return:
    """
    value = val_to_list(value)
    try:
        data_type = [SPARK_DTYPES_DICT_OBJECTS[SPARK_SHORT_DTYPES[v]] for v in value]

    except (KeyError, TypeError):
        data_type = value

    data_type = one_list_to_val(data_type)
    return data_type
Exemple #11
0
def filter_col_name_by_dtypes(df, data_type):
    """
    Return column names filtered by the column data type
    :param df: Dataframe which columns are going to be filtered
    :param data_type: Datatype used to filter the column.
    :type data_type: str or list
    :return:
    """
    data_type = parse_spark_dtypes(data_type)

    # isinstace require a tuple
    data_type = tuple(val_to_list(data_type))

    # Filter columns by data type
    return [c for c in df.columns if isinstance(df.schema[c].dataType, data_type)]
Exemple #12
0
    def is_in(columns, values):
        """
        Filter rows which columns that match a specific value
        :return: Spark DataFrame
        """

        # Ensure that we have a list
        values = val_to_list(values)

        # Create column/value expression
        column_expr = [(F.col(columns) == v) for v in values]

        # Concat expression with and logical or
        expr = reduce(lambda a, b: a | b, column_expr)

        return self.rows.select(expr)
Exemple #13
0
def parse_spark_dtypes(value):
    """
    Get a pyspark data type from a string data type representation. for example 'StringType' from 'string'
    :param value:
    :return:
    """

    value = val_to_list(value)

    try:
        data_type = [SPARK_DTYPES_DICT[SPARK_SHORT_DTYPES[v]] for v in value]

    except KeyError:
        data_type = value

    data_type = one_list_to_val(data_type)
    return data_type
Exemple #14
0
def validate_columns_names(df, col_names, index=0):
    """
    Check if a string or list of string are valid dataframe columns
    :param df: Data frame to be analyzed
    :param col_names: columns names to be checked
    :param index:
    :return:
    """

    columns = val_to_list(col_names)

    if is_list_of_tuples(columns):
        columns = [c[index] for c in columns]

    # Remove duplicates in the list
    if is_list_of_strings(columns):
        columns = OrderedSet(columns)

    check_for_missing_columns(df, columns)

    return True
Exemple #15
0
    def __init__(self,
                 session=None,
                 master="local[*]",
                 app_name="optimus",
                 checkpoint=False,
                 path=None,
                 file_system="local",
                 verbose=False,
                 server=False,
                 repositories=None,
                 packages=None,
                 jars=None,
                 options=None,
                 additional_options=None,
                 queue_url=None,
                 queue_exchange=None,
                 queue_routing_key="optimus"):
        """
        Transform and roll out
        :param master: 'Master', 'local' or ip address to a cluster
        :param app_name: Spark app name
        :param path: path to the checkpoint folder
        :param checkpoint: If True create a checkpoint folder
        :param file_system: 'local' or 'hadoop'
        :param additional_options:


        :param options: Configuration options that are passed to spark-submit.
            See `the list of possible options
            <https://spark.apache.org/docs/2.4.1/configuration.html#available-properties>`_.
            Note that any options set already through PYSPARK_SUBMIT_ARGS will override
            these.
        :type options: (dict[str,str])
        :param repositories: List of additional maven repositories for package lookup.
        :type repositories: (list[str])

        :param packages: Spark packages that should be installed.
        :type packages: (list[str])

        :param jars: Full paths to jar files that we want to include to the session.
        :type jars: (list[str])

        """
        if session is None:
            # print("Creating Spark Session...")
            # If a Spark session in not passed by argument create it

            self.master = master
            self.app_name = app_name

            if options is None:
                options = {}

            self.options = options

            if packages is None:
                packages = []
            else:
                packages = val_to_list(packages)

            self.packages = packages
            self.repositories = repositories

            if jars is None:
                jars = {}

            self.jars = jars
            self.additional_options = additional_options

            self.verbose(verbose)

            # Load Avro.
            # TODO:
            #  if the Spark 2.4 version is going to be used this is not neccesesary.
            #  Maybe we can check a priori which version fo Spark is going to be used
            # self._add_spark_packages(["com.databricks:spark-avro_2.11:4.0.0"])

            self._start_session()

            if path is None:
                path = os.getcwd()

            if checkpoint is True:
                self._set_check_point_folder(path, file_system)

        else:
            # If a session is passed by arguments just save the reference

            Spark.instance = Spark().load(session)

        # Initialize Spark
        logger.print("""
                             ____        __  _                     
                            / __ \____  / /_(_)___ ___  __  _______
                           / / / / __ \/ __/ / __ `__ \/ / / / ___/
                          / /_/ / /_/ / /_/ / / / / / / /_/ (__  ) 
                          \____/ .___/\__/_/_/ /_/ /_/\__,_/____/  
                              /_/                                  
                              """)

        logger.print(STARTING_OPTIMUS)

        if server:
            logger.print("Starting Optimus Server...")
            s = Server()
            s.start()
            self.server_instance = s

        logger.print(SUCCESS)

        self.create = Create()
        self.load = Load()
        self.read = self.spark.read
        self.profiler = Profiler(queue_url=queue_url,
                                 queue_exchange=queue_exchange,
                                 queue_routing_key=queue_routing_key)
        self.ml = ML()

        #
        self._load_css()

        # Set global output as html
        self.output("html")
Exemple #16
0
def parse_columns(df, cols_args, get_args=False, is_regex=None, filter_by_column_dtypes=None,
                  accepts_missing_cols=False):
    """
    Return a list of columns and check that columns exists in the dataframe
    Accept '*' as parameter in which case return a list of all columns in the dataframe.
    Also accept a regex.
    If a list of tuples return to list. The first element is the columns name the others element are params.
    This params can be used to create custom transformation functions. You can find and example in cols().cast()
    :param df: Dataframe in which the columns are going to be checked
    :param cols_args: Accepts * as param to return all the string columns in the dataframe
    :param get_args:
    :param is_regex: Use True is col_attrs is a regex
    :param filter_by_column_dtypes: A data type for which a columns list is going be filtered
    :param accepts_missing_cols: if true not check if column exist in the dataframe
    :return: A list of columns string names
    """

    attrs = None

    # ensure that cols_args is a list
    # cols_args = val_to_list(cols_args)

    # if columns value is * get all dataframes columns
    if is_regex is True:
        r = re.compile(cols_args[0])
        cols = list(filter(r.match, df.columns))

    elif cols_args == "*" or cols_args is None:
        cols = df.columns


    # Return filtered columns
    # columns_filtered = list(set(columns) - set(columns_filtered))

    # In case we have a list of tuples we use the first element of the tuple is taken as the column name
    # and the rest as params. We can use the param in a custom function as follow
    # def func(attrs): attrs return (1,2) and (3,4)
    #   return attrs[0] + 1
    # df.cols().apply([('col_1',1,2),('cols_2', 3 ,4)], func)

    # Verify if we have a list with tuples
    elif is_tuple(cols_args) or is_list_of_tuples(cols_args):
        cols_args = val_to_list(cols_args)
        # Extract a specific position in the tuple
        cols = [(i[0:1][0]) for i in cols_args]
        attrs = [(i[1:]) for i in cols_args]
    else:
        # if not a list convert to list
        cols = val_to_list(cols_args)
        # Get col name from index
        cols = [c if is_str(c) else df.columns[c] for c in cols]

    # Check for missing columns
    if accepts_missing_cols is False:
        check_for_missing_columns(df, cols)

    # Filter by column data type
    filter_by_column_dtypes = val_to_list(filter_by_column_dtypes)

    columns_residual = None

    # If necessary filter the columns by data type
    if is_list_of_strings(filter_by_column_dtypes):
        # Get columns for every data type
        columns_filtered = filter_col_name_by_dtypes(df, filter_by_column_dtypes)

        # Intersect the columns filtered per data type from the whole dataframe with the columns passed to the function
        final_columns = list(OrderedSet(cols).intersection(columns_filtered))

        # This columns match filtered data type
        columns_residual = list(OrderedSet(cols) - OrderedSet(columns_filtered))
    else:
        final_columns = cols
    # final_columns = escape_columns(final_columns)
    # Return cols or cols an params
    cols_params = []

    if get_args is True:
        cols_params = final_columns, attrs
    elif get_args is False:
        cols_params = final_columns
    else:
        RaiseIt.value_error(get_args, ["True", "False"])

    if columns_residual:
        print(",".join(escape_columns(columns_residual)), "column(s) was not processed because is/are not",
              ",".join(filter_by_column_dtypes))

    return cols_params