Esempio n. 1
0
def is_column_a(df, column=None, dtypes="str"):
    """
    Check if column match a list of data types
    :param df: spark or dask dataframe
    :param column: column to be compared with
    :param dtypes: types to be checked
    :return:
    """
    column = val_to_list(column)

    if len(column) > 1:
        RaiseIt.length_error(column, 1)
    data_type = tuple(val_to_list(parse_dtypes(df, dtypes)))
    column = one_list_to_val(column)

    # Filter columns by data type
    # print("df",type(df),df)
    v = df.cols.schema_dtype(column)

    if is_spark_dataframe(df.data):
        result = isinstance(v, data_type)
    elif is_dask_dataframe(df):
        result = v in data_type
    else:
        result = None
    return result
Esempio n. 2
0
def normalizer(df, input_cols, p=2.0):
    """
    Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which
    specifies the p-norm used for normalization. (p=2) by default.
    :param df: Dataframe to be transformed
    :param input_cols: Columns to be normalized.
    :param p:  p-norm used for normalization.
    :return: Dataframe with normalized columns.
    """

    # Check if columns argument must be a string or list datatype:
    if is_(input_cols, [str, list]):
        RaiseIt.type_error(input_cols, [str, list])

    if is_str(input_cols):
        input_cols = [input_cols]

    if is_(input_cols, [float, int]):
        RaiseIt.type_error(input_cols, [float, int])

    df = df.cols.cast(input_cols, "vector")

    normal = [
        Normalizer(inputCol=column, outputCol=column + "_normalized", p=p)
        for column in list(set(input_cols))
    ]

    pipeline = Pipeline(stages=normal)

    df = pipeline.fit(df).transform(df)

    return df
Esempio n. 3
0
def abstract_udf(col,
                 func,
                 func_return_type=None,
                 attrs=None,
                 func_type=None,
                 verbose=False):
    """
    Abstract User defined functions. This is a helper function to create udf, pandas udf or a Column Exp
    :param col: Column to created or transformed
    :param func: Function to be applied to the data
    :param attrs: If required attributes to be passed to the function
    :param func_return_type: Required by UDF and Pandas UDF.
    :param func_type: pandas_udf or udf. The function is going to try to use pandas_udf if func_type is not defined
    :param verbose: print additional info
    :return: A function, UDF or Pandas UDF
    """

    # By default is going to try to use pandas UDF
    if func_type is None and is_pyarrow_installed() is True:
        func_type = "pandas_udf"

    types = ["column_exp", "udf", "pandas_udf"]
    if func_type not in types:
        RaiseIt.value_error(func_type, types)

    # if verbose is True:
    #    logging.info("Using '{func_type}' to process column '{column}' with function {func_name}"
    #                 .format(func_type=func_type, column=col, func_name=func.__name__))

    df_func = func_factory(func_type, func_return_type)
    return df_func(attrs, func)(col)
Esempio n. 4
0
    def data_loader(self, url, type_of):
        """
        Load data in from a url
        :param url: url string
        :param type_of: format data type
        :return:
        """

        data_loader = None
        if type_of == "csv":
            data_loader = self.csv
        elif type_of == "json":
            data_loader = self.json
        elif type_of == "parquet":
            data_loader = self.parquet
        elif type_of == "avro":
            data_loader = self.avro
        else:
            RaiseIt.type_error(data_loader, [
                "csv",
                "json",
                "parquet",
                "avro",
            ])

        i = url.rfind('/')
        data_name = url[(i + 1):]
        data_def = {"displayName": data_name, "url": url}
        return Downloader(data_def).download(data_loader, type_of)
Esempio n. 5
0
    def _data_loader(self, url, type_of):
        """
        Select the correct method to download the file depending of the format
        :param url: string url
        :param type_of: format data type
        :return:
        """

        file_format = None
        if type_of == "csv":
            file_format = self.csv
        elif type_of == "json":
            file_format = self.json
        elif type_of == "parquet":
            file_format = self.parquet
        elif type_of == "avro":
            file_format = self.avro
        else:
            RaiseIt.type_error(file_format, ["csv", "json", "parquet", "avro", ])

        i = url.rfind('/')
        data_name = url[(i + 1):]
        data_def = {
            "displayName": data_name,
            "url": url
        }
        return Downloader(data_def).download(file_format, type_of)
Esempio n. 6
0
    def delete_check_point_folder(path, file_system):
        """
        Function that deletes the temporal folder where temp files were stored.
        The path required is the same provided by user in setCheckPointFolder().

        :param path: path where the info will be saved
        :param file_system: Describes if file system is local or hadoop file system.
        :return:
        """

        if file_system == "hadoop":
            # Folder path:
            folder_path = path + "/" + "checkPointFolder"
            logger.print("Deleting checkpoint folder...")
            command = "hadoop fs -rm -r " + folder_path
            os.system(command)
            logger.print("$" + command)
            logger.print("Folder deleted.")
        elif file_system == "local":
            logger.print("Deleting checkpoint folder...")
            # Folder path:
            folder_path = path + "/" + "checkPointFolder"
            # Checking if tempFolder exits:
            if os.path.isdir(folder_path):
                # Deletes folder if exits:
                rmtree(folder_path)
                # Creates new folder:
                logger.print("Folder deleted.")
            else:
                logger.print("Folder deleted.")
        else:
            RaiseIt.value_error(file_system, ["hadoop", "local"])
Esempio n. 7
0
    def get(driver_type) -> AbstractDriver:
        """
        Returns a driver implementation given a database name

        :param driver_type: name of the database
        :return: a database driver
        """
        if driver_type == DriverProperties.CASSANDRA.value["name"]:
            return CassandraDriver()
        elif driver_type == DriverProperties.MYSQL.value["name"]:
            return MySQLDriver()
        elif driver_type == DriverProperties.ORACLE.value["name"]:
            return OracleDriver()
        elif driver_type == DriverProperties.POSTGRESQL.value["name"]:
            return PostgreSQLDriver()
        elif driver_type == DriverProperties.PRESTO.value["name"]:
            return PrestoDriver()
        elif driver_type == DriverProperties.REDSHIFT.value["name"]:
            return RedshiftDriver()
        elif driver_type == DriverProperties.SQLITE.value["name"]:
            return SQLiteDriver()
        elif driver_type == DriverProperties.SQLSERVER.value["name"]:
            return SQLServerDriver()
        elif driver_type == DriverProperties.BIGQUERY.value["name"]:
            return BigQueryDriver()
        elif driver_type == DriverProperties.IMPALA.value["name"]:
            return ImpalaDriver()
        else:
            RaiseIt.value_error(
                driver_type,
                [database["name"] for database in DriverProperties.list()])
Esempio n. 8
0
    def append(rows):
        """
        Append a row at the end of a dataframe
        :param rows: List of values or tuples to be appended
        :return: Spark DataFrame
        """
        df = self

        if is_list_of_tuples(rows):
            columns = [str(i) for i in range(df.cols.count())]
            if not is_list_of_tuples(rows):
                rows = [tuple(rows)]
            new_row = op.Create.df(columns, rows)
            df_result = df.union(new_row)

        elif is_list_of_dataframes(rows) or is_dataframe(rows):
            row = val_to_list(rows)
            row.insert(0, df)
            df_result = append_df(row, like="rows")
        else:
            RaiseIt.type_error(rows, ["list of tuples", "list of dataframes"])

        df_result = df_result.preserve_meta(self, Actions.NEST.value,
                                            df.cols.names())

        return df_result
Esempio n. 9
0
    def sort(col_sort):
        """
        Sort rows taking in account multiple columns
        :param col_sort: column and sort type combination (col_name, "asc")
        :type col_sort: list of tuples
        """
        # If a list of columns names are given order this by desc. If you need to specify the order of every
        # column use a list of tuples (col_name, "asc")
        df = self

        t = []
        if is_list_of_str_or_int(col_sort):
            for col_name in col_sort:
                t.append(tuple([col_name, "desc"]))
            col_sort = t

        func = []
        for cs in col_sort:
            col_name = one_list_to_val(cs[0])
            order = cs[1]

            if order == "asc":
                sort_func = F.asc
            elif order == "desc":
                sort_func = F.desc
            else:
                RaiseIt.value_error(sort_func, ["asc", "desc"])

            func.append(sort_func(col_name))
            df = df.preserve_meta(self, Actions.SORT_ROW.value, col_name)

        df = df.sort(*func)
        return df
Esempio n. 10
0
        def cast_factory(cls):

            # Parse to Vector
            if is_type(cls, Vectors):
                func_type = "udf"

                def cast_to_vectors(val, attr):
                    return Vectors.dense(val)

                func_return_type = VectorUDT()
            # Parse standard data types
            elif get_spark_dtypes_object(cls):

                func_type = "column_exp"

                def cast_to_vectors(col_name, attr):
                    return F.col(col_name).cast(get_spark_dtypes_object(cls))

                func_return_type = None

            # Add here any other parse you want
            else:
                RaiseIt.value_error(cls)

            return func_return_type, cast_to_vectors, func_type
Esempio n. 11
0
    def nest(input_cols, output_col, shape="string", separator=""):
        """
        Concat multiple columns to one with the format specified
        :param input_cols: columns to be nested
        :param output_col: final column with the nested content
        :param separator: char to be used as separator at the concat time
        :param shape: final data type, 'array', 'string' or 'vector'
        :return: Spark DataFrame
        """

        df = self

        if has_(input_cols, F.Column):
            # Transform non Column data to lit
            columns = [F.lit(col) if not is_(col, F.Column) else col for col in input_cols]
        else:
            columns = parse_columns(self, input_cols)

        if shape is "vector":
            columns = parse_columns(self, input_cols, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)

            vector_assembler = VectorAssembler(
                inputCols=columns,
                outputCol=output_col)
            df = vector_assembler.transform(df)

        elif shape is "array":
            df = apply_expr(output_col, F.array(*columns))

        elif shape is "string":
            df = apply_expr(output_col, F.concat_ws(separator, *columns))
        else:
            RaiseIt.value_error(shape, ["vector", "array", "string"])

        return df
Esempio n. 12
0
    def nest(input_cols, output_col, shape=None, separator=" "):
        """
        Concat multiple columns to one with the format specified
        :param input_cols: columns to be nested
        :param output_col: final column with the nested content
        :param separator: char to be used as separator at the concat time
        :param shape: final data type, 'array', 'string' or 'vector'
        :return: Spark DataFrame
        """
        columns = parse_columns(self, input_cols)
        df = self

        if shape is "vector":
            vector_assembler = VectorAssembler(inputCols=input_cols,
                                               outputCol=output_col)
            df = vector_assembler.transform(self)

        elif shape is "array":
            df = apply_expr(output_col, F.array(*columns))

        elif shape is "string":

            df = apply_expr(output_col, F.concat_ws(separator, *columns))
        else:
            RaiseIt.value_error(shape, ["vector", "array", "string"])

        return df
Esempio n. 13
0
def normalizer(df, input_cols, p=2.0):
    """
    Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which
    specifies the p-norm used for normalization. (p=2) by default.
    :param df: Dataframe to be transformed
    :param input_cols: Columns to be normalized.
    :param p:  p-norm used for normalization.
    :return: Dataframe with normalized columns.
    """

    # Check if columns argument must be a string or list datatype:
    if is_(input_cols, [str, list]):
        RaiseIt.type_error(input_cols, [str, list])

    if is_str(input_cols):
        input_cols = [input_cols]

    if is_(input_cols, [float, int]):
        RaiseIt.type_error(input_cols, [float, int])

    df = df.cols.cast(input_cols, "vector")

    # TODO https://developer.ibm.com/code/2018/04/10/improve-performance-ml-pipelines-wide-dataframes-apache-spark-2-3/
    normal = [
        Normalizer(inputCol=col_name,
                   outputCol=name_col(col_name, "normalized"),
                   p=p) for col_name in list(set(input_cols))
    ]

    pipeline = Pipeline(stages=normal)

    df = pipeline.fit(df).transform(df)

    return df
Esempio n. 14
0
def append(dfs, like="columns"):
    """
    Concat multiple dataFrames columns or rows wise
    :param dfs: List of DataFrames
    :param like: concat as columns or rows
    :return:
    """

    # FIX: Because monotonically_increasing_id can create different
    # sequence for different dataframes the result could be wrong.

    if like == "columns":
        temp_dfs = []
        col_temp_name = "id_" + random_int()

        dfs = val_to_list(dfs)
        for df in dfs:
            temp_dfs.append(
                df.withColumn(col_temp_name, F.monotonically_increasing_id()))

        def _append(df1, df2):
            return df1.join(df2, col_temp_name, "outer")

        df_result = reduce(_append, temp_dfs).drop(col_temp_name)

    elif like == "rows":
        df_result = reduce(DataFrame.union, dfs)
    else:
        RaiseIt.value_error(like, ["columns", "rows"])

    return df_result
Esempio n. 15
0
def concat(dfs, like="columns"):
    """
    Concat multiple dataframes as columns or rows
    :param dfs:
    :param like: The way dataframes is going to be concat. like columns or rows
    :return:
    """
    # Add increasing Ids, and they should be the same.
    if like == "columns":
        temp_dfs = []
        col_temp_name = "id_" + random_int()
        for df in dfs:
            temp_dfs.append(
                df.withColumn(col_temp_name, F.monotonically_increasing_id()))

        def _append_df(df1, df2):
            return df1.join(df2, col_temp_name, "outer")

        df_result = reduce(_append_df, temp_dfs).drop(col_temp_name)

    elif like == "rows":
        df_result = reduce(DataFrame.union, dfs)
    else:
        RaiseIt.value_error(like, ["columns", "rows"])

    return df_result
Esempio n. 16
0
def check_for_missing_columns(df, col_names):
    """
    Check if the columns you want to select exits in the dataframe
    :param df: Dataframe to be checked
    :param col_names: cols names to
    :return:
    """
    missing_columns = list(OrderedSet(col_names) - OrderedSet(df.schema.names))

    if len(missing_columns) > 0:
        RaiseIt.value_error(missing_columns, df.columns)
    return False
Esempio n. 17
0
    def url(self, path=None, type_of="csv"):
        """
        Entry point for loading data from a URL. Check that the url is well format
        :param path: string for URL to read
        :param type_of: type of the URL backend (can be csv or json)
        :return: pyspark dataframe from URL.
        """

        if "https://" in str(path) or "http://" in str(path) or "file://" in str(path):
            return self._data_loader(str(path), type_of)
        else:
            RaiseIt.type_error(type_of, ["https://", "http://", "file://"])
Esempio n. 18
0
def check_column_numbers(columns, number=0):
    """
    Check if the columns number match number expected
    :param columns:
    :param number: Number of columns to check
    :return:
    """
    if columns is None:
        RaiseIt.value_error(
            columns, ["str", "list"],
            extra_text=
            "Maybe the columns selected do not match a specified datatype filter."
        )

    if isinstance(columns, zip):
        columns = list(columns)

    count = list(columns)

    if number == "*":
        if not len(columns) >= 1:
            RaiseIt.value_error(len(columns), ["1 or greater"])
    elif number == ">1":
        if not len(columns) > 1:
            RaiseIt.value_error(len(columns), ["more than 1"])
    elif len(columns) != number:
        RaiseIt.value_error(count,
                            "{} columns, {} needed".format(number, columns))
Esempio n. 19
0
    def to_file(self, path=None, output="html"):
        """
        Save profiler data to a file in the specified format (html, json)
        :param output: html or json
        :param path: filename in which the data will be saved
        :return:
        """

        if path is None:
            RaiseIt.value_error(path, "str")

        # We need to append a some extra html tags to display it correctly in the browser.
        if output is "html":
            if self.html is None:
                RaiseIt.not_ready_error(
                    "You must first run the profiler, then it can be exported. Try op.profiler.run(df, '*')"
                )

            write_html(HEADER + self.html + FOOTER, path)
        elif output is "json":
            if self.json is None:
                RaiseIt.not_ready_error(
                    "You must first run the profiler, then it can be exported. Try op.profiler.run(df, '*')"
                )

            write_json(self.json, path)
        else:

            RaiseIt.type_error(output, ["html", "json"])
Esempio n. 20
0
    def sort(order="asc"):
        """
        Sort dataframes columns asc or desc
        :param order: 'asc' or 'desc' accepted
        :return: Spark DataFrame
        """

        if order == "asc":
            sorted_col_names = sorted(self.columns)
        elif order == "desc":
            sorted_col_names = sorted(self.columns, reverse=True)
        else:
            RaiseIt.value_error(order, ["asc", "desc"])

        return self.select(sorted_col_names)
Esempio n. 21
0
def table_name(self, name=None):
    """
    Create a temp view for a data frame
    :param self:
    :param name:
    :return:
    """
    if not is_str(name):
        RaiseIt.type_error(name, ["string"])

    if len(name) is 0:
        RaiseIt.value_error(name, ["> 0"])

    self.createOrReplaceTempView(name)
    return self
Esempio n. 22
0
def check_column_numbers(columns, number=0):
    """
    Check if the columns number match number expected
    :param columns:
    :param number: Number of columns to check
    :return:
    """
    count = len(columns)

    if number is "*":
        if not len(columns) >= 1:
            RaiseIt.value_error(len(columns), ["more than 1"])
    elif not len(columns) == number:

        RaiseIt.value_error(count, "Receive {} columns, {} needed".format(number, columns))
Esempio n. 23
0
def set_name(self, value=None):
    """
    Create a temp view for a data frame also used in the json output profiling
    :param self:
    :param value:
    :return:
    """
    self._name = value
    if not is_str(value):
        RaiseIt.type_error(value, ["string"])

    if len(value) == 0:
        RaiseIt.value_error(value, ["> 0"])

    self.createOrReplaceTempView(value)
Esempio n. 24
0
def get_output_cols(input_cols, output_cols):
    # Construct input and output columns names
    if is_list(input_cols) and is_list(output_cols):
        if len(input_cols) != len(output_cols):
            RaiseIt.length_error(input_cols, output_cols)
    elif is_list(input_cols) and is_str(output_cols):
        if len(input_cols) > 1:
            output_cols = list([i + output_cols for i in input_cols])
        else:
            output_cols = val_to_list(output_cols)
    elif is_str(input_cols) and is_str(output_cols):
        output_cols = val_to_list(output_cols)
    elif output_cols is None:
        output_cols = input_cols

    return output_cols
Esempio n. 25
0
    def to_file(self, path=None, output=None):
        """
        Save profiler data to a file in the specified format (html, json)
        :param output: html or json
        :param path: filename in which the data will be saved
        :return:
        """

        if path is None:
            RaiseIt.value_error(path, ["Invalid file path"])

        # We need to append a some extra html tags to display it correctly in the browser.
        if output is "html":
            if self.html is None:
                assert self.html is not None, "Please run the profiler first"

            header = '''<!doctype html>
<html class="no-js" lang="">

<head>
  <meta charset="utf-8">
  <meta http-equiv="x-ua-compatible" content="ie=edge">
  <title></title>
  <meta name="description" content="">
  <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">

  <link rel="manifest" href="site.webmanifest">
  <link rel="apple-touch-icon" href="icon.png">
  <!-- Place favicon.ico in the root directory -->

  <link rel="stylesheet" href="css/normalize.css">
  <link rel="stylesheet" href="css/main.css">
</head>

<body>'''

            footer = '''</body></html>'''

            write_html(header + self.html + footer, path)
        elif output is "json":
            if self.json is None:
                assert self.json is not None, "Please run the profiler first"

            write_json(self.json, path)
        else:
            print("sdf")
            RaiseIt.type_error(output, ["html", "json"])
Esempio n. 26
0
def absolute_path(files, format="posix"):
    """
    User project base folder to construct and absolute path
    :param files: path files
    :param format: posix or uri
    :return:
    """
    files = val_to_list(files)
    if format == "uri":
        result = [Path(ROOT_DIR + file).as_uri() for file in files]
    elif format == "posix":
        result = [Path(ROOT_DIR + file).as_posix() for file in files]
    else:
        RaiseIt.value_error(format, ["posix", "uri"])

    result = one_list_to_val(result)
    return result
Esempio n. 27
0
def is_column_a(df, column, dtypes):
    """
    Check if column match a list of data types
    :param df: dataframe
    :param column: column to be compared with
    :param dtypes: types to be checked
    :return:
    """
    column = val_to_list(column)

    if len(column) > 1:
        RaiseIt.length_error(column, 1)

    data_type = tuple(val_to_list(parse_spark_dtypes(dtypes)))
    column = one_list_to_val(column)

    # Filter columns by data type
    return isinstance(df.schema[column].dataType, data_type)
Esempio n. 28
0
    def _set_check_point_folder(path, file_system):
        """
        Function that receives a workspace path where a folder is created.
        This folder will store temporal dataframes when user writes the .checkPoint().

        :param path: Location of the dataset (string).
        :param file_system: Describes if file system is local or hadoop file system.

        """

        print_check_point_config(file_system)

        if file_system == "hadoop":
            folder_path = path + "/" + "checkPointFolder"
            Optimus.delete_check_point_folder(path=path,
                                              file_system=file_system)

            # Creating file:
            logger.print("Creating the hadoop folder...")
            command = "hadoop fs -mkdir " + folder_path
            logger.print("$" + command)
            os.system(command)
            logger.print("Hadoop folder created. \n")

            logger.print("Setting created folder as checkpoint folder...")
            Spark.instance.sc.setCheckpointDir(folder_path)
        elif file_system == "local":
            # Folder path:
            folder_path = path + "/" + "checkPointFolder"
            # Checking if tempFolder exits:
            logger.print("Deleting previous folder if exists...")
            if os.path.isdir(folder_path):
                # Deletes folder if exits:
                rmtree(folder_path)

            logger.print("Creating the checkpoint directory...")
            # Creates new folder:
            os.mkdir(folder_path)

            Spark.instance.sc.setCheckpointDir(dirName="file:///" +
                                               folder_path)
        else:
            RaiseIt.value_error(file_system, ["hadoop", "local"])
Esempio n. 29
0
    def compare(df1, df2, method="json"):
        """
        Compare 2 Spark dataframes
        :param df1:
        :param df2:
        :param method: json or a
        :return:
        """
        if method is "json":
            diff = DeepDiff(df1.to_json(), df2.to_json(), ignore_order=False)
            print_json(diff)
        elif method is "collect":
            if df1.collect() == df2.collect():
                print("Dataframes are equal")
                return True
            else:
                print("Dataframes not equal. Use 'json' param to check for diffrences")
                return False

        else:
            RaiseIt.type_error(method, ["json", "collect"])
Esempio n. 30
0
def abstract_udf(col, func, func_return_type=None, args=None, func_type=None):
    """
    Abstract User defined functions. This is a helper function to create udf, pandas udf or a Column Exp
    :param col: Column to created or transformed
    :param func: Function to be applied to the data
    :param args: If required attributes to be passed to the function
    :param func_return_type: Required by UDF and Pandas UDF.
    :param func_type: pandas_udf or udf. The function is going to try to use pandas_udf if func_type is not defined
    :return: A function, UDF or Pandas UDF
    """

    if func_return_type is None:
        func_type = "column_expr"
    # By default is going to try to use pandas UDF
    elif func_type is None and is_pyarrow_installed() is True:
        func_type = "pandas_udf"

    types = ["column_expr", "udf", "pandas_udf"]
    if func_type not in types:
        RaiseIt.value_error(func_type, types)

    # It handle if func param is a plain expression or a function returning and expression
    def func_col_exp(col_name, attr):
        return func

    if is_column(func):
        _func = func_col_exp
    else:
        _func = func
    # print(func_type)
    logger.print(
        "Using '{func_type}' to process column '{column}' with function {func_name}"
        .format(func_type=func_type, column=col, func_name=_func.__name__))

    df_func = func_factory(func_type, func_return_type)
    if not is_tuple(args):
        args = (args, )

    # print("-----------------df_func(_func, args)(col)", df_func(_func, args)(col))
    return df_func(_func, args)(col)