Esempio n. 1
0
    def to_file(self, path=None, output="html"):
        """
        Save profiler data to a file in the specified format (html, json)
        :param output: html or json
        :param path: filename in which the data will be saved
        :return:
        """

        if path is None:
            RaiseIt.value_error(path, "str")

        # We need to append a some extra html tags to display it correctly in the browser.
        if output is "html":
            if self.html is None:
                RaiseIt.not_ready_error(
                    "You must first run the profiler, then it can be exported. Try op.profiler.run(df, '*')"
                )

            write_html(HEADER + self.html + FOOTER, path)
        elif output is "json":
            if self.json is None:
                RaiseIt.not_ready_error(
                    "You must first run the profiler, then it can be exported. Try op.profiler.run(df, '*')"
                )

            write_json(self.json, path)
        else:

            RaiseIt.type_error(output, ["html", "json"])
Esempio n. 2
0
def normalizer(df, input_cols, p=2.0):
    """
    Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which
    specifies the p-norm used for normalization. (p=2) by default.
    :param df: Dataframe to be transformed
    :param input_cols: Columns to be normalized.
    :param p:  p-norm used for normalization.
    :return: Dataframe with normalized columns.
    """

    # Check if columns argument must be a string or list datatype:
    if is_(input_cols, [str, list]):
        RaiseIt.type_error(input_cols, [str, list])

    if is_str(input_cols):
        input_cols = [input_cols]

    if is_(input_cols, [float, int]):
        RaiseIt.type_error(input_cols, [float, int])

    df = df.cols.cast(input_cols, "vector")

    # TODO https://developer.ibm.com/code/2018/04/10/improve-performance-ml-pipelines-wide-dataframes-apache-spark-2-3/
    normal = [
        Normalizer(inputCol=col_name,
                   outputCol=name_col(col_name, "normalized"),
                   p=p) for col_name in list(set(input_cols))
    ]

    pipeline = Pipeline(stages=normal)

    df = pipeline.fit(df).transform(df)

    return df
Esempio n. 3
0
    def append(rows):
        """
        Append a row at the end of a dataframe
        :param rows: List of values or tuples to be appended
        :return: Spark DataFrame
        """
        df = self

        if is_list_of_tuples(rows):
            columns = [str(i) for i in range(df.cols.count())]
            if not is_list_of_tuples(rows):
                rows = [tuple(rows)]
            new_row = op.Create.df(columns, rows)
            df_result = df.union(new_row)

        elif is_list_of_dataframes(rows) or is_dataframe(rows):
            row = val_to_list(rows)
            row.insert(0, df)
            df_result = append_df(row, like="rows")
        else:
            RaiseIt.type_error(rows, ["list of tuples", "list of dataframes"])

        df_result = df_result.preserve_meta(self, Actions.NEST.value,
                                            df.cols.names())

        return df_result
Esempio n. 4
0
    def _data_loader(self, url, type_of):
        """
        Select the correct method to download the file depending of the format
        :param url: string url
        :param type_of: format data type
        :return:
        """

        file_format = None
        if type_of == "csv":
            file_format = self.csv
        elif type_of == "json":
            file_format = self.json
        elif type_of == "parquet":
            file_format = self.parquet
        elif type_of == "avro":
            file_format = self.avro
        else:
            RaiseIt.type_error(file_format, ["csv", "json", "parquet", "avro", ])

        i = url.rfind('/')
        data_name = url[(i + 1):]
        data_def = {
            "displayName": data_name,
            "url": url
        }
        return Downloader(data_def).download(file_format, type_of)
Esempio n. 5
0
    def data_loader(self, url, type_of):
        """
        Load data in from a url
        :param url: url string
        :param type_of: format data type
        :return:
        """

        data_loader = None
        if type_of == "csv":
            data_loader = self.csv
        elif type_of == "json":
            data_loader = self.json
        elif type_of == "parquet":
            data_loader = self.parquet
        elif type_of == "avro":
            data_loader = self.avro
        else:
            RaiseIt.type_error(data_loader, [
                "csv",
                "json",
                "parquet",
                "avro",
            ])

        i = url.rfind('/')
        data_name = url[(i + 1):]
        data_def = {"displayName": data_name, "url": url}
        return Downloader(data_def).download(data_loader, type_of)
Esempio n. 6
0
def normalizer(df, input_cols, p=2.0):
    """
    Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which
    specifies the p-norm used for normalization. (p=2) by default.
    :param df: Dataframe to be transformed
    :param input_cols: Columns to be normalized.
    :param p:  p-norm used for normalization.
    :return: Dataframe with normalized columns.
    """

    # Check if columns argument must be a string or list datatype:
    if is_(input_cols, [str, list]):
        RaiseIt.type_error(input_cols, [str, list])

    if is_str(input_cols):
        input_cols = [input_cols]

    if is_(input_cols, [float, int]):
        RaiseIt.type_error(input_cols, [float, int])

    df = df.cols.cast(input_cols, "vector")

    normal = [
        Normalizer(inputCol=column, outputCol=column + "_normalized", p=p)
        for column in list(set(input_cols))
    ]

    pipeline = Pipeline(stages=normal)

    df = pipeline.fit(df).transform(df)

    return df
Esempio n. 7
0
    def url(self, path=None, type_of="csv"):
        """
        Entry point for loading data from a URL. Check that the url is well format
        :param path: string for URL to read
        :param type_of: type of the URL backend (can be csv or json)
        :return: pyspark dataframe from URL.
        """

        if "https://" in str(path) or "http://" in str(path) or "file://" in str(path):
            return self._data_loader(str(path), type_of)
        else:
            RaiseIt.type_error(type_of, ["https://", "http://", "file://"])
Esempio n. 8
0
def set_name(self, value=None):
    """
    Create a temp view for a data frame also used in the json output profiling
    :param self:
    :param value:
    :return:
    """
    self._name = value
    if not is_str(value):
        RaiseIt.type_error(value, ["string"])

    if len(value) == 0:
        RaiseIt.value_error(value, ["> 0"])

    self.createOrReplaceTempView(value)
Esempio n. 9
0
def table_name(self, name=None):
    """
    Create a temp view for a data frame
    :param self:
    :param name:
    :return:
    """
    if not is_str(name):
        RaiseIt.type_error(name, ["string"])

    if len(name) is 0:
        RaiseIt.value_error(name, ["> 0"])

    self.createOrReplaceTempView(name)
    return self
Esempio n. 10
0
    def to_file(self, path=None, output=None):
        """
        Save profiler data to a file in the specified format (html, json)
        :param output: html or json
        :param path: filename in which the data will be saved
        :return:
        """

        if path is None:
            RaiseIt.value_error(path, ["Invalid file path"])

        # We need to append a some extra html tags to display it correctly in the browser.
        if output is "html":
            if self.html is None:
                assert self.html is not None, "Please run the profiler first"

            header = '''<!doctype html>
<html class="no-js" lang="">

<head>
  <meta charset="utf-8">
  <meta http-equiv="x-ua-compatible" content="ie=edge">
  <title></title>
  <meta name="description" content="">
  <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">

  <link rel="manifest" href="site.webmanifest">
  <link rel="apple-touch-icon" href="icon.png">
  <!-- Place favicon.ico in the root directory -->

  <link rel="stylesheet" href="css/normalize.css">
  <link rel="stylesheet" href="css/main.css">
</head>

<body>'''

            footer = '''</body></html>'''

            write_html(header + self.html + footer, path)
        elif output is "json":
            if self.json is None:
                assert self.json is not None, "Please run the profiler first"

            write_json(self.json, path)
        else:
            print("sdf")
            RaiseIt.type_error(output, ["html", "json"])
Esempio n. 11
0
    def compare(df1, df2, method="json"):
        """
        Compare 2 Spark dataframes
        :param df1:
        :param df2:
        :param method: json or a
        :return:
        """
        if method is "json":
            diff = DeepDiff(df1.to_json(), df2.to_json(), ignore_order=False)
            print_json(diff)
        elif method is "collect":
            if df1.collect() == df2.collect():
                print("Dataframes are equal")
                return True
            else:
                print("Dataframes not equal. Use 'json' param to check for diffrences")
                return False

        else:
            RaiseIt.type_error(method, ["json", "collect"])
Esempio n. 12
0
def parse_columns(df,
                  cols_args,
                  get_args=False,
                  is_regex=None,
                  filter_by_column_dtypes=None,
                  accepts_missing_cols=False,
                  invert=False):
    """
    Return a list of columns and check that columns exists in the dataframe
    Accept '*' as parameter in which case return a list of all columns in the dataframe.
    Also accept a regex.
    If a list of tuples return to list. The first element is the columns name the others element are params.
    This params can be used to create custom transformation functions. You can find and example in cols().cast()
    :param df: Dataframe in which the columns are going to be checked
    :param cols_args: Accepts * as param to return all the string columns in the dataframe
    :param get_args:
    :param is_regex: Use True is col_attrs is a regex
    :param filter_by_column_dtypes: A data type for which a columns list is going be filtered
    :param accepts_missing_cols: if true not check if column exist in the dataframe
    :param invert: Invert the final selection. For example if you want to select not integers

    :return: A list of columns string names
    """

    if not is_dataframe(df):
        RaiseIt.type_error(df, "Dataframe")
    attrs = None

    # if columns value is * get all dataframes columns
    if is_regex is True:
        r = re.compile(cols_args[0])
        cols = list(filter(r.match, df.columns))

    elif cols_args == "*" or cols_args is None:
        cols = df.columns

    # In case we have a list of tuples we use the first element of the tuple is taken as the column name
    # and the rest as params. We can use the param in a custom function as follow
    # def func(attrs): attrs return (1,2) and (3,4)
    #   return attrs[0] + 1
    # df.cols().apply([('col_1',1,2),('cols_2', 3 ,4)], func)

    # Verify if we have a list with tuples
    elif is_tuple(cols_args) or is_list_of_tuples(cols_args):
        cols_args = val_to_list(cols_args)
        # Extract a specific position in the tuple
        cols = [(i[0:1][0]) for i in cols_args]
        attrs = [(i[1:]) for i in cols_args]
    else:
        # if not a list convert to list
        cols = val_to_list(cols_args)
        # Get col name from index
        cols = [c if is_str(c) else df.columns[c] for c in cols]

    # Check for missing columns
    if accepts_missing_cols is False:
        check_for_missing_columns(df, cols)

    # Filter by column data type
    filter_by_column_dtypes = val_to_list(filter_by_column_dtypes)
    if is_list_of_list(filter_by_column_dtypes):
        filter_by_column_dtypes = [
            item for sublist in filter_by_column_dtypes for item in sublist
        ]

    columns_residual = None

    # If necessary filter the columns by data type
    if filter_by_column_dtypes:
        # Get columns for every data type

        columns_filtered = filter_col_name_by_dtypes(df,
                                                     filter_by_column_dtypes)

        # Intersect the columns filtered per data type from the whole dataframe with the columns passed to the function
        final_columns = list(OrderedSet(cols).intersection(columns_filtered))

        # This columns match filtered data type
        columns_residual = list(
            OrderedSet(cols) - OrderedSet(columns_filtered))
    else:
        final_columns = cols

    cols_params = []

    if invert:
        final_columns = list(
            OrderedSet(df.cols.names()) - OrderedSet(final_columns))

    if get_args is True:
        cols_params = final_columns, attrs
    elif get_args is False:
        cols_params = final_columns
    else:
        RaiseIt.value_error(get_args, ["True", "False"])

    if columns_residual:
        logger.print("%s %s %s", ",".join(escape_columns(columns_residual)),
                     "column(s) was not processed because is/are not",
                     ",".join(filter_by_column_dtypes))

    # if because of filtering we got 0 columns return None
    if len(cols_params) == 0:
        cols_params = None
        logger.print("Outputting 0 columns after filtering. Is this expected?")

    return cols_params