Exemple #1
0
    def to_file(self, path=None, output="html"):
        """
        Save profiler data to a file in the specified format (html, json)
        :param output: html or json
        :param path: filename in which the data will be saved
        :return:
        """

        if path is None:
            RaiseIt.value_error(path, "str")

        # We need to append a some extra html tags to display it correctly in the browser.
        if output is "html":
            if self.html is None:
                RaiseIt.not_ready_error(
                    "You must first run the profiler, then it can be exported. Try op.profiler.run(df, '*')"
                )

            write_html(HEADER + self.html + FOOTER, path)
        elif output is "json":
            if self.json is None:
                RaiseIt.not_ready_error(
                    "You must first run the profiler, then it can be exported. Try op.profiler.run(df, '*')"
                )

            write_json(self.json, path)
        else:

            RaiseIt.type_error(output, ["html", "json"])
Exemple #2
0
    def run(self, df, columns, buckets=20):
        """
        Return statistical information in HTML Format
        :param df:
        :param columns:
        :param buckets:
        :return:
        """

        columns = parse_columns(df, columns)
        output = Profiler.to_json(df, columns, buckets)

        # Load jinja
        path = os.path.dirname(os.path.abspath(__file__))
        template_loader = jinja2.FileSystemLoader(searchpath=path +
                                                  "//templates")
        template_env = jinja2.Environment(loader=template_loader,
                                          autoescape=True)

        # Render template
        # Create the profiler info header
        html = ""
        general_template = template_env.get_template("general_info.html")
        html = html + general_template.render(data=output)

        template = template_env.get_template("one_column.html")

        # Create every column stats
        for col_name in columns:
            if "hist" in output["columns"][col_name]:
                hist_pic = plot_hist(
                    {col_name: output["columns"][col_name]["hist"]},
                    output="base64")
            else:
                hist_pic = None
            if "frequency" in output["columns"][col_name]:
                freq_pic = plot_freq(
                    {col_name: output["columns"][col_name]["frequency"]},
                    output="base64")
            else:
                freq_pic = None

            html = html + template.render(data=output["columns"][col_name],
                                          hist_pic=hist_pic,
                                          freq_pic=freq_pic)

        html = html + df.table_html(10)
        # df.plots.correlation(columns)

        # Display HTML
        display(HTML(html))

        # Save to file
        write_json(output, self.path)
Exemple #3
0
    def to_file(self, path=None, output=None):
        """
        Save profiler data to a file in the specified format (html, json)
        :param output: html or json
        :param path: filename in which the data will be saved
        :return:
        """

        if path is None:
            RaiseIt.value_error(path, ["Invalid file path"])

        # We need to append a some extra html tags to display it correctly in the browser.
        if output is "html":
            if self.html is None:
                assert self.html is not None, "Please run the profiler first"

            header = '''<!doctype html>
<html class="no-js" lang="">

<head>
  <meta charset="utf-8">
  <meta http-equiv="x-ua-compatible" content="ie=edge">
  <title></title>
  <meta name="description" content="">
  <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">

  <link rel="manifest" href="site.webmanifest">
  <link rel="apple-touch-icon" href="icon.png">
  <!-- Place favicon.ico in the root directory -->

  <link rel="stylesheet" href="css/normalize.css">
  <link rel="stylesheet" href="css/main.css">
</head>

<body>'''

            footer = '''</body></html>'''

            write_html(header + self.html + footer, path)
        elif output is "json":
            if self.json is None:
                assert self.json is not None, "Please run the profiler first"

            write_json(self.json, path)
        else:
            print("sdf")
            RaiseIt.type_error(output, ["html", "json"])
Exemple #4
0
    def json(df, columns, buckets=20, path=None):
        """
        Return the profiling data in json format
        :param df: Dataframe to be processed
        :param columns: column to calculate the histogram
        :param buckets: buckets on the histogram
        :param path: Path where the json is going to be saved
        :return: json file
        """
        dataset = Profiler.dataset_info(df)
        summary = Profiler.columns(df, columns, buckets)
        summary["summary"] = dataset

        if path is None:
            path = Path.cwd() / "data.json"

        write_json(summary, path=path)

        return summary
Exemple #5
0
    def run(self, df, columns, buckets=40, infer=False, relative_error=1):
        """
        Return dataframe statistical information in HTML Format

        :param df: Dataframe to be analyzed
        :param columns: Columns to be analized
        :param buckets: Number of buckets calculated to print the histogram
        :param infer: infer data type
        :param relative_error: Relative Error for quantile discretizer calculation
        :return:
        """

        columns = parse_columns(df, columns)
        output = Profiler.to_json(df, columns, buckets, infer, relative_error)

        # Load jinja
        path = os.path.dirname(os.path.abspath(__file__))
        template_loader = jinja2.FileSystemLoader(searchpath=path +
                                                  "//templates")
        template_env = jinja2.Environment(loader=template_loader,
                                          autoescape=True)

        # Render template
        # Create the profiler info header
        html = ""
        general_template = template_env.get_template("general_info.html")
        html = html + general_template.render(data=output)

        template = template_env.get_template("one_column.html")

        # Create every column stats
        for col_name in columns:
            hist_pic = None
            col = output["columns"][col_name]
            if "hist" in col:
                if col["column_dtype"] == "date":
                    hist_year = plot_hist({col_name: col["hist"]["years"]},
                                          "base64", "years")
                    hist_month = plot_hist({col_name: col["hist"]["months"]},
                                           "base64", "months")
                    hist_weekday = plot_hist(
                        {col_name: col["hist"]["weekdays"]}, "base64",
                        "weekdays")
                    hist_hour = plot_hist({col_name: col["hist"]["hours"]},
                                          "base64", "hours")
                    hist_minute = plot_hist({col_name: col["hist"]["minutes"]},
                                            "base64", "minutes")
                    hist_pic = {
                        "hist_years": hist_year,
                        "hist_months": hist_month,
                        "hist_weekdays": hist_weekday,
                        "hist_hours": hist_hour,
                        "hist_minutes": hist_minute
                    }
                else:

                    hist = plot_hist({col_name: col["hist"]}, output="base64")
                    hist_pic = {"hist_pic": hist}

            if "frequency" in col:
                freq_pic = plot_freq({col_name: col["frequency"]},
                                     output="base64")
            else:
                freq_pic = None

            html = html + template.render(
                data=col, freq_pic=freq_pic, **hist_pic)

        html = html + df.table_html(10)

        # Display HTML
        print_html(html)

        # send to queue

        if self.queue_url is not None:
            self.to_queue(output)

        # JSON
        # Save in case we want to output to a json file
        self.json = output

        # Save file in json format
        write_json(output, self.path)

        # Save in case we want to output to a html file
        self.html = html
Exemple #6
0
    def run(self,
            df,
            columns="*",
            buckets=MAX_BUCKETS,
            infer=False,
            relative_error=RELATIVE_ERROR,
            approx_count=True,
            mismatch=None):
        """
        Return dataframe statistical information in HTML Format
        :param df: Dataframe to be analyzed
        :param columns: Columns to be analyzed
        :param buckets: Number of buckets calculated to print the histogram
        :param infer: infer data type
        :param relative_error: Relative Error for quantile discretizer calculation
        :param approx_count: Use approx_count_distinct or countDistinct
        :param mismatch:
        :return:
        """

        columns = parse_columns(df, columns)

        # for col_name in columns:
        #     df.cols.set_meta({"name": col_name})
        # df.set_meta({"initialized": True})

        output = self.dataset(df,
                              columns,
                              buckets,
                              infer,
                              relative_error,
                              approx_count,
                              format="dict",
                              mismatch=mismatch)

        # Load jinja
        template_loader = jinja2.FileSystemLoader(
            searchpath=absolute_path("/profiler/templates/out"))
        template_env = jinja2.Environment(loader=template_loader,
                                          autoescape=True)

        # Render template
        # Create the profiler info header
        html = ""
        general_template = template_env.get_template("general_info.html")
        html = html + general_template.render(data=output)

        template = template_env.get_template("one_column.html")

        # Create every column stats
        for col_name in columns:
            hist_pic = None
            freq_pic = None

            col = output["columns"][col_name]
            if "hist" in col["stats"]:
                hist_dict = col["stats"]["hist"]

                if col["column_dtype"] == "date":
                    hist_year = plot_hist({col_name: hist_dict["years"]},
                                          "base64", "years")
                    hist_month = plot_hist({col_name: hist_dict["months"]},
                                           "base64", "months")
                    hist_weekday = plot_hist({col_name: hist_dict["weekdays"]},
                                             "base64", "weekdays")
                    hist_hour = plot_hist({col_name: hist_dict["hours"]},
                                          "base64", "hours")
                    hist_minute = plot_hist({col_name: hist_dict["minutes"]},
                                            "base64", "minutes")
                    hist_pic = {
                        "hist_years": hist_year,
                        "hist_months": hist_month,
                        "hist_weekdays": hist_weekday,
                        "hist_hours": hist_hour,
                        "hist_minutes": hist_minute
                    }

                elif col["column_dtype"] == "int" or col[
                        "column_dtype"] == "string" or col[
                            "column_dtype"] == "decimal":
                    hist = plot_hist({col_name: hist_dict}, output="base64")
                    hist_pic = {"hist_numeric_string": hist}

            if "frequency" in col:
                freq_pic = plot_frequency({col_name: col["frequency"]},
                                          output="base64")

            html = html + template.render(
                data=col, freq_pic=freq_pic, hist_pic=hist_pic)

        # Save in case we want to output to a html file
        self.html = html + df.table_html(10)

        # Display HTML
        print_html(self.html)

        # JSON
        # Save in case we want to output to a json file
        self.json = output

        # Save file in json format
        write_json(output, self.path)

        return self