コード例 #1
0
    def hist(columns=None, buckets=10):
        """
        Plot histogram
        :param columns: Columns to be printed
        :param buckets: Number of buckets
        :return:
        """
        columns = parse_columns(self, columns, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)

        for col_name in columns:
            data = self.cols.hist(col_name, buckets)
            plot_hist({col_name: data}, output="image")
コード例 #2
0
    def hist(columns=None, buckets=10):
        """
        Plot histogram
        :param columns: Columns to be printed
        :param buckets: Number of buckets
        :return:
        """
        columns = parse_columns(self, columns)

        for col_name in columns:
            data = self.cols.hist(col_name, buckets)
            plot_hist({col_name: data}, output="image")
コード例 #3
0
ファイル: plots.py プロジェクト: niteshnicholas/Optimus
    def hist(columns=None, buckets=10, output=None, path=None):
        """
        Plot histogram
        :param columns: Columns to be printed
        :param buckets: Number of buckets
        :param output:
        :param path:
        :return:
        """
        columns = parse_columns(self,
                                columns,
                                filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)
        check_column_numbers(columns, "*")

        for col_name in columns:
            data = self.cols.hist(col_name, buckets)
            plot_hist({col_name: data}, output=output, path=path)
コード例 #4
0
    def run(self, df, columns, buckets=20):
        """
        Return statistical information in HTML Format
        :param df:
        :param columns:
        :param buckets:
        :return:
        """

        columns = parse_columns(df, columns)
        output = Profiler.to_json(df, columns, buckets)

        # Load jinja
        path = os.path.dirname(os.path.abspath(__file__))
        template_loader = jinja2.FileSystemLoader(searchpath=path +
                                                  "//templates")
        template_env = jinja2.Environment(loader=template_loader,
                                          autoescape=True)

        # Render template
        # Create the profiler info header
        html = ""
        general_template = template_env.get_template("general_info.html")
        html = html + general_template.render(data=output)

        template = template_env.get_template("one_column.html")

        # Create every column stats
        for col_name in columns:
            if "hist" in output["columns"][col_name]:
                hist_pic = plot_hist(
                    {col_name: output["columns"][col_name]["hist"]},
                    output="base64")
            else:
                hist_pic = None
            if "frequency" in output["columns"][col_name]:
                freq_pic = plot_freq(
                    {col_name: output["columns"][col_name]["frequency"]},
                    output="base64")
            else:
                freq_pic = None

            html = html + template.render(data=output["columns"][col_name],
                                          hist_pic=hist_pic,
                                          freq_pic=freq_pic)

        html = html + df.table_html(10)
        # df.plots.correlation(columns)

        # Display HTML
        display(HTML(html))

        # Save to file
        write_json(output, self.path)
コード例 #5
0
ファイル: profiler.py プロジェクト: niteshnicholas/Optimus
    def run(self, df, columns, buckets=40, infer=False, relative_error=1):
        """
        Return dataframe statistical information in HTML Format

        :param df: Dataframe to be analyzed
        :param columns: Columns to be analized
        :param buckets: Number of buckets calculated to print the histogram
        :param infer: infer data type
        :param relative_error: Relative Error for quantile discretizer calculation
        :return:
        """

        columns = parse_columns(df, columns)
        output = Profiler.to_json(df, columns, buckets, infer, relative_error)

        # Load jinja
        path = os.path.dirname(os.path.abspath(__file__))
        template_loader = jinja2.FileSystemLoader(searchpath=path +
                                                  "//templates")
        template_env = jinja2.Environment(loader=template_loader,
                                          autoescape=True)

        # Render template
        # Create the profiler info header
        html = ""
        general_template = template_env.get_template("general_info.html")
        html = html + general_template.render(data=output)

        template = template_env.get_template("one_column.html")

        # Create every column stats
        for col_name in columns:
            hist_pic = None
            col = output["columns"][col_name]
            if "hist" in col:
                if col["column_dtype"] == "date":
                    hist_year = plot_hist({col_name: col["hist"]["years"]},
                                          "base64", "years")
                    hist_month = plot_hist({col_name: col["hist"]["months"]},
                                           "base64", "months")
                    hist_weekday = plot_hist(
                        {col_name: col["hist"]["weekdays"]}, "base64",
                        "weekdays")
                    hist_hour = plot_hist({col_name: col["hist"]["hours"]},
                                          "base64", "hours")
                    hist_minute = plot_hist({col_name: col["hist"]["minutes"]},
                                            "base64", "minutes")
                    hist_pic = {
                        "hist_years": hist_year,
                        "hist_months": hist_month,
                        "hist_weekdays": hist_weekday,
                        "hist_hours": hist_hour,
                        "hist_minutes": hist_minute
                    }
                else:

                    hist = plot_hist({col_name: col["hist"]}, output="base64")
                    hist_pic = {"hist_pic": hist}

            if "frequency" in col:
                freq_pic = plot_freq({col_name: col["frequency"]},
                                     output="base64")
            else:
                freq_pic = None

            html = html + template.render(
                data=col, freq_pic=freq_pic, **hist_pic)

        html = html + df.table_html(10)

        # Display HTML
        print_html(html)

        # send to queue

        if self.queue_url is not None:
            self.to_queue(output)

        # JSON
        # Save in case we want to output to a json file
        self.json = output

        # Save file in json format
        write_json(output, self.path)

        # Save in case we want to output to a html file
        self.html = html