Esempio n. 1
0
def plot_boxplot(column_data=None, output=None, path=None):
    """
    Box plot
    :param column_data: column data in json format
    :param output: image, base64 or plot. Image output a file, base64 output a base64 encoded image and plot output the
    image to the notebook
    :param path:
    :return:
    """

    for col_name, stats in column_data.items():
        fig, axes = plt.subplots(1, 1)

        bp = axes.bxp(stats, patch_artist=True)

        axes.set_title(col_name)
        plt.figure(figsize=(12, 5))

        # 'fliers', 'means', 'medians', 'caps'
        for element in ['boxes', 'whiskers']:
            plt.setp(bp[element], color='#1f77b4')

        for patch in bp['boxes']:
            patch.set(facecolor='white')

            # Tweak spacing to prevent clipping of tick-labels

        # Save as base64
        if output is "base64":
            return output_base64(fig)
        elif output is "image":
            output_image(fig, path)
            print_html("<img src='" + path + "'>")
Esempio n. 2
0
    def __init__(self, queue_name=None, key=None):

        # If queue_name was not given try lo load from file if not generate one
        if queue_name is None:
            self.queue_name = save_config_key("bumblebee.ini", "DEFAULT",
                                              "QueueName", str(uuid.uuid4()))
        else:
            self.queue_name = queue_name

        if key is None:
            # key is generated as byte convert to base64 so we can saved it in the config file
            key = Fernet.generate_key()
            self.key = save_config_key("bumblebee.ini", "DEFAULT", "Key",
                                       key.decode())
        else:
            self.key = key

        keys_link = "<a href ='{FULL_DOMAIN}'> here</a>".format(
            FULL_DOMAIN=FULL_DOMAIN, SESSION=self.queue_name, KEY=self.key)

        direct_link = "<a target='_blank' href ='{FULL_DOMAIN}/?session={SESSION}&key={KEY}&view=0'>{FULL_DOMAIN}</a>".format(
            FULL_DOMAIN=FULL_DOMAIN, SESSION=self.queue_name, KEY=self.key)

        print_html(
            "Open Bumblebee: " + direct_link +
            "<div>If you really care about privacy get your keys in bumblebee.ini and put them"
            + keys_link + "</div>")

        self.token = None

        self.f = Fernet(self.key)
Esempio n. 3
0
def plot_scatterplot(column_data=None, output=None, path=None):
    """
    Scatter plot
    :param column_data: column data in json format
    :param output: image or base64
    :param path:
    :return:
    """

    fig = plt.figure(figsize=(12, 5))
    plt.scatter(column_data["x"]["data"],
                column_data["y"]["data"],
                s=column_data["s"],
                alpha=0.5)
    plt.xlabel(column_data["x"]["name"])
    plt.ylabel(column_data["y"]["name"])

    if output is "base64":
        return output_base64(fig)
    elif output is "image":
        output_image(fig, path)
        print_html("<img src='" + path + "'>")
    elif output is "plot":
        # Tweak spacing to prevent clipping of tick-labels
        plt.subplots_adjust(left=0.05, right=0.99, top=0.9, bottom=0.3)
Esempio n. 4
0
def plot_correlation(cols_data, output=None, path=None):
    """
    Plot a correlation plot
    :param cols_data:
    :param output:
    :param path:
    :return:
    """
    import pandas as pd
    df = pd.DataFrame(data=cols_data["data"],
                      columns=cols_data["cols"],
                      index=cols_data["cols"])

    sns_plot = sns.heatmap(df,
                           mask=np.zeros_like(cols_data["data"],
                                              dtype=np.bool),
                           cmap=sns.diverging_palette(220, 10, as_cmap=True),
                           annot=True)

    if output is "base64":
        # fig = sns.get_figure()
        fig = sns_plot.get_figure()
        return output_base64(fig)
    elif output is "image":
        # Save image
        fig = sns_plot.get_figure()
        fig.savefig(path)
        print_html("<img src='" + path + "'>")
Esempio n. 5
0
def table(self, limit=None, columns=None, title=None):
    try:
        if __IPYTHON__ and DataFrame.output is "html":
            result = self.table_html(title=title, limit=limit, columns=columns)
            print_html(result)
        else:
            self.show()
    except NameError:

        self.show()
Esempio n. 6
0
def plot_hist(column_data=None, output=None, sub_title="", path=None):
    """
    Plot a histogram
    obj = {"col_name":[{'lower': -87.36666870117188, 'upper': -70.51333465576172, 'value': 0},
    {'lower': -70.51333465576172, 'upper': -53.66000061035157, 'value': 22094},
    {'lower': -53.66000061035157, 'upper': -36.80666656494141, 'value': 2},
    ...
    ]}
    :param column_data: column data in json format
    :param output: image, base64 or plot. Image output a file, base64 output a base64 encoded image and plot output the
    image to the notebook
    :param sub_title: plot subtitle
    :param path:
    :return: plot, image or base64
    """

    for col_name, data in column_data.items():
        bins = []
        # print(data)
        # print("**********")
        for d in data:
            bins.append(d['lower'])

        last = data[len(data) - 1]["upper"]
        bins.append(last)

        # Transform hist Optimus format to matplot lib format
        hist = []
        for d in data:
            if d is not None:
                hist.append(d["count"])

        array_bins = array(bins)
        center = (array_bins[:-1] + array_bins[1:]) / 2
        width = 0.9 * (array_bins[1] - array_bins[0])

        hist = one_list_to_val(hist)

        # Plot
        fig = plt.figure(figsize=(12, 5))
        plt.bar(center, hist, width=width)
        plt.title("Histogram '" + col_name + "' " + sub_title)

        # fig.tight_layout()

        if output is "base64":
            return output_base64(fig)
        elif output is "image":
            # Save image
            output_image(plt, path)
            print_html("<img src='" + path + "'>")
            # Print in jupyter notebook

        elif output is "plot":
            plt.subplots_adjust(left=0.05, right=0.99, top=0.9, bottom=0.3)
Esempio n. 7
0
    def to_image(self, output_path):
        """
        Save the profiler result as image
        :param self:
        :param output_path: path where the image will be saved
        :return:
        """
        css = absolute_path("/css/styles.css")
        imgkit.from_string(self.html, output_path, css=css)

        print_html("<img src='" + output_path + "'>")
Esempio n. 8
0
    def __init__(self, queue_name=None, username="******", password="******"):

        # If queue_name was not given try lo load from file if not generate one
        if queue_name is None:
            self.queue_name = save_config_key("bumblebee.ini", "DEFAULT", "QueueName", str(uuid.uuid4()))

            # key is generated as byte convert to base64 so we can saved it in the config file
            key = Fernet.generate_key()
            self.key = save_config_key("bumblebee.ini", "DEFAULT", "Key", key.decode())

        else:
            self.queue_name = queue_name

        keys_link = "<a href ='{FULL_DOMAIN}'> here</a>. ".format(FULL_DOMAIN=FULL_DOMAIN,
                                                                  SESSION=self.queue_name, KEY=self.key)

        direct_link = "<a target='_blank' href ='{FULL_DOMAIN}/?session={SESSION}&key={KEY}&view=0'>call bumblebee</a>".format(
            FULL_DOMAIN=FULL_DOMAIN, SESSION=self.queue_name, KEY=self.key)

        print_html(
            "Your connection keys are in bumblebee.ini. If you really care about privacy get your keys and put them" + keys_link +
            "If you are testing just " + direct_link

        )

        # Queue config
        client = mqtt.Client("MQTT")
        client.username_pw_set(username=username, password=password)

        # Callbacks
        client.connected_flag = False

        def on_connect(client, userdata, flags, rc):
            if rc == 0:
                client.connected_flag = True  # set flag
                # print("connected...")
            else:
                print("Bad connection Returned code=", rc)

        def on_disconnect(client, userdata, rc):
            # print("disconnected")
            client.connected_flag = False

        def on_publish(client, userdata, result):  # create function for callback
            print("Data sent \n")

        client.on_publish = on_publish
        client.on_disconnect = on_disconnect
        client.on_connect = on_connect

        self.queue = client
        self.token = None

        self.f = Fernet(self.key)
Esempio n. 9
0
def table_image(self, path, limit=10):
    """

    :param self:
    :param limit:
    :param path:
    :return:
    """

    css = absolute_path("/css/styles.css")

    imgkit.from_string(self.table_html(limit=limit, full=True), path, css=css)
    print_html("<img src='" + path + "'>")
Esempio n. 10
0
def table(self, limit=None, columns=None, title=None, truncate=True):
    try:
        if isnotebook() and DataFrame.output == "html":
            result = self.table_html(title=title,
                                     limit=limit,
                                     columns=columns,
                                     truncate=truncate)
            print_html(result)
        else:
            self.show()
    except NameError:

        self.show()
Esempio n. 11
0
 def _load_css():
     """
     Try to load the css for templates
     :return:
     """
     try:
         if __IPYTHON__:
             url = absolute_path("/css/styles.css")
             styles = open(url, "r", encoding="utf8").read()
             s = '<style>%s</style>' % styles
             print_html(s)
     except NameError:
         pass
Esempio n. 12
0
    def display(self,
                limit=10,
                columns=None,
                title=None,
                truncate=True,
                plain_text=False):
        # TODO: limit, columns, title, truncate
        df = self

        if is_notebook() and not plain_text:
            print_html(df.table(limit, columns, title, truncate))

        else:
            print(df.ascii(limit, columns))
Esempio n. 13
0
def plot_frequency(column_data=None, output=None, path=None):
    """
    Frequency plot
    :param column_data: column data in json format
    :param output: image, base64 or plot. Image output a file, base64 output a base64 encoded image and plot output the
    image to the notebook
    :param path:
    :return:
    """

    for col_name, data in column_data.items():

        # Transform Optimus' format to matplotlib's format
        x = []
        h = []

        for d in data:
            x.append(ellipsis(d["value"]))
            h.append(d["count"])

        # Plot
        fig = plt.figure(figsize=(12, 5))

        # Need to to this to plot string labels on x
        x_i = range(len(x))
        plt.bar(x_i, h)
        plt.xticks(x_i, x)

        plt.title("Frequency '" + col_name + "'")

        plt.xticks(rotation=45, ha="right")
        plt.subplots_adjust(left=0.05, right=0.99, top=0.9, bottom=0.3)

        if output is "base64":
            return output_base64(fig)
        elif output is "image":
            output_image(plt, path)
            print_html("<img src='" + path + "'>")
        elif output is "plot":
            # Tweak spacing to prevent clipping of tick-labels
            plt.subplots_adjust(left=0.05, right=0.99, top=0.9, bottom=0.3)
Esempio n. 14
0
    def run(self,
            df,
            columns="*",
            buckets=MAX_BUCKETS,
            infer=False,
            relative_error=RELATIVE_ERROR,
            approx_count=True,
            mismatch=None):
        """
        Return dataframe statistical information in HTML Format
        :param df: Dataframe to be analyzed
        :param columns: Columns to be analyzed
        :param buckets: Number of buckets calculated to print the histogram
        :param infer: infer data type
        :param relative_error: Relative Error for quantile discretizer calculation
        :param approx_count: Use approx_count_distinct or countDistinct
        :param mismatch:
        :return:
        """

        columns = parse_columns(df, columns)
        columns, output = self.dataset(df,
                                       columns,
                                       buckets,
                                       infer,
                                       relative_error,
                                       approx_count,
                                       format="dict",
                                       mismatch=mismatch)

        # Load jinja
        template_loader = jinja2.FileSystemLoader(
            searchpath=absolute_path("/profiler/templates/out"))
        template_env = jinja2.Environment(loader=template_loader,
                                          autoescape=True)

        # Render template
        # Create the profiler info header
        html = ""
        general_template = template_env.get_template("general_info.html")
        html = html + general_template.render(data=output)

        template = template_env.get_template("one_column.html")
        # Create every column stats
        for col_name in columns:
            hist_pic = None
            freq_pic = None

            col = output["columns"][col_name]
            if "hist" in col["stats"]:
                hist_dict = col["stats"]["hist"]

                if col["column_dtype"] == "date":
                    hist_year = plot_hist({col_name: hist_dict["years"]},
                                          "base64", "years")
                    hist_month = plot_hist({col_name: hist_dict["months"]},
                                           "base64", "months")
                    hist_weekday = plot_hist({col_name: hist_dict["weekdays"]},
                                             "base64", "weekdays")
                    hist_hour = plot_hist({col_name: hist_dict["hours"]},
                                          "base64", "hours")
                    hist_minute = plot_hist({col_name: hist_dict["minutes"]},
                                            "base64", "minutes")
                    hist_pic = {
                        "hist_years": hist_year,
                        "hist_months": hist_month,
                        "hist_weekdays": hist_weekday,
                        "hist_hours": hist_hour,
                        "hist_minutes": hist_minute
                    }

                elif col["column_dtype"] == "int" or col[
                        "column_dtype"] == "string" or col[
                            "column_dtype"] == "decimal":
                    hist = plot_hist({col_name: hist_dict}, output="base64")
                    hist_pic = {"hist_numeric_string": hist}
            if "frequency" in col:
                freq_pic = plot_frequency({col_name: col["frequency"]},
                                          output="base64")

            html = html + template.render(
                data=col, freq_pic=freq_pic, hist_pic=hist_pic)

        # Save in case we want to output to a html file
        # self.html = html + df.table_html(10)
        self.html = html

        # Display HTML
        print_html(self.html)

        # JSON
        # Save in case we want to output to a json file
        self.json = output

        return self
Esempio n. 15
0
    def run(self,
            df,
            columns,
            buckets=40,
            infer=False,
            relative_error=1,
            approx_count=True):
        """
        Return dataframe statistical information in HTML Format
        :param df: Dataframe to be analyzed
        :param columns: Columns to be analyzed
        :param buckets: Number of buckets calculated to print the histogram
        :param infer: infer data type
        :param relative_error: Relative Error for quantile discretizer calculation
        :param approx_count: Use approx_count_distinct or countDistinct
        :return:
        """

        columns = parse_columns(df, columns)

        output = Profiler.to_json(df, columns, buckets, infer, relative_error,
                                  approx_count)

        # Load jinja
        path = os.path.dirname(os.path.abspath(__file__))
        template_loader = jinja2.FileSystemLoader(searchpath=path +
                                                  "//templates")
        template_env = jinja2.Environment(loader=template_loader,
                                          autoescape=True)

        # Render template
        # Create the profiler info header
        html = ""
        general_template = template_env.get_template("general_info.html")
        html = html + general_template.render(data=output)

        template = template_env.get_template("one_column.html")

        # Create every column stats
        for col_name in columns:
            hist_pic = None
            col = output["columns"][col_name]
            if "hist" in col:
                if col["column_dtype"] == "date":
                    hist_year = plot_hist({col_name: col["hist"]["years"]},
                                          "base64", "years")
                    hist_month = plot_hist({col_name: col["hist"]["months"]},
                                           "base64", "months")
                    hist_weekday = plot_hist(
                        {col_name: col["hist"]["weekdays"]}, "base64",
                        "weekdays")
                    hist_hour = plot_hist({col_name: col["hist"]["hours"]},
                                          "base64", "hours")
                    hist_minute = plot_hist({col_name: col["hist"]["minutes"]},
                                            "base64", "minutes")
                    hist_pic = {
                        "hist_years": hist_year,
                        "hist_months": hist_month,
                        "hist_weekdays": hist_weekday,
                        "hist_hours": hist_hour,
                        "hist_minutes": hist_minute
                    }
                else:

                    hist = plot_hist({col_name: col["hist"]}, output="base64")
                    hist_pic = {"hist_pic": hist}

            if "frequency" in col:
                freq_pic = plot_frequency({col_name: col["frequency"]},
                                          output="base64")
            else:
                freq_pic = None

            html = html + template.render(
                data=col, freq_pic=freq_pic, **hist_pic)

        html = html + df.table_html(10)

        # Display HTML
        print_html(html)

        # send to queue

        if self.queue_url is not None:
            self.to_queue(output)

        # JSON
        # Save in case we want to output to a json file
        self.json = output

        # Save file in json format
        write_json(output, self.path)

        # Save in case we want to output to a html file
        self.html = html