def plot_boxplot(column_data=None, output=None, path=None): """ Box plot :param column_data: column data in json format :param output: image, base64 or plot. Image output a file, base64 output a base64 encoded image and plot output the image to the notebook :param path: :return: """ for col_name, stats in column_data.items(): fig, axes = plt.subplots(1, 1) bp = axes.bxp(stats, patch_artist=True) axes.set_title(col_name) plt.figure(figsize=(12, 5)) # 'fliers', 'means', 'medians', 'caps' for element in ['boxes', 'whiskers']: plt.setp(bp[element], color='#1f77b4') for patch in bp['boxes']: patch.set(facecolor='white') # Tweak spacing to prevent clipping of tick-labels # Save as base64 if output is "base64": return output_base64(fig) elif output is "image": output_image(fig, path) print_html("<img src='" + path + "'>")
def __init__(self, queue_name=None, key=None): # If queue_name was not given try lo load from file if not generate one if queue_name is None: self.queue_name = save_config_key("bumblebee.ini", "DEFAULT", "QueueName", str(uuid.uuid4())) else: self.queue_name = queue_name if key is None: # key is generated as byte convert to base64 so we can saved it in the config file key = Fernet.generate_key() self.key = save_config_key("bumblebee.ini", "DEFAULT", "Key", key.decode()) else: self.key = key keys_link = "<a href ='{FULL_DOMAIN}'> here</a>".format( FULL_DOMAIN=FULL_DOMAIN, SESSION=self.queue_name, KEY=self.key) direct_link = "<a target='_blank' href ='{FULL_DOMAIN}/?session={SESSION}&key={KEY}&view=0'>{FULL_DOMAIN}</a>".format( FULL_DOMAIN=FULL_DOMAIN, SESSION=self.queue_name, KEY=self.key) print_html( "Open Bumblebee: " + direct_link + "<div>If you really care about privacy get your keys in bumblebee.ini and put them" + keys_link + "</div>") self.token = None self.f = Fernet(self.key)
def plot_scatterplot(column_data=None, output=None, path=None): """ Scatter plot :param column_data: column data in json format :param output: image or base64 :param path: :return: """ fig = plt.figure(figsize=(12, 5)) plt.scatter(column_data["x"]["data"], column_data["y"]["data"], s=column_data["s"], alpha=0.5) plt.xlabel(column_data["x"]["name"]) plt.ylabel(column_data["y"]["name"]) if output is "base64": return output_base64(fig) elif output is "image": output_image(fig, path) print_html("<img src='" + path + "'>") elif output is "plot": # Tweak spacing to prevent clipping of tick-labels plt.subplots_adjust(left=0.05, right=0.99, top=0.9, bottom=0.3)
def plot_correlation(cols_data, output=None, path=None): """ Plot a correlation plot :param cols_data: :param output: :param path: :return: """ import pandas as pd df = pd.DataFrame(data=cols_data["data"], columns=cols_data["cols"], index=cols_data["cols"]) sns_plot = sns.heatmap(df, mask=np.zeros_like(cols_data["data"], dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True), annot=True) if output is "base64": # fig = sns.get_figure() fig = sns_plot.get_figure() return output_base64(fig) elif output is "image": # Save image fig = sns_plot.get_figure() fig.savefig(path) print_html("<img src='" + path + "'>")
def table(self, limit=None, columns=None, title=None): try: if __IPYTHON__ and DataFrame.output is "html": result = self.table_html(title=title, limit=limit, columns=columns) print_html(result) else: self.show() except NameError: self.show()
def plot_hist(column_data=None, output=None, sub_title="", path=None): """ Plot a histogram obj = {"col_name":[{'lower': -87.36666870117188, 'upper': -70.51333465576172, 'value': 0}, {'lower': -70.51333465576172, 'upper': -53.66000061035157, 'value': 22094}, {'lower': -53.66000061035157, 'upper': -36.80666656494141, 'value': 2}, ... ]} :param column_data: column data in json format :param output: image, base64 or plot. Image output a file, base64 output a base64 encoded image and plot output the image to the notebook :param sub_title: plot subtitle :param path: :return: plot, image or base64 """ for col_name, data in column_data.items(): bins = [] # print(data) # print("**********") for d in data: bins.append(d['lower']) last = data[len(data) - 1]["upper"] bins.append(last) # Transform hist Optimus format to matplot lib format hist = [] for d in data: if d is not None: hist.append(d["count"]) array_bins = array(bins) center = (array_bins[:-1] + array_bins[1:]) / 2 width = 0.9 * (array_bins[1] - array_bins[0]) hist = one_list_to_val(hist) # Plot fig = plt.figure(figsize=(12, 5)) plt.bar(center, hist, width=width) plt.title("Histogram '" + col_name + "' " + sub_title) # fig.tight_layout() if output is "base64": return output_base64(fig) elif output is "image": # Save image output_image(plt, path) print_html("<img src='" + path + "'>") # Print in jupyter notebook elif output is "plot": plt.subplots_adjust(left=0.05, right=0.99, top=0.9, bottom=0.3)
def to_image(self, output_path): """ Save the profiler result as image :param self: :param output_path: path where the image will be saved :return: """ css = absolute_path("/css/styles.css") imgkit.from_string(self.html, output_path, css=css) print_html("<img src='" + output_path + "'>")
def __init__(self, queue_name=None, username="******", password="******"): # If queue_name was not given try lo load from file if not generate one if queue_name is None: self.queue_name = save_config_key("bumblebee.ini", "DEFAULT", "QueueName", str(uuid.uuid4())) # key is generated as byte convert to base64 so we can saved it in the config file key = Fernet.generate_key() self.key = save_config_key("bumblebee.ini", "DEFAULT", "Key", key.decode()) else: self.queue_name = queue_name keys_link = "<a href ='{FULL_DOMAIN}'> here</a>. ".format(FULL_DOMAIN=FULL_DOMAIN, SESSION=self.queue_name, KEY=self.key) direct_link = "<a target='_blank' href ='{FULL_DOMAIN}/?session={SESSION}&key={KEY}&view=0'>call bumblebee</a>".format( FULL_DOMAIN=FULL_DOMAIN, SESSION=self.queue_name, KEY=self.key) print_html( "Your connection keys are in bumblebee.ini. If you really care about privacy get your keys and put them" + keys_link + "If you are testing just " + direct_link ) # Queue config client = mqtt.Client("MQTT") client.username_pw_set(username=username, password=password) # Callbacks client.connected_flag = False def on_connect(client, userdata, flags, rc): if rc == 0: client.connected_flag = True # set flag # print("connected...") else: print("Bad connection Returned code=", rc) def on_disconnect(client, userdata, rc): # print("disconnected") client.connected_flag = False def on_publish(client, userdata, result): # create function for callback print("Data sent \n") client.on_publish = on_publish client.on_disconnect = on_disconnect client.on_connect = on_connect self.queue = client self.token = None self.f = Fernet(self.key)
def table_image(self, path, limit=10): """ :param self: :param limit: :param path: :return: """ css = absolute_path("/css/styles.css") imgkit.from_string(self.table_html(limit=limit, full=True), path, css=css) print_html("<img src='" + path + "'>")
def table(self, limit=None, columns=None, title=None, truncate=True): try: if isnotebook() and DataFrame.output == "html": result = self.table_html(title=title, limit=limit, columns=columns, truncate=truncate) print_html(result) else: self.show() except NameError: self.show()
def _load_css(): """ Try to load the css for templates :return: """ try: if __IPYTHON__: url = absolute_path("/css/styles.css") styles = open(url, "r", encoding="utf8").read() s = '<style>%s</style>' % styles print_html(s) except NameError: pass
def display(self, limit=10, columns=None, title=None, truncate=True, plain_text=False): # TODO: limit, columns, title, truncate df = self if is_notebook() and not plain_text: print_html(df.table(limit, columns, title, truncate)) else: print(df.ascii(limit, columns))
def plot_frequency(column_data=None, output=None, path=None): """ Frequency plot :param column_data: column data in json format :param output: image, base64 or plot. Image output a file, base64 output a base64 encoded image and plot output the image to the notebook :param path: :return: """ for col_name, data in column_data.items(): # Transform Optimus' format to matplotlib's format x = [] h = [] for d in data: x.append(ellipsis(d["value"])) h.append(d["count"]) # Plot fig = plt.figure(figsize=(12, 5)) # Need to to this to plot string labels on x x_i = range(len(x)) plt.bar(x_i, h) plt.xticks(x_i, x) plt.title("Frequency '" + col_name + "'") plt.xticks(rotation=45, ha="right") plt.subplots_adjust(left=0.05, right=0.99, top=0.9, bottom=0.3) if output is "base64": return output_base64(fig) elif output is "image": output_image(plt, path) print_html("<img src='" + path + "'>") elif output is "plot": # Tweak spacing to prevent clipping of tick-labels plt.subplots_adjust(left=0.05, right=0.99, top=0.9, bottom=0.3)
def run(self, df, columns="*", buckets=MAX_BUCKETS, infer=False, relative_error=RELATIVE_ERROR, approx_count=True, mismatch=None): """ Return dataframe statistical information in HTML Format :param df: Dataframe to be analyzed :param columns: Columns to be analyzed :param buckets: Number of buckets calculated to print the histogram :param infer: infer data type :param relative_error: Relative Error for quantile discretizer calculation :param approx_count: Use approx_count_distinct or countDistinct :param mismatch: :return: """ columns = parse_columns(df, columns) columns, output = self.dataset(df, columns, buckets, infer, relative_error, approx_count, format="dict", mismatch=mismatch) # Load jinja template_loader = jinja2.FileSystemLoader( searchpath=absolute_path("/profiler/templates/out")) template_env = jinja2.Environment(loader=template_loader, autoescape=True) # Render template # Create the profiler info header html = "" general_template = template_env.get_template("general_info.html") html = html + general_template.render(data=output) template = template_env.get_template("one_column.html") # Create every column stats for col_name in columns: hist_pic = None freq_pic = None col = output["columns"][col_name] if "hist" in col["stats"]: hist_dict = col["stats"]["hist"] if col["column_dtype"] == "date": hist_year = plot_hist({col_name: hist_dict["years"]}, "base64", "years") hist_month = plot_hist({col_name: hist_dict["months"]}, "base64", "months") hist_weekday = plot_hist({col_name: hist_dict["weekdays"]}, "base64", "weekdays") hist_hour = plot_hist({col_name: hist_dict["hours"]}, "base64", "hours") hist_minute = plot_hist({col_name: hist_dict["minutes"]}, "base64", "minutes") hist_pic = { "hist_years": hist_year, "hist_months": hist_month, "hist_weekdays": hist_weekday, "hist_hours": hist_hour, "hist_minutes": hist_minute } elif col["column_dtype"] == "int" or col[ "column_dtype"] == "string" or col[ "column_dtype"] == "decimal": hist = plot_hist({col_name: hist_dict}, output="base64") hist_pic = {"hist_numeric_string": hist} if "frequency" in col: freq_pic = plot_frequency({col_name: col["frequency"]}, output="base64") html = html + template.render( data=col, freq_pic=freq_pic, hist_pic=hist_pic) # Save in case we want to output to a html file # self.html = html + df.table_html(10) self.html = html # Display HTML print_html(self.html) # JSON # Save in case we want to output to a json file self.json = output return self
def run(self, df, columns, buckets=40, infer=False, relative_error=1, approx_count=True): """ Return dataframe statistical information in HTML Format :param df: Dataframe to be analyzed :param columns: Columns to be analyzed :param buckets: Number of buckets calculated to print the histogram :param infer: infer data type :param relative_error: Relative Error for quantile discretizer calculation :param approx_count: Use approx_count_distinct or countDistinct :return: """ columns = parse_columns(df, columns) output = Profiler.to_json(df, columns, buckets, infer, relative_error, approx_count) # Load jinja path = os.path.dirname(os.path.abspath(__file__)) template_loader = jinja2.FileSystemLoader(searchpath=path + "//templates") template_env = jinja2.Environment(loader=template_loader, autoescape=True) # Render template # Create the profiler info header html = "" general_template = template_env.get_template("general_info.html") html = html + general_template.render(data=output) template = template_env.get_template("one_column.html") # Create every column stats for col_name in columns: hist_pic = None col = output["columns"][col_name] if "hist" in col: if col["column_dtype"] == "date": hist_year = plot_hist({col_name: col["hist"]["years"]}, "base64", "years") hist_month = plot_hist({col_name: col["hist"]["months"]}, "base64", "months") hist_weekday = plot_hist( {col_name: col["hist"]["weekdays"]}, "base64", "weekdays") hist_hour = plot_hist({col_name: col["hist"]["hours"]}, "base64", "hours") hist_minute = plot_hist({col_name: col["hist"]["minutes"]}, "base64", "minutes") hist_pic = { "hist_years": hist_year, "hist_months": hist_month, "hist_weekdays": hist_weekday, "hist_hours": hist_hour, "hist_minutes": hist_minute } else: hist = plot_hist({col_name: col["hist"]}, output="base64") hist_pic = {"hist_pic": hist} if "frequency" in col: freq_pic = plot_frequency({col_name: col["frequency"]}, output="base64") else: freq_pic = None html = html + template.render( data=col, freq_pic=freq_pic, **hist_pic) html = html + df.table_html(10) # Display HTML print_html(html) # send to queue if self.queue_url is not None: self.to_queue(output) # JSON # Save in case we want to output to a json file self.json = output # Save file in json format write_json(output, self.path) # Save in case we want to output to a html file self.html = html