def to_file(self, path=None, output="html"): """ Save profiler data to a file in the specified format (html, json) :param output: html or json :param path: filename in which the data will be saved :return: """ if path is None: RaiseIt.value_error(path, "str") # We need to append a some extra html tags to display it correctly in the browser. if output is "html": if self.html is None: RaiseIt.not_ready_error( "You must first run the profiler, then it can be exported. Try op.profiler.run(df, '*')" ) write_html(HEADER + self.html + FOOTER, path) elif output is "json": if self.json is None: RaiseIt.not_ready_error( "You must first run the profiler, then it can be exported. Try op.profiler.run(df, '*')" ) write_json(self.json, path) else: RaiseIt.type_error(output, ["html", "json"])
def run(self, df, columns, buckets=20): """ Return statistical information in HTML Format :param df: :param columns: :param buckets: :return: """ columns = parse_columns(df, columns) output = Profiler.to_json(df, columns, buckets) # Load jinja path = os.path.dirname(os.path.abspath(__file__)) template_loader = jinja2.FileSystemLoader(searchpath=path + "//templates") template_env = jinja2.Environment(loader=template_loader, autoescape=True) # Render template # Create the profiler info header html = "" general_template = template_env.get_template("general_info.html") html = html + general_template.render(data=output) template = template_env.get_template("one_column.html") # Create every column stats for col_name in columns: if "hist" in output["columns"][col_name]: hist_pic = plot_hist( {col_name: output["columns"][col_name]["hist"]}, output="base64") else: hist_pic = None if "frequency" in output["columns"][col_name]: freq_pic = plot_freq( {col_name: output["columns"][col_name]["frequency"]}, output="base64") else: freq_pic = None html = html + template.render(data=output["columns"][col_name], hist_pic=hist_pic, freq_pic=freq_pic) html = html + df.table_html(10) # df.plots.correlation(columns) # Display HTML display(HTML(html)) # Save to file write_json(output, self.path)
def to_file(self, path=None, output=None): """ Save profiler data to a file in the specified format (html, json) :param output: html or json :param path: filename in which the data will be saved :return: """ if path is None: RaiseIt.value_error(path, ["Invalid file path"]) # We need to append a some extra html tags to display it correctly in the browser. if output is "html": if self.html is None: assert self.html is not None, "Please run the profiler first" header = '''<!doctype html> <html class="no-js" lang=""> <head> <meta charset="utf-8"> <meta http-equiv="x-ua-compatible" content="ie=edge"> <title></title> <meta name="description" content=""> <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> <link rel="manifest" href="site.webmanifest"> <link rel="apple-touch-icon" href="icon.png"> <!-- Place favicon.ico in the root directory --> <link rel="stylesheet" href="css/normalize.css"> <link rel="stylesheet" href="css/main.css"> </head> <body>''' footer = '''</body></html>''' write_html(header + self.html + footer, path) elif output is "json": if self.json is None: assert self.json is not None, "Please run the profiler first" write_json(self.json, path) else: print("sdf") RaiseIt.type_error(output, ["html", "json"])
def json(df, columns, buckets=20, path=None): """ Return the profiling data in json format :param df: Dataframe to be processed :param columns: column to calculate the histogram :param buckets: buckets on the histogram :param path: Path where the json is going to be saved :return: json file """ dataset = Profiler.dataset_info(df) summary = Profiler.columns(df, columns, buckets) summary["summary"] = dataset if path is None: path = Path.cwd() / "data.json" write_json(summary, path=path) return summary
def run(self, df, columns, buckets=40, infer=False, relative_error=1): """ Return dataframe statistical information in HTML Format :param df: Dataframe to be analyzed :param columns: Columns to be analized :param buckets: Number of buckets calculated to print the histogram :param infer: infer data type :param relative_error: Relative Error for quantile discretizer calculation :return: """ columns = parse_columns(df, columns) output = Profiler.to_json(df, columns, buckets, infer, relative_error) # Load jinja path = os.path.dirname(os.path.abspath(__file__)) template_loader = jinja2.FileSystemLoader(searchpath=path + "//templates") template_env = jinja2.Environment(loader=template_loader, autoescape=True) # Render template # Create the profiler info header html = "" general_template = template_env.get_template("general_info.html") html = html + general_template.render(data=output) template = template_env.get_template("one_column.html") # Create every column stats for col_name in columns: hist_pic = None col = output["columns"][col_name] if "hist" in col: if col["column_dtype"] == "date": hist_year = plot_hist({col_name: col["hist"]["years"]}, "base64", "years") hist_month = plot_hist({col_name: col["hist"]["months"]}, "base64", "months") hist_weekday = plot_hist( {col_name: col["hist"]["weekdays"]}, "base64", "weekdays") hist_hour = plot_hist({col_name: col["hist"]["hours"]}, "base64", "hours") hist_minute = plot_hist({col_name: col["hist"]["minutes"]}, "base64", "minutes") hist_pic = { "hist_years": hist_year, "hist_months": hist_month, "hist_weekdays": hist_weekday, "hist_hours": hist_hour, "hist_minutes": hist_minute } else: hist = plot_hist({col_name: col["hist"]}, output="base64") hist_pic = {"hist_pic": hist} if "frequency" in col: freq_pic = plot_freq({col_name: col["frequency"]}, output="base64") else: freq_pic = None html = html + template.render( data=col, freq_pic=freq_pic, **hist_pic) html = html + df.table_html(10) # Display HTML print_html(html) # send to queue if self.queue_url is not None: self.to_queue(output) # JSON # Save in case we want to output to a json file self.json = output # Save file in json format write_json(output, self.path) # Save in case we want to output to a html file self.html = html
def run(self, df, columns="*", buckets=MAX_BUCKETS, infer=False, relative_error=RELATIVE_ERROR, approx_count=True, mismatch=None): """ Return dataframe statistical information in HTML Format :param df: Dataframe to be analyzed :param columns: Columns to be analyzed :param buckets: Number of buckets calculated to print the histogram :param infer: infer data type :param relative_error: Relative Error for quantile discretizer calculation :param approx_count: Use approx_count_distinct or countDistinct :param mismatch: :return: """ columns = parse_columns(df, columns) # for col_name in columns: # df.cols.set_meta({"name": col_name}) # df.set_meta({"initialized": True}) output = self.dataset(df, columns, buckets, infer, relative_error, approx_count, format="dict", mismatch=mismatch) # Load jinja template_loader = jinja2.FileSystemLoader( searchpath=absolute_path("/profiler/templates/out")) template_env = jinja2.Environment(loader=template_loader, autoescape=True) # Render template # Create the profiler info header html = "" general_template = template_env.get_template("general_info.html") html = html + general_template.render(data=output) template = template_env.get_template("one_column.html") # Create every column stats for col_name in columns: hist_pic = None freq_pic = None col = output["columns"][col_name] if "hist" in col["stats"]: hist_dict = col["stats"]["hist"] if col["column_dtype"] == "date": hist_year = plot_hist({col_name: hist_dict["years"]}, "base64", "years") hist_month = plot_hist({col_name: hist_dict["months"]}, "base64", "months") hist_weekday = plot_hist({col_name: hist_dict["weekdays"]}, "base64", "weekdays") hist_hour = plot_hist({col_name: hist_dict["hours"]}, "base64", "hours") hist_minute = plot_hist({col_name: hist_dict["minutes"]}, "base64", "minutes") hist_pic = { "hist_years": hist_year, "hist_months": hist_month, "hist_weekdays": hist_weekday, "hist_hours": hist_hour, "hist_minutes": hist_minute } elif col["column_dtype"] == "int" or col[ "column_dtype"] == "string" or col[ "column_dtype"] == "decimal": hist = plot_hist({col_name: hist_dict}, output="base64") hist_pic = {"hist_numeric_string": hist} if "frequency" in col: freq_pic = plot_frequency({col_name: col["frequency"]}, output="base64") html = html + template.render( data=col, freq_pic=freq_pic, hist_pic=hist_pic) # Save in case we want to output to a html file self.html = html + df.table_html(10) # Display HTML print_html(self.html) # JSON # Save in case we want to output to a json file self.json = output # Save file in json format write_json(output, self.path) return self