def to_image(self, output_path): """ Save the profiler result as image :param self: :param output_path: path where the image will be saved :return: """ css = absolute_path("/css/styles.css") imgkit.from_string(self.html, output_path, css=css) print_html("<img src='" + output_path + "'>")
def table_image(self, path, limit=10): """ :param self: :param limit: :param path: :return: """ css = absolute_path("/css/styles.css") imgkit.from_string(self.table_html(limit=limit, full=True), path, css=css) print_html("<img src='" + path + "'>")
def _load_css(): """ Try to load the css for templates :return: """ try: if __IPYTHON__: url = absolute_path("/css/styles.css") styles = open(url, "r", encoding="utf8").read() s = '<style>%s</style>' % styles print_html(s) except NameError: pass
def run(self, df, columns="*", buckets=MAX_BUCKETS, infer=False, relative_error=RELATIVE_ERROR, approx_count=True, mismatch=None): """ Return dataframe statistical information in HTML Format :param df: Dataframe to be analyzed :param columns: Columns to be analyzed :param buckets: Number of buckets calculated to print the histogram :param infer: infer data type :param relative_error: Relative Error for quantile discretizer calculation :param approx_count: Use approx_count_distinct or countDistinct :param mismatch: :return: """ columns = parse_columns(df, columns) columns, output = self.dataset(df, columns, buckets, infer, relative_error, approx_count, format="dict", mismatch=mismatch) # Load jinja template_loader = jinja2.FileSystemLoader( searchpath=absolute_path("/profiler/templates/out")) template_env = jinja2.Environment(loader=template_loader, autoescape=True) # Render template # Create the profiler info header html = "" general_template = template_env.get_template("general_info.html") html = html + general_template.render(data=output) template = template_env.get_template("one_column.html") # Create every column stats for col_name in columns: hist_pic = None freq_pic = None col = output["columns"][col_name] if "hist" in col["stats"]: hist_dict = col["stats"]["hist"] if col["column_dtype"] == "date": hist_year = plot_hist({col_name: hist_dict["years"]}, "base64", "years") hist_month = plot_hist({col_name: hist_dict["months"]}, "base64", "months") hist_weekday = plot_hist({col_name: hist_dict["weekdays"]}, "base64", "weekdays") hist_hour = plot_hist({col_name: hist_dict["hours"]}, "base64", "hours") hist_minute = plot_hist({col_name: hist_dict["minutes"]}, "base64", "minutes") hist_pic = { "hist_years": hist_year, "hist_months": hist_month, "hist_weekdays": hist_weekday, "hist_hours": hist_hour, "hist_minutes": hist_minute } elif col["column_dtype"] == "int" or col[ "column_dtype"] == "string" or col[ "column_dtype"] == "decimal": hist = plot_hist({col_name: hist_dict}, output="base64") hist_pic = {"hist_numeric_string": hist} if "frequency" in col: freq_pic = plot_frequency({col_name: col["frequency"]}, output="base64") html = html + template.render( data=col, freq_pic=freq_pic, hist_pic=hist_pic) # Save in case we want to output to a html file # self.html = html + df.table_html(10) self.html = html # Display HTML print_html(self.html) # JSON # Save in case we want to output to a json file self.json = output return self
def __init__(self, session=None, master="local[*]", app_name="optimus", checkpoint=False, path=None, file_system="local", verbose=False, server=False, repositories=None, packages=None, jars=None, driver_class_path=None, options=None, additional_options=None, comm=None, load_avro=False, cache=True): """ Transform and roll out :param master: 'Master', 'local' or ip address to a cluster :param app_name: Spark app name :param path: path to the checkpoint folder :param checkpoint: If True create a checkpoint folder :param file_system: 'local' or 'hadoop' :param additional_options: :param options: Configuration options that are passed to spark-submit. See `the list of possible options <https://spark.apache.org/docs/2.4.1/configuration.html#available-properties>`_. Note that any options set already through PYSPARK_SUBMIT_ARGS will override these. :type options: (dict[str,str]) :param repositories: List of additional maven repositories for package lookup. :type repositories: (list[str]) :param packages: Spark packages that should be installed. :type packages: (list[str]) :param jars: Full paths to jar files that we want to include to the session. :type jars: (list[str]) """ self.preserve = False Optimus.cache = cache if comm is True: Comm.instance = Comm() else: Comm.instance = comm if jars is None: jars = [] if driver_class_path is None: driver_class_path = [] if session is None: # Creating Spark Session # If a Spark session in not passed by argument create one self.master = master self.app_name = app_name if options is None: options = {} self.options = options # Initialize as lists self.packages = val_to_list(packages) self.repositories = val_to_list(repositories) self.jars = val_to_list(jars) self.driver_class_path = val_to_list(driver_class_path) self.additional_options = additional_options self.verbose(verbose) # Because avro depends of a external package you can decide if should be loaded if load_avro == "2.4": self._add_spark_packages( ["org.apache.spark:spark-avro_2.12:2.4.3"]) elif load_avro == "2.3": self._add_spark_packages( ["com.databricks:spark-avro_2.11:4.0.0"]) jdbc_jars = [ "/jars/spark-redis-2.4.1-SNAPSHOT-jar-with-dependencies.jar", "/jars/RedshiftJDBC42-1.2.16.1027.jar", "/jars/mysql-connector-java-8.0.16.jar", "/jars/ojdbc8.jar", "/jars/postgresql-42.2.5.jar", "/jars/presto-jdbc-0.224.jar", "/jars/spark-cassandra-connector_2.11-2.4.1.jar", "/jars/sqlite-jdbc-3.27.2.1.jar", "/jars/mssql-jdbc-7.4.1.jre8.jar" ] self._add_jars(absolute_path(jdbc_jars, "uri")) self._add_driver_class_path(absolute_path(jdbc_jars, "posix")) self._create_session() if path is None: path = os.getcwd() if checkpoint is True: self._set_check_point_folder(path, file_system) else: # If a session is passed by arguments just save the reference # logger.print("Spark session") Spark.instance = Spark().load(session) # Initialize Spark logger.print(""" ____ __ _ / __ \____ / /_(_)___ ___ __ _______ / / / / __ \/ __/ / __ `__ \/ / / / ___/ / /_/ / /_/ / /_/ / / / / / / /_/ (__ ) \____/ .___/\__/_/_/ /_/ /_/\__,_/____/ /_/ """) logger.print(STARTING_OPTIMUS) # Pickling Spark.instance.sc.addPyFile(absolute_path("/infer.py")) if server: logger.print("Starting Optimus Server...") s = Server() s.start() self.server_instance = s logger.print(SUCCESS) self.create = Create() self.load = Load() self.read = self.spark.read # Create singleton profiler Profiler.instance = Profiler() self.profiler = Profiler.instance self.ml = ML() # Set global output as html self.output("html")
def table_html(self, limit=10, columns=None, title=None, full=False, truncate=True): """ Return a HTML table with the dataframe cols, data types and values :param self: :param columns: Columns to be printed :param limit: How many rows will be printed :param title: Table title :param full: Include html header and footer :param truncate: Truncate the row information :return: """ columns = parse_columns(self, columns) if limit is None: limit = 10 if limit == "all": data = collect_as_dict(self.cols.select(columns)) else: data = collect_as_dict(self.cols.select(columns).limit(limit)) # Load the Jinja template template_loader = jinja2.FileSystemLoader( searchpath=absolute_path("/templates/out")) template_env = jinja2.Environment(loader=template_loader, autoescape=True) template = template_env.get_template("table.html") # Filter only the columns and data type info need it dtypes = [] for i, j in zip(self.dtypes, self.schema): if i[1].startswith("array<struct"): dtype = "array<struct>" elif i[1].startswith("struct"): dtype = "struct" else: dtype = i[1] dtypes.append((i[0], dtype, j.nullable)) # Remove not selected columns final_columns = [] for i in dtypes: for j in columns: if i[0] == j: final_columns.append(i) total_rows = self.rows.approx_count() if limit == "all": limit = total_rows elif total_rows < limit: limit = total_rows total_rows = humanize.intword(total_rows) total_cols = self.cols.count() total_partitions = self.partitions() output = template.render(cols=final_columns, data=data, limit=limit, total_rows=total_rows, total_cols=total_cols, partitions=total_partitions, title=title, truncate=truncate) if full is True: output = HEADER + output + FOOTER return output
def table_html(self, limit=10, columns=None, title=None, full=False, truncate=True, count=True): """ Return a HTML table with the spark cols, data types and values :param columns: Columns to be printed :param limit: How many rows will be printed :param title: Table title :param full: Include html header and footer :param truncate: Truncate the row information :param count: :return: """ columns = parse_columns(self, columns) if limit is None: limit = 10 df = self if limit == "all": data = df.cols.select(columns).to_dict() else: data = df.cols.select(columns).rows.limit(limit).to_dict() # Load the Jinja template template_loader = jinja2.FileSystemLoader( searchpath=absolute_path("/templates/out")) template_env = jinja2.Environment(loader=template_loader, autoescape=True) template = template_env.get_template("table.html") # Filter only the columns and data type info need it dtypes = [(k, v) for k, v in df.cols.dtypes().items()] # Remove not selected columns final_columns = [] for i in dtypes: for j in columns: if i[0] == j: final_columns.append(i) # if count is True: # else: # count = None total_rows = df.rows.approx_count() if limit == "all" or total_rows < limit: limit = total_rows total_rows = humanize.intword(total_rows) total_cols = df.cols.count() total_partitions = df.partitions() df_type = type(df) output = template.render(df_type=df_type, cols=final_columns, data=data, limit=limit, total_rows=total_rows, total_cols=total_cols, partitions=total_partitions, title=title, truncate=truncate) if full is True: output = HEADER + output + FOOTER return output