Beispiel #1
0
    def to_image(self, output_path):
        """
        Save the profiler result as image
        :param self:
        :param output_path: path where the image will be saved
        :return:
        """
        css = absolute_path("/css/styles.css")
        imgkit.from_string(self.html, output_path, css=css)

        print_html("<img src='" + output_path + "'>")
Beispiel #2
0
def table_image(self, path, limit=10):
    """

    :param self:
    :param limit:
    :param path:
    :return:
    """

    css = absolute_path("/css/styles.css")

    imgkit.from_string(self.table_html(limit=limit, full=True), path, css=css)
    print_html("<img src='" + path + "'>")
Beispiel #3
0
 def _load_css():
     """
     Try to load the css for templates
     :return:
     """
     try:
         if __IPYTHON__:
             url = absolute_path("/css/styles.css")
             styles = open(url, "r", encoding="utf8").read()
             s = '<style>%s</style>' % styles
             print_html(s)
     except NameError:
         pass
Beispiel #4
0
    def run(self,
            df,
            columns="*",
            buckets=MAX_BUCKETS,
            infer=False,
            relative_error=RELATIVE_ERROR,
            approx_count=True,
            mismatch=None):
        """
        Return dataframe statistical information in HTML Format
        :param df: Dataframe to be analyzed
        :param columns: Columns to be analyzed
        :param buckets: Number of buckets calculated to print the histogram
        :param infer: infer data type
        :param relative_error: Relative Error for quantile discretizer calculation
        :param approx_count: Use approx_count_distinct or countDistinct
        :param mismatch:
        :return:
        """

        columns = parse_columns(df, columns)
        columns, output = self.dataset(df,
                                       columns,
                                       buckets,
                                       infer,
                                       relative_error,
                                       approx_count,
                                       format="dict",
                                       mismatch=mismatch)

        # Load jinja
        template_loader = jinja2.FileSystemLoader(
            searchpath=absolute_path("/profiler/templates/out"))
        template_env = jinja2.Environment(loader=template_loader,
                                          autoescape=True)

        # Render template
        # Create the profiler info header
        html = ""
        general_template = template_env.get_template("general_info.html")
        html = html + general_template.render(data=output)

        template = template_env.get_template("one_column.html")
        # Create every column stats
        for col_name in columns:
            hist_pic = None
            freq_pic = None

            col = output["columns"][col_name]
            if "hist" in col["stats"]:
                hist_dict = col["stats"]["hist"]

                if col["column_dtype"] == "date":
                    hist_year = plot_hist({col_name: hist_dict["years"]},
                                          "base64", "years")
                    hist_month = plot_hist({col_name: hist_dict["months"]},
                                           "base64", "months")
                    hist_weekday = plot_hist({col_name: hist_dict["weekdays"]},
                                             "base64", "weekdays")
                    hist_hour = plot_hist({col_name: hist_dict["hours"]},
                                          "base64", "hours")
                    hist_minute = plot_hist({col_name: hist_dict["minutes"]},
                                            "base64", "minutes")
                    hist_pic = {
                        "hist_years": hist_year,
                        "hist_months": hist_month,
                        "hist_weekdays": hist_weekday,
                        "hist_hours": hist_hour,
                        "hist_minutes": hist_minute
                    }

                elif col["column_dtype"] == "int" or col[
                        "column_dtype"] == "string" or col[
                            "column_dtype"] == "decimal":
                    hist = plot_hist({col_name: hist_dict}, output="base64")
                    hist_pic = {"hist_numeric_string": hist}
            if "frequency" in col:
                freq_pic = plot_frequency({col_name: col["frequency"]},
                                          output="base64")

            html = html + template.render(
                data=col, freq_pic=freq_pic, hist_pic=hist_pic)

        # Save in case we want to output to a html file
        # self.html = html + df.table_html(10)
        self.html = html

        # Display HTML
        print_html(self.html)

        # JSON
        # Save in case we want to output to a json file
        self.json = output

        return self
Beispiel #5
0
    def __init__(self,
                 session=None,
                 master="local[*]",
                 app_name="optimus",
                 checkpoint=False,
                 path=None,
                 file_system="local",
                 verbose=False,
                 server=False,
                 repositories=None,
                 packages=None,
                 jars=None,
                 driver_class_path=None,
                 options=None,
                 additional_options=None,
                 comm=None,
                 load_avro=False,
                 cache=True):
        """
        Transform and roll out
        :param master: 'Master', 'local' or ip address to a cluster
        :param app_name: Spark app name
        :param path: path to the checkpoint folder
        :param checkpoint: If True create a checkpoint folder
        :param file_system: 'local' or 'hadoop'
        :param additional_options:


        :param options: Configuration options that are passed to spark-submit.
            See `the list of possible options
            <https://spark.apache.org/docs/2.4.1/configuration.html#available-properties>`_.
            Note that any options set already through PYSPARK_SUBMIT_ARGS will override
            these.
        :type options: (dict[str,str])
        :param repositories: List of additional maven repositories for package lookup.
        :type repositories: (list[str])

        :param packages: Spark packages that should be installed.
        :type packages: (list[str])

        :param jars: Full paths to jar files that we want to include to the session.
        :type jars: (list[str])

        """

        self.preserve = False

        Optimus.cache = cache

        if comm is True:
            Comm.instance = Comm()
        else:
            Comm.instance = comm

        if jars is None:
            jars = []

        if driver_class_path is None:
            driver_class_path = []

        if session is None:
            # Creating Spark Session
            # If a Spark session in not passed by argument create one

            self.master = master
            self.app_name = app_name

            if options is None:
                options = {}

            self.options = options

            # Initialize as lists
            self.packages = val_to_list(packages)
            self.repositories = val_to_list(repositories)
            self.jars = val_to_list(jars)
            self.driver_class_path = val_to_list(driver_class_path)

            self.additional_options = additional_options

            self.verbose(verbose)

            # Because avro depends of a external package you can decide if should be loaded
            if load_avro == "2.4":
                self._add_spark_packages(
                    ["org.apache.spark:spark-avro_2.12:2.4.3"])

            elif load_avro == "2.3":
                self._add_spark_packages(
                    ["com.databricks:spark-avro_2.11:4.0.0"])

            jdbc_jars = [
                "/jars/spark-redis-2.4.1-SNAPSHOT-jar-with-dependencies.jar",
                "/jars/RedshiftJDBC42-1.2.16.1027.jar",
                "/jars/mysql-connector-java-8.0.16.jar", "/jars/ojdbc8.jar",
                "/jars/postgresql-42.2.5.jar", "/jars/presto-jdbc-0.224.jar",
                "/jars/spark-cassandra-connector_2.11-2.4.1.jar",
                "/jars/sqlite-jdbc-3.27.2.1.jar",
                "/jars/mssql-jdbc-7.4.1.jre8.jar"
            ]

            self._add_jars(absolute_path(jdbc_jars, "uri"))
            self._add_driver_class_path(absolute_path(jdbc_jars, "posix"))

            self._create_session()

            if path is None:
                path = os.getcwd()

            if checkpoint is True:
                self._set_check_point_folder(path, file_system)

        else:
            # If a session is passed by arguments just save the reference
            # logger.print("Spark session")
            Spark.instance = Spark().load(session)

        # Initialize Spark
        logger.print("""
                             ____        __  _                     
                            / __ \____  / /_(_)___ ___  __  _______
                           / / / / __ \/ __/ / __ `__ \/ / / / ___/
                          / /_/ / /_/ / /_/ / / / / / / /_/ (__  ) 
                          \____/ .___/\__/_/_/ /_/ /_/\__,_/____/  
                              /_/                                  
                              """)

        logger.print(STARTING_OPTIMUS)

        # Pickling
        Spark.instance.sc.addPyFile(absolute_path("/infer.py"))

        if server:
            logger.print("Starting Optimus Server...")
            s = Server()
            s.start()
            self.server_instance = s

        logger.print(SUCCESS)

        self.create = Create()
        self.load = Load()
        self.read = self.spark.read

        # Create singleton profiler
        Profiler.instance = Profiler()
        self.profiler = Profiler.instance
        self.ml = ML()

        # Set global output as html
        self.output("html")
Beispiel #6
0
def table_html(self,
               limit=10,
               columns=None,
               title=None,
               full=False,
               truncate=True):
    """
    Return a HTML table with the dataframe cols, data types and values
    :param self:
    :param columns: Columns to be printed
    :param limit: How many rows will be printed
    :param title: Table title
    :param full: Include html header and footer
    :param truncate: Truncate the row information

    :return:
    """

    columns = parse_columns(self, columns)

    if limit is None:
        limit = 10

    if limit == "all":
        data = collect_as_dict(self.cols.select(columns))
    else:
        data = collect_as_dict(self.cols.select(columns).limit(limit))

    # Load the Jinja template
    template_loader = jinja2.FileSystemLoader(
        searchpath=absolute_path("/templates/out"))
    template_env = jinja2.Environment(loader=template_loader, autoescape=True)
    template = template_env.get_template("table.html")

    # Filter only the columns and data type info need it
    dtypes = []
    for i, j in zip(self.dtypes, self.schema):
        if i[1].startswith("array<struct"):
            dtype = "array<struct>"
        elif i[1].startswith("struct"):
            dtype = "struct"
        else:
            dtype = i[1]

        dtypes.append((i[0], dtype, j.nullable))

    # Remove not selected columns
    final_columns = []
    for i in dtypes:
        for j in columns:
            if i[0] == j:
                final_columns.append(i)

    total_rows = self.rows.approx_count()

    if limit == "all":
        limit = total_rows
    elif total_rows < limit:
        limit = total_rows

    total_rows = humanize.intword(total_rows)

    total_cols = self.cols.count()
    total_partitions = self.partitions()

    output = template.render(cols=final_columns,
                             data=data,
                             limit=limit,
                             total_rows=total_rows,
                             total_cols=total_cols,
                             partitions=total_partitions,
                             title=title,
                             truncate=truncate)

    if full is True:
        output = HEADER + output + FOOTER
    return output
Beispiel #7
0
    def table_html(self,
                   limit=10,
                   columns=None,
                   title=None,
                   full=False,
                   truncate=True,
                   count=True):
        """
        Return a HTML table with the spark cols, data types and values
        :param columns: Columns to be printed
        :param limit: How many rows will be printed
        :param title: Table title
        :param full: Include html header and footer
        :param truncate: Truncate the row information
        :param count:

        :return:
        """

        columns = parse_columns(self, columns)
        if limit is None:
            limit = 10

        df = self
        if limit == "all":
            data = df.cols.select(columns).to_dict()
        else:
            data = df.cols.select(columns).rows.limit(limit).to_dict()
        # Load the Jinja template
        template_loader = jinja2.FileSystemLoader(
            searchpath=absolute_path("/templates/out"))
        template_env = jinja2.Environment(loader=template_loader,
                                          autoescape=True)
        template = template_env.get_template("table.html")

        # Filter only the columns and data type info need it
        dtypes = [(k, v) for k, v in df.cols.dtypes().items()]

        # Remove not selected columns
        final_columns = []
        for i in dtypes:
            for j in columns:
                if i[0] == j:
                    final_columns.append(i)

        # if count is True:

        # else:
        #     count = None
        total_rows = df.rows.approx_count()
        if limit == "all" or total_rows < limit:
            limit = total_rows

        total_rows = humanize.intword(total_rows)
        total_cols = df.cols.count()
        total_partitions = df.partitions()

        df_type = type(df)
        output = template.render(df_type=df_type,
                                 cols=final_columns,
                                 data=data,
                                 limit=limit,
                                 total_rows=total_rows,
                                 total_cols=total_cols,
                                 partitions=total_partitions,
                                 title=title,
                                 truncate=truncate)

        if full is True:
            output = HEADER + output + FOOTER
        return output