Beispiel #1
0
    def columns_stats(self,
                      df,
                      columns,
                      buckets=10,
                      infer=False,
                      relative_error=RELATIVE_ERROR,
                      approx_count=True,
                      mismatch=None,
                      advanced_stats=True):
        """
        Return statistical information about a specific column in json format
        :param df: Dataframe to be processed
        :param columns: Columns that you want to profile
        :param buckets: Create buckets divided by range. Each bin is equal.
        :param infer: try to infer the column dataType
        :param relative_error: relative error when the percentile is calculated.
        0 more precision/slow 1 less precision/faster
        :param approx_count: Use the function approx_count_distinct or countDistinct. approx_count_distinct is faster
        :param mismatch:
        :return: json object
        """

        columns = parse_columns(df, columns)

        # Initialize Objects
        logger.print("Processing Stats For columns...")

        # Get columns data types. This is necessary to make the pertinent histogram calculations.
        count_by_data_type = df.cols.count_by_dtypes(columns,
                                                     infer=infer,
                                                     mismatch=mismatch)

        count_by_data_type_no_mismatch = copy.deepcopy(count_by_data_type)

        # Info from all the columns
        type_details = {}

        for col_name in columns:
            # Not count mismatch
            if "mismatch" in count_by_data_type_no_mismatch[col_name]:
                count_by_data_type_no_mismatch[col_name].pop("mismatch")

            # Get the greatest count by column data type
            greatest_data_type_count = max(
                count_by_data_type_no_mismatch[col_name],
                key=count_by_data_type_no_mismatch[col_name].get)
            cat = PYTHON_TO_PROFILER.get(greatest_data_type_count)

            assign(type_details, col_name + ".dtype", greatest_data_type_count,
                   dict)
            assign(type_details, col_name + ".type", cat, dict)
            assign(type_details, col_name + ".stats",
                   count_by_data_type[col_name], dict)

        # Count the categorical, numerical, boolean and date columns
        count_types = {}
        for value in type_details.values():
            name = value["dtype"]
            if name in count_types:
                count_types[name] += 1
            else:
                count_types[name] = 1

        # List the data types this data set have
        dtypes = [key for key, value in count_types.items() if value > 0]

        columns_info = {}
        columns_info["count_types"] = fill_missing_col_types(count_types)
        columns_info["total_count_dtypes"] = len(dtypes)
        columns_info["dtypes_list"] = dtypes
        columns_info["columns"] = type_details

        # Aggregation
        stats = self.columns_agg(df, columns, buckets, relative_error,
                                 approx_count, advanced_stats)

        # Calculate Frequency
        logger.print("Processing Frequency ...")
        # print("COLUMNS",columns)
        df_freq = df.cols.select(columns,
                                 data_type=PYSPARK_NUMERIC_TYPES,
                                 invert=True)

        freq = None
        if df_freq is not None:
            freq = df_freq.cols.frequency("*", buckets, True, self.rows_count)
            # print("FREQUENCY1", freq)
        for col_name in columns:
            col_info = {}
            assign(col_info, "stats", stats[col_name], dict)

            if freq is not None:
                if col_name in freq:
                    # print("ASSIGN")
                    assign(col_info, "frequency", freq[col_name])

            assign(col_info, "name", col_name)
            assign(col_info, "column_dtype",
                   columns_info["columns"][col_name]['dtype'])
            assign(col_info, "dtypes_stats",
                   columns_info["columns"][col_name]['stats'])
            assign(col_info, "column_type",
                   columns_info["columns"][col_name]['type'])
            assign(columns_info, "columns." + col_name, col_info, dict)

            assign(col_info, "id", df.cols.get_meta(col_name, "id"))

        return columns_info
Beispiel #2
0
    def __init__(self,
                 session=None,
                 master="local[*]",
                 app_name="optimus",
                 checkpoint=False,
                 path=None,
                 file_system="local",
                 verbose=False,
                 server=False,
                 repositories=None,
                 packages=None,
                 jars=[],
                 driver_class_path=[],
                 options=None,
                 additional_options=None,
                 comm=None,
                 load_avro=False,
                 cache=True):
        """
        Transform and roll out
        :param master: 'Master', 'local' or ip address to a cluster
        :param app_name: Spark app name
        :param path: path to the checkpoint folder
        :param checkpoint: If True create a checkpoint folder
        :param file_system: 'local' or 'hadoop'
        :param additional_options:


        :param options: Configuration options that are passed to spark-submit.
            See `the list of possible options
            <https://spark.apache.org/docs/2.4.1/configuration.html#available-properties>`_.
            Note that any options set already through PYSPARK_SUBMIT_ARGS will override
            these.
        :type options: (dict[str,str])
        :param repositories: List of additional maven repositories for package lookup.
        :type repositories: (list[str])

        :param packages: Spark packages that should be installed.
        :type packages: (list[str])

        :param jars: Full paths to jar files that we want to include to the session.
        :type jars: (list[str])

        """
        self.preserve = False

        Optimus.cache = cache

        if comm is True:
            Comm.instance = Comm()
        else:
            Comm.instance = comm

        if session is None:
            # Creating Spark Session
            # If a Spark session in not passed by argument create one

            self.master = master
            self.app_name = app_name

            if options is None:
                options = {}

            self.options = options

            # Initialize as lists
            self.packages = val_to_list(packages)
            self.repositories = val_to_list(repositories)
            self.jars = val_to_list(jars)
            self.driver_class_path = val_to_list(driver_class_path)

            self.additional_options = additional_options

            self.verbose(verbose)

            # Because avro depends of a external package you can decide if should be loaded
            if load_avro == "2.4":
                self._add_spark_packages(
                    ["org.apache.spark:spark-avro_2.12:2.4.3"])

            elif load_avro == "2.3":
                self._add_spark_packages(
                    ["com.databricks:spark-avro_2.11:4.0.0"])

            jdbc_jars = [
                "/jars/RedshiftJDBC42-1.2.16.1027.jar",
                "/jars/mysql-connector-java-8.0.16.jar", "/jars/ojdbc8.jar",
                "/jars/postgresql-42.2.5.jar", "/jars/presto-jdbc-0.224.jar",
                "/jars/spark-cassandra-connector_2.11-2.4.1.jar",
                "/jars/sqlite-jdbc-3.27.2.1.jar",
                "/jars/mssql-jdbc-7.4.1.jre8.jar"
            ]

            self._add_jars(absolute_path(jdbc_jars, "uri"))
            self._add_driver_class_path(absolute_path(jdbc_jars, "posix"))

            self._create_session()

            if path is None:
                path = os.getcwd()

            if checkpoint is True:
                self._set_check_point_folder(path, file_system)

        else:
            # If a session is passed by arguments just save the reference
            # logger.print("Spark session")
            Spark.instance = Spark().load(session)

        # Initialize Spark
        logger.print("""
                             ____        __  _                     
                            / __ \____  / /_(_)___ ___  __  _______
                           / / / / __ \/ __/ / __ `__ \/ / / / ___/
                          / /_/ / /_/ / /_/ / / / / / / /_/ (__  ) 
                          \____/ .___/\__/_/_/ /_/ /_/\__,_/____/  
                              /_/                                  
                              """)

        logger.print(STARTING_OPTIMUS)

        # Pickling
        # Spark.instance.sc.addPyFile(absolute_path("/helpers/pickle.py"))

        if server:
            logger.print("Starting Optimus Server...")
            s = Server()
            s.start()
            self.server_instance = s

        logger.print(SUCCESS)

        self.create = Create()
        self.load = Load()
        self.read = self.spark.read

        # Create singleton profiler
        Profiler.instance = Profiler()
        self.profiler = Profiler.instance
        self.ml = ML()

        # Set global output as html
        self.output("html")
Beispiel #3
0
def print_check_point_config(filesystem):
    logger.print(
        "Setting checkpoint folder %s. If you are in a cluster initialize Optimus with master='your_ip' as param",
        filesystem)
Beispiel #4
0
    def _set_check_point_folder(path, file_system):
        """
        Function that receives a workspace path where a folder is created.
        This folder will store temporal dataframes when user writes the .checkPoint().

        :param path: Location of the dataset (string).
        :param file_system: Describes if file system is local or hadoop file system.

        """

        print_check_point_config(file_system)

        if file_system == "hadoop":
            folder_path = path + "/" + "checkPointFolder"
            Optimus.delete_check_point_folder(path=path,
                                              file_system=file_system)

            # Creating file:
            logger.print("Creating the hadoop folder...")
            command = "hadoop fs -mkdir " + folder_path
            logger.print("$" + command)
            os.system(command)
            logger.print("Hadoop folder created. \n")

            logger.print("Setting created folder as checkpoint folder...")
            Spark.instance.sc.setCheckpointDir(folder_path)
        elif file_system == "local":
            # Folder path:
            folder_path = path + "/" + "checkPointFolder"
            # Checking if tempFolder exits:
            logger.print("Deleting previous folder if exists...")
            if os.path.isdir(folder_path):
                # Deletes folder if exits:
                rmtree(folder_path)

            logger.print("Creating the checkpoint directory...")
            # Creates new folder:
            os.mkdir(folder_path)

            Spark.instance.sc.setCheckpointDir(dirName="file:///" +
                                               folder_path)
        else:
            RaiseIt.value_error(file_system, ["hadoop", "local"])
Beispiel #5
0
    def delete_check_point_folder(path, file_system):
        """
        Function that deletes the temporal folder where temp files were stored.
        The path required is the same provided by user in setCheckPointFolder().

        :param path: path where the info will be saved
        :param file_system: Describes if file system is local or hadoop file system.
        :return:
        """

        if file_system == "hadoop":
            # Folder path:
            folder_path = path + "/" + "checkPointFolder"
            logger.print("Deleting checkpoint folder...")
            command = "hadoop fs -rm -r " + folder_path
            os.system(command)
            logger.print("$" + command)
            logger.print("Folder deleted.")
        elif file_system == "local":
            logger.print("Deleting checkpoint folder...")
            # Folder path:
            folder_path = path + "/" + "checkPointFolder"
            # Checking if tempFolder exits:
            if os.path.isdir(folder_path):
                # Deletes folder if exits:
                rmtree(folder_path)
                # Creates new folder:
                logger.print("Folder deleted.")
            else:
                logger.print("Folder deleted.")
        else:
            RaiseIt.value_error(file_system, ["hadoop", "local"])
Beispiel #6
0
    def columns_agg(df,
                    columns,
                    buckets=10,
                    relative_error=RELATIVE_ERROR,
                    approx_count=True):
        columns = parse_columns(df, columns)
        n = BATCH_SIZE
        list_columns = [
            columns[i * n:(i + 1) * n]
            for i in range((len(columns) + n - 1) // n)
        ]
        # we have problems sending +100 columns at the same time. Process in batch

        result = {}
        for i, cols in enumerate(list_columns):
            logger.print(
                "Batch Stats {BATCH_NUMBER}. Processing columns{COLUMNS}".
                format(BATCH_NUMBER=i, COLUMNS=cols))

            funcs = [count_uniques_agg]
            exprs = df.cols.create_exprs(cols, funcs, approx_count)

            # TODO: in basic calculations funcs = [F.min, F.max]
            funcs = [
                F.min, F.max, F.stddev, F.kurtosis, F.mean, F.skewness, F.sum,
                F.variance, zeros_agg
            ]
            exprs.extend(df.cols.create_exprs(cols, funcs))

            # TODO: None in basic calculation
            funcs = [percentile_agg]
            exprs.extend(
                df.cols.create_exprs(cols, funcs, df,
                                     [0.05, 0.25, 0.5, 0.75, 0.95],
                                     relative_error))

            funcs = [count_na_agg]
            exprs.extend(df.cols.create_exprs(cols, funcs, df))
            result.update(df.cols.exec_agg(exprs))

        exprs = []
        n = BATCH_SIZE
        result_hist = {}
        list_columns = [
            columns[i * n:(i + 1) * n]
            for i in range((len(columns) + n - 1) // n)
        ]
        for i, cols in enumerate(list_columns):
            logger.print(
                "Batch Histogram {BATCH_NUMBER}. Processing columns{COLUMNS}".
                format(BATCH_NUMBER=i, COLUMNS=cols))

            funcs = [hist_agg]
            # min_max = None

            for col_name in cols:
                # Only process histogram id numeric. For toher data types using frequency
                if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES):
                    min_max = {
                        "min": result[col_name]["min"],
                        "max": result[col_name]["max"]
                    }
                    buckets = result[col_name]["count_uniques"] - 1
                    if buckets > MAX_BUCKETS:
                        buckets = MAX_BUCKETS
                    elif buckets == 0:
                        buckets = 1
                    exprs.extend(
                        df.cols.create_exprs(col_name, funcs, df, buckets,
                                             min_max))

            agg_result = df.cols.exec_agg(exprs)
            if agg_result is not None:
                result_hist.update(agg_result)

        # Merge results
        for col_name in result:
            if col_name in result_hist:
                result[col_name].update(result_hist[col_name])
        return result
Beispiel #7
0
    def create(self, obj, method, suffix=None, output="df", additional_method=None, *args, **kwargs):
        """
        This is a helper function that output python tests for Spark Dataframes.
        :param obj: Object to be tested
        :param method: Method to be tested
        :param suffix: The test name will be create using the method param. suffix will add a string in case you want
        to customize the test name.
        :param output: can be a 'df' or a 'json'
        :param additional_method:
        :param args: Arguments to be used in the method
        :param kwargs: Keyword arguments to be used in the functions
        :return:
        """

        buffer = []

        def add_buffer(value):
            buffer.append("\t" + value)

        # Create name
        name = []

        if method is not None:
            name.append(method.replace(".", "_"))

        if additional_method is not None:
            name.append(additional_method)

        if suffix is not None:
            name.append(suffix)

        test_name = "_".join(name)

        func_test_name = "test_" + test_name + "()"
        filename = test_name + ".test"

        print("Creating {test} test function...".format(test=func_test_name))
        logger.print(func_test_name)

        add_buffer("@staticmethod\n")
        add_buffer("def " + func_test_name + ":\n")

        source = "source_df"
        if obj is None:
            # Use the main df
            df_func = self.df
        elif isinstance(obj, pyspark.sql.dataframe.DataFrame):
            source_df = "\tsource_df=op.create.df(" + obj.export() + ")\n"
            df_func = obj
            add_buffer(source_df)
        else:
            source = get_var_name(obj)
            df_func = obj

        # Process simple arguments
        _args = []
        for v in args:
            if is_str(v):
                _args.append("'" + v + "'")
            elif is_numeric(v):
                _args.append(str(v))

            elif is_list(v):
                if is_list_of_strings(v):
                    lst = ["'" + x + "'" for x in v]
                elif is_list_of_numeric(v):
                    lst = [str(x) for x in v]
                elif is_list_of_tuples(v):
                    lst = [str(x) for x in v]

                _args.append('[' + ','.join(lst) + ']')
            elif is_function(v):
                _args.append(v.__qualname__)

            else:
                _args.append(get_var_name(v))

            # else:
            #     import marshal
            #     code_string = marshal.dumps(v.__code__)
            #     add_buffer("\tfunction = '" + code_string + "'\n")
            # import marshal, types
            #
            # code = marshal.loads(code_string)
            # func = types.FunctionType(code, globals(), "some_func_name")

        _args = ','.join(_args)
        _kwargs = []

        # print(_args)
        # Process keywords arguments
        for k, v in kwargs.items():
            if is_str(v):
                v = "'" + v + "'"
            _kwargs.append(k + "=" + str(v))

        # Separator if we have positional and keyword arguments
        separator = ""
        if (not is_list_empty(args)) & (not is_list_empty(kwargs)):
            separator = ","

        if method is None:
            add_buffer("\tactual_df = source_df\n")
        else:
            am = ""
            if additional_method:
                am = "." + additional_method + "()"

            add_buffer("\tactual_df =" + source + "." + method + "(" + _args + separator + ','.join(
                _kwargs) + ")" + am + "\n")

        # Apply function to the dataframe
        if method is None:
            df_result = self.op.create.df(*args, **kwargs)
        else:
            # Here we construct the method to be applied to the source object
            for f in method.split("."):
                df_func = getattr(df_func, f)

            df_result = df_func(*args, **kwargs)

        # Additional Methods
        if additional_method is not None:
            df_result = getattr(df_result, additional_method)()

        if output == "df":

            df_result.table()
            expected = "\texpected_df = op.create.df(" + df_result.export() + ")\n"
        elif output == "json":
            print(df_result)

            if is_str(df_result):
                df_result = "'" + df_result + "'"
            else:
                df_result = str(df_result)
            add_buffer("\tactual_df =json_enconding(actual_df)\n")

            expected = "\texpected_value =json_enconding(" + df_result + ")\n"
        else:
            expected = "\t\n"

        add_buffer(expected)

        # Output
        if output == "df":
            add_buffer("\tassert (expected_df.collect() == actual_df.collect())\n")
        elif output == "json":
            add_buffer("\tassert (expected_value == actual_df)\n")

        filename = self.path + "//" + filename
        if not os.path.exists(os.path.dirname(filename)):
            try:
                os.makedirs(os.path.dirname(filename))
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise

        # Write file
        test_file = open(filename, 'w', encoding='utf-8')

        for b in buffer:
            test_file.write(b)
Beispiel #8
0
    def create(self, df, func, suffix=None, output="df", *args, **kwargs):
        """
        This is a helper function that output python tests for Spark Dataframes.
        :param df: Spark Dataframe
        :param suffix: The create method will try to create a test function with the func param given.
        If you want to test a function with different params you can use suffix.
        :param func: Spark dataframe function to be tested
        :param output: can be a 'df' or a 'json'
        :param args: Arguments to be used in the function
        :param kwargs: Keyword arguments to be used in the functions
        :return:
        """

        buffer = []

        def add_buffer(value):
            buffer.append("\t" + value)

        if suffix is None:
            suffix = ""
        else:
            suffix = "_" + suffix

        # Create func test name. If is None we just test the create.df function a not transform the data frame in
        # any way
        if func is None:
            func_test_name = "test_" + "create_df" + suffix + "()"
            filename = "create_df" + suffix + ".test"

        else:
            func_test_name = "test_" + func.replace(".", "_") + suffix + "()"

            filename = func.replace(".", "_") + suffix + ".test"

        print("Creating {test} test function...".format(test=func_test_name))
        logger.print(func_test_name)

        add_buffer("@staticmethod\n")
        add_buffer("def " + func_test_name + ":\n")

        source = "source_df"
        if df is None:
            # Use the main df
            df_func = self.df
        elif isinstance(df, pyspark.sql.dataframe.DataFrame):
            source_df = "\tsource_df=op.create.df(" + df.export() + ")\n"
            df_func = df
            add_buffer(source_df)
        else:
            # TODO: op is not supposed to be hardcoded
            source = "op"
            df_func = df

        # Process simple arguments
        _args = []
        for v in args:
            if is_str(v):
                _args.append("'" + v + "'")
            elif is_numeric(v):
                _args.append(str(v))
            elif is_list(v):
                if is_list_of_strings(v):
                    lst = ["'" + x + "'" for x in v]
                elif is_list_of_numeric(v):
                    lst = [str(x) for x in v]
                elif is_list_of_tuples(v):
                    lst = [str(x) for x in v]

                _args.append('[' + ','.join(lst) + ']')
            elif is_function(v):
                _args.append(v.__qualname__)
            # else:
            #     import marshal
            #     code_string = marshal.dumps(v.__code__)
            #     add_buffer("\tfunction = '" + code_string + "'\n")
            # import marshal, types
            #
            # code = marshal.loads(code_string)
            # func = types.FunctionType(code, globals(), "some_func_name")

        _args = ','.join(_args)
        _kwargs = []

        # print(_args)
        # Process keywords arguments
        for k, v in kwargs.items():
            if is_str(v):
                v = "'" + v + "'"
            _kwargs.append(k + "=" + str(v))

        # Separator if we have positional and keyword arguments
        separator = ""
        if (not is_list_empty(args)) & (not is_list_empty(kwargs)):
            separator = ","

        if func is None:
            add_buffer("\tactual_df = source_df\n")
        else:
            add_buffer("\tactual_df =" + source + "." + func + "(" + _args + separator + ','.join(_kwargs) + ")\n")

        # Apply function to the dataframe
        if func is None:
            df_result = self.op.create.df(*args, **kwargs)
        else:
            # Here we construct the method to be applied to the source object
            for f in func.split("."):
                df_func = getattr(df_func, f)

            df_result = df_func(*args, **kwargs)

        if output == "df":
            df_result.table()
            expected = "\texpected_df = op.create.df(" + df_result.export() + ")\n"
        elif output == "json":
            print(df_result)
            if is_str(df_result):
                df_result = "'" + df_result + "'"
            else:
                df_result = str(df_result)
            add_buffer("\tactual_df =json_enconding(actual_df)\n")

            expected = "\texpected_value =json_enconding(" + df_result + ")\n"
        else:
            expected = "\t\n"

        add_buffer(expected)

        if output == "df":
            add_buffer("\tassert (expected_df.collect() == actual_df.collect())\n")
        elif output == "json":
            add_buffer("\tassert (expected_value == actual_df)\n")

        filename = self.path + "//" + filename
        if not os.path.exists(os.path.dirname(filename)):
            try:
                os.makedirs(os.path.dirname(filename))
            except OSError as exc:  # Guard against race condition
                if exc.errno != errno.EEXIST:
                    raise

        # write file
        test_file = open(filename, 'w', encoding='utf-8')

        for b in buffer:
            test_file.write(b)
Beispiel #9
0
    def columns(df, columns, buckets=40, infer=False, relative_error=1):
        """
        Return statistical information about a specific column in json format
        :param df: Dataframe to be processed
        :param columns: Columns that you want to profile
        :param buckets: Create buckets divided by range. Each bin is equal.
        :param relative_error: relative error when the percentile is calculated. 0 is more exact as slow 1 more error and faster
        :return: json object with the
        """

        columns = parse_columns(df, columns)

        # Get just a sample to infer the column data type
        # sample_size_number = sample_size(rows_count, 95.0, 2.0)
        # fraction = sample_size_number / rows_count
        # sample = df.sample(False, fraction, seed=1)

        # Initialize Objects
        columns_info = {}
        columns_info['columns'] = {}

        rows_count = df.count()
        columns_info['rows_count'] = humanize.intword(rows_count)
        count_dtypes = Profiler.count_data_types(df, columns, infer)

        columns_info["count_types"] = count_dtypes["count_types"]

        columns_info['size'] = humanize.naturalsize(df.size())

        # Cast columns to the data type infer by count_data_types()
        df = Profiler.cast_columns(df, columns, count_dtypes).cache()

        # Calculate stats
        stats = Profiler.general_stats(df, columns)

        for col_name in columns:
            col_info = {}
            logger.print("------------------------------")
            logger.print("Processing column '" + col_name + "'...")
            columns_info['columns'][col_name] = {}

            col_info["stats"] = stats[col_name]
            col_info.update(Profiler.frequency(df, col_name, buckets))
            col_info.update(Profiler.stats_by_column(col_name, stats, count_dtypes, rows_count))

            col_info['column_dtype'] = count_dtypes["columns"][col_name]['dtype']
            col_info["dtypes_stats"] = count_dtypes["columns"][col_name]['details']

            column_type = count_dtypes["columns"][col_name]['type']

            if column_type == "numeric":
                col_info["stats"].update(Profiler.extra_numeric_stats(df, col_name, stats, relative_error))
                col_info["hist"] = df.cols.hist(col_name, stats[col_name]["min"], stats[col_name]["max"], buckets)

            if column_type == "categorical" or column_type == "array":
                col_info["hist"] = Profiler.hist_string(df, col_name, buckets)

            if column_type == "date":
                col_info["hist"] = Profiler.hist_date(df, col_name)

            columns_info['columns'][col_name] = col_info

        return columns_info
Beispiel #10
0
def optimus(engine=Engine.DASK.value, *args, **kwargs):
    """
    This is the entry point to initialize the selected engine.
    :param engine: A string identifying an engine :classL`Engine`.
    :param args:
    :param kwargs:
    :return:
    """
    logger.print("ENGINE", engine)

    # lemmatizer
    nltk.download('wordnet', quiet=True)

    # Stopwords
    nltk.download('stopwords', quiet=True)

    # Init engine
    if engine == Engine.PANDAS.value:
        from optimus.engines.pandas.engine import PandasEngine
        op = PandasEngine(*args, **kwargs)

    elif engine == Engine.VAEX.value:
        from optimus.engines.vaex.engine import VaexEngine
        op = VaexEngine(*args, **kwargs)

    elif engine == Engine.SPARK.value:
        from optimus.engines.spark.engine import SparkEngine
        op = SparkEngine(*args, **kwargs)

    elif engine == Engine.DASK.value:
        from optimus.engines.dask.engine import DaskEngine
        op = DaskEngine(*args, **kwargs)

    elif engine == Engine.IBIS.value:
        from optimus.engines.ibis.engine import IbisEngine
        op = IbisEngine(*args, **kwargs)

    elif engine == Engine.CUDF.value:
        from optimus.engines.cudf.engine import CUDFEngine
        op = CUDFEngine(*args, **kwargs)

    elif engine == Engine.DASK_CUDF.value:
        from optimus.engines.dask_cudf.engine import DaskCUDFEngine
        op = DaskCUDFEngine(*args, **kwargs)

    else:
        RaiseIt.value_error(engine, Engine.list())

    # Set cupy yo user RMM
    def switch_to_rmm_allocator():
        import rmm
        import cupy
        cupy.cuda.set_allocator(rmm.rmm_cupy_allocator)
        return True

    if engine == Engine.CUDF.value:
        switch_to_rmm_allocator()

    if engine == Engine.DASK_CUDF.value:
        if op.client:
            op.client.run(switch_to_rmm_allocator)

    return op
Beispiel #11
0
def parse_columns(df,
                  cols_args,
                  get_args=False,
                  is_regex=None,
                  filter_by_column_dtypes=None,
                  accepts_missing_cols=False,
                  invert=False):
    """
    Return a list of columns and check that columns exists in the dataframe
    Accept '*' as parameter in which case return a list of all columns in the dataframe.
    Also accept a regex.
    If a list of tuples return to list. The first element is the columns name the others element are params.
    This params can be used to create custom transformation functions. You can find and example in cols().cast()
    :param df: Dataframe in which the columns are going to be checked
    :param cols_args: Accepts * as param to return all the string columns in the dataframe
    :param get_args:
    :param is_regex: Use True is col_attrs is a regex
    :param filter_by_column_dtypes: A data type for which a columns list is going be filtered
    :param accepts_missing_cols: if true not check if column exist in the dataframe
    :param invert: Invert the final selection. For example if you want to select not integers

    :return: A list of columns string names
    """

    if not is_dataframe(df):
        RaiseIt.type_error(df, "Dataframe")
    attrs = None

    # if columns value is * get all dataframes columns
    if is_regex is True:
        r = re.compile(cols_args[0])
        cols = list(filter(r.match, df.columns))

    elif cols_args == "*" or cols_args is None:
        cols = df.columns

    # In case we have a list of tuples we use the first element of the tuple is taken as the column name
    # and the rest as params. We can use the param in a custom function as follow
    # def func(attrs): attrs return (1,2) and (3,4)
    #   return attrs[0] + 1
    # df.cols().apply([('col_1',1,2),('cols_2', 3 ,4)], func)

    # Verify if we have a list with tuples
    elif is_tuple(cols_args) or is_list_of_tuples(cols_args):
        cols_args = val_to_list(cols_args)
        # Extract a specific position in the tuple
        cols = [(i[0:1][0]) for i in cols_args]
        attrs = [(i[1:]) for i in cols_args]
    else:
        # if not a list convert to list
        cols = val_to_list(cols_args)
        # Get col name from index
        cols = [c if is_str(c) else df.columns[c] for c in cols]

    # Check for missing columns
    if accepts_missing_cols is False:
        check_for_missing_columns(df, cols)

    # Filter by column data type
    filter_by_column_dtypes = val_to_list(filter_by_column_dtypes)
    if is_list_of_list(filter_by_column_dtypes):
        filter_by_column_dtypes = [
            item for sublist in filter_by_column_dtypes for item in sublist
        ]

    columns_residual = None

    # If necessary filter the columns by data type
    if filter_by_column_dtypes:
        # Get columns for every data type

        columns_filtered = filter_col_name_by_dtypes(df,
                                                     filter_by_column_dtypes)

        # Intersect the columns filtered per data type from the whole dataframe with the columns passed to the function
        final_columns = list(OrderedSet(cols).intersection(columns_filtered))

        # This columns match filtered data type
        columns_residual = list(
            OrderedSet(cols) - OrderedSet(columns_filtered))
    else:
        final_columns = cols

    cols_params = []
    if invert:
        final_columns = list(
            OrderedSet(df.cols.names()) - OrderedSet(final_columns))

    if get_args is True:
        cols_params = final_columns, attrs
    elif get_args is False:
        cols_params = final_columns
    else:
        RaiseIt.value_error(get_args, ["True", "False"])

    if columns_residual:
        logger.print("%s %s %s", ",".join(escape_columns(columns_residual)),
                     "column(s) was not processed because is/are not",
                     ",".join(filter_by_column_dtypes))

    # if because of filtering we got 0 columns return None
    if len(cols_params) == 0:
        cols_params = None
        logger.print("Outputting 0 columns after filtering. Is this expected?")

    return cols_params
Beispiel #12
0
        def _count_data_types(col_name):
            """
            Function for determine if register value is float or int or string.
            :param col_name:
            :return:
            """

            # If String, process the data to try to infer which data type is inside. This a kind of optimization.
            # We do not need to analyze the data if the column data type is integer or boolean.etc

            temp = col_name + "_type"
            col_data_type = df.cols.dtypes(col_name)

            # Parse dtype
            if col_data_type == "smallint" or col_data_type == "tinyint":
                col_data_type = "int"
            elif col_data_type == "float" or col_data_type == "double":
                col_data_type = "decimal"
            elif col_data_type.find("array") >= 0:
                col_data_type = "array"

            count_by_data_type = {}
            count_empty_strings = 0

            if infer is True and col_data_type == "string":
                logger.print("Processing column '" + col_name + "'...")
                types = collect_as_dict(df
                                        .h_repartition(col_name=col_name)
                                        .withColumn(temp, fbdt(col_name, get_type=True))
                                        .groupBy(temp).count()
                                        )

                for row in types:
                    count_by_data_type[row[temp]] = row["count"]

                count_empty_strings = df.where(F.col(col_name) == '').count()

            else:
                # if boolean not support count na
                if "count_na" in stats[col_name]:
                    nulls = stats[col_name]["count_na"]
                    count_by_data_type[col_data_type] = int(df_count) - nulls
                    count_by_data_type["null"] = nulls

            count_by_data_type = fill_missing_var_types(count_by_data_type)

            # Subtract white spaces to the total string count
            null_missed_count = {"null": count_by_data_type['null'],
                                 "missing": count_empty_strings,
                                 }
            # Get the greatest count by column data type
            greatest_data_type_count = max(count_by_data_type, key=count_by_data_type.get)

            if greatest_data_type_count == "string" or greatest_data_type_count == "boolean":
                cat = "categorical"
            elif greatest_data_type_count == "int" or greatest_data_type_count == "decimal":
                cat = "numeric"
            elif greatest_data_type_count == "date":
                cat = "date"
            elif greatest_data_type_count == "array":
                cat = "array"
            elif greatest_data_type_count == "binary":
                cat = "binary"
            elif greatest_data_type_count == "null":
                cat = "null"
            else:
                cat = None

            col = {}
            col['dtype'] = greatest_data_type_count
            col['type'] = cat
            col['details'] = {**count_by_data_type, **null_missed_count}

            return col
Beispiel #13
0
    def columns(self, df, columns, buckets=10, infer=False, relative_error=RELATIVE_ERROR, approx_count=True):
        """
        Return statistical information about a specific column in json format
        :param df: Dataframe to be processed
        :param columns: Columns that you want to profile
        :param buckets: Create buckets divided by range. Each bin is equal.
        :param infer: try to infer the column datatype
        :param relative_error: relative error when the percentile is calculated. 0 is more exact as slow 1 more error and faster
        :param approx_count: Use the function approx_count_distinct or countDistinct. approx_count_distinct is faster
        :return: json object
        """

        columns = parse_columns(df, columns)

        self.rows_count = df.count()
        self.cols_count = len(df.columns)

        # Initialize Objects
        columns_info = {}
        columns_info['columns'] = {}
        columns_info['name'] = df._name

        columns_info['rows_count'] = humanize.intword(self.rows_count)
        logger.print("Processing General Stats...")
        stats = Profiler.general_stats(df, columns, buckets, relative_error, approx_count)
        count_dtypes = self._count_data_types(df, columns, infer, stats)

        columns_info["count_types"] = count_dtypes["count_types"]
        columns_info['size'] = humanize.naturalsize(df.size())

        # Cast columns to the data type infer by count_data_types()
        # df = Profiler.cast_columns(df, columns, count_dtypes).cache()

        # Calculate stats
        logger.print("Processing Frequency ...")
        freq = df.cols.frequency(columns, buckets, True, self.rows_count)

        # Missing
        total_count_na = 0
        for col_name in columns:
            total_count_na = total_count_na + stats[col_name]["count_na"]
        columns_info["summary"] = {}
        columns_info["summary"]['missing_count'] = total_count_na
        columns_info["summary"]['p_missing'] = round(total_count_na / self.rows_count * 100, 2)

        # Calculate percentage
        for col_name in columns:
            col_info = {}

            col_info["stats"] = stats[col_name]

            if freq is not None:
                col_info["frequency"] = freq[col_name]

            col_info["stats"].update(self.extra_stats(df, col_name, stats))

            col_info['name'] = col_name
            col_info['column_dtype'] = count_dtypes["columns"][col_name]['dtype']
            col_info["dtypes_stats"] = count_dtypes["columns"][col_name]['details']
            col_info['column_type'] = count_dtypes["columns"][col_name]['type']

            columns_info['columns'][col_name] = {}
            columns_info['columns'][col_name] = col_info

        return columns_info
Beispiel #14
0
    def csv(path,
            sep=',',
            header=True,
            infer_schema=True,
            na_values=None,
            encoding="utf-8",
            n_rows=-1,
            cache=False,
            quoting=0,
            lineterminator=None,
            error_bad_lines=False,
            engine="c",
            keep_default_na=False,
            na_filter=False,
            null_value=None,
            storage_options=None,
            conn=None,
            n_partitions=1,
            *args,
            **kwargs):
        """
        Return a dataframe from a csv file. It is the same read.csv Spark function with some predefined
        params

        :param path: path or location of the file.
        :param sep: usually delimiter mark are ',' or ';'.
        :param header: tell the function whether dataset has a header row. True default.
        :param infer_schema: infers the input schema automatically from data.
        :param null_value:
        :param charset:
        It requires one extra pass over the data. True default.

        :return dataFrame
        """
        path = unquote_path(path)

        if cache is False:
            prepare_path.cache_clear()

        if conn is not None:
            path = conn.path(path)
            storage_options = conn.storage_options

        remove_param = "chunk_size"
        if kwargs.get(remove_param):
            # This is handle in this way to preserve compatibility with others dataframe technologies.
            logger.print(
                f"{remove_param} is not supported. Used to preserve compatibility with Optimus Pandas"
            )
            kwargs.pop(remove_param)

        try:
            # From the panda docs using na_filter
            # Detect missing value markers (empty strings and the value of na_values). In data without any NAs,
            # passing na_filter=False can improve the performance of reading a large file.
            dfd = vaex.read_csv(path,
                                sep=sep,
                                header=0 if header else None,
                                encoding=encoding,
                                quoting=quoting,
                                lineterminator=lineterminator,
                                error_bad_lines=error_bad_lines,
                                keep_default_na=True,
                                na_values=None,
                                engine=engine,
                                na_filter=na_filter,
                                storage_options=storage_options,
                                *args,
                                **kwargs)

            if n_rows > -1:
                dfd = vaex.from_pandas(dfd.head(n=n_rows),
                                       npartitions=1).reset_index(drop=True)

            df = VaexDataFrame(dfd)
            df.meta = Meta.set(df.meta,
                               value={
                                   "file_name": path,
                                   "name": ntpath.basename(path)
                               })
        except IOError as error:
            logger.print(error)
            raise

        return df
Beispiel #15
0
    def columns_agg(self,
                    df,
                    columns,
                    buckets=10,
                    relative_error=RELATIVE_ERROR,
                    approx_count=True,
                    advanced_stats=True):
        columns = parse_columns(df, columns)
        n = BATCH_SIZE
        list_columns = [
            columns[i * n:(i + 1) * n]
            for i in range((len(columns) + n - 1) // n)
        ]
        # we have problems sending +100 columns at the same time. Processing in batch

        result = {}

        for i, cols in enumerate(list_columns):
            logger.print(
                "Batch Stats {BATCH_NUMBER}. Processing columns{COLUMNS}".
                format(BATCH_NUMBER=i, COLUMNS=cols))

            # Count uniques is necessary for calculate the histogram buckets
            funcs = [count_uniques_agg]
            exprs = df.cols.create_exprs(cols, funcs, approx_count)

            funcs = [F.min, F.max]
            exprs.extend(df.cols.create_exprs(cols, funcs))

            funcs = [count_na_agg]
            exprs.extend(df.cols.create_exprs(cols, funcs, df))

            if advanced_stats is True:
                funcs = [
                    F.stddev, F.kurtosis, F.mean, F.skewness, F.sum,
                    F.variance, zeros_agg
                ]
                exprs.extend(df.cols.create_exprs(cols, funcs))

                # TODO: None in basic calculation
                funcs = [percentile_agg]
                exprs.extend(
                    df.cols.create_exprs(cols, funcs, df,
                                         [0.05, 0.25, 0.5, 0.75, 0.95],
                                         relative_error))

            result.update(df.cols.exec_agg(exprs))

        n = BATCH_SIZE
        result_hist = {}
        list_columns = [
            columns[i * n:(i + 1) * n]
            for i in range((len(columns) + n - 1) // n)
        ]

        for i, cols in enumerate(list_columns):
            logger.print(
                "Batch Histogram {BATCH_NUMBER}. Processing columns{COLUMNS}".
                format(BATCH_NUMBER=i, COLUMNS=cols))

            funcs = [hist_agg]

            for col_name in cols:
                # Only process histogram for numeric columns. For other data types using frequency
                if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES):
                    min_max = {
                        "min": result[col_name]["min"],
                        "max": result[col_name]["max"]
                    }
                    buckets = result[col_name]["count_uniques"] - 1
                    if buckets > MAX_BUCKETS:
                        buckets = MAX_BUCKETS
                    elif buckets == 0:
                        buckets = 1
                    exprs.extend(
                        df.cols.create_exprs(col_name, funcs, df, buckets,
                                             min_max))
            agg_result = df.cols.exec_agg(exprs)
            if agg_result is not None:
                result_hist.update(agg_result)

        # Merge results
        for col_name in result:
            if col_name in result_hist:
                result[col_name].update(result_hist[col_name])

        def extra_columns_stats(df, col_name, stats):
            """
            Specific Stats for numeric columns
            :param df:
            :param col_name:
            :param stats:
            :return:
            """

            col_info = {}

            max_value = stats[col_name]["max"]
            min_value = stats[col_name]["min"]

            if is_column_a(df, col_name, PYSPARK_NUMERIC_TYPES):
                stddev = stats[col_name]['stddev']
                mean = stats[col_name]['mean']

                quantile = stats[col_name]["percentile"]
                if max_value is not None and min_value is not None:
                    col_info['range'] = max_value - min_value
                else:
                    col_info['range'] = None

                col_info['median'] = quantile["0.5"]

                q1 = quantile["0.25"]
                q3 = quantile["0.75"]

                if q1 is not None and q3 is not None:
                    col_info['interquartile_range'] = q3 - q1
                else:
                    col_info['interquartile_range'] = None

                if mean != 0 and mean is not None:
                    col_info['coef_variation'] = round((stddev / mean), 5)
                else:
                    col_info['coef_variation'] = None

                mad = df.cols.mad(col_name)
                if mad is not None:
                    col_info['mad'] = round(df.cols.mad(col_name), 5)
                else:
                    col_info['mad'] = None

            if self.rows_count is None:
                self.rows_count = df.count()

            col_info['p_count_na'] = round(
                (stats[col_name]['count_na'] * 100) / self.rows_count, 2)
            col_info['p_count_uniques'] = round(
                (stats[col_name]['count_uniques'] * 100) / self.rows_count, 2)
            return col_info

        if advanced_stats is True:
            for col_name in columns:
                result.update(extra_columns_stats(df, col_name, result))

        return result
Beispiel #16
0
        def _count_data_types(col_name):
            """
            Function for determine if register value is float or int or string.
            :param col_name:
            :return:
            """
            logger.print("Processing column '" + col_name + "'...")
            # If String, process the data to try to infer which data type is inside. This a kind of optimization.
            # We do not need to analyze the data if the column data type is integer or boolean.etc

            temp = col_name + "_type"
            col_data_type = df.cols.dtypes(col_name)

            count_by_data_type = {}
            count_empty_strings = 0

            if infer is True and col_data_type == "string":

                types = (df
                         .h_repartition(col_name=col_name)
                         .withColumn(temp, fbdt(col_name, get_type=True))
                         .groupBy(temp).count()
                         .to_json())

                for row in types:
                    count_by_data_type[row[temp]] = row["count"]

                count_empty_strings = df.where(F.col(col_name) == '').count()

            else:
                nulls = df.cols.count_na(col_name)
                count_by_data_type[col_data_type] = int(df.count()) - nulls
                count_by_data_type["null"] = nulls

            count_by_data_type = fill_missing_var_types(count_by_data_type)

            # Subtract white spaces to the total string count
            data_types_count = {"string": count_by_data_type['string'],
                                "bool": count_by_data_type['bool'],
                                "int": count_by_data_type['int'],
                                "float": count_by_data_type['float'],
                                "double": count_by_data_type['double'],
                                "date": count_by_data_type['date'],
                                "array": count_by_data_type['array']
                                }

            null_missed_count = {"null": count_by_data_type['null'],
                                 "missing": count_empty_strings,
                                 }
            # Get the greatest count by column data type
            greatest_data_type_count = max(data_types_count, key=data_types_count.get)

            if greatest_data_type_count is "string":
                cat = "categorical"
            elif greatest_data_type_count is "int" or greatest_data_type_count is "float" or greatest_data_type_count is "double":
                cat = "numeric"
            elif greatest_data_type_count is "date":
                cat = "date"
            elif greatest_data_type_count is "bool":
                cat = "bool"
            elif greatest_data_type_count is "array":
                cat = "array"
            else:
                cat = "null"

            col = {}
            col['dtype'] = greatest_data_type_count
            col['type'] = cat
            col['details'] = {**data_types_count, **null_missed_count}

            return col
Beispiel #17
0
    def columns_stats(self,
                      df,
                      columns,
                      buckets=10,
                      infer=False,
                      relative_error=RELATIVE_ERROR,
                      approx_count=True):
        """
        Return statistical information about a specific column in json format
        :param df: Dataframe to be processed
        :param columns: Columns that you want to profile
        :param buckets: Create buckets divided by range. Each bin is equal.
        :param infer: try to infer the column datatype
        :param relative_error: relative error when the percentile is calculated. 0 is more exact as slow 1 more error and faster
        :param approx_count: Use the function approx_count_distinct or countDistinct. approx_count_distinct is faster
        :return: json object
        """

        columns = parse_columns(df, columns)

        # Initialize Objects
        logger.print("Processing Stats For columns...")

        # Get columns data types. This is necessary to make the pertinent histogram calculations.
        type_details = self._count_data_types(df, columns, infer)

        # Count the categorical, numerical, boolean and date columns
        count_types = {}
        for value in type_details.values():
            name = value["dtype"]
            if name in count_types:
                count_types[name] += 1
            else:
                count_types[name] = 1

        # List the data types this data set have

        total = 0
        dtypes = []
        for key, value in count_types.items():
            if value > 0:
                dtypes.append(key)
                total = total + 1

        count_types = fill_missing_col_types(count_types)

        columns_info = {}
        columns_info["count_types"] = count_types
        columns_info["total_count_dtypes"] = total
        columns_info["dtypes_list"] = dtypes
        columns_info["columns"] = type_details

        # Aggregation
        stats = Profiler.columns_agg(df, columns, buckets, relative_error,
                                     approx_count)

        # Calculate Frequency
        logger.print("Processing Frequency ...")
        df_freq = df.cols.select("*",
                                 data_type=PYSPARK_NUMERIC_TYPES,
                                 invert=True)
        freq = None
        if df_freq is not None:
            freq = df_freq.cols.frequency("*", buckets, True, self.rows_count)

        # Calculate percentage
        for col_name in columns:
            col_info = {}
            assign(col_info, "stats", stats[col_name], dict)

            if freq is not None:
                if col_name in freq:
                    assign(col_info, "frequency", freq[col_name])

            col_info["stats"].update(
                self.extra_columns_stats(df, col_name, stats))

            assign(col_info, "name", col_name)
            assign(col_info, "column_dtype",
                   columns_info["columns"][col_name]['dtype'])
            assign(col_info, "dtypes_stats",
                   columns_info["columns"][col_name]['stats'])
            assign(col_info, "column_type",
                   columns_info["columns"][col_name]['type'])
            assign(columns_info, "columns." + col_name, col_info, dict)

        return columns_info
Beispiel #18
0
 def timed(*args, **kw):
     start_time = timeit.default_timer()
     f = method(*args, **kw)
     _time = round(timeit.default_timer() - start_time, 2)
     logger.print("{name}() executed in {time} sec".format(name=method.__name__, time=_time))
     return f
Beispiel #19
0
    def run(self, df, collection_name=None, func_request=None, func_response=None, return_type="json", calls=60,
            period=60, max_tries=8):
        """
        Read a the url key from a mongo collection an make a request to a service
        :param df: Dataframe to me loaded to the enricher collection.
        :param collection_name: Custom collection to save the data.
        :param func_request: help to create a custom request
        :param func_response: help to create a custom response
        :param return_type:
        :param calls: how many call can you make
        :param period: in which period ot time can the call be made
        :param max_tries: how many retries should we do
        :return:
        """

        # Load the dataframe data in the enricher
        if is_(df, DataFrame):
            df = df.create_id(COL_ID)

        # Load the dataframe data in the enricher
        self.send(df)

        if collection_name is None:
            collection_name = self.collection_name
        collection = self.get_collection(collection_name)

        # Get data that is not yet enriched
        cursor = collection.find({COL_RESULTS: {"$exists": False}})

        total_docs = cursor.count(True)

        if func_request is None:
            func_request = requests.get
        collection = self.get_collection(collection_name)

        @on_exception(expo, RateLimitException, max_tries=max_tries)
        @limits(calls=calls, period=period)
        def _func_request(v):
            return func_request(v)

        if total_docs > 0:
            for c in tqdm_notebook(cursor, total=total_docs, desc='Processing...'):

                # Send request to the API
                response = _func_request(c)

                mongo_id = c["_id"]

                if response.status_code == 200:
                    if return_type == "json":
                        response = json.loads(response.text)
                    elif return_type == "text":
                        response = response.text

                    # Process the result with an external function
                    if is_function(func_response):
                        response = func_response(response)

                    # Update the mongo id with the result
                    collection.find_and_modify(query={"_id": mongo_id},
                                               update={"$set": {COL_RESULTS: response}},
                                               upsert=False, full_response=True)
                else:
                    # The response key will remain blank so we can filter it to try in future request
                    logger.print(response.status_code)

            # Append the data in enrichment to the dataframe

            logger.print("Appending collection info into the dataframe")
            # TODO: An elegant way to handle pickling?
            # take care to the pickling
            host = self.host
            port = self.port
            db_name = self.db_name

            @pandas_udf('string', PandasUDFType.SCALAR)
            def func(value):
                # More about pickling
                from pymongo import MongoClient
                _client = MongoClient(host, port)
                _db = _client[db_name]
                _collection = _db[collection_name]

                def func_serie(serie):
                    _cursor = _collection.find_one({COL_ID: serie}, projection={"_id": 0, COL_RESULTS: 1})
                    return _cursor[COL_RESULTS]

                return value.apply(func_serie)

            df = df.withColumn(COL_RESULTS, func(df[COL_ID])).cols.drop(COL_ID).run()

            # If the process is finished, flush the Mongo collection
            self.flush()
            return df
        else:
            print("No records available to process")
Beispiel #20
0
    def __init__(self, master="local[*]", app_name="optimus"):
        """

        :param master: Sets the Spark master URL to connect to, such as 'local' to run locally, 'local[4]' to run
        locally with 4 cores, or spark://master:7077 to run on a Spark standalone cluster.
        :param app_name: Sets a name for the application, which will be shown in the Spark web UI
        """

        self.master = master
        self.app_name = app_name

        logger.print(JUST_CHECKING)
        logger.print("-----")
        check_env_vars([
            "SPARK_HOME", "HADOOP_HOME", "PYSPARK_PYTHON",
            "PYSPARK_DRIVER_PYTHON", "PYSPARK_SUBMIT_ARGS", "JAVA_HOME"
        ])

        if is_pyarrow_installed() is True:
            logger.print("Pyarrow Installed")
        else:
            logger.print(
                "Pyarrow not installed. Pandas UDF not available. Install using 'pip install pyarrow'"
            )
        logger.print("-----")
        logger.print(STARTING_SPARK)

        # print(os.environ['PYSPARK_SUBMIT_ARGS'])

        # Build the spark session
        self._spark = SparkSession.builder \
            .appName(app_name) \
            .master(master) \
            .config("spark.executor.heartbeatInterval", "110") \
            .config("spark.jars.packages", "ml.combust.mleap:mleap-spark_2.11:0.13.0") \
            .getOrCreate()

        # .option("driver", "org.postgresql.Driver")
        self._sc = self._spark.sparkContext
        logger.print("Spark Version:" + self._sc.version)