Ejemplo n.º 1
0
    def __init__(self, master="local[*]", app_name="optimus"):
        """

        :param master: Sets the Spark master URL to connect to, such as “local” to run locally, “local[4]” to run
        locally with 4 cores, or “spark://master:7077” to run on a Spark standalone cluster.
        :param app_name: Sets a name for the application, which will be shown in the Spark web UI
        """

        self.master = master
        self.app_name = app_name

        logging.info(JUST_CHECKING)
        logging.info("-----")
        check_env_vars([
            "SPARK_HOME", "HADOOP_HOME", "PYSPARK_PYTHON",
            "PYSPARK_DRIVER_PYTHON", "JAVA_HOME"
        ])

        if is_pyarrow_installed() is True:
            logging.info("Pyarrow Installed")
        else:
            logging.info(
                "Pyarrow not installed. Pandas UDF not available. Install using 'pip install pyarrow'"
            )
        logging.info("-----")
        logging.info(STARTING_SPARK)

        # Build the spark session
        self._spark = (SparkSession.builder.master(self.master).appName(
            self.app_name).getOrCreate())
Ejemplo n.º 2
0
    def create(self, master="local[*]", app_name="optimus"):
        """

        :param master: Sets the Spark master URL to connect to, such as 'local' to run locally, 'local[4]' to run
        locally with 4 cores, or spark://master:7077 to run on a Spark standalone cluster.
        :param app_name: Sets a name for the application, which will be shown in the Spark web UI
        """

        logger.print(JUST_CHECKING)
        logger.print("-----")
        check_env_vars([
            "SPARK_HOME", "HADOOP_HOME", "PYSPARK_PYTHON",
            "PYSPARK_DRIVER_PYTHON", "PYSPARK_SUBMIT_ARGS", "JAVA_HOME"
        ])

        if is_pyarrow_installed() is True:
            logger.print("Pyarrow Installed")
        else:
            logger.print(
                "Pyarrow not installed. Pandas UDF not available. Install using 'pip install pyarrow'"
            )
        logger.print("-----")
        logger.print(STARTING_SPARK)

        # Build the spark session
        self._spark = SparkSession.builder \
            .appName(app_name) \
            .master(master) \
            .getOrCreate()

        self._sc = self._spark.sparkContext
        logger.print("Spark Version:" + self._sc.version)

        return self
Ejemplo n.º 3
0
def abstract_udf(col,
                 func,
                 func_return_type=None,
                 attrs=None,
                 func_type=None,
                 verbose=False):
    """
    Abstract User defined functions. This is a helper function to create udf, pandas udf or a Column Exp
    :param col: Column to created or transformed
    :param func: Function to be applied to the data
    :param attrs: If required attributes to be passed to the function
    :param func_return_type: Required by UDF and Pandas UDF.
    :param func_type: pandas_udf or udf. The function is going to try to use pandas_udf if func_type is not defined
    :param verbose: print additional info
    :return: A function, UDF or Pandas UDF
    """

    # By default is going to try to use pandas UDF
    if func_type is None and is_pyarrow_installed() is True:
        func_type = "pandas_udf"

    types = ["column_exp", "udf", "pandas_udf"]
    if func_type not in types:
        RaiseIt.value_error(func_type, types)

    # if verbose is True:
    #    logging.info("Using '{func_type}' to process column '{column}' with function {func_name}"
    #                 .format(func_type=func_type, column=col, func_name=func.__name__))

    df_func = func_factory(func_type, func_return_type)
    return df_func(attrs, func)(col)
Ejemplo n.º 4
0
    def __init__(self, master="local[*]", app_name="optimus"):
        """

        :param master: Sets the Spark master URL to connect to, such as 'local' to run locally, 'local[4]' to run
        locally with 4 cores, or spark://master:7077 to run on a Spark standalone cluster.
        :param app_name: Sets a name for the application, which will be shown in the Spark web UI
        """

        self.master = master
        self.app_name = app_name

        logger.info(message=JUST_CHECKING)
        logger.info("-----")
        check_env_vars(["SPARK_HOME", "HADOOP_HOME", "PYSPARK_PYTHON", "PYSPARK_DRIVER_PYTHON", "PYSPARK_SUBMIT_ARGS",
                        "JAVA_HOME"])

        if is_pyarrow_installed() is True:
            logger.info("Pyarrow Installed")
        else:
            logger.info(
                "Pyarrow not installed. Pandas UDF not available. Install using 'pip install pyarrow'")
        logger.info("-----")
        logger.info(STARTING_SPARK)

        # Build the spark session
        self._spark = SparkSession.builder \
            .master(master) \
            .config("spark.executor.heartbeatInterval", "110") \
            .appName(app_name) \
            .getOrCreate()

        # .option("driver", "org.postgresql.Driver")
        self._sc = self._spark.sparkContext
Ejemplo n.º 5
0
def abstract_udf(col, func, func_return_type=None, args=None, func_type=None):
    """
    Abstract User defined functions. This is a helper function to create udf, pandas udf or a Column Exp
    :param col: Column to created or transformed
    :param func: Function to be applied to the data
    :param args: If required attributes to be passed to the function
    :param func_return_type: Required by UDF and Pandas UDF.
    :param func_type: pandas_udf or udf. The function is going to try to use pandas_udf if func_type is not defined
    :return: A function, UDF or Pandas UDF
    """

    if func_return_type is None:
        func_type = "column_expr"
    # By default is going to try to use pandas UDF
    elif func_type is None and is_pyarrow_installed() is True:
        func_type = "pandas_udf"

    types = ["column_expr", "udf", "pandas_udf"]
    if func_type not in types:
        RaiseIt.value_error(func_type, types)

    # It handle if func param is a plain expression or a function returning and expression
    def func_col_exp(col_name, attr):
        return func

    if is_column(func):
        _func = func_col_exp
    else:
        _func = func
    # print(func_type)
    logger.print(
        "Using '{func_type}' to process column '{column}' with function {func_name}"
        .format(func_type=func_type, column=col, func_name=_func.__name__))

    df_func = func_factory(func_type, func_return_type)
    if not is_tuple(args):
        args = (args, )

    # print("-----------------df_func(_func, args)(col)", df_func(_func, args)(col))
    return df_func(_func, args)(col)