Example #1
0
    def create(self, master="local[*]", app_name="optimus"):
        """

        :param master: Sets the Spark master URL to connect to, such as 'local' to run locally, 'local[4]' to run
        locally with 4 cores, or spark://master:7077 to run on a Spark standalone cluster.
        :param app_name: Sets a name for the application, which will be shown in the Spark web UI
        """

        logger.print(JUST_CHECKING)
        logger.print("-----")
        check_env_vars([
            "SPARK_HOME", "HADOOP_HOME", "PYSPARK_PYTHON",
            "PYSPARK_DRIVER_PYTHON", "PYSPARK_SUBMIT_ARGS", "JAVA_HOME"
        ])

        if is_pyarrow_installed() is True:
            logger.print("Pyarrow Installed")
        else:
            logger.print(
                "Pyarrow not installed. Pandas UDF not available. Install using 'pip install pyarrow'"
            )
        logger.print("-----")
        logger.print(STARTING_SPARK)

        # Build the spark session
        self._spark = SparkSession.builder \
            .appName(app_name) \
            .master(master) \
            .getOrCreate()

        self._sc = self._spark.sparkContext
        logger.print("Spark Version:" + self._sc.version)

        return self
Example #2
0
    def __init__(self, master="local[*]", app_name="optimus"):
        """

        :param master: Sets the Spark master URL to connect to, such as “local” to run locally, “local[4]” to run
        locally with 4 cores, or “spark://master:7077” to run on a Spark standalone cluster.
        :param app_name: Sets a name for the application, which will be shown in the Spark web UI
        """

        self.master = master
        self.app_name = app_name

        logging.info(JUST_CHECKING)
        logging.info("-----")
        check_env_vars([
            "SPARK_HOME", "HADOOP_HOME", "PYSPARK_PYTHON",
            "PYSPARK_DRIVER_PYTHON", "JAVA_HOME"
        ])

        if is_pyarrow_installed() is True:
            logging.info("Pyarrow Installed")
        else:
            logging.info(
                "Pyarrow not installed. Pandas UDF not available. Install using 'pip install pyarrow'"
            )
        logging.info("-----")
        logging.info(STARTING_SPARK)

        # Build the spark session
        self._spark = (SparkSession.builder.master(self.master).appName(
            self.app_name).getOrCreate())
Example #3
0
    def __init__(self, master="local[*]", app_name="optimus"):
        """

        :param master: Sets the Spark master URL to connect to, such as 'local' to run locally, 'local[4]' to run
        locally with 4 cores, or spark://master:7077 to run on a Spark standalone cluster.
        :param app_name: Sets a name for the application, which will be shown in the Spark web UI
        """

        self.master = master
        self.app_name = app_name

        logger.info(message=JUST_CHECKING)
        logger.info("-----")
        check_env_vars(["SPARK_HOME", "HADOOP_HOME", "PYSPARK_PYTHON", "PYSPARK_DRIVER_PYTHON", "PYSPARK_SUBMIT_ARGS",
                        "JAVA_HOME"])

        if is_pyarrow_installed() is True:
            logger.info("Pyarrow Installed")
        else:
            logger.info(
                "Pyarrow not installed. Pandas UDF not available. Install using 'pip install pyarrow'")
        logger.info("-----")
        logger.info(STARTING_SPARK)

        # Build the spark session
        self._spark = SparkSession.builder \
            .master(master) \
            .config("spark.executor.heartbeatInterval", "110") \
            .appName(app_name) \
            .getOrCreate()

        # .option("driver", "org.postgresql.Driver")
        self._sc = self._spark.sparkContext