def create(self, master="local[*]", app_name="optimus"): """ :param master: Sets the Spark master URL to connect to, such as 'local' to run locally, 'local[4]' to run locally with 4 cores, or spark://master:7077 to run on a Spark standalone cluster. :param app_name: Sets a name for the application, which will be shown in the Spark web UI """ logger.print(JUST_CHECKING) logger.print("-----") check_env_vars([ "SPARK_HOME", "HADOOP_HOME", "PYSPARK_PYTHON", "PYSPARK_DRIVER_PYTHON", "PYSPARK_SUBMIT_ARGS", "JAVA_HOME" ]) if is_pyarrow_installed() is True: logger.print("Pyarrow Installed") else: logger.print( "Pyarrow not installed. Pandas UDF not available. Install using 'pip install pyarrow'" ) logger.print("-----") logger.print(STARTING_SPARK) # Build the spark session self._spark = SparkSession.builder \ .appName(app_name) \ .master(master) \ .getOrCreate() self._sc = self._spark.sparkContext logger.print("Spark Version:" + self._sc.version) return self
def __init__(self, master="local[*]", app_name="optimus"): """ :param master: Sets the Spark master URL to connect to, such as “local” to run locally, “local[4]” to run locally with 4 cores, or “spark://master:7077” to run on a Spark standalone cluster. :param app_name: Sets a name for the application, which will be shown in the Spark web UI """ self.master = master self.app_name = app_name logging.info(JUST_CHECKING) logging.info("-----") check_env_vars([ "SPARK_HOME", "HADOOP_HOME", "PYSPARK_PYTHON", "PYSPARK_DRIVER_PYTHON", "JAVA_HOME" ]) if is_pyarrow_installed() is True: logging.info("Pyarrow Installed") else: logging.info( "Pyarrow not installed. Pandas UDF not available. Install using 'pip install pyarrow'" ) logging.info("-----") logging.info(STARTING_SPARK) # Build the spark session self._spark = (SparkSession.builder.master(self.master).appName( self.app_name).getOrCreate())
def __init__(self, master="local[*]", app_name="optimus"): """ :param master: Sets the Spark master URL to connect to, such as 'local' to run locally, 'local[4]' to run locally with 4 cores, or spark://master:7077 to run on a Spark standalone cluster. :param app_name: Sets a name for the application, which will be shown in the Spark web UI """ self.master = master self.app_name = app_name logger.info(message=JUST_CHECKING) logger.info("-----") check_env_vars(["SPARK_HOME", "HADOOP_HOME", "PYSPARK_PYTHON", "PYSPARK_DRIVER_PYTHON", "PYSPARK_SUBMIT_ARGS", "JAVA_HOME"]) if is_pyarrow_installed() is True: logger.info("Pyarrow Installed") else: logger.info( "Pyarrow not installed. Pandas UDF not available. Install using 'pip install pyarrow'") logger.info("-----") logger.info(STARTING_SPARK) # Build the spark session self._spark = SparkSession.builder \ .master(master) \ .config("spark.executor.heartbeatInterval", "110") \ .appName(app_name) \ .getOrCreate() # .option("driver", "org.postgresql.Driver") self._sc = self._spark.sparkContext