def set_info(self): hadoop_version = None hadoop_detect_from = None try: spark_session = pyspark.sql.SparkSession.builder.getOrCreate() hadoop_version = spark_session.sparkContext._gateway.jvm.org.apache.hadoop.util.VersionInfo.getVersion( ) hadoop_detect_from = 'spark' except Exception as e: pass self._stop(spark_session) if hadoop_version is None: hadoop_version = get_hadoop_version_from_system() hadoop_detect_from = 'system' if hadoop_version is None: logging.warning('Could not find a valid hadoop install.') hadoop_home = get_tool_home('hadoop', 'HADOOP_HOME', 'bin')[0] spark_home = get_tool_home('spark-submit', 'SPARK_HOME', 'bin')[0] spark_dist_classpath = os.environ.get('SPARK_DIST_CLASSPATH') spark_dist_classpath_source = 'env' if not spark_dist_classpath: spark_dist_classpath_source = os.path.join(spark_home, 'conf/spark-env.sh') if os.path.isfile(spark_dist_classpath_source): with open(spark_dist_classpath_source) as s: for line in s: pattern = 'SPARK_DIST_CLASSPATH=' pos = line.find(pattern) if pos >= 0: spark_dist_classpath = line[pos + len(pattern):].strip() spark_dist_classpath = run_command( f'echo {spark_dist_classpath}')[0] if hadoop_detect_from == 'system' and (not spark_dist_classpath): logging.warning( textwrap.dedent(""" SPARK_DIST_CLASSPATH not defined and spark installed without hadoop define SPARK_DIST_CLASSPATH in $SPARK_HOME/conf/spark-env.sh as follows: export SPARK_DIST_CLASSPATH=$(hadoop classpath) for more info refer to: https://spark.apache.org/docs/latest/hadoop-provided.html """)) self.info['python_version'] = python_version() self.info['hadoop_version'] = hadoop_version self.info['hadoop_detect'] = hadoop_detect_from self.info['hadoop_home'] = hadoop_home self.info['spark_home'] = spark_home self.info['spark_classpath'] = spark_dist_classpath.split( ':') if spark_dist_classpath else None self.info['spark_classpath_source'] = spark_dist_classpath_source return
def set_info(self): self.info['python_version'] = python_version() self.info['dask_version'] = dask.__version__ return
def set_info(self): self.info['python_version'] = python_version() self.info['pandas_version'] = pd.__version__ return