def _stop(self, spark_session=None): self.stopped = True try: sc_from_session = spark_session.sparkContext if spark_session else None sc_from_engine = self.context.sparkContext if self.context else None sc_from_module = pyspark.SparkContext._active_spark_context or None scs = [sc_from_session, sc_from_engine, sc_from_module] if self.context: self.context.stop() if spark_session: spark_session.stop() cls = pyspark.SparkContext for sc in scs: if sc: try: sc.stop() sc._gateway.shutdown() except Exception as e: pass cls._active_spark_context = None cls._gateway = None cls._jvm = None except Exception as e: print(e) logging.warning( f'Could not fully stop the {self.engine_type} context')
def load_with_pandas(self, kwargs): logging.warning("Fallback dataframe reader") # conversion of *some* pyspark arguments to pandas kwargs.pop('inferSchema', None) kwargs['header'] = 'infer' if kwargs.get('header') else None kwargs['prefix'] = '_c' return kwargs
def load_csv(self, path=None, provider=None, *args, sep=None, header=None, **kwargs): #return None obj = None md = Resource(path, provider, sep=sep, header=header, **kwargs) # download if necessary md = get_local(md) options = md['options'] # after collecting from metadata, or method call, define csv defaults options['header'] = options.get('header') or True options['inferSchema'] = options.get('inferSchema') or True options['sep'] = options.get('sep') or ',' local = self.is_spark_local() # start the timer for logging ts_start = timer() try: #three approaches: local, cluster, and service if md['service'] == 'file' and local: obj = self.context.read.options(**options).csv(md['url']) elif md['service'] == 'file': logging.warning( f'local file + spark cluster: loading using pandas reader', extra={'md': to_dict(md)}) df = pd.read_csv(md['url'], sep=options['sep'], header=options['header']) obj = self.context.createDataFrame(df) elif md['service'] in ['hdfs', 's3a']: obj = self.context.read.options(**options).csv(md['url']) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) except AnalysisException as e: logging.error(str(e), extra={'md': md}) except Exception as e: logging.error(e, extra={'md': md}) self.load_log(md, options, ts_start) return obj
def load_parquet(self, path=None, provider=None, *args, mergeSchema=None, **kwargs): obj = None md = Resource(path, provider, format='parquet', mergeSchema=mergeSchema, **kwargs) # download if necessary md = get_local(md) options = md['options'] # after collecting from metadata, or method call, define csv defaults options['mergeSchema'] = options.get('mergeSchema') or True local = self.is_spark_local() try: #three approaches: local, cluster, and service if md['service'] == 'file' and local: obj = self.context.read.options(**options).parquet(md['url']) elif md['service'] == 'file': logging.warning( f'local file + spark cluster: loading using pandas reader', extra={'md': to_dict(md)}) #fallback to the pandas reader, then convert to spark df = pd.read_parquet(md['url']) obj = self.context.createDataFrame(df) elif md['service'] in ['hdfs', 's3a']: obj = self.context.read.options(**options).parquet(md['url']) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) return obj except AnalysisException as e: logging.error(str(e), extra={'md': md}) except Exception as e: logging.error(str(e), extra={'md': md}) return obj
def directory_to_file(self, path): if os.path.exists(path) and os.path.isfile(path): return dirname = os.path.dirname(path) basename = os.path.basename(path) filename = list( filter(lambda x: x.startswith('part-'), os.listdir(path))) if len(filename) != 1: if len(filename) > 1: logging.warning( 'cannot convert if more than a partition present') return else: filename = filename[0] shutil.move(os.path.join(path, filename), dirname) if os.path.exists(path) and os.path.isdir(path): shutil.rmtree(path) shutil.move(os.path.join(dirname, filename), os.path.join(dirname, basename)) return
def directory_to_file(path): if os.path.exists(path) and os.path.isfile(path): return dirname = os.path.dirname(path) basename = os.path.basename(path) filename = list(filter(lambda x: x.startswith('part-'), os.listdir(path))) if len(filename) != 1: if len(filename) > 1: logging.warning('In local mode, ', 'save will not flatten the directory to file,', 'if more than a partition present') return else: filename = filename[0] shutil.move(os.path.join(path, filename), dirname) if os.path.exists(path) and os.path.isdir(path): shutil.rmtree(path) shutil.move(os.path.join(dirname, filename), os.path.join(dirname, basename)) return
def _getcols(self, *colnames): for c in set(colnames) - set(self.df.columns): logging.warning(f'Column not found: {c}') return list(set(colnames) & set(self.df.columns))
def hll_init(self, k=12): logging.warning("Consider using hll_init_agg instead: " "ex: .groupby('g').agg(A.hll_init_agg())") return self.apply(functions.hll_init(k))
def set_info(self): hadoop_version = None hadoop_detect_from = None try: spark_session = pyspark.sql.SparkSession.builder.getOrCreate() hadoop_version = spark_session.sparkContext._gateway.jvm.org.apache.hadoop.util.VersionInfo.getVersion( ) hadoop_detect_from = 'spark' except Exception as e: pass self._stop(spark_session) if hadoop_version is None: hadoop_version = get_hadoop_version_from_system() hadoop_detect_from = 'system' if hadoop_version is None: logging.warning('Could not find a valid hadoop install.') hadoop_home = get_tool_home('hadoop', 'HADOOP_HOME', 'bin')[0] spark_home = get_tool_home('spark-submit', 'SPARK_HOME', 'bin')[0] spark_dist_classpath = os.environ.get('SPARK_DIST_CLASSPATH') spark_dist_classpath_source = 'env' if not spark_dist_classpath: spark_dist_classpath_source = os.path.join(spark_home, 'conf/spark-env.sh') if os.path.isfile(spark_dist_classpath_source): with open(spark_dist_classpath_source) as s: for line in s: pattern = 'SPARK_DIST_CLASSPATH=' pos = line.find(pattern) if pos >= 0: spark_dist_classpath = line[pos + len(pattern):].strip() spark_dist_classpath = run_command( f'echo {spark_dist_classpath}')[0] if hadoop_detect_from == 'system' and (not spark_dist_classpath): logging.warning( textwrap.dedent(""" SPARK_DIST_CLASSPATH not defined and spark installed without hadoop define SPARK_DIST_CLASSPATH in $SPARK_HOME/conf/spark-env.sh as follows: export SPARK_DIST_CLASSPATH=$(hadoop classpath) for more info refer to: https://spark.apache.org/docs/latest/hadoop-provided.html """)) self.info['python_version'] = python_version() self.info['hadoop_version'] = hadoop_version self.info['hadoop_detect'] = hadoop_detect_from self.info['hadoop_home'] = hadoop_home self.info['spark_home'] = spark_home self.info['spark_classpath'] = spark_dist_classpath.split( ':') if spark_dist_classpath else None self.info['spark_classpath_source'] = spark_dist_classpath_source return
def detect_submit_params(self, services=None): assert (isinstance(services, (type(None), str, list, set))) services = [services] if isinstance(services, str) else services services = services or [] # if service is a string, make a resource out of it resources = [ s if isinstance(s, dict) else Resource(service=s) for s in services ] # create a dictionary of services and versions, services = {} for r in resources: services[r['service']] = r['version'] submit_types = [ 'jars', 'packages', 'repositories', 'py-files', 'files', 'conf' ] submit_objs = dict() for submit_type in submit_types: submit_objs[submit_type] = {} if submit_type == 'conf' else [] if not services: return submit_objs services = dict(sorted(services.items())) # get hadoop, and configured metadata services hadoop_version = self.info['hadoop_version'] #### submit: repositories repositories = submit_objs['repositories'] #### submit: jars jars = submit_objs['jars'] #### submit: packages packages = submit_objs['packages'] for s, v in services.items(): if s == 'mysql': packages.append(f'mysql:mysql-connector-java:{v}') elif s == 'sqlite': packages.append(f'org.xerial:sqlite-jdbc:{v}') elif s == 'postgres': packages.append(f'org.postgresql:postgresql:{v}') elif s == 'oracle': vv = v.split('.') if v else [0, 0, 0, 0] repositories.append( 'http://maven.icm.edu.pl/artifactory/repo/') repositories.append('https://maven.xwiki.org/externals') if vv[0] == '12' and vv[1] == '2': packages.append(f'com.oracle.jdbc:ojdbc8:{v}') elif vv[0] == '12' and vv[1] == '1': packages.append(f'com.oracle.jdbc:ojdbc7:{v}') elif vv[0] == '11': packages.append(f'com.oracle.jdbc:ojdbc6:{v}') else: logging.warning( f'could not autodetect the oracle ' 'ojdbc driver to install for {s}, version {v}') elif s == 'mssql': packages.append(f'com.microsoft.sqlserver:mssql-jdbc:{v}') elif s == 'mongodb': packages.append( f'org.mongodb.spark:mongo-spark-connector_2.11:{v}') elif s == 'clickhouse': packages.append(f'ru.yandex.clickhouse:clickhouse-jdbc:{v}') elif s == 's3a': if hadoop_version: packages.append( f"org.apache.hadoop:hadoop-aws:{hadoop_version}") else: logging.warning('The Hadoop installation is not detected. ' 'Could not load hadoop-aws (s3a) package ') #### submit: packages conf = submit_objs['conf'] for v in resources: if v['service'] == 's3a': service_url = 'http://{}:{}'.format(v['host'], v['port']) s3a = "org.apache.hadoop.fs.s3a.S3AFileSystem" conf["spark.hadoop.fs.s3a.endpoint"] = service_url conf["spark.hadoop.fs.s3a.access.key"] = v['user'] conf["spark.hadoop.fs.s3a.secret.key"] = v['password'] conf["spark.hadoop.fs.s3a.impl"] = s3a conf["spark.hadoop.fs.s3a.path.style.access"] = "true" break return submit_objs
def detect_submit_params(self, services=None): assert (isinstance(services, (type(None), str, list, set))) services = [services] if isinstance(services, str) else services services = services or [] # if service is a string, make a resource out of it resources = [ s if isinstance(s, dict) else Resource(service=s) for s in services ] # create a dictionary of services and versions, services = {} for r in resources: services[r['service']] = r['version'] submit_types = [ 'jars', 'packages', 'repositories', 'py-files', 'files', 'conf' ] submit_objs = dict() for submit_type in submit_types: submit_objs[submit_type] = [] if not services: return submit_objs services = dict(sorted(services.items())) # get hadoop, and configured metadata services hadoop_version = self.info['hadoop_version'] # submit: repositories repositories = submit_objs['repositories'] # submit: jars jars = submit_objs['jars'] # submit: packages packages = submit_objs['packages'] for s, v in services.items(): if s == 'mysql': packages.append(f'mysql:mysql-connector-java:{v}') elif s == 'sqlite': packages.append(f'org.xerial:sqlite-jdbc:{v}') elif s == 'postgres': packages.append(f'org.postgresql:postgresql:{v}') elif s == 'oracle': packages.append(f'com.oracle.ojdbc:ojdbc8:{v}') elif s == 'mssql': packages.append(f'com.microsoft.sqlserver:mssql-jdbc:{v}') elif s == 'clickhouse': packages.append(f'ru.yandex.clickhouse:clickhouse-jdbc:{v}') elif s == 's3a': if hadoop_version: #packages.append(f"org.apache.hadoop:hadoop-aws:{hadoop_version}") pass else: logging.warning('The Hadoop installation is not detected. ' 'Could not load hadoop-aws (s3a) package ') elif s == 'file': pass elif s == 'hdfs': pass else: logging.warning( f'could not autodetect driver to install for {s}, version {v}' ) # submit: packages conf = submit_objs['conf'] for v in resources: if v['service'] == 's3a': service_url = 'http://{}:{}'.format(v['host'], v['port']) s3a = "org.apache.hadoop.fs.s3a.S3AFileSystem" conf.append(("spark.hadoop.fs.s3a.endpoint", service_url)) conf.append(("spark.hadoop.fs.s3a.access.key", v['user'])) conf.append(("spark.hadoop.fs.s3a.secret.key", v['password'])) conf.append(("spark.hadoop.fs.s3a.impl", s3a)) break return submit_objs