Exemple #1
0
    def _stop(self, spark_session=None):
        self.stopped = True
        try:
            sc_from_session = spark_session.sparkContext if spark_session else None
            sc_from_engine = self.context.sparkContext if self.context else None
            sc_from_module = pyspark.SparkContext._active_spark_context or None

            scs = [sc_from_session, sc_from_engine, sc_from_module]

            if self.context:
                self.context.stop()

            if spark_session:
                spark_session.stop()

            cls = pyspark.SparkContext

            for sc in scs:
                if sc:
                    try:
                        sc.stop()
                        sc._gateway.shutdown()
                    except Exception as e:
                        pass

            cls._active_spark_context = None
            cls._gateway = None
            cls._jvm = None
        except Exception as e:
            print(e)
            logging.warning(
                f'Could not fully stop the {self.engine_type} context')
Exemple #2
0
    def load_with_pandas(self, kwargs):
        logging.warning("Fallback dataframe reader")

        # conversion of *some* pyspark arguments to pandas
        kwargs.pop('inferSchema', None)

        kwargs['header'] = 'infer' if kwargs.get('header') else None
        kwargs['prefix'] = '_c'

        return kwargs
Exemple #3
0
    def load_csv(self,
                 path=None,
                 provider=None,
                 *args,
                 sep=None,
                 header=None,
                 **kwargs):

        #return None
        obj = None

        md = Resource(path, provider, sep=sep, header=header, **kwargs)

        # download if necessary
        md = get_local(md)

        options = md['options']

        # after collecting from metadata, or method call, define csv defaults
        options['header'] = options.get('header') or True
        options['inferSchema'] = options.get('inferSchema') or True
        options['sep'] = options.get('sep') or ','

        local = self.is_spark_local()

        # start the timer for logging
        ts_start = timer()
        try:
            #three approaches: local, cluster, and service
            if md['service'] == 'file' and local:
                obj = self.context.read.options(**options).csv(md['url'])
            elif md['service'] == 'file':
                logging.warning(
                    f'local file + spark cluster: loading using pandas reader',
                    extra={'md': to_dict(md)})

                df = pd.read_csv(md['url'],
                                 sep=options['sep'],
                                 header=options['header'])
                obj = self.context.createDataFrame(df)
            elif md['service'] in ['hdfs', 's3a']:
                obj = self.context.read.options(**options).csv(md['url'])
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error(e, extra={'md': md})

        self.load_log(md, options, ts_start)
        return obj
Exemple #4
0
    def load_parquet(self,
                     path=None,
                     provider=None,
                     *args,
                     mergeSchema=None,
                     **kwargs):
        obj = None

        md = Resource(path,
                      provider,
                      format='parquet',
                      mergeSchema=mergeSchema,
                      **kwargs)

        # download if necessary
        md = get_local(md)

        options = md['options']

        # after collecting from metadata, or method call, define csv defaults
        options['mergeSchema'] = options.get('mergeSchema') or True

        local = self.is_spark_local()

        try:
            #three approaches: local, cluster, and service
            if md['service'] == 'file' and local:
                obj = self.context.read.options(**options).parquet(md['url'])
            elif md['service'] == 'file':
                logging.warning(
                    f'local file + spark cluster: loading using pandas reader',
                    extra={'md': to_dict(md)})
                #fallback to the pandas reader, then convert to spark
                df = pd.read_parquet(md['url'])
                obj = self.context.createDataFrame(df)
            elif md['service'] in ['hdfs', 's3a']:
                obj = self.context.read.options(**options).parquet(md['url'])
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                return obj

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error(str(e), extra={'md': md})

        return obj
Exemple #5
0
    def directory_to_file(self, path):
        if os.path.exists(path) and os.path.isfile(path):
            return

        dirname = os.path.dirname(path)
        basename = os.path.basename(path)

        filename = list(
            filter(lambda x: x.startswith('part-'), os.listdir(path)))
        if len(filename) != 1:
            if len(filename) > 1:
                logging.warning(
                    'cannot convert if more than a partition present')
            return
        else:
            filename = filename[0]

        shutil.move(os.path.join(path, filename), dirname)
        if os.path.exists(path) and os.path.isdir(path):
            shutil.rmtree(path)

        shutil.move(os.path.join(dirname, filename),
                    os.path.join(dirname, basename))
        return
Exemple #6
0
def directory_to_file(path):
    if os.path.exists(path) and os.path.isfile(path):
        return

    dirname = os.path.dirname(path)
    basename = os.path.basename(path)

    filename = list(filter(lambda x: x.startswith('part-'), os.listdir(path)))
    if len(filename) != 1:
        if len(filename) > 1:
            logging.warning('In local mode, ',
                            'save will not flatten the directory to file,',
                            'if more than a partition present')
        return
    else:
        filename = filename[0]

    shutil.move(os.path.join(path, filename), dirname)
    if os.path.exists(path) and os.path.isdir(path):
        shutil.rmtree(path)

    shutil.move(os.path.join(dirname, filename),
                os.path.join(dirname, basename))
    return
Exemple #7
0
    def _getcols(self, *colnames):
        for c in set(colnames) - set(self.df.columns):
            logging.warning(f'Column not found: {c}')

        return list(set(colnames) & set(self.df.columns))
Exemple #8
0
 def hll_init(self, k=12):
     logging.warning("Consider using hll_init_agg instead: "
                     "ex: .groupby('g').agg(A.hll_init_agg())")
     return self.apply(functions.hll_init(k))
Exemple #9
0
    def set_info(self):
        hadoop_version = None
        hadoop_detect_from = None
        try:
            spark_session = pyspark.sql.SparkSession.builder.getOrCreate()
            hadoop_version = spark_session.sparkContext._gateway.jvm.org.apache.hadoop.util.VersionInfo.getVersion(
            )
            hadoop_detect_from = 'spark'
        except Exception as e:
            pass

        self._stop(spark_session)

        if hadoop_version is None:
            hadoop_version = get_hadoop_version_from_system()
            hadoop_detect_from = 'system'

        if hadoop_version is None:
            logging.warning('Could not find a valid hadoop install.')

        hadoop_home = get_tool_home('hadoop', 'HADOOP_HOME', 'bin')[0]
        spark_home = get_tool_home('spark-submit', 'SPARK_HOME', 'bin')[0]

        spark_dist_classpath = os.environ.get('SPARK_DIST_CLASSPATH')
        spark_dist_classpath_source = 'env'

        if not spark_dist_classpath:
            spark_dist_classpath_source = os.path.join(spark_home,
                                                       'conf/spark-env.sh')
            if os.path.isfile(spark_dist_classpath_source):
                with open(spark_dist_classpath_source) as s:
                    for line in s:
                        pattern = 'SPARK_DIST_CLASSPATH='
                        pos = line.find(pattern)
                        if pos >= 0:
                            spark_dist_classpath = line[pos +
                                                        len(pattern):].strip()
                            spark_dist_classpath = run_command(
                                f'echo {spark_dist_classpath}')[0]

        if hadoop_detect_from == 'system' and (not spark_dist_classpath):
            logging.warning(
                textwrap.dedent("""
                        SPARK_DIST_CLASSPATH not defined and spark installed without hadoop
                        define SPARK_DIST_CLASSPATH in $SPARK_HOME/conf/spark-env.sh as follows:

                           export SPARK_DIST_CLASSPATH=$(hadoop classpath)

                        for more info refer to:
                        https://spark.apache.org/docs/latest/hadoop-provided.html
                    """))

        self.info['python_version'] = python_version()
        self.info['hadoop_version'] = hadoop_version
        self.info['hadoop_detect'] = hadoop_detect_from
        self.info['hadoop_home'] = hadoop_home
        self.info['spark_home'] = spark_home
        self.info['spark_classpath'] = spark_dist_classpath.split(
            ':') if spark_dist_classpath else None
        self.info['spark_classpath_source'] = spark_dist_classpath_source

        return
Exemple #10
0
    def detect_submit_params(self, services=None):
        assert (isinstance(services, (type(None), str, list, set)))
        services = [services] if isinstance(services, str) else services
        services = services or []

        # if service is a string, make a resource out of it

        resources = [
            s if isinstance(s, dict) else Resource(service=s) for s in services
        ]

        # create a dictionary of services and versions,
        services = {}
        for r in resources:
            services[r['service']] = r['version']

        submit_types = [
            'jars', 'packages', 'repositories', 'py-files', 'files', 'conf'
        ]

        submit_objs = dict()
        for submit_type in submit_types:
            submit_objs[submit_type] = {} if submit_type == 'conf' else []

        if not services:
            return submit_objs

        services = dict(sorted(services.items()))

        # get hadoop, and configured metadata services
        hadoop_version = self.info['hadoop_version']

        #### submit: repositories
        repositories = submit_objs['repositories']

        #### submit: jars
        jars = submit_objs['jars']

        #### submit: packages
        packages = submit_objs['packages']

        for s, v in services.items():
            if s == 'mysql':
                packages.append(f'mysql:mysql-connector-java:{v}')
            elif s == 'sqlite':
                packages.append(f'org.xerial:sqlite-jdbc:{v}')
            elif s == 'postgres':
                packages.append(f'org.postgresql:postgresql:{v}')
            elif s == 'oracle':
                vv = v.split('.') if v else [0, 0, 0, 0]
                repositories.append(
                    'http://maven.icm.edu.pl/artifactory/repo/')
                repositories.append('https://maven.xwiki.org/externals')
                if vv[0] == '12' and vv[1] == '2':
                    packages.append(f'com.oracle.jdbc:ojdbc8:{v}')
                elif vv[0] == '12' and vv[1] == '1':
                    packages.append(f'com.oracle.jdbc:ojdbc7:{v}')
                elif vv[0] == '11':
                    packages.append(f'com.oracle.jdbc:ojdbc6:{v}')
                else:
                    logging.warning(
                        f'could not autodetect the oracle '
                        'ojdbc driver to install for {s}, version {v}')
            elif s == 'mssql':
                packages.append(f'com.microsoft.sqlserver:mssql-jdbc:{v}')
            elif s == 'mongodb':
                packages.append(
                    f'org.mongodb.spark:mongo-spark-connector_2.11:{v}')
            elif s == 'clickhouse':
                packages.append(f'ru.yandex.clickhouse:clickhouse-jdbc:{v}')
            elif s == 's3a':
                if hadoop_version:
                    packages.append(
                        f"org.apache.hadoop:hadoop-aws:{hadoop_version}")
                else:
                    logging.warning('The Hadoop installation is not detected. '
                                    'Could not load hadoop-aws (s3a) package ')

        #### submit: packages
        conf = submit_objs['conf']

        for v in resources:
            if v['service'] == 's3a':
                service_url = 'http://{}:{}'.format(v['host'], v['port'])
                s3a = "org.apache.hadoop.fs.s3a.S3AFileSystem"

                conf["spark.hadoop.fs.s3a.endpoint"] = service_url
                conf["spark.hadoop.fs.s3a.access.key"] = v['user']
                conf["spark.hadoop.fs.s3a.secret.key"] = v['password']
                conf["spark.hadoop.fs.s3a.impl"] = s3a
                conf["spark.hadoop.fs.s3a.path.style.access"] = "true"
                break

        return submit_objs
Exemple #11
0
    def detect_submit_params(self, services=None):
        assert (isinstance(services, (type(None), str, list, set)))
        services = [services] if isinstance(services, str) else services
        services = services or []

        # if service is a string, make a resource out of it

        resources = [
            s if isinstance(s, dict) else Resource(service=s) for s in services
        ]

        # create a dictionary of services and versions,
        services = {}
        for r in resources:
            services[r['service']] = r['version']

        submit_types = [
            'jars', 'packages', 'repositories', 'py-files', 'files', 'conf'
        ]

        submit_objs = dict()
        for submit_type in submit_types:
            submit_objs[submit_type] = []

        if not services:
            return submit_objs

        services = dict(sorted(services.items()))

        # get hadoop, and configured metadata services
        hadoop_version = self.info['hadoop_version']

        # submit: repositories
        repositories = submit_objs['repositories']

        # submit: jars
        jars = submit_objs['jars']

        # submit: packages
        packages = submit_objs['packages']

        for s, v in services.items():
            if s == 'mysql':
                packages.append(f'mysql:mysql-connector-java:{v}')
            elif s == 'sqlite':
                packages.append(f'org.xerial:sqlite-jdbc:{v}')
            elif s == 'postgres':
                packages.append(f'org.postgresql:postgresql:{v}')
            elif s == 'oracle':
                packages.append(f'com.oracle.ojdbc:ojdbc8:{v}')
            elif s == 'mssql':
                packages.append(f'com.microsoft.sqlserver:mssql-jdbc:{v}')
            elif s == 'clickhouse':
                packages.append(f'ru.yandex.clickhouse:clickhouse-jdbc:{v}')
            elif s == 's3a':
                if hadoop_version:
                    #packages.append(f"org.apache.hadoop:hadoop-aws:{hadoop_version}")
                    pass
                else:
                    logging.warning('The Hadoop installation is not detected. '
                                    'Could not load hadoop-aws (s3a) package ')
            elif s == 'file':
                pass
            elif s == 'hdfs':
                pass
            else:
                logging.warning(
                    f'could not autodetect driver to install for {s}, version {v}'
                )

        # submit: packages
        conf = submit_objs['conf']

        for v in resources:
            if v['service'] == 's3a':
                service_url = 'http://{}:{}'.format(v['host'], v['port'])
                s3a = "org.apache.hadoop.fs.s3a.S3AFileSystem"

                conf.append(("spark.hadoop.fs.s3a.endpoint", service_url))
                conf.append(("spark.hadoop.fs.s3a.access.key", v['user']))
                conf.append(("spark.hadoop.fs.s3a.secret.key", v['password']))
                conf.append(("spark.hadoop.fs.s3a.impl", s3a))

                break

        return submit_objs