Example #1
0
    def _stop(self, spark_session=None):
        self.stopped = True
        try:
            sc_from_session = spark_session.sparkContext if spark_session else None
            sc_from_engine = self.context.sparkContext if self.context else None
            sc_from_module = pyspark.SparkContext._active_spark_context or None

            scs = [sc_from_session, sc_from_engine, sc_from_module]

            if self.context:
                self.context.stop()

            if spark_session:
                spark_session.stop()

            cls = pyspark.SparkContext

            for sc in scs:
                if sc:
                    try:
                        sc.stop()
                        sc._gateway.shutdown()
                    except Exception as e:
                        pass

            cls._active_spark_context = None
            cls._gateway = None
            cls._jvm = None
        except Exception as e:
            print(e)
            logging.warning(
                f'Could not fully stop the {self.engine_type} context')
Example #2
0
    def list(self, provider):
        if isinstance(provider, YamlDict):
            md = provider.to_dict()
        elif isinstance(provider, str):
            md = get_metadata(self._rootdir, self._metadata, None, provider)
        elif isinstance(provider, dict):
            md = provider
        else:
            logging.warning(f'{str(provider)} cannot be used to reference a provider')
            return []

        try:
            if md['service'] in ['local', 'file']:
                d = []
                for f in os.listdir(md['provider_path']):
                    d.append(os.path.join(md['provider_path'], f))
                return d
            elif md['service'] == 'hdfs':
                sc = self._ctx._sc
                URI = sc._gateway.jvm.java.net.URI
                Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
                FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
                fs = FileSystem.get(URI(md['url']), sc._jsc.hadoopConfiguration())

                obj = fs.listStatus(Path(md['url']))
                tables = [obj[i].getPath().getName() for i in range(len(obj))]
                return tables

            elif md['format'] == 'jdbc':
                if md['service'] == 'mssql':
                    query = "(SELECT table_name FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE') as query"
                elif md['service'] == 'oracle':
                    query = "(SELECT table_name FROM all_tables WHERE owner='schema_name') as query"
                elif md['service'] == 'mysql':
                    query = f"(SELECT table_name FROM information_schema.tables where table_schema='{md['database']}') as query"
                elif md['service'] == 'pgsql':
                    query = f"(SELECT table_name FROM information_schema.tables) as query"
                else:
                    # vanilla query ... for other databases
                    query = f"(SELECT table_name FROM information_schema.tables) as query"

                obj = self._ctx.read \
                    .format('jdbc') \
                    .option('url', md['url']) \
                    .option("dbtable", query) \
                    .option("driver", md['driver']) \
                    .option("user", md['username']) \
                    .option('password', md['password']) \
                    .load()

                # load the data from jdbc
                return [x.TABLE_NAME for x in obj.select('TABLE_NAME').collect()]
            else:
                logging.error({'md': md, 'error_msg': f'List resource on service "{md["service"]}" not implemented'})
                return []
        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        return []
Example #3
0
    def get_detected_submit_lists(self, detect=True):
        
        submit_types = ['jars', 'packages', 'py-files']
        
        submit_objs=dict()
        for submit_type in submit_types:
            submit_objs[submit_type] = []

        if not detect:
            return submit_objs

        # get hadoop, and configured metadata services
        hadoop_version = self._info['hadoop_version']

        providers = self._metadata['providers']
        services = {v['service'] for v in providers.values()}
        services = sorted(list(services))

        #### submit: jars
        jars = submit_objs['jars']
            
        if 'oracle' in services:
            jar  = 'http://www.datanucleus.org/downloads/maven2/'
            jar += 'oracle/ojdbc6/11.2.0.3/ojdbc6-11.2.0.3.jar'
            jars.append(jar)
        
        #### submit: packages
        packages = submit_objs['packages']
        
        for v in services:
            if v == 'mysql':
                packages.append('mysql:mysql-connector-java:8.0.12')
            elif v == 'sqlite':
                packages.append('org.xerial:sqlite-jdbc:3.25.2')
            elif v == 'postgres':
                packages.append('org.postgresql:postgresql:42.2.5')
            elif v == 'mssql':
                packages.append('com.microsoft.sqlserver:mssql-jdbc:6.4.0.jre8')
            elif v == 'minio':
                if hadoop_version:
                    packages.append(f"org.apache.hadoop:hadoop-aws:{hadoop_version}")
                else:
                    logging.warning('Hadoop is not detected. '
                                    'Could not load hadoop-aws package ')
        
        #### submit: py-files
        pyfiles = submit_objs['py-files']
        
        #### print debug
        
        for submit_type in submit_types:
            if submit_objs[submit_type]:
                print(f'Loading detected {submit_type}:')
                for i in submit_objs[submit_type]:
                    print(f'  -  {i}')

        return submit_objs
Example #4
0
    def set_info(self):
        hadoop_version = None
        hadoop_detect_from = None
        try:
            session = pyspark.sql.SparkSession.builder.getOrCreate()
            hadoop_version = session.sparkContext._gateway.jvm.org.apache.hadoop.util.VersionInfo.getVersion()
            hadoop_detect_from = 'spark'
            self.stop(session)
        except Exception as e:
            print(e)
            pass
        
        if hadoop_version is None:
            hadoop_version = get_hadoop_version_from_system()
            hadoop_detect_from = 'system'
        
        if hadoop_version is None:
            logging.warning('Could not find a valid hadoop install.')

        hadoop_home = get_tool_home('hadoop', 'HADOOP_HOME', 'bin')[0]
        spark_home = get_tool_home('spark-submit', 'SPARK_HOME', 'bin')[0]
        
        spark_dist_classpath = os.environ.get('SPARK_DIST_CLASSPATH')
        spark_dist_classpath_source = 'env'
        
        if not spark_dist_classpath:
            spark_dist_classpath_source = os.path.join(spark_home,'conf/spark-env.sh')
            if os.path.isfile(spark_dist_classpath_source):
                with open(spark_dist_classpath_source) as s:
                    for line in s:
                        pattern = 'SPARK_DIST_CLASSPATH='
                        pos = line.find(pattern)
                        if pos>=0:
                            spark_dist_classpath = line[pos+len(pattern):].strip()
                            spark_dist_classpath = run_command(f'echo {spark_dist_classpath}')[0]

        if hadoop_detect_from == 'system' and (not spark_dist_classpath):
            logging.warning(textwrap.dedent("""
                    SPARK_DIST_CLASSPATH not defined and spark installed without hadoop
                    define SPARK_DIST_CLASSPATH in $SPARK_HOME/conf/spark-env.sh as follows:
                       
                       export SPARK_DIST_CLASSPATH=$(hadoop classpath)
                    
                    for more info refer to: 
                    https://spark.apache.org/docs/latest/hadoop-provided.html
                """))
        
        self._info['python_version']=python_version()
        self._info['hadoop_version']=hadoop_version
        self._info['hadoop_detect']=hadoop_detect_from
        self._info['hadoop_home']=hadoop_home
        self._info['spark_home']=spark_home
        self._info['spark_classpath']=spark_dist_classpath.split(':') if spark_dist_classpath else None
        self._info['spark_classpath_source']=spark_dist_classpath_source
        
        return 
Example #5
0
 def load_with_pandas(self, kargs):
     logging.warning("Fallback dataframe reader")
     
     #conversion of *some* pyspark arguments to pandas
     kargs.pop('inferSchema', None)
     
     kargs['header'] = 'infer' if kargs.get('header') else None
     kargs['prefix'] = '_c'
     
     return kargs
Example #6
0
    def save_with_pandas(self, md, kargs):
        if not self.is_spark_local():
            logging.warning("Fallback dataframe writer")
        
        if os.path.exists(md['url']) and os.path.isdir(md['url']):
            shutil.rmtree(md['url'])
                               
        #conversion of *some* pyspark arguments to pandas
        kargs.pop('mode', None)
        kargs['index'] = False
        
        kargs['header'] = False if kargs.get('header') is None else kargs.get('header')

        return kargs
Example #7
0
    def load_csv(self,
                 path=None,
                 provider=None,
                 *args,
                 sep=None,
                 header=None,
                 **kwargs):

        #return None
        obj = None

        md = Resource(path, provider, sep=sep, header=header, **kwargs)

        options = md['options']

        # after collecting from metadata, or method call, define csv defaults
        options['header'] = options.get('header') or True
        options['inferSchema'] = options.get('inferSchema') or True
        options['sep'] = options.get('sep') or ','

        local = self.is_spark_local()

        try:
            #three approaches: local, cluster, and service
            if md['service'] == 'file' and local:
                obj = self.context.read.options(**options).csv(md['url'])
            elif md['service'] == 'file':
                logging.warning(
                    f'local file + spark cluster: loading using pandas reader',
                    extra={'md': to_dict(md)})

                df = pd.read_csv(md['url'],
                                 sep=options['sep'],
                                 header=options['header'])
                obj = self.context.createDataFrame(df)
            elif md['service'] in ['hdfs', 's3a']:
                obj = self.context.read.options(**options).csv(md['url'])
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                return obj

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error(e, extra={'md': md})

        return obj
Example #8
0
    def set_conf_kv(self, conf):
        # appname
        if self._metadata['engine']['jobname']:
            logging.warning('deprecated: metadata engine/jobname is generated')
        conf.setAppName(self._name)

        # set master
        master_url = self._metadata['engine']['master']
        conf.setMaster(master_url)

        # set kv conf from metadata
        conf_md = self._metadata['engine']['config']

        for k, v in conf_md.items():
            if isinstance(v, (bool, int, float, str)):
                conf.set(k, v)
Example #9
0
    def load_parquet(self,
                     path=None,
                     provider=None,
                     *args,
                     mergeSchema=None,
                     **kwargs):

        #return None
        obj = None

        md = Resource(path,
                      provider,
                      format='parquet',
                      mergeSchema=mergeSchema,
                      **kwargs)

        options = md['options']

        # after collecting from metadata, or method call, define csv defaults
        options['mergeSchema'] = options.get('mergeSchema') or True

        local = self.is_spark_local()

        try:
            #three approaches: local, cluster, and service
            if md['service'] == 'file' and local:
                obj = self.context.read.options(**options).parquet(md['url'])
            elif md['service'] == 'file':
                logging.warning(
                    f'local file + spark cluster: loading using pandas reader',
                    extra={'md': to_dict(md)})
                #fallback to the pandas reader, then convert to spark
                df = pd.read_parquet(md['url'])
                obj = self.context.createDataFrame(df)
            elif md['service'] in ['hdfs', 's3a']:
                obj = self.context.read.options(**options).parquet(md['url'])
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                return obj

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error(e, extra={'md': md})

        return obj
Example #10
0
 def directory_to_file(self, path, ext):
     if os.path.exists(path) and os.path.isfile(path):
         return
     
     dirname = os.path.dirname(path)
     basename = os.path.basename(path)
     
                            
     filename = list(filter(lambda x: x.endswith(ext), os.listdir(path)))
     if len(filename)!=1:
         logging.warning('cannot convert if more than a partition present')
         return
     else:
         filename = filename[0]
                            
     shutil.move(os.path.join(path,filename), dirname)
     if os.path.exists(path) and os.path.isdir(path):
         shutil.rmtree(path)
                            
     shutil.move(os.path.join(dirname,filename), os.path.join(dirname,basename))
     return
Example #11
0
 def stop(self, spark_session=None):
     try:
         spark_session = spark_session or self._ctx
         sc = None
         if spark_session:
             sc = spark_session.sparkContext
             spark_session.stop()
             
         cls = pyspark.SparkContext
         sc = sc or cls._active_spark_context
         
         if sc:
             sc.stop()
             sc._gateway.shutdown()
             
         cls._active_spark_context = None
         cls._gateway = None
         cls._jvm = None
     except Exception as e:
         print(e)
         logging.warning('Could not fully stop the engine context')
Example #12
0
    def list(self, provider, path=''):

        df_schema = T.StructType([
                T.StructField('name',T.StringType(),True),
                T.StructField('type',T.StringType(),True)])

        df_empty = self._ctx.createDataFrame(data=(), schema=df_schema)
                      
        if isinstance(provider, YamlDict):
            md = provider.to_dict()
        elif isinstance(provider, str):
            md = resource.metadata(self._rootdir, self._metadata, None, provider)
        elif isinstance(provider, dict):
            md = provider
        else:
            logging.warning(f'{str(provider)} cannot be used to reference a provider')
            return df_empty

        try:
            if md['service'] in ['local', 'file']:
                lst = []
                rootpath = os.path.join(md['provider_path'], path)
                for f in os.listdir(rootpath):
                    fullpath = os.path.join(rootpath, f)
                    if os.path.isfile(fullpath):
                        obj_type='FILE'
                    elif os.path.isdir(fullpath):
                        obj_type='DIRECTORY'
                    elif os.path.ismount(fullpath):
                        obj_type='LINK'
                    elif os.path.islink(fullpath):
                        obj_type='MOUNT'
                    else:
                        obj_type='UNDEFINED'
                
                    obj_name = f
                    lst += [(obj_name, obj_type)]
                return self._ctx.createDataFrame(lst, ['name', 'type']) if lst else df_empty
            elif md['service'] in ['hdfs', 'minio']:
                sc = self._ctx._sc
                URI = sc._gateway.jvm.java.net.URI
                Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
                FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
                fs = FileSystem.get(URI(md['url']), sc._jsc.hadoopConfiguration())

                provider_path = md['provider_path'] if  md['service']=='hdfs' else '/'
                obj = fs.listStatus(Path(os.path.join(provider_path, path)))
                
                lst = []
                
                for i in range(len(obj)):
                    if obj[i].isFile():
                        obj_type='FILE'
                    elif obj[i].isDirectory():
                        obj_type='DIRECTORY'
                    else:
                        obj_type='UNDEFINED'
                
                    obj_name = obj[i].getPath().getName()
                    lst += [(obj_name, obj_type)]
                return self._ctx.createDataFrame(lst, ['name', 'type']) if lst else df_empty
            elif md['format'] == 'jdbc':
                # remove options from database, if any
                database = md["database"].split('?')[0]
                schema = md['schema']
                if md['service'] == 'mssql':
                    query = f"""
                        ( SELECT table_name, table_type 
                          FROM INFORMATION_SCHEMA.TABLES 
                          WHERE table_schema='{schema}'
                        ) as query
                        """
                elif md['service'] == 'oracle':
                    query = f"""
                        ( SELECT table_name, table_type 
                         FROM all_tables 
                         WHERE table_schema='{schema}'
                        ) as query
                        """
                elif md['service'] == 'mysql':
                    query = f"""
                        ( SELECT table_name, table_type 
                          FROM information_schema.tables 
                          WHERE table_schema='{schema}'
                        ) as query
                        """
                elif md['service'] == 'postgres':
                    query = f"""
                        ( SELECT table_name, table_type
                          FROM information_schema.tables 
                          WHERE table_schema = '{schema}'
                        ) as query
                        """
                else:
                    # vanilla query ... for other databases
                    query = f"""
                            ( SELECT table_name, table_type 
                              FROM information_schema.tables'
                            ) as query
                            """

                obj = self._ctx.read \
                    .format('jdbc') \
                    .option('url', md['url']) \
                    .option("dbtable", query) \
                    .option("driver", md['driver']) \
                    .option("user", md['username']) \
                    .option('password', md['password']) \
                    .load()

                # load the data from jdbc
                lst = [(x.TABLE_NAME, x.TABLE_TYPE) for x in obj.select('TABLE_NAME', 'TABLE_TYPE').collect()]
                return self._ctx.createDataFrame(lst, ['name', 'type']) if lst else df_empty
            else:
                logging.error({'md': md, 'error_msg': f'List resource on service "{md["service"]}" not implemented'})
                return  df_empty
        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        return  df_empty
Example #13
0
    def detect_submit_params(self, services=None):
        assert (isinstance(services, (type(None), str, list, set)))
        services = [services] if isinstance(services, str) else services
        services = services or []

        # if service is a string, make a resource out of it

        resources = [
            s if isinstance(s, dict) else Resource(service=s) for s in services
        ]
        services = set([r['service'] for r in resources])

        submit_types = [
            'jars', 'packages', 'repositories', 'py-files', 'files', 'conf'
        ]

        submit_objs = dict()
        for submit_type in submit_types:
            submit_objs[submit_type] = []

        if not services:
            return submit_objs

        services = sorted(list(services))

        # get hadoop, and configured metadata services
        hadoop_version = self.info['hadoop_version']

        #### submit: jars
        jars = submit_objs['jars']

        if 'oracle' in services:
            jar = 'http://www.datanucleus.org/downloads/maven2/'
            jar += 'oracle/ojdbc6/11.2.0.3/ojdbc6-11.2.0.3.jar'
            jars.append(jar)

        #### submit: packages
        packages = submit_objs['packages']

        for v in services:
            if v == 'mysql':
                packages.append('mysql:mysql-connector-java:8.0.12')
            elif v == 'sqlite':
                packages.append('org.xerial:sqlite-jdbc:3.25.2')
            elif v == 'postgres':
                packages.append('org.postgresql:postgresql:42.2.5')
            elif v == 'mssql':
                packages.append(
                    'com.microsoft.sqlserver:mssql-jdbc:6.4.0.jre8')
            elif v == 's3a':
                if hadoop_version:
                    packages.append(
                        f"org.apache.hadoop:hadoop-aws:{hadoop_version}")
                else:
                    logging.warning('The Hadoop installation is not detected. '
                                    'Could not load hadoop-aws (s3a) package ')

        #### submit: packages
        conf = submit_objs['conf']

        for v in resources:
            if v['service'] == 's3a':
                service_url = 'http://{}:{}'.format(v['host'], v['port'])
                s3a = "org.apache.hadoop.fs.s3a.S3AFileSystem"

                conf.append(("spark.hadoop.fs.s3a.endpoint", service_url))
                conf.append(("spark.hadoop.fs.s3a.access.key", v['user']))
                conf.append(("spark.hadoop.fs.s3a.secret.key", v['password']))
                conf.append(("spark.hadoop.fs.s3a.impl", s3a))
                conf.append(("spark.hadoop.fs.s3a.path.style.access", "true"))
                break

        return submit_objs