Ejemplo n.º 1
0
 def start_context(self, conf):
     try:
         return dask.dataframe
     except Exception as e:
         print(e)
         logging.error('Could not start the engine context')
         return None
Ejemplo n.º 2
0
    def save(self,
             obj,
             path=None,
             provider=None,
             *args,
             format=None,
             mode=None,
             **kwargs):

        md = Resource(path, provider, format=format, mode=mode, **kwargs)

        if md['format'] == 'csv':
            return self.save_csv(obj, path, provider, mode=mode, **kwargs)
        elif md['format'] == 'tsv':
            kwargs['sep'] = '\t'
            return self.save_csv(obj, path, provider, mode=mode, **kwargs)
        elif md['format'] == 'json':
            return self.save_json(obj, path, provider, mode=mode, **kwargs)
        elif md['format'] == 'jsonl':
            return self.save_json(obj, path, provider, mode=mode, **kwargs)
        elif md['format'] == 'parquet':
            return self.save_parquet(obj, path, provider, mode=mode, **kwargs)
        elif md['format'] == 'jdbc':
            return self.save_jdbc(obj, path, provider, mode=mode, **kwargs)
        else:
            logging.error(f'Unknown format "{md["service"]}"',
                          extra={'md': md})
            return False
Ejemplo n.º 3
0
    def load_event_log(self,
                       path=None,
                       provider=None,
                       versionAsOf=None,
                       *args,
                       **kwargs):
        obj = None

        md = Resource(path, provider, format='event_log', **kwargs)

        options = md['options']

        try:
            if md['service'] in ['hdfs', 's3a']:
                version = self.find_version(versionAsOf, path, provider)
                if not version:
                    logging.error('No version of data detected',
                                  extra={'md': md})
                    return obj
                version = version.strftime('%Y-%m-%d-%H-%M-%S')
                url = f'{md["url"]}/_version={version}'
                obj = self.context.read.options(**options).parquet(url)
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                return obj

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error(str(e), extra={'md': md})

        return obj
Ejemplo n.º 4
0
    def start_session(self, conf):
        try:
            # init the spark session
            session = pyspark.sql.SparkSession.builder.config(
                conf=conf).getOrCreate()

            # store the spark session
            self.session = session

            # fix SQLContext for back compatibility
            initialize_spark_sql_context(session, session.sparkContext)

            # pyspark set log level method
            # (this will not suppress WARN before starting the context)
            session.sparkContext.setLogLevel("ERROR")

            # bootstrap datafaucet.zip in the cluster
            if not self.is_spark_local():
                dir_path = os.path.dirname(os.path.realpath(__file__))
                filename = os.path.abspath(
                    os.path.join(dir_path, 'dist/datafaucet.zip'))
                session.sparkContext.addPyFile(filename)

            # collect configuration
            self.conf = dict(dict(session.sparkContext.getConf().getAll()))

            # set the engine version
            self.version = session.version

            # set environment
            self.env = self.get_environment()

            # set info
            self.info['spark_classpath'] = self.info['spark_classpath'][
                0].split(' ')
            self.info = YamlDict(self.info)

            # set version if spark is loaded
            logging.notice(
                f'Engine context {self.engine_type}:{self.version} successfully started'
            )

            # session is running
            self.stopped = False

        except Exception as e:
            print(e)
            logging.error('Could not start the engine context')
            return None
Ejemplo n.º 5
0
    def load(self, path=None, provider=None, *args, format=None, **kwargs):

        md = Resource(path, provider, format=format, **kwargs)

        if md['format'] == 'csv':
            return self.load_csv(path, provider, **kwargs)
        elif md['format'] == 'json':
            return self.load_json(path, provider, **kwargs)
        elif md['format'] == 'parquet':
            return self.load_parquet(path, provider, **kwargs)
        elif md['format'] == 'jdbc':
            return self.load_jdbc(path, provider, **kwargs)
        else:
            logging.error(f'Unknown resource format "{md["format"]}"',
                          extra={'md': to_dict(md)})
        return None
Ejemplo n.º 6
0
    def save_plus(self, obj, path=None, provider=None, **kwargs):
        md = Resource(path, provider, **kwargs)

        prep_start = timer()
        options = md['options'] or {}

        if md['date_partition'] and md['date_column']:
            tzone = 'UTC' if self._timestamps == 'naive' else self._timezone
            obj = dataframe.add_datetime_columns(obj,
                                                 column=md['date_column'],
                                                 tzone=tzone)
            kwargs['partitionBy'] = ['_date'] + kwargs.get(
                'partitionBy', options.get('partitionBy', []))

        if md['update_column']:
            obj = dataframe.add_update_column(obj, tzone=self._timezone)

        if md['hash_column']:
            obj = dataframe.add_hash_column(obj,
                                            cols=md['hash_column'],
                                            exclude_cols=[
                                                '_date', '_datetime',
                                                '_updated', '_hash', '_state'
                                            ])

        date_column = '_date' if md['date_partition'] else md['date_column']
        obj = dataframe.filter_by_date(obj, date_column, md['date_start'],
                                       md['date_end'], md['date_window'])

        obj = dataframe.cache(obj, md['cache'])

        num_rows = obj.count()
        num_cols = len(obj.columns)

        # force 1 file per partition, just before saving
        obj = obj.repartition(1, *kwargs['partitionBy']) if kwargs.get(
            'partitionBy') else obj.repartition(1)
        # obj = obj.coalesce(1)

        prep_end = timer()

        core_start = timer()
        result = self.save_dataframe(obj, md, **kwargs)
        core_end = timer()

        log_data = {
            'md': dict(md),
            'mode': kwargs.get('mode', options.get('mode')),
            'records': num_rows,
            'columns': num_cols,
            'time': core_end - prep_start,
            'time_core': core_end - core_start,
            'time_prep': prep_end - prep_start
        }

        logging.info(log_data) if result else logging.error(log_data)

        return result
Ejemplo n.º 7
0
def Engine(engine_type=None, *args, **kwargs):
    global _engines

    if engine_type:
        if engine_type in _engines.keys():
            cls = _engines[engine_type]
            cls(*args, **kwargs)
        else:
            print('Could not create the Engine:')
            print('No matching engine type in', ', '.join(_engines.keys()))

    engine = _singleton['instance']

    if not engine:
        logging.error(
            'No Engine running yet. \n'
            'try datafaucet.engine(...) or datafaucet.project.load(...)')

    return engine
Ejemplo n.º 8
0
    def info(self):
        if not self.loaded:
            logging.error("No project profile loaded. " +
                          "Execute datafaucet.project.load(...) first.")
            return None

        return YamlDict({
            'version': __version__,
            'username': self._username,
            'session_name': self._session_name,
            'session_id': self._session_id,
            'profile': self._profile,
            'rootdir': paths.rootdir(),
            'script_path': self._script_path,
            'dotenv_path': self._dotenv_path,
            'notebooks_files': self._notebook_files,
            'python_files': self._python_files,
            'metadata_files': self._metadata_files,
            'repository': self._repo
        })
Ejemplo n.º 9
0
    def load_mongo(self, path=None, provider=None, *args, **kwargs):
        obj = None

        md = Resource(path, provider, format='mongo', **kwargs)

        options = md['options']

        try:
            if md['service'] == 'mongodb':
                obj = self.context.read \
                    .format('mongo') \
                    .option('uri', md['url']) \
                    .options(**options)

                # load the data
                obj = obj.load(**kwargs)
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                return obj

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error(str(e), extra={'md': md})

        return obj
Ejemplo n.º 10
0
    def save_mongo(self, path=None, provider=None, *args, **kwargs):
        md = Resource(path, provider, format='mongo', **kwargs)

        options = md['options']

        try:
            if md['service'] == 'mongodb':
                obj.write \
                    .format('mongo') \
                    .option('uri', md['url']) \
                    .options(**options) \
                    .mode(options['mode']) \
                    .save(**kwargs)
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                return False

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        return True
Ejemplo n.º 11
0
    def load_jdbc(self, path=None, provider=None, *args, **kwargs):
        obj = None

        md = Resource(path, provider, format='jdbc', **kwargs)

        options = md['options']

        try:
            if md['service'] in [
                    'sqlite', 'mysql', 'postgres', 'mssql', 'clickhouse',
                    'oracle'
            ]:
                obj = self.context.read \
                    .format('jdbc') \
                    .option('url', md['url']) \
                    .option("dbtable", md['table']) \
                    .option("driver", md['driver']) \
                    .option("user", md['user']) \
                    .option('password', md['password']) \
                    .options(**options)
                # load the data from jdbc
                obj = obj.load(**kwargs)
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                return obj

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error(str(e), extra={'md': md})

        return obj
Ejemplo n.º 12
0
    def read(self, file_paths=None):
        """
        Return all profiles, stored in a nested dictionary
        Profiles are merged over the list provided of provided metadata files to read.
        The order in the list of metadata files determines how profile properties are override
        :param file_paths: list of yaml files paths
        :return: dict of profiles
        """

        # empty profiles, before start reading
        profiles = {}

        if not file_paths:
            file_paths = []

        self._info['files'] = []
        for filename in file_paths:
            if os.path.isfile(filename):
                with open(filename, 'r') as f:
                    try:
                        docs = list(yaml.load_all(f))
                        self._info['files'].append(filename)
                    except yaml.YAMLError as e:
                        if hasattr(e, 'problem_mark'):
                            mark = e.problem_mark
                            logging.error(
                                "Error loading yml file {} at position: (%s:%s): skipping file"
                                .format(filename, mark.line + 1,
                                        mark.column + 1))
                            docs = []
                    finally:
                        for doc in docs:
                            doc['profile'] = doc.get('profile', 'default')
                            profiles[doc['profile']] = merge(
                                profiles.get(doc['profile'], {}), doc)

        self._info['profiles'] = sorted(list(profiles.keys()))

        return profiles
Ejemplo n.º 13
0
    def start_context(self, conf):
        try:
            # init the spark session
            session = pyspark.sql.SparkSession.builder.config(
                conf=conf).getOrCreate()

            # fix SQLContext for back compatibility
            self.initialize_spark_sql_context(session, session.sparkContext)

            # pyspark set log level method
            # (this will not suppress WARN before starting the context)
            session.sparkContext.setLogLevel("ERROR")

            # set the engine version
            self.version = session.version

            # set environment
            self.env = self.get_environment()

            return session
        except Exception as e:
            print(e)
            logging.error('Could not start the engine context')
            return None
Ejemplo n.º 14
0
    def save_parquet(self,
                     obj,
                     path=None,
                     provider=None,
                     *args,
                     mode=None,
                     **kwargs):

        result = True
        md = Resource(path, provider, format='parquet', mode=mode, **kwargs)
        options = md['options']

        # after collecting from metadata, or method call, define defaults
        options['mode'] = options['mode'] or 'overwrite'

        local = self.is_spark_local()

        ts_start = timer()
        try:
            #three approaches: file-local, local+cluster, and service
            if md['service'] == 'file' and local:
                obj.coalesce(1).write\
                    .format('parquet')\
                    .mode(options['mode'])\
                    .options(**options)\
                    .parquet(md['url'])

            elif md['service'] == 'file':
                if os.path.exists(md['url']) and os.path.isdir(md['url']):
                    shutil.rmtree(md['url'])

                # save with pandas
                obj.toPandas().to_parquet(md['url'], mode=options['mode'])

            elif md['service'] in ['hdfs', 's3a']:
                obj.write\
                     .format('parquet')\
                     .mode(options['mode'])\
                     .options(**options)\
                     .parquet(md['url'])
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                result = False

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
            result = False

        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        self.save_log(md, options, ts_start)
        return result
Ejemplo n.º 15
0
    def load_jdbc(self, path=None, provider=None, *args, **kwargs):
        # return None
        obj = None

        md = Resource(path, provider, format='jdbc', **kwargs)

        options = md['options']

        # start the timer for logging
        ts_start = timer()

        # cluster mode and local sqlite db: use pandas/sqlite
        if md['service'] == 'sqlite':
            local = self.is_spark_local()
            if not local:
                con = sqlite3.connect(md['database'])
                pdf = pd.read_sql(f"select * from {md['table']}", con=con)
                obj = self.session.createDataFrame(pdf)
                self.load_log(md, options, ts_start)
                return obj

        # all the other cases:
        try:
            if md['service'] in [
                    'sqlite', 'mysql', 'postgres', 'mssql', 'clickhouse',
                    'oracle'
            ]:
                obj = self.session.read \
                    .format('jdbc') \
                    .option('url', md['url']) \
                    .option("dbtable", md['table']) \
                    .option("driver", md['driver'])

                if md['user']:
                    obj = obj.option("user", md['user'])

                if md['password']:
                    obj = obj.option('password', md['password'])

                obj = obj.options(**options)

                # load the data from jdbc
                obj = obj.load(**kwargs)
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                return obj

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error(e, extra={'md': md})

        self.load_log(md, options, ts_start)
        return obj
Ejemplo n.º 16
0
    def load_plus(self,
                  path=None,
                  provider=None,
                  catch_exception=True,
                  **kwargs):
        md = Resource(path, provider, **kwargs)

        core_start = timer()
        obj = self.load_dataframe(md, catch_exception, **kwargs)
        core_end = timer()
        if obj is None:
            return obj

        prep_start = timer()
        #date_column = '_date' if md['date_partition'] else md['date_column']
        obj = dataframe.filter_by_date(obj, date_column, md['date_start'],
                                       md['date_end'], md['date_window'])

        # partition and sorting (hmmm, needed?)
        if date_column and date_column in obj.columns:
            obj = obj.repartition(date_column)

        if '_updated' in obj.columns:
            obj = obj.sortWithinPartitions(F.desc('_updated'))

        num_rows = obj.count()
        num_cols = len(obj.columns)

        obj = dataframe.cache(obj, md['cache'])

        prep_end = timer()

        log_data = {
            'md': md,
            'mode': kwargs.get('mode',
                               md.get('options', {}).get('mode')),
            'records': num_rows,
            'columns': num_cols,
            'time': prep_end - core_start,
            'time_core': core_end - core_start,
            'time_prep': prep_end - prep_start
        }
        logging.info(log_data) if obj is not None else logging.error(log_data)

        obj.__name__ = path
        return obj
Ejemplo n.º 17
0
    def load_csv(self,
                 path=None,
                 provider=None,
                 *args,
                 sep=None,
                 header=None,
                 **kwargs):

        #return None
        obj = None

        md = Resource(path, provider, sep=sep, header=header, **kwargs)

        # download if necessary
        md = get_local(md)

        options = md['options']

        # after collecting from metadata, or method call, define csv defaults
        options['header'] = options.get('header') or True
        options['inferSchema'] = options.get('inferSchema') or True
        options['sep'] = options.get('sep') or ','

        local = self.is_spark_local()

        # start the timer for logging
        ts_start = timer()
        try:
            #three approaches: local, cluster, and service
            if md['service'] == 'file' and local:
                obj = self.context.read.options(**options).csv(md['url'])
            elif md['service'] == 'file':
                logging.warning(
                    f'local file + spark cluster: loading using pandas reader',
                    extra={'md': to_dict(md)})

                df = pd.read_csv(md['url'],
                                 sep=options['sep'],
                                 header=options['header'])
                obj = self.context.createDataFrame(df)
            elif md['service'] in ['hdfs', 's3a']:
                obj = self.context.read.options(**options).csv(md['url'])
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error(e, extra={'md': md})

        self.load_log(md, options, ts_start)
        return obj
Ejemplo n.º 18
0
    def save_jdbc(self,
                  obj,
                  path=None,
                  provider=None,
                  *args,
                  mode=None,
                  **kwargs):

        result = True
        md = Resource(path, provider, format='jdbc', mode=mode, **kwargs)

        options = md['options']

        # after collecting from metadata, or method call, define csv defaults
        options['mode'] = options['mode'] or 'overwrite'

        ts_start = timer()
        try:
            #three approaches: local, cluster, and service
            if md['service'] in [
                    'sqlite', 'mysql', 'postgres', 'mssql', 'clickhouse',
                    'oracle'
            ]:
                obj.write \
                    .format('jdbc') \
                    .option('url', md['url']) \
                    .option("dbtable", md['table']) \
                    .option("driver", md['driver']) \
                    .option("user", md['user']) \
                    .option('password', md['password']) \
                    .options(**options) \
                    .mode(options['mode'])\
                    .save()
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                result = False

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
            result = False

        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        self.save_log(md, options, ts_start)
        return result
Ejemplo n.º 19
0
    def load_parquet(self,
                     path=None,
                     provider=None,
                     *args,
                     mergeSchema=None,
                     **kwargs):
        obj = None

        md = Resource(path,
                      provider,
                      format='parquet',
                      mergeSchema=mergeSchema,
                      **kwargs)

        # download if necessary
        md = get_local(md)

        options = md['options']

        # after collecting from metadata, or method call, define csv defaults
        options['mergeSchema'] = options.get('mergeSchema') or True

        local = self.is_spark_local()

        try:
            #three approaches: local, cluster, and service
            if md['service'] == 'file' and local:
                obj = self.context.read.options(**options).parquet(md['url'])
            elif md['service'] == 'file':
                logging.warning(
                    f'local file + spark cluster: loading using pandas reader',
                    extra={'md': to_dict(md)})
                #fallback to the pandas reader, then convert to spark
                df = pd.read_parquet(md['url'])
                obj = self.context.createDataFrame(df)
            elif md['service'] in ['hdfs', 's3a']:
                obj = self.context.read.options(**options).parquet(md['url'])
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                return obj

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error(str(e), extra={'md': md})

        return obj
Ejemplo n.º 20
0
    def save_event_log(self,
                       obj,
                       path=None,
                       provider=None,
                       *args,
                       mode=None,
                       partitionBy=None,
                       **kwargs):

        md = Resource(path, provider, format='event_log', mode=mode, **kwargs)
        options = md['options']

        # after collecting from metadata, or method call, define defaults
        options['mode'] = options['mode'] or 'append'
        try:
            if md['service'] in ['hdfs', 's3a']:
                obj = dataframe.add_version_column(obj)
                partitionBy = ['_version'] + (partitionBy or [])
                obj.write\
                    .format('parquet')\
                    .mode(options['mode'])\
                    .partitionBy(partitionBy)\
                    .options(**options)\
                    .parquet(md['url'])
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                return False

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        return True
Ejemplo n.º 21
0
    def save_csv(self,
                 obj,
                 path=None,
                 provider=None,
                 *args,
                 mode=None,
                 sep=None,
                 header=None,
                 **kwargs):

        result = True

        md = Resource(path,
                      provider,
                      format='csv',
                      mode=mode,
                      sep=sep,
                      header=header,
                      **kwargs)

        options = md['options']

        # after collecting from metadata, or method call, define csv defaults
        options['header'] = options.get('header', None) or 'true'
        options['sep'] = options.get('sep', None) or ','
        options['mode'] = options.get('mode', None) or 'overwrite'

        pcols = options.pop('partitionBy', None) or []
        pcols = pcols if isinstance(pcols, (list, tuple)) else [pcols]

        local = self.is_spark_local()

        ts_start = timer()
        try:
            # three approaches: file+local, file+cluster, and service
            if md['service'] == 'file' and local:
                obj.coalesce(1).write \
                    .partitionBy(*pcols) \
                    .format('csv') \
                    .mode(options['mode']) \
                    .options(**options) \
                    .csv(md['url'], **options)
                directory_to_file(md['url'])

            elif md['service'] == 'file':
                if os.path.exists(md['url']) and os.path.isdir(md['url']):
                    shutil.rmtree(md['url'])

                # save with pandas
                obj.toPandas().to_csv(md['url'],
                                      mode=options['mode'],
                                      header=options['header'],
                                      sep=options['sep'])

            elif md['service'] in ['hdfs', 's3a']:
                obj.write \
                    .partitionBy(*pcols) \
                    .format('csv') \
                    .mode(options['mode']) \
                    .options(**options) \
                    .csv(md['url'], **options)
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                result = False

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
            result = False

        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        self.save_log(md, options, ts_start)
        return result
Ejemplo n.º 22
0
    def save_jdbc(self,
                  obj,
                  path=None,
                  provider=None,
                  *args,
                  mode=None,
                  **kwargs):

        result = True
        md = Resource(path, provider, format='jdbc', mode=mode, **kwargs)

        options = md['options']

        # after collecting from metadata, or method call, define csv defaults
        options['mode'] = options.get('mode', None) or 'overwrite'

        # avoid multi-processing and distributed writes on sqlite
        if md['service'] == 'sqlite':
            local = self.is_spark_local()
            if not local:
                raise ValueError(
                    'write to sqlite can only be done from a local cluster')
                #todo:
                # sketched solution obj.toPandas().to_sql(md['url']

            #calesce to a single writer
            obj = obj.coalesce(1)

        # partition is meaningless here
        pcols = options.pop('partitionBy', None) or []

        ts_start = timer()
        try:
            if md['service'] in [
                    'sqlite', 'mysql', 'postgres', 'mssql', 'clickhouse',
                    'oracle'
            ]:
                obj = obj.write \
                    .format('jdbc') \
                    .option('url', md['url']) \
                    .option("dbtable", md['table']) \
                    .option("driver", md['driver'])

                if md['user']:
                    obj = obj.option("user", md['user'])

                if md['password']:
                    obj = obj.option('password', md['password'])

                obj.options(**options).mode(options['mode']).save()

            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                result = False

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
            result = False

        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        self.save_log(md, options, ts_start)
        return result
Ejemplo n.º 23
0
    def list(self, provider, path=''):
        df_schema = T.StructType([
            T.StructField('name', T.StringType(), True),
            T.StructField('type', T.StringType(), True)
        ])

        df_empty = self.context.createDataFrame(data=(), schema=df_schema)
        md = Resource(path, provider)

        try:
            if md['service'] in ['local', 'file']:
                lst = []
                rootpath = os.path.join(md['url'], path)
                for f in os.listdir(rootpath):
                    fullpath = os.path.join(rootpath, f)
                    if os.path.isfile(fullpath):
                        obj_type = 'FILE'
                    elif os.path.isdir(fullpath):
                        obj_type = 'DIRECTORY'
                    elif os.path.islink(fullpath):
                        obj_type = 'LINK'
                    elif os.path.ismount(fullpath):
                        obj_type = 'MOUNT'
                    else:
                        obj_type = 'UNDEFINED'

                    obj_name = f
                    lst += [(obj_name, obj_type)]

                if lst:
                    df = self.context.createDataFrame(lst, ['name', 'type'])
                else:
                    df = df_empty

                return df

            elif md['service'] in ['hdfs', 'minio', 's3a']:
                sc = self.context._sc
                URI = sc._gateway.jvm.java.net.URI
                Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
                FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
                fs = FileSystem.get(URI(md['url']),
                                    sc._jsc.hadoopConfiguration())
                obj = fs.listStatus(Path(md['url']))
                lst = []
                for i in range(len(obj)):
                    if obj[i].isFile():
                        obj_type = 'FILE'
                    elif obj[i].isDirectory():
                        obj_type = 'DIRECTORY'
                    else:
                        obj_type = 'UNDEFINED'

                    obj_name = obj[i].getPath().getName()
                    lst += [(obj_name, obj_type)]

                if lst:
                    df = self.context.createDataFrame(lst, ['name', 'type'])
                else:
                    df = df_empty

                return df

            elif md['format'] == 'jdbc':
                # remove options from database, if any
                database = md["database"].split('?')[0]
                schema = md['schema']
                if md['service'] == 'mssql':
                    query = f"""
                            ( SELECT table_name, table_type
                              FROM information_schema.tables
                              WHERE table_schema='{schema}'
                            ) as query
                            """
                elif md['service'] == 'oracle':
                    query = f"""
                            ( SELECT table_name, table_type
                             FROM all_tables
                             WHERE owner='{schema}'
                            ) as query
                            """
                elif md['service'] == 'mysql':
                    query = f"""
                            ( SELECT table_name, table_type
                              FROM information_schema.tables
                              WHERE table_schema='{schema}'
                            ) as query
                            """
                elif md['service'] == 'postgres':
                    query = f"""
                            ( SELECT table_name, table_type
                              FROM information_schema.tables
                              WHERE table_schema = '{schema}'
                            ) as query
                            """
                else:
                    # vanilla query ... for other databases
                    query = f"""
                            ( SELECT table_name, table_type
                              FROM information_schema.tables'
                            ) as query
                            """

                obj = self.context.read \
                    .format('jdbc') \
                    .option('url', md['url']) \
                    .option("dbtable", query) \
                    .option("driver", md['driver']) \
                    .option("user", md['user']) \
                    .option('password', md['password']) \
                    .load()

                # load the data from jdbc
                lst = []
                for x in obj.select('table_name', 'table_type').collect():
                    lst.append((x.table_name, x.table_type))

                if lst:
                    df = self.context.createDataFrame(lst, ['name', 'type'])
                else:
                    df = df_empty

                return df

            else:
                logging.error({
                    'md':
                    md,
                    'error_msg':
                    f'List resource on service "{md["service"]}" not implemented'
                })
                return df_empty
        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        return df_empty
Ejemplo n.º 24
0
    def copy(self, md_src, md_trg, mode='append'):
        # timer
        timer_start = timer()

        # src dataframe
        df_src = self.load(md_src)

        # if not path on target, get it from src
        if not md_trg['resource_path']:
            md_trg = resource.metadata(self._rootdir, self._metadata,
                                       md_src['resource_path'],
                                       md_trg['provider_alias'])

        # logging
        log_data = {
            'src_hash': md_src['hash'],
            'src_path': md_src['resource_path'],
            'trg_hash': md_trg['hash'],
            'trg_path': md_trg['resource_path'],
            'mode': mode,
            'updated': False,
            'records_read': 0,
            'records_add': 0,
            'records_del': 0,
            'columns': 0,
            'time': timer() - timer_start
        }

        # could not read source, log error and return
        if df_src is None:
            logging.error(log_data)
            return

        num_rows = df_src.count()
        num_cols = len(df_src.columns)

        # empty source, log notice and return
        if num_rows == 0 and mode == 'append':
            log_data['time'] = timer() - timer_start
            logging.notice(log_data)
            return

        # overwrite target, save, log notice/error and return
        if mode == 'overwrite':
            if md_trg['state_column']:
                df_src = df_src.withColumn('_state', F.lit(0))

            result = self.save(df_src, md_trg, mode=mode)

            log_data['time'] = timer() - timer_start
            log_data['records_read'] = num_rows
            log_data['records_add'] = num_rows
            log_data['columns'] = num_cols

            logging.notice(log_data) if result else logging.error(log_data)
            return

        # trg dataframe (if exists)
        try:
            df_trg = self.load(md_trg, catch_exception=False)
        except:
            df_trg = dataframe.empty(df_src)

        # de-dup (exclude the _updated column)

        # create a view from the extracted log
        df_trg = dataframe.view(df_trg)

        # capture added records
        df_add = dataframe.diff(
            df_src, df_trg,
            ['_date', '_datetime', '_updated', '_hash', '_state'])
        rows_add = df_add.count()

        # capture deleted records
        rows_del = 0
        if md_trg['state_column']:
            df_del = dataframe.diff(
                df_trg, df_src,
                ['_date', '_datetime', '_updated', '_hash', '_state'])
            rows_del = df_del.count()

        updated = (rows_add + rows_del) > 0

        num_cols = len(df_add.columns)
        num_rows = max(df_src.count(), df_trg.count())

        # save diff
        if updated:
            if md_trg['state_column']:
                df_add = df_add.withColumn('_state', F.lit(0))
                df_del = df_del.withColumn('_state', F.lit(1))

                df = df_add.union(df_del)
            else:
                df = df_add

            result = self.save(df, md_trg, mode=mode)
        else:
            result = True

        log_data.update({
            'updated': updated,
            'records_read': num_rows,
            'records_add': rows_add,
            'records_del': rows_del,
            'columns': num_cols,
            'time': timer() - timer_start
        })

        logging.notice(log_data) if result else logging.error(log_data)
Ejemplo n.º 25
0
    def list(self, provider, path=None, **kwargs):
        df_schema = T.StructType([
            T.StructField('name', T.StringType(), True),
            T.StructField('type', T.StringType(), True)
        ])

        df_empty = self.context.createDataFrame(data=(), schema=df_schema)

        md = Resource(path, provider, **kwargs)

        try:
            if md['service'] in ['local', 'file']:
                lst = []
                rootpath = md['url']
                for f in os.listdir(rootpath):
                    fullpath = os.path.join(rootpath, f)
                    if os.path.isfile(fullpath):
                        obj_type = 'FILE'
                    elif os.path.isdir(fullpath):
                        obj_type = 'DIRECTORY'
                    elif os.path.islink(fullpath):
                        obj_type = 'LINK'
                    elif os.path.ismount(fullpath):
                        obj_type = 'MOUNT'
                    else:
                        obj_type = 'UNDEFINED'

                    obj_name = f
                    lst += [(obj_name, obj_type)]

                if lst:
                    df = self.context.createDataFrame(lst, ['name', 'type'])
                else:
                    df = df_empty

                return df

            elif md['service'] in ['hdfs', 's3a']:
                sc = self.context._sc
                URI = sc._gateway.jvm.java.net.URI
                Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
                FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem

                parsed = urnparse(md['url'])
                if md['service'] == 's3a':
                    path = parsed.path.split('/')
                    url = 's3a://' + path[0]
                    path = '/' + '/'.join(path[1:]) if len(path) > 1 else '/'

                if md['service'] == 'hdfs':
                    host_port = f"{parsed.host}:{parsed.port}" if parsed.port else parsed.hosts
                    url = f'hdfs://{host_port}'
                    path = '/' + parsed.path

                try:
                    fs = FileSystem.get(URI(url),
                                        sc._jsc.hadoopConfiguration())
                    obj = fs.listStatus(Path(path))
                except:
                    logging.error(f'An error occurred accessing {url}{path}')
                    obj = []

                lst = []
                for i in range(len(obj)):
                    if obj[i].isFile():
                        obj_type = 'FILE'
                    elif obj[i].isDirectory():
                        obj_type = 'DIRECTORY'
                    else:
                        obj_type = 'UNDEFINED'

                    obj_name = obj[i].getPath().getName()
                    lst += [(obj_name, obj_type)]

                if lst:
                    df = self.context.createDataFrame(lst, ['name', 'type'])
                else:
                    df = df_empty

                return df

            elif md['format'] == 'jdbc':
                # remove options from database, if any

                database = md["database"].split('?')[0]
                schema = md['schema']
                table = md['table']

                if database and table:
                    try:
                        obj = self.context.read \
                        .format('jdbc') \
                        .option('url', md['url']) \
                        .option("dbtable", table) \
                        .option("driver", md['driver']) \
                        .option("user", md['user']) \
                        .option('password', md['password']) \
                        .load()
                        info = [(i.name, i.dataType.simpleString())
                                for i in obj.schema]
                    except:
                        info = []

                    if info:
                        return self.context.createDataFrame(
                            info, ['name', 'type'])

                if md['service'] == 'mssql':
                    query = f"""
                            ( SELECT table_name, table_type
                              FROM INFORMATION_SCHEMA.TABLES
                              WHERE table_schema='{schema}'
                            ) as query
                            """
                elif md['service'] == 'oracle':
                    query = f"""
                            ( SELECT table_name, table_type
                             FROM all_tables
                             WHERE table_schema='{schema}'
                            ) as query
                            """
                elif md['service'] == 'mysql':
                    query = f"""
                            ( SELECT table_name, table_type
                              FROM information_schema.tables
                              WHERE table_schema='{schema}'
                            ) as query
                            """
                elif md['service'] == 'postgres':
                    query = f"""
                            ( SELECT table_name, table_type
                              FROM information_schema.tables
                              WHERE table_schema = '{schema}'
                            ) as query
                            """
                else:
                    # vanilla query ... for other databases
                    query = f"""
                                ( SELECT table_name, table_type
                                  FROM information_schema.tables'
                                ) as query
                                """

                obj = self.context.read \
                    .format('jdbc') \
                    .option('url', md['url']) \
                    .option("dbtable", query) \
                    .option("driver", md['driver']) \
                    .option("user", md['user']) \
                    .option('password', md['password']) \
                    .load()

                # load the data from jdbc
                lst = []
                for x in obj.select('TABLE_NAME', 'TABLE_TYPE').collect():
                    lst.append((x.TABLE_NAME, x.TABLE_TYPE))

                if lst:
                    df = self.context.createDataFrame(lst, ['name', 'type'])
                else:
                    df = df_empty

                return df

            else:
                logging.error({
                    'md':
                    md,
                    'error_msg':
                    f'List resource on service "{md["service"]}" not implemented'
                })
                return df_empty
        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        return df_empty
Ejemplo n.º 26
0
    def save_json(self,
                  obj,
                  path=None,
                  provider=None,
                  *args,
                  mode=None,
                  lines=None,
                  **kwargs):

        md = Resource(path,
                      provider,
                      format='csv',
                      mode=mode,
                      lines=lines,
                      **kwargs)

        options = md['options']

        # after collecting from metadata, or method call, define csv defaults
        options['mode'] = options['mode'] or 'overwrite'
        options['lines'] = options['lines'] or True

        local = self.is_spark_local()

        try:
            #three approaches: local, cluster, and service
            if local and md['service'] == 'file' and options['lines']:
                obj.coalesce(1).write\
                    .format('json')\
                    .mode(options['mode'])\
                    .options(**options)\
                    .json(md['url'])
                self.directory_to_file(md['url'])

            elif md['service'] == 'file':
                # fallback, use pandas
                # save single files, not directories
                if os.path.exists(md['url']) and os.path.isdir(md['url']):
                    shutil.rmtree(md['url'])

                # save with pandas
                obj.toPandas().to_json(md['url'],
                                       mode=options['mode'],
                                       lines=options['lines'])

            elif md['service'] in ['hdfs', 's3a']:
                obj.write\
                    .format('json')\
                    .mode(options['mode'])\
                    .options(**options)\
                    .json(md['url'])
            else:
                logging.error(f'Unknown resource service "{md["service"]}"',
                              extra={'md': to_dict(md)})
                return False

        except AnalysisException as e:
            logging.error(str(e), extra={'md': md})
        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        return True