def start_context(self, conf): try: return dask.dataframe except Exception as e: print(e) logging.error('Could not start the engine context') return None
def save(self, obj, path=None, provider=None, *args, format=None, mode=None, **kwargs): md = Resource(path, provider, format=format, mode=mode, **kwargs) if md['format'] == 'csv': return self.save_csv(obj, path, provider, mode=mode, **kwargs) elif md['format'] == 'tsv': kwargs['sep'] = '\t' return self.save_csv(obj, path, provider, mode=mode, **kwargs) elif md['format'] == 'json': return self.save_json(obj, path, provider, mode=mode, **kwargs) elif md['format'] == 'jsonl': return self.save_json(obj, path, provider, mode=mode, **kwargs) elif md['format'] == 'parquet': return self.save_parquet(obj, path, provider, mode=mode, **kwargs) elif md['format'] == 'jdbc': return self.save_jdbc(obj, path, provider, mode=mode, **kwargs) else: logging.error(f'Unknown format "{md["service"]}"', extra={'md': md}) return False
def load_event_log(self, path=None, provider=None, versionAsOf=None, *args, **kwargs): obj = None md = Resource(path, provider, format='event_log', **kwargs) options = md['options'] try: if md['service'] in ['hdfs', 's3a']: version = self.find_version(versionAsOf, path, provider) if not version: logging.error('No version of data detected', extra={'md': md}) return obj version = version.strftime('%Y-%m-%d-%H-%M-%S') url = f'{md["url"]}/_version={version}' obj = self.context.read.options(**options).parquet(url) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) return obj except AnalysisException as e: logging.error(str(e), extra={'md': md}) except Exception as e: logging.error(str(e), extra={'md': md}) return obj
def start_session(self, conf): try: # init the spark session session = pyspark.sql.SparkSession.builder.config( conf=conf).getOrCreate() # store the spark session self.session = session # fix SQLContext for back compatibility initialize_spark_sql_context(session, session.sparkContext) # pyspark set log level method # (this will not suppress WARN before starting the context) session.sparkContext.setLogLevel("ERROR") # bootstrap datafaucet.zip in the cluster if not self.is_spark_local(): dir_path = os.path.dirname(os.path.realpath(__file__)) filename = os.path.abspath( os.path.join(dir_path, 'dist/datafaucet.zip')) session.sparkContext.addPyFile(filename) # collect configuration self.conf = dict(dict(session.sparkContext.getConf().getAll())) # set the engine version self.version = session.version # set environment self.env = self.get_environment() # set info self.info['spark_classpath'] = self.info['spark_classpath'][ 0].split(' ') self.info = YamlDict(self.info) # set version if spark is loaded logging.notice( f'Engine context {self.engine_type}:{self.version} successfully started' ) # session is running self.stopped = False except Exception as e: print(e) logging.error('Could not start the engine context') return None
def load(self, path=None, provider=None, *args, format=None, **kwargs): md = Resource(path, provider, format=format, **kwargs) if md['format'] == 'csv': return self.load_csv(path, provider, **kwargs) elif md['format'] == 'json': return self.load_json(path, provider, **kwargs) elif md['format'] == 'parquet': return self.load_parquet(path, provider, **kwargs) elif md['format'] == 'jdbc': return self.load_jdbc(path, provider, **kwargs) else: logging.error(f'Unknown resource format "{md["format"]}"', extra={'md': to_dict(md)}) return None
def save_plus(self, obj, path=None, provider=None, **kwargs): md = Resource(path, provider, **kwargs) prep_start = timer() options = md['options'] or {} if md['date_partition'] and md['date_column']: tzone = 'UTC' if self._timestamps == 'naive' else self._timezone obj = dataframe.add_datetime_columns(obj, column=md['date_column'], tzone=tzone) kwargs['partitionBy'] = ['_date'] + kwargs.get( 'partitionBy', options.get('partitionBy', [])) if md['update_column']: obj = dataframe.add_update_column(obj, tzone=self._timezone) if md['hash_column']: obj = dataframe.add_hash_column(obj, cols=md['hash_column'], exclude_cols=[ '_date', '_datetime', '_updated', '_hash', '_state' ]) date_column = '_date' if md['date_partition'] else md['date_column'] obj = dataframe.filter_by_date(obj, date_column, md['date_start'], md['date_end'], md['date_window']) obj = dataframe.cache(obj, md['cache']) num_rows = obj.count() num_cols = len(obj.columns) # force 1 file per partition, just before saving obj = obj.repartition(1, *kwargs['partitionBy']) if kwargs.get( 'partitionBy') else obj.repartition(1) # obj = obj.coalesce(1) prep_end = timer() core_start = timer() result = self.save_dataframe(obj, md, **kwargs) core_end = timer() log_data = { 'md': dict(md), 'mode': kwargs.get('mode', options.get('mode')), 'records': num_rows, 'columns': num_cols, 'time': core_end - prep_start, 'time_core': core_end - core_start, 'time_prep': prep_end - prep_start } logging.info(log_data) if result else logging.error(log_data) return result
def Engine(engine_type=None, *args, **kwargs): global _engines if engine_type: if engine_type in _engines.keys(): cls = _engines[engine_type] cls(*args, **kwargs) else: print('Could not create the Engine:') print('No matching engine type in', ', '.join(_engines.keys())) engine = _singleton['instance'] if not engine: logging.error( 'No Engine running yet. \n' 'try datafaucet.engine(...) or datafaucet.project.load(...)') return engine
def info(self): if not self.loaded: logging.error("No project profile loaded. " + "Execute datafaucet.project.load(...) first.") return None return YamlDict({ 'version': __version__, 'username': self._username, 'session_name': self._session_name, 'session_id': self._session_id, 'profile': self._profile, 'rootdir': paths.rootdir(), 'script_path': self._script_path, 'dotenv_path': self._dotenv_path, 'notebooks_files': self._notebook_files, 'python_files': self._python_files, 'metadata_files': self._metadata_files, 'repository': self._repo })
def load_mongo(self, path=None, provider=None, *args, **kwargs): obj = None md = Resource(path, provider, format='mongo', **kwargs) options = md['options'] try: if md['service'] == 'mongodb': obj = self.context.read \ .format('mongo') \ .option('uri', md['url']) \ .options(**options) # load the data obj = obj.load(**kwargs) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) return obj except AnalysisException as e: logging.error(str(e), extra={'md': md}) except Exception as e: logging.error(str(e), extra={'md': md}) return obj
def save_mongo(self, path=None, provider=None, *args, **kwargs): md = Resource(path, provider, format='mongo', **kwargs) options = md['options'] try: if md['service'] == 'mongodb': obj.write \ .format('mongo') \ .option('uri', md['url']) \ .options(**options) \ .mode(options['mode']) \ .save(**kwargs) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) return False except AnalysisException as e: logging.error(str(e), extra={'md': md}) except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e return True
def load_jdbc(self, path=None, provider=None, *args, **kwargs): obj = None md = Resource(path, provider, format='jdbc', **kwargs) options = md['options'] try: if md['service'] in [ 'sqlite', 'mysql', 'postgres', 'mssql', 'clickhouse', 'oracle' ]: obj = self.context.read \ .format('jdbc') \ .option('url', md['url']) \ .option("dbtable", md['table']) \ .option("driver", md['driver']) \ .option("user", md['user']) \ .option('password', md['password']) \ .options(**options) # load the data from jdbc obj = obj.load(**kwargs) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) return obj except AnalysisException as e: logging.error(str(e), extra={'md': md}) except Exception as e: logging.error(str(e), extra={'md': md}) return obj
def read(self, file_paths=None): """ Return all profiles, stored in a nested dictionary Profiles are merged over the list provided of provided metadata files to read. The order in the list of metadata files determines how profile properties are override :param file_paths: list of yaml files paths :return: dict of profiles """ # empty profiles, before start reading profiles = {} if not file_paths: file_paths = [] self._info['files'] = [] for filename in file_paths: if os.path.isfile(filename): with open(filename, 'r') as f: try: docs = list(yaml.load_all(f)) self._info['files'].append(filename) except yaml.YAMLError as e: if hasattr(e, 'problem_mark'): mark = e.problem_mark logging.error( "Error loading yml file {} at position: (%s:%s): skipping file" .format(filename, mark.line + 1, mark.column + 1)) docs = [] finally: for doc in docs: doc['profile'] = doc.get('profile', 'default') profiles[doc['profile']] = merge( profiles.get(doc['profile'], {}), doc) self._info['profiles'] = sorted(list(profiles.keys())) return profiles
def start_context(self, conf): try: # init the spark session session = pyspark.sql.SparkSession.builder.config( conf=conf).getOrCreate() # fix SQLContext for back compatibility self.initialize_spark_sql_context(session, session.sparkContext) # pyspark set log level method # (this will not suppress WARN before starting the context) session.sparkContext.setLogLevel("ERROR") # set the engine version self.version = session.version # set environment self.env = self.get_environment() return session except Exception as e: print(e) logging.error('Could not start the engine context') return None
def save_parquet(self, obj, path=None, provider=None, *args, mode=None, **kwargs): result = True md = Resource(path, provider, format='parquet', mode=mode, **kwargs) options = md['options'] # after collecting from metadata, or method call, define defaults options['mode'] = options['mode'] or 'overwrite' local = self.is_spark_local() ts_start = timer() try: #three approaches: file-local, local+cluster, and service if md['service'] == 'file' and local: obj.coalesce(1).write\ .format('parquet')\ .mode(options['mode'])\ .options(**options)\ .parquet(md['url']) elif md['service'] == 'file': if os.path.exists(md['url']) and os.path.isdir(md['url']): shutil.rmtree(md['url']) # save with pandas obj.toPandas().to_parquet(md['url'], mode=options['mode']) elif md['service'] in ['hdfs', 's3a']: obj.write\ .format('parquet')\ .mode(options['mode'])\ .options(**options)\ .parquet(md['url']) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) result = False except AnalysisException as e: logging.error(str(e), extra={'md': md}) result = False except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e self.save_log(md, options, ts_start) return result
def load_jdbc(self, path=None, provider=None, *args, **kwargs): # return None obj = None md = Resource(path, provider, format='jdbc', **kwargs) options = md['options'] # start the timer for logging ts_start = timer() # cluster mode and local sqlite db: use pandas/sqlite if md['service'] == 'sqlite': local = self.is_spark_local() if not local: con = sqlite3.connect(md['database']) pdf = pd.read_sql(f"select * from {md['table']}", con=con) obj = self.session.createDataFrame(pdf) self.load_log(md, options, ts_start) return obj # all the other cases: try: if md['service'] in [ 'sqlite', 'mysql', 'postgres', 'mssql', 'clickhouse', 'oracle' ]: obj = self.session.read \ .format('jdbc') \ .option('url', md['url']) \ .option("dbtable", md['table']) \ .option("driver", md['driver']) if md['user']: obj = obj.option("user", md['user']) if md['password']: obj = obj.option('password', md['password']) obj = obj.options(**options) # load the data from jdbc obj = obj.load(**kwargs) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) return obj except AnalysisException as e: logging.error(str(e), extra={'md': md}) except Exception as e: logging.error(e, extra={'md': md}) self.load_log(md, options, ts_start) return obj
def load_plus(self, path=None, provider=None, catch_exception=True, **kwargs): md = Resource(path, provider, **kwargs) core_start = timer() obj = self.load_dataframe(md, catch_exception, **kwargs) core_end = timer() if obj is None: return obj prep_start = timer() #date_column = '_date' if md['date_partition'] else md['date_column'] obj = dataframe.filter_by_date(obj, date_column, md['date_start'], md['date_end'], md['date_window']) # partition and sorting (hmmm, needed?) if date_column and date_column in obj.columns: obj = obj.repartition(date_column) if '_updated' in obj.columns: obj = obj.sortWithinPartitions(F.desc('_updated')) num_rows = obj.count() num_cols = len(obj.columns) obj = dataframe.cache(obj, md['cache']) prep_end = timer() log_data = { 'md': md, 'mode': kwargs.get('mode', md.get('options', {}).get('mode')), 'records': num_rows, 'columns': num_cols, 'time': prep_end - core_start, 'time_core': core_end - core_start, 'time_prep': prep_end - prep_start } logging.info(log_data) if obj is not None else logging.error(log_data) obj.__name__ = path return obj
def load_csv(self, path=None, provider=None, *args, sep=None, header=None, **kwargs): #return None obj = None md = Resource(path, provider, sep=sep, header=header, **kwargs) # download if necessary md = get_local(md) options = md['options'] # after collecting from metadata, or method call, define csv defaults options['header'] = options.get('header') or True options['inferSchema'] = options.get('inferSchema') or True options['sep'] = options.get('sep') or ',' local = self.is_spark_local() # start the timer for logging ts_start = timer() try: #three approaches: local, cluster, and service if md['service'] == 'file' and local: obj = self.context.read.options(**options).csv(md['url']) elif md['service'] == 'file': logging.warning( f'local file + spark cluster: loading using pandas reader', extra={'md': to_dict(md)}) df = pd.read_csv(md['url'], sep=options['sep'], header=options['header']) obj = self.context.createDataFrame(df) elif md['service'] in ['hdfs', 's3a']: obj = self.context.read.options(**options).csv(md['url']) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) except AnalysisException as e: logging.error(str(e), extra={'md': md}) except Exception as e: logging.error(e, extra={'md': md}) self.load_log(md, options, ts_start) return obj
def save_jdbc(self, obj, path=None, provider=None, *args, mode=None, **kwargs): result = True md = Resource(path, provider, format='jdbc', mode=mode, **kwargs) options = md['options'] # after collecting from metadata, or method call, define csv defaults options['mode'] = options['mode'] or 'overwrite' ts_start = timer() try: #three approaches: local, cluster, and service if md['service'] in [ 'sqlite', 'mysql', 'postgres', 'mssql', 'clickhouse', 'oracle' ]: obj.write \ .format('jdbc') \ .option('url', md['url']) \ .option("dbtable", md['table']) \ .option("driver", md['driver']) \ .option("user", md['user']) \ .option('password', md['password']) \ .options(**options) \ .mode(options['mode'])\ .save() else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) result = False except AnalysisException as e: logging.error(str(e), extra={'md': md}) result = False except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e self.save_log(md, options, ts_start) return result
def load_parquet(self, path=None, provider=None, *args, mergeSchema=None, **kwargs): obj = None md = Resource(path, provider, format='parquet', mergeSchema=mergeSchema, **kwargs) # download if necessary md = get_local(md) options = md['options'] # after collecting from metadata, or method call, define csv defaults options['mergeSchema'] = options.get('mergeSchema') or True local = self.is_spark_local() try: #three approaches: local, cluster, and service if md['service'] == 'file' and local: obj = self.context.read.options(**options).parquet(md['url']) elif md['service'] == 'file': logging.warning( f'local file + spark cluster: loading using pandas reader', extra={'md': to_dict(md)}) #fallback to the pandas reader, then convert to spark df = pd.read_parquet(md['url']) obj = self.context.createDataFrame(df) elif md['service'] in ['hdfs', 's3a']: obj = self.context.read.options(**options).parquet(md['url']) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) return obj except AnalysisException as e: logging.error(str(e), extra={'md': md}) except Exception as e: logging.error(str(e), extra={'md': md}) return obj
def save_event_log(self, obj, path=None, provider=None, *args, mode=None, partitionBy=None, **kwargs): md = Resource(path, provider, format='event_log', mode=mode, **kwargs) options = md['options'] # after collecting from metadata, or method call, define defaults options['mode'] = options['mode'] or 'append' try: if md['service'] in ['hdfs', 's3a']: obj = dataframe.add_version_column(obj) partitionBy = ['_version'] + (partitionBy or []) obj.write\ .format('parquet')\ .mode(options['mode'])\ .partitionBy(partitionBy)\ .options(**options)\ .parquet(md['url']) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) return False except AnalysisException as e: logging.error(str(e), extra={'md': md}) except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e return True
def save_csv(self, obj, path=None, provider=None, *args, mode=None, sep=None, header=None, **kwargs): result = True md = Resource(path, provider, format='csv', mode=mode, sep=sep, header=header, **kwargs) options = md['options'] # after collecting from metadata, or method call, define csv defaults options['header'] = options.get('header', None) or 'true' options['sep'] = options.get('sep', None) or ',' options['mode'] = options.get('mode', None) or 'overwrite' pcols = options.pop('partitionBy', None) or [] pcols = pcols if isinstance(pcols, (list, tuple)) else [pcols] local = self.is_spark_local() ts_start = timer() try: # three approaches: file+local, file+cluster, and service if md['service'] == 'file' and local: obj.coalesce(1).write \ .partitionBy(*pcols) \ .format('csv') \ .mode(options['mode']) \ .options(**options) \ .csv(md['url'], **options) directory_to_file(md['url']) elif md['service'] == 'file': if os.path.exists(md['url']) and os.path.isdir(md['url']): shutil.rmtree(md['url']) # save with pandas obj.toPandas().to_csv(md['url'], mode=options['mode'], header=options['header'], sep=options['sep']) elif md['service'] in ['hdfs', 's3a']: obj.write \ .partitionBy(*pcols) \ .format('csv') \ .mode(options['mode']) \ .options(**options) \ .csv(md['url'], **options) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) result = False except AnalysisException as e: logging.error(str(e), extra={'md': md}) result = False except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e self.save_log(md, options, ts_start) return result
def save_jdbc(self, obj, path=None, provider=None, *args, mode=None, **kwargs): result = True md = Resource(path, provider, format='jdbc', mode=mode, **kwargs) options = md['options'] # after collecting from metadata, or method call, define csv defaults options['mode'] = options.get('mode', None) or 'overwrite' # avoid multi-processing and distributed writes on sqlite if md['service'] == 'sqlite': local = self.is_spark_local() if not local: raise ValueError( 'write to sqlite can only be done from a local cluster') #todo: # sketched solution obj.toPandas().to_sql(md['url'] #calesce to a single writer obj = obj.coalesce(1) # partition is meaningless here pcols = options.pop('partitionBy', None) or [] ts_start = timer() try: if md['service'] in [ 'sqlite', 'mysql', 'postgres', 'mssql', 'clickhouse', 'oracle' ]: obj = obj.write \ .format('jdbc') \ .option('url', md['url']) \ .option("dbtable", md['table']) \ .option("driver", md['driver']) if md['user']: obj = obj.option("user", md['user']) if md['password']: obj = obj.option('password', md['password']) obj.options(**options).mode(options['mode']).save() else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) result = False except AnalysisException as e: logging.error(str(e), extra={'md': md}) result = False except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e self.save_log(md, options, ts_start) return result
def list(self, provider, path=''): df_schema = T.StructType([ T.StructField('name', T.StringType(), True), T.StructField('type', T.StringType(), True) ]) df_empty = self.context.createDataFrame(data=(), schema=df_schema) md = Resource(path, provider) try: if md['service'] in ['local', 'file']: lst = [] rootpath = os.path.join(md['url'], path) for f in os.listdir(rootpath): fullpath = os.path.join(rootpath, f) if os.path.isfile(fullpath): obj_type = 'FILE' elif os.path.isdir(fullpath): obj_type = 'DIRECTORY' elif os.path.islink(fullpath): obj_type = 'LINK' elif os.path.ismount(fullpath): obj_type = 'MOUNT' else: obj_type = 'UNDEFINED' obj_name = f lst += [(obj_name, obj_type)] if lst: df = self.context.createDataFrame(lst, ['name', 'type']) else: df = df_empty return df elif md['service'] in ['hdfs', 'minio', 's3a']: sc = self.context._sc URI = sc._gateway.jvm.java.net.URI Path = sc._gateway.jvm.org.apache.hadoop.fs.Path FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem fs = FileSystem.get(URI(md['url']), sc._jsc.hadoopConfiguration()) obj = fs.listStatus(Path(md['url'])) lst = [] for i in range(len(obj)): if obj[i].isFile(): obj_type = 'FILE' elif obj[i].isDirectory(): obj_type = 'DIRECTORY' else: obj_type = 'UNDEFINED' obj_name = obj[i].getPath().getName() lst += [(obj_name, obj_type)] if lst: df = self.context.createDataFrame(lst, ['name', 'type']) else: df = df_empty return df elif md['format'] == 'jdbc': # remove options from database, if any database = md["database"].split('?')[0] schema = md['schema'] if md['service'] == 'mssql': query = f""" ( SELECT table_name, table_type FROM information_schema.tables WHERE table_schema='{schema}' ) as query """ elif md['service'] == 'oracle': query = f""" ( SELECT table_name, table_type FROM all_tables WHERE owner='{schema}' ) as query """ elif md['service'] == 'mysql': query = f""" ( SELECT table_name, table_type FROM information_schema.tables WHERE table_schema='{schema}' ) as query """ elif md['service'] == 'postgres': query = f""" ( SELECT table_name, table_type FROM information_schema.tables WHERE table_schema = '{schema}' ) as query """ else: # vanilla query ... for other databases query = f""" ( SELECT table_name, table_type FROM information_schema.tables' ) as query """ obj = self.context.read \ .format('jdbc') \ .option('url', md['url']) \ .option("dbtable", query) \ .option("driver", md['driver']) \ .option("user", md['user']) \ .option('password', md['password']) \ .load() # load the data from jdbc lst = [] for x in obj.select('table_name', 'table_type').collect(): lst.append((x.table_name, x.table_type)) if lst: df = self.context.createDataFrame(lst, ['name', 'type']) else: df = df_empty return df else: logging.error({ 'md': md, 'error_msg': f'List resource on service "{md["service"]}" not implemented' }) return df_empty except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e return df_empty
def copy(self, md_src, md_trg, mode='append'): # timer timer_start = timer() # src dataframe df_src = self.load(md_src) # if not path on target, get it from src if not md_trg['resource_path']: md_trg = resource.metadata(self._rootdir, self._metadata, md_src['resource_path'], md_trg['provider_alias']) # logging log_data = { 'src_hash': md_src['hash'], 'src_path': md_src['resource_path'], 'trg_hash': md_trg['hash'], 'trg_path': md_trg['resource_path'], 'mode': mode, 'updated': False, 'records_read': 0, 'records_add': 0, 'records_del': 0, 'columns': 0, 'time': timer() - timer_start } # could not read source, log error and return if df_src is None: logging.error(log_data) return num_rows = df_src.count() num_cols = len(df_src.columns) # empty source, log notice and return if num_rows == 0 and mode == 'append': log_data['time'] = timer() - timer_start logging.notice(log_data) return # overwrite target, save, log notice/error and return if mode == 'overwrite': if md_trg['state_column']: df_src = df_src.withColumn('_state', F.lit(0)) result = self.save(df_src, md_trg, mode=mode) log_data['time'] = timer() - timer_start log_data['records_read'] = num_rows log_data['records_add'] = num_rows log_data['columns'] = num_cols logging.notice(log_data) if result else logging.error(log_data) return # trg dataframe (if exists) try: df_trg = self.load(md_trg, catch_exception=False) except: df_trg = dataframe.empty(df_src) # de-dup (exclude the _updated column) # create a view from the extracted log df_trg = dataframe.view(df_trg) # capture added records df_add = dataframe.diff( df_src, df_trg, ['_date', '_datetime', '_updated', '_hash', '_state']) rows_add = df_add.count() # capture deleted records rows_del = 0 if md_trg['state_column']: df_del = dataframe.diff( df_trg, df_src, ['_date', '_datetime', '_updated', '_hash', '_state']) rows_del = df_del.count() updated = (rows_add + rows_del) > 0 num_cols = len(df_add.columns) num_rows = max(df_src.count(), df_trg.count()) # save diff if updated: if md_trg['state_column']: df_add = df_add.withColumn('_state', F.lit(0)) df_del = df_del.withColumn('_state', F.lit(1)) df = df_add.union(df_del) else: df = df_add result = self.save(df, md_trg, mode=mode) else: result = True log_data.update({ 'updated': updated, 'records_read': num_rows, 'records_add': rows_add, 'records_del': rows_del, 'columns': num_cols, 'time': timer() - timer_start }) logging.notice(log_data) if result else logging.error(log_data)
def list(self, provider, path=None, **kwargs): df_schema = T.StructType([ T.StructField('name', T.StringType(), True), T.StructField('type', T.StringType(), True) ]) df_empty = self.context.createDataFrame(data=(), schema=df_schema) md = Resource(path, provider, **kwargs) try: if md['service'] in ['local', 'file']: lst = [] rootpath = md['url'] for f in os.listdir(rootpath): fullpath = os.path.join(rootpath, f) if os.path.isfile(fullpath): obj_type = 'FILE' elif os.path.isdir(fullpath): obj_type = 'DIRECTORY' elif os.path.islink(fullpath): obj_type = 'LINK' elif os.path.ismount(fullpath): obj_type = 'MOUNT' else: obj_type = 'UNDEFINED' obj_name = f lst += [(obj_name, obj_type)] if lst: df = self.context.createDataFrame(lst, ['name', 'type']) else: df = df_empty return df elif md['service'] in ['hdfs', 's3a']: sc = self.context._sc URI = sc._gateway.jvm.java.net.URI Path = sc._gateway.jvm.org.apache.hadoop.fs.Path FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem parsed = urnparse(md['url']) if md['service'] == 's3a': path = parsed.path.split('/') url = 's3a://' + path[0] path = '/' + '/'.join(path[1:]) if len(path) > 1 else '/' if md['service'] == 'hdfs': host_port = f"{parsed.host}:{parsed.port}" if parsed.port else parsed.hosts url = f'hdfs://{host_port}' path = '/' + parsed.path try: fs = FileSystem.get(URI(url), sc._jsc.hadoopConfiguration()) obj = fs.listStatus(Path(path)) except: logging.error(f'An error occurred accessing {url}{path}') obj = [] lst = [] for i in range(len(obj)): if obj[i].isFile(): obj_type = 'FILE' elif obj[i].isDirectory(): obj_type = 'DIRECTORY' else: obj_type = 'UNDEFINED' obj_name = obj[i].getPath().getName() lst += [(obj_name, obj_type)] if lst: df = self.context.createDataFrame(lst, ['name', 'type']) else: df = df_empty return df elif md['format'] == 'jdbc': # remove options from database, if any database = md["database"].split('?')[0] schema = md['schema'] table = md['table'] if database and table: try: obj = self.context.read \ .format('jdbc') \ .option('url', md['url']) \ .option("dbtable", table) \ .option("driver", md['driver']) \ .option("user", md['user']) \ .option('password', md['password']) \ .load() info = [(i.name, i.dataType.simpleString()) for i in obj.schema] except: info = [] if info: return self.context.createDataFrame( info, ['name', 'type']) if md['service'] == 'mssql': query = f""" ( SELECT table_name, table_type FROM INFORMATION_SCHEMA.TABLES WHERE table_schema='{schema}' ) as query """ elif md['service'] == 'oracle': query = f""" ( SELECT table_name, table_type FROM all_tables WHERE table_schema='{schema}' ) as query """ elif md['service'] == 'mysql': query = f""" ( SELECT table_name, table_type FROM information_schema.tables WHERE table_schema='{schema}' ) as query """ elif md['service'] == 'postgres': query = f""" ( SELECT table_name, table_type FROM information_schema.tables WHERE table_schema = '{schema}' ) as query """ else: # vanilla query ... for other databases query = f""" ( SELECT table_name, table_type FROM information_schema.tables' ) as query """ obj = self.context.read \ .format('jdbc') \ .option('url', md['url']) \ .option("dbtable", query) \ .option("driver", md['driver']) \ .option("user", md['user']) \ .option('password', md['password']) \ .load() # load the data from jdbc lst = [] for x in obj.select('TABLE_NAME', 'TABLE_TYPE').collect(): lst.append((x.TABLE_NAME, x.TABLE_TYPE)) if lst: df = self.context.createDataFrame(lst, ['name', 'type']) else: df = df_empty return df else: logging.error({ 'md': md, 'error_msg': f'List resource on service "{md["service"]}" not implemented' }) return df_empty except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e return df_empty
def save_json(self, obj, path=None, provider=None, *args, mode=None, lines=None, **kwargs): md = Resource(path, provider, format='csv', mode=mode, lines=lines, **kwargs) options = md['options'] # after collecting from metadata, or method call, define csv defaults options['mode'] = options['mode'] or 'overwrite' options['lines'] = options['lines'] or True local = self.is_spark_local() try: #three approaches: local, cluster, and service if local and md['service'] == 'file' and options['lines']: obj.coalesce(1).write\ .format('json')\ .mode(options['mode'])\ .options(**options)\ .json(md['url']) self.directory_to_file(md['url']) elif md['service'] == 'file': # fallback, use pandas # save single files, not directories if os.path.exists(md['url']) and os.path.isdir(md['url']): shutil.rmtree(md['url']) # save with pandas obj.toPandas().to_json(md['url'], mode=options['mode'], lines=options['lines']) elif md['service'] in ['hdfs', 's3a']: obj.write\ .format('json')\ .mode(options['mode'])\ .options(**options)\ .json(md['url']) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) return False except AnalysisException as e: logging.error(str(e), extra={'md': md}) except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e return True