def list(self, provider): if isinstance(provider, YamlDict): md = provider.to_dict() elif isinstance(provider, str): md = get_metadata(self._rootdir, self._metadata, None, provider) elif isinstance(provider, dict): md = provider else: logging.warning(f'{str(provider)} cannot be used to reference a provider') return [] try: if md['service'] in ['local', 'file']: d = [] for f in os.listdir(md['provider_path']): d.append(os.path.join(md['provider_path'], f)) return d elif md['service'] == 'hdfs': sc = self._ctx._sc URI = sc._gateway.jvm.java.net.URI Path = sc._gateway.jvm.org.apache.hadoop.fs.Path FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem fs = FileSystem.get(URI(md['url']), sc._jsc.hadoopConfiguration()) obj = fs.listStatus(Path(md['url'])) tables = [obj[i].getPath().getName() for i in range(len(obj))] return tables elif md['format'] == 'jdbc': if md['service'] == 'mssql': query = "(SELECT table_name FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE') as query" elif md['service'] == 'oracle': query = "(SELECT table_name FROM all_tables WHERE owner='schema_name') as query" elif md['service'] == 'mysql': query = f"(SELECT table_name FROM information_schema.tables where table_schema='{md['database']}') as query" elif md['service'] == 'pgsql': query = f"(SELECT table_name FROM information_schema.tables) as query" else: # vanilla query ... for other databases query = f"(SELECT table_name FROM information_schema.tables) as query" obj = self._ctx.read \ .format('jdbc') \ .option('url', md['url']) \ .option("dbtable", query) \ .option("driver", md['driver']) \ .option("user", md['username']) \ .option('password', md['password']) \ .load() # load the data from jdbc return [x.TABLE_NAME for x in obj.select('TABLE_NAME').collect()] else: logging.error({'md': md, 'error_msg': f'List resource on service "{md["service"]}" not implemented'}) return [] except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e return []
def read(file_paths=None): """ Return all profiles, stored in a nested dictionary profiles are merged over the list provided profiles. list order determines override each profile name :param file_paths: list of yaml files :return: dict of profiles """ profiles = {} if not file_paths: file_paths = [] for filename in file_paths: if os.path.isfile(filename): with open(filename, 'r') as f: try: docs = list(yaml.load_all(f)) except yaml.YAMLError as e: if hasattr(e, 'problem_mark'): mark = e.problem_mark logging.error( "Error loading yml file {} at position: (%s:%s): skipping file" .format(filename, mark.line + 1, mark.column + 1)) docs = [] finally: for doc in docs: doc['profile'] = doc.get('profile', 'default') profiles[doc['profile']] = merge( profiles.get(doc['profile'], {}), doc) return profiles
def save(self, obj, path=None, provider=None, *args, format=None, mode=None, **kwargs): md = Resource(path, provider, format=format, mode=mode, **kwargs) if md['format'] == 'csv': return self.save_csv(obj, path, provider, mode=mode, **kwargs) elif md['format'] == 'tsv': kwargs['sep'] = '\t' return self.save_csv(obj, path, provider, mode=mode, **kwargs) elif md['format'] == 'json': return self.save_json(obj, path, provider, mode=mode, **kwargs) elif md['format'] == 'jsonl': return self.save_json(obj, path, provider, mode=mode, **kwargs) elif md['format'] == 'parquet': return self.save_parquet(obj, path, provider, mode=mode, **kwargs) elif md['format'] == 'jdbc': return self.save_jdbc(obj, path, provider, mode=mode, **kwargs) else: logging.error(f'Unknown format "{md["service"]}"', extra={'md': md}) return False
def read(file_paths=None): """ Return all profiles, stored in a nested dictionary Profiles are merged over the list provided of provided metadata files to read. The order in the list of metadata files determines how profile properties are override :param file_paths: list of yaml files paths :return: dict of profiles """ global loaded_md_files, profiles # empty profiles, before start reading profiles = {} if not file_paths: file_paths = [] loaded_md_files = [] for filename in file_paths: if os.path.isfile(filename): with open(filename, 'r') as f: try: docs = list(yaml.load_all(f)) loaded_md_files.append(filename) except yaml.YAMLError as e: if hasattr(e, 'problem_mark'): mark = e.problem_mark logging.error("Error loading yml file {} at position: (%s:%s): skipping file".format(filename, mark.line+1, mark.column+1)) docs = [] finally: for doc in docs: doc['profile'] = doc.get('profile', 'default') profiles[doc['profile']] = merge(profiles.get(doc['profile'],{}), doc) return profiles
def start_context(self, conf): try: # init the spark session session = pyspark.sql.SparkSession.builder.config(conf=conf).getOrCreate() # fix SQLContext for back compatibility self.initialize_spark_sql_context(session,session.sparkContext) # pyspark set log level method # (this will not suppress WARN before starting the context) session.sparkContext.setLogLevel("ERROR") return session except Exception as e: logging.error('Could not start the engine context') return None
def load(self, path=None, provider=None, *args, format=None, **kwargs): md = Resource(path, provider, format=format, **kwargs) if md['format'] == 'csv': return self.load_csv(path, provider, **kwargs) elif md['format'] == 'json': return self.load_json(path, provider, **kwargs) elif md['format'] == 'parquet': return self.load_parquet(path, provider, **kwargs) elif md['format'] == 'jdbc': return self.load_jdbc(path, provider, **kwargs) else: logging.error(f'Unknown resource format "{md["format"]}"', extra={'md': to_dict(md)}) return None
def save(self, obj, path=None, provider=None, **kargs): if isinstance(path, YamlDict): md = path.to_dict() elif isinstance(path, str): md = resource.metadata(self._rootdir, self._metadata, path, provider) elif isinstance(path, dict): md = path prep_start = timer() options = md['options'] or {} if md['date_partition'] and md['date_column']: tzone = 'UTC' if self._timestamps == 'naive' else self._timezone obj = dataframe.add_datetime_columns(obj, column=md['date_column'], tzone=tzone) kargs['partitionBy'] = ['_date'] + kargs.get('partitionBy', options.get('partitionBy', [])) if md['update_column']: obj = dataframe.add_update_column(obj, tzone=self._timezone) if md['hash_column']: obj = dataframe.add_hash_column(obj, cols=md['hash_column'], exclude_cols=['_date', '_datetime', '_updated', '_hash', '_state']) date_column = '_date' if md['date_partition'] else md['date_column'] obj = dataframe.filter_by_date( obj, date_column, md['date_start'], md['date_end'], md['date_window']) obj = dataframe.cache(obj, md['cache']) num_rows = obj.count() num_cols = len(obj.columns) # force 1 file per partition, just before saving obj = obj.repartition(1, *kargs['partitionBy']) if kargs.get('partitionBy') else obj.repartition(1) # obj = obj.coalesce(1) prep_end = timer() core_start = timer() result = self.save_dataframe(obj, md, **kargs) core_end = timer() log_data = { 'md': {i: md[i] for i in md if i != 'password'}, 'mode': kargs.get('mode', options.get('mode')), 'records': num_rows, 'columns': num_cols, 'time': core_end - prep_start, 'time_core': core_end - core_start, 'time_prep': prep_end - prep_start } logging.info(log_data) if result else logging.error(log_data) return result
def info(self): if not self.loaded: logging.error("No project profile loaded. " + "Execute datalabframework.project.load(...) first.") return None return YamlDict({ 'version': __version__, 'username': self._username, 'session_name': self._session_name, 'session_id': self._session_id, 'profile': self._profile, 'rootdir': paths.rootdir(), 'script_path': self._script_path, 'dotenv_path': self._dotenv_path, 'notebooks_files': self._notebook_files, 'python_files': self._python_files, 'metadata_files': self._metadata_files, 'repository': self._repo })
def Engine(engine_type=None, *args, **kwargs): global _engines if engine_type: if engine_type in _engines.keys(): cls = _engines[engine_type] cls(*args, **kwargs) else: print('Could not create the Engine:') print('No matching engine type in', ', '.join(_engines.keys())) engine = _singleton['instance'] if not engine: logging.error( 'No Engine running yet. \n' 'try datalabframework.engine(...) or datalabframework.project.load(...)' ) return engine
def load_jdbc(self, path=None, provider=None, *args, **kwargs): #return None obj = None md = Resource(path, provider, format='jdbc', **kwargs) options = md['options'] try: if md['service'] in [ 'sqlite', 'mysql', 'postgres', 'mssql', 'oracle' ]: obj = self.context.read \ .format('jdbc') \ .option('url', md['url']) \ .option("dbtable", md['table']) \ .option("driver", md['driver']) \ .option("user", md['user']) \ .option('password', md['password']) \ .options(**options) # load the data from jdbc obj = obj.load(**kwargs) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) return obj except AnalysisException as e: logging.error(str(e), extra={'md': md}) except Exception as e: logging.error(e, extra={'md': md}) return obj
def load(self, path=None, provider=None, catch_exception=True, **kargs): if isinstance(path, YamlDict): md = path.to_dict() elif isinstance(path, str): md = get_metadata(self._rootdir, self._metadata, path, provider) elif isinstance(path, dict): md = path core_start = timer() obj = self.load_dataframe(md, catch_exception, **kargs) core_end = timer() if obj is None: return obj prep_start = timer() date_column = '_date' if md['date_partition'] else md['date_column'] obj = dataframe.filter_by_date( obj, date_column, md['date_start'], md['date_end'], md['date_window']) # partition and sorting (hmmm, needed?) if date_column and date_column in obj.columns: obj = obj.repartition(date_column) if '_updated' in obj.columns: obj = obj.sortWithinPartitions(F.desc('_updated')) num_rows = obj.count() num_cols = len(obj.columns) obj = dataframe.cache(obj, md['cache']) prep_end = timer() log_data = { 'md': dict(md), 'mode': kargs.get('mode', md.get('options', {}).get('mode')), 'records': num_rows, 'columns': num_cols, 'time': prep_end - core_start, 'time_core': core_end - core_start, 'time_prep': prep_end - prep_start } logging.info(log_data) if obj is not None else logging.error(log_data) return obj
def load_plus(self, path=None, provider=None, catch_exception=True, **kwargs): md = Resource(path, provider, **kwargs) core_start = timer() obj = self.load_dataframe(md, catch_exception, **kwargs) core_end = timer() if obj is None: return obj prep_start = timer() #date_column = '_date' if md['date_partition'] else md['date_column'] obj = dataframe.filter_by_date(obj, date_column, md['date_start'], md['date_end'], md['date_window']) # partition and sorting (hmmm, needed?) if date_column and date_column in obj.columns: obj = obj.repartition(date_column) if '_updated' in obj.columns: obj = obj.sortWithinPartitions(F.desc('_updated')) num_rows = obj.count() num_cols = len(obj.columns) obj = dataframe.cache(obj, md['cache']) prep_end = timer() log_data = { 'md': md, 'mode': kwargs.get('mode', md.get('options', {}).get('mode')), 'records': num_rows, 'columns': num_cols, 'time': prep_end - core_start, 'time_core': core_end - core_start, 'time_prep': prep_end - prep_start } logging.info(log_data) if obj is not None else logging.error(log_data) obj.__name__ = path return obj
def save_parquet(self, obj, path=None, provider=None, *args, mode=None, **kwargs): md = Resource(path, provider, format='parquet', mode=mode, **kwargs) options = md['options'] # after collecting from metadata, or method call, define defaults options['mode'] = options['mode'] or 'overwrite' local = self.is_spark_local() try: #three approaches: file-local, local+cluster, and service if md['service'] == 'file' and local: obj.coalesce(1).write\ .format('parquet')\ .mode(options['mode'])\ .options(**options)\ .parquet(md['url']) elif md['service'] == 'file': if os.path.exists(md['url']) and os.path.isdir(md['url']): shutil.rmtree(md['url']) # save with pandas obj.toPandas().to_parquet(md['url'], mode=options['mode']) elif md['service'] in ['hdfs', 's3a']: obj.write\ .format('parquet')\ .mode(options['mode'])\ .options(**options)\ .parquet(md['url']) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) return False except AnalysisException as e: logging.error(str(e), extra={'md': md}) except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e return True
def load_csv(self, path=None, provider=None, *args, sep=None, header=None, **kwargs): #return None obj = None md = Resource(path, provider, sep=sep, header=header, **kwargs) options = md['options'] # after collecting from metadata, or method call, define csv defaults options['header'] = options.get('header') or True options['inferSchema'] = options.get('inferSchema') or True options['sep'] = options.get('sep') or ',' local = self.is_spark_local() try: #three approaches: local, cluster, and service if md['service'] == 'file' and local: obj = self.context.read.options(**options).csv(md['url']) elif md['service'] == 'file': logging.warning( f'local file + spark cluster: loading using pandas reader', extra={'md': to_dict(md)}) df = pd.read_csv(md['url'], sep=options['sep'], header=options['header']) obj = self.context.createDataFrame(df) elif md['service'] in ['hdfs', 's3a']: obj = self.context.read.options(**options).csv(md['url']) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) return obj except AnalysisException as e: logging.error(str(e), extra={'md': md}) except Exception as e: logging.error(e, extra={'md': md}) return obj
def load_parquet(self, path=None, provider=None, *args, mergeSchema=None, **kwargs): #return None obj = None md = Resource(path, provider, format='parquet', mergeSchema=mergeSchema, **kwargs) options = md['options'] # after collecting from metadata, or method call, define csv defaults options['mergeSchema'] = options.get('mergeSchema') or True local = self.is_spark_local() try: #three approaches: local, cluster, and service if md['service'] == 'file' and local: obj = self.context.read.options(**options).parquet(md['url']) elif md['service'] == 'file': logging.warning( f'local file + spark cluster: loading using pandas reader', extra={'md': to_dict(md)}) #fallback to the pandas reader, then convert to spark df = pd.read_parquet(md['url']) obj = self.context.createDataFrame(df) elif md['service'] in ['hdfs', 's3a']: obj = self.context.read.options(**options).parquet(md['url']) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) return obj except AnalysisException as e: logging.error(str(e), extra={'md': md}) except Exception as e: logging.error(e, extra={'md': md}) return obj
def save_jdbc(self, obj, path=None, provider=None, *args, mode=None, **kwargs): md = Resource(path, provider, format='jdbc', mode=mode, **kwargs) options = md['options'] # after collecting from metadata, or method call, define csv defaults options['mode'] = options['mode'] or 'overwrite' try: #three approaches: local, cluster, and service if md['service'] in [ 'sqlite', 'mysql', 'postgres', 'mssql', 'oracle' ]: obj.write \ .format('jdbc') \ .option('url', md['url']) \ .option("dbtable", md['table']) \ .option("driver", md['driver']) \ .option("user", md['user']) \ .option('password', md['password']) \ .options(**options) \ .mode(options['mode'])\ .save() else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) return False except AnalysisException as e: logging.error(str(e), extra={'md': md}) except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e return True
def copy(self, md_src, md_trg, mode='append'): # timer timer_start = timer() # src dataframe df_src = self.load(md_src) # logging log_data = { 'src_hash': md_src['hash'], 'src_path': md_src['resource_path'], 'trg_hash': md_trg['hash'], 'trg_path': md_trg['resource_path'], 'mode': mode, 'updated': False, 'records_read': 0, 'records_add': 0, 'records_del': 0, 'columns': 0, 'time': timer() - timer_start } # could not read source, log error and return if df_src is None: logging.error(log_data) return num_rows = df_src.count() num_cols = len(df_src.columns) # empty source, log notice and return if num_rows == 0 and mode == 'append': log_data['time'] = timer() - timer_start logging.notice(log_data) return # overwrite target, save, log notice/error and return if mode == 'overwrite': if md_trg['state_column']: df_src = df_src.withColumn('_state', F.lit(0)) if md_trg['version_column']: df_src = dataframe.add_version_column(df_src, tzone=self._timezone) result = self.save(df_src, md_trg, mode=mode) log_data['time'] = timer() - timer_start log_data['records_read'] = num_rows log_data['records_add'] = num_rows log_data['columns'] = num_cols logging.notice(log_data) if result else logging.error(log_data) return # trg dataframe (if exists) try: if md_trg['version_column']: df_trg = self.load_cdc(md_trg, catch_exception=False) else: df_trg = self.load(md_trg, catch_exception=False) except: df_trg = dataframe.empty(df_src) # if there is schema change, create new version, log notice/error and return if not dataframe.compare_schema(df_src, df_trg, ['_date', '_datetime', '_updated', '_hash', '_state', '_version']): if md_trg['state_column']: df_src = df_src.withColumn('_state', F.lit(0)) if md_trg['version_column']: df_src = dataframe.add_version_column(df_src, tzone=self._timezone) result = self.save(df_src, md_trg, mode=mode) log_data['time'] = timer() - timer_start log_data['records_read'] = num_rows log_data['records_add'] = num_rows log_data['columns'] = num_cols logging.notice(log_data) if result else logging.error(log_data) return # de-dup (exclude the _updated column) # create a view from the extracted log df_trg = dataframe.view(df_trg) # capture added records df_add = dataframe.diff(df_src, df_trg, ['_date', '_datetime', '_updated', '_hash', '_state', '_version']) rows_add = df_add.count() # capture deleted records rows_del = 0 if md_trg['state_column']: df_del = dataframe.diff(df_trg, df_src, ['_date', '_datetime', '_updated', '_hash', '_state', '_version']) rows_del = df_del.count() updated = (rows_add + rows_del) > 0 num_cols = len(df_add.columns) num_rows = max(df_src.count(), df_trg.count()) # save diff if updated: if md_trg['state_column']: df_add = df_add.withColumn('_state', F.lit(0)) df_del = df_del.withColumn('_state', F.lit(1)) df = df_add.union(df_del) else: df = df_add if md_trg['version_column']: version = self.find_version(md=md_trg) date = datetime.strptime(version, '%Y-%m-%d-%H-%M-%S') if version else None df = dataframe.add_version_column(df, version_time=date, tzone=self._timezone) result = self.save(df, md_trg, mode=mode) else: result = True log_data.update({ 'updated': updated, 'records_read': num_rows, 'records_add': rows_add, 'records_del': rows_del, 'columns': num_cols, 'time': timer() - timer_start }) logging.notice(log_data) if result else logging.error(log_data)
def save_dataframe(self, obj, md, **kargs): options = md.get('options', {}) try: if md['service'] in ['local', 'file']: if md['format'] == 'csv': try: obj.write.options(**options).csv(md['url'], **kargs) except: obj.toPandas().to_csv(md['url'], **kargs) elif md['format'] == 'json': try: obj.write.options(**options).json(md['url'], **kargs) except: obj.toPandas().to_json(md['url'], **kargs) elif md['format'] == 'jsonl': try: obj.write.options(**options).option('multiLine', True).json(md['url'], **kargs) except: obj.toPandas().to_json(md['url'], orient='records', lines=True, **kargs) elif md['format'] == 'parquet': try: obj.write.options(**options).parquet(md['url'], **kargs) except: obj.toPandas().to_parquet(md['url'], orient='records', lines=True, **kargs) else: logging.error({'md': md, 'error_msg': f'Unknown format "{md["format"]}"'}) return False elif md['service'] in ['hdfs', 'minio']: if md['format'] == 'csv': obj.write.options(**options).csv(md['url'], **kargs) elif md['format'] == 'json': obj.write.options(**options).json(md['url'], **kargs) elif md['format'] == 'jsonl': obj.write.options(**options).option('multiLine', True).json(md['url'], **kargs) elif md['format'] == 'parquet': obj.write.options(**options).parquet(md['url'], **kargs) else: logging.error({'md': md, 'error_msg': f'Unknown format "{md["format"]}"'}) return False elif md['service'] in ['sqlite', 'mysql', 'postgres', 'oracle']: obj.write \ .format('jdbc') \ .option('url', md['url']) \ .option("dbtable", md['resource_path']) \ .option("driver", md['driver']) \ .option("user", md['username']) \ .option('password', md['password']) \ .options(**options) \ .save(**kargs) elif md['service'] == 'mongodb': obj.write \ .format(md['format']) \ .option('spark.mongodb.input.uri', md['url'] + '.' + md['resource_path']) \ .options(**options)\ .save(**kargs) elif md['service'] == 'elastic': mode = kargs.get("mode", None) obj = [row.asDict() for row in obj.collect()] elastic.write(obj, md['url'], mode, md['resource_path'], options['settings'], options['mappings']) else: logging.error({'md': md, 'error_msg': f'Unknown service "{md["service"]}"'}) return False except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e return True
def load_dataframe(self, md, catch_exception=True, **kargs): obj = None options = md['options'] try: if md['service'] in ['local', 'file']: if md['format'] == 'csv': try: obj = self._ctx.read.options(**options).csv(md['url'], **kargs) except: obj = self._ctx.createDataFrame(pd.read_csv(md['url'], **kargs)) elif md['format'] == 'json': try: obj = self._ctx.read.options(**options).json(md['url'], **kargs) except: obj = self._ctx.createDataFrame(pd.read_json(md['url'], **kargs)) elif md['format'] == 'jsonl': try: obj = self._ctx.read.option('multiLine', True).options(**options).json(md['url'], **kargs) except: obj = self._ctx.createDataFrame(pd.read_json(md['url'], lines=True, **kargs)) elif md['format'] == 'parquet': try: obj = self._ctx.read.options(**options).parquet(md['url'], **kargs) except: obj = self._ctx.createDataFrame(pd.read_parquet(md['url'], **kargs)) else: logging.error({'md': md, 'error_msg': f'Unknown format "{md["format"]}"'}) return None elif md['service'] in ['hdfs', 'minio']: if md['format'] == 'csv': obj = self._ctx.read.options(**options).csv(md['url'], **kargs) elif md['format'] == 'json': obj = self._ctx.read.options(**options).json(md['url'], **kargs) elif md['format'] == 'jsonl': obj = self._ctx.read.option('multiLine', True).options(**options).json(md['url'], **kargs) elif md['format'] == 'parquet': obj = self._ctx.read.options(**options).parquet(md['url'], **kargs) else: logging.error({'md': md, 'error_msg': f'Unknown format "{md["format"]}"'}) return None elif md['service'] in ['sqlite', 'mysql', 'postgres', 'mssql', 'oracle']: obj = self._ctx.read \ .format('jdbc') \ .option('url', md['url']) \ .option("dbtable", md['resource_path']) \ .option("driver", md['driver']) \ .option("user", md['username']) \ .option('password', md['password']) \ .options(**options) # load the data from jdbc obj = obj.load(**kargs) elif md['service'] == 'mongodb': obj = self._ctx.read \ .format(md['format']) \ .option('spark.mongodb.input.uri', md['url'] + '.' + md['resource_path']) \ .options(**options) # load the data obj = obj.load(**kargs) elif md['service'] == 'elastic': results = elastic.read(md['url'], options.get('query', {})) rows = [pyspark.sql.Row(**r) for r in results] obj = self.context().createDataFrame(rows) else: logging.error({'md': md, 'error_msg': f'Unknown service "{md["service"]}"'}) except Exception as e: if catch_exception: logging.error({'md': md, 'error': str(e)}) return None else: raise e return obj
def list(self, provider, path=''): df_schema = T.StructType([ T.StructField('name',T.StringType(),True), T.StructField('type',T.StringType(),True)]) df_empty = self._ctx.createDataFrame(data=(), schema=df_schema) if isinstance(provider, YamlDict): md = provider.to_dict() elif isinstance(provider, str): md = resource.metadata(self._rootdir, self._metadata, None, provider) elif isinstance(provider, dict): md = provider else: logging.warning(f'{str(provider)} cannot be used to reference a provider') return df_empty try: if md['service'] in ['local', 'file']: lst = [] rootpath = os.path.join(md['provider_path'], path) for f in os.listdir(rootpath): fullpath = os.path.join(rootpath, f) if os.path.isfile(fullpath): obj_type='FILE' elif os.path.isdir(fullpath): obj_type='DIRECTORY' elif os.path.ismount(fullpath): obj_type='LINK' elif os.path.islink(fullpath): obj_type='MOUNT' else: obj_type='UNDEFINED' obj_name = f lst += [(obj_name, obj_type)] return self._ctx.createDataFrame(lst, ['name', 'type']) if lst else df_empty elif md['service'] in ['hdfs', 'minio']: sc = self._ctx._sc URI = sc._gateway.jvm.java.net.URI Path = sc._gateway.jvm.org.apache.hadoop.fs.Path FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem fs = FileSystem.get(URI(md['url']), sc._jsc.hadoopConfiguration()) provider_path = md['provider_path'] if md['service']=='hdfs' else '/' obj = fs.listStatus(Path(os.path.join(provider_path, path))) lst = [] for i in range(len(obj)): if obj[i].isFile(): obj_type='FILE' elif obj[i].isDirectory(): obj_type='DIRECTORY' else: obj_type='UNDEFINED' obj_name = obj[i].getPath().getName() lst += [(obj_name, obj_type)] return self._ctx.createDataFrame(lst, ['name', 'type']) if lst else df_empty elif md['format'] == 'jdbc': # remove options from database, if any database = md["database"].split('?')[0] schema = md['schema'] if md['service'] == 'mssql': query = f""" ( SELECT table_name, table_type FROM INFORMATION_SCHEMA.TABLES WHERE table_schema='{schema}' ) as query """ elif md['service'] == 'oracle': query = f""" ( SELECT table_name, table_type FROM all_tables WHERE table_schema='{schema}' ) as query """ elif md['service'] == 'mysql': query = f""" ( SELECT table_name, table_type FROM information_schema.tables WHERE table_schema='{schema}' ) as query """ elif md['service'] == 'postgres': query = f""" ( SELECT table_name, table_type FROM information_schema.tables WHERE table_schema = '{schema}' ) as query """ else: # vanilla query ... for other databases query = f""" ( SELECT table_name, table_type FROM information_schema.tables' ) as query """ obj = self._ctx.read \ .format('jdbc') \ .option('url', md['url']) \ .option("dbtable", query) \ .option("driver", md['driver']) \ .option("user", md['username']) \ .option('password', md['password']) \ .load() # load the data from jdbc lst = [(x.TABLE_NAME, x.TABLE_TYPE) for x in obj.select('TABLE_NAME', 'TABLE_TYPE').collect()] return self._ctx.createDataFrame(lst, ['name', 'type']) if lst else df_empty else: logging.error({'md': md, 'error_msg': f'List resource on service "{md["service"]}" not implemented'}) return df_empty except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e return df_empty
def save_json(self, obj, path=None, provider=None, *args, mode=None, lines=None, **kwargs): md = Resource(path, provider, format='csv', mode=mode, lines=lines, **kwargs) options = md['options'] # after collecting from metadata, or method call, define csv defaults options['mode'] = options['mode'] or 'overwrite' options['lines'] = options['lines'] or True local = self.is_spark_local() try: #three approaches: local, cluster, and service if local and md['service'] == 'file' and options['lines']: obj.coalesce(1).write\ .format('json')\ .mode(options['mode'])\ .options(**options)\ .json(md['url']) self.directory_to_file(md['url']) elif md['service'] == 'file': # fallback, use pandas # save single files, not directories if os.path.exists(md['url']) and os.path.isdir(md['url']): shutil.rmtree(md['url']) # save with pandas obj.toPandas().to_json(md['url'], mode=options['mode'], lines=options['lines']) elif md['service'] in ['hdfs', 's3a']: obj.write\ .format('json')\ .mode(options['mode'])\ .options(**options)\ .json(md['url']) else: logging.error(f'Unknown resource service "{md["service"]}"', extra={'md': to_dict(md)}) return False except AnalysisException as e: logging.error(str(e), extra={'md': md}) except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e return True
def copy(self, md_src, md_trg, mode='append'): # timer timer_start = timer() # src dataframe df_src = self.load(md_src) # if not path on target, get it from src if not md_trg['resource_path']: md_trg = resource.metadata(self._rootdir, self._metadata, md_src['resource_path'], md_trg['provider_alias']) # logging log_data = { 'src_hash': md_src['hash'], 'src_path': md_src['resource_path'], 'trg_hash': md_trg['hash'], 'trg_path': md_trg['resource_path'], 'mode': mode, 'updated': False, 'records_read': 0, 'records_add': 0, 'records_del': 0, 'columns': 0, 'time': timer() - timer_start } # could not read source, log error and return if df_src is None: logging.error(log_data) return num_rows = df_src.count() num_cols = len(df_src.columns) # empty source, log notice and return if num_rows == 0 and mode == 'append': log_data['time'] = timer() - timer_start logging.notice(log_data) return # overwrite target, save, log notice/error and return if mode == 'overwrite': if md_trg['state_column']: df_src = df_src.withColumn('_state', F.lit(0)) result = self.save(df_src, md_trg, mode=mode) log_data['time'] = timer() - timer_start log_data['records_read'] = num_rows log_data['records_add'] = num_rows log_data['columns'] = num_cols logging.notice(log_data) if result else logging.error(log_data) return # trg dataframe (if exists) try: df_trg = self.load(md_trg, catch_exception=False) except: df_trg = dataframe.empty(df_src) # de-dup (exclude the _updated column) # create a view from the extracted log df_trg = dataframe.view(df_trg) # capture added records df_add = dataframe.diff( df_src, df_trg, ['_date', '_datetime', '_updated', '_hash', '_state']) rows_add = df_add.count() # capture deleted records rows_del = 0 if md_trg['state_column']: df_del = dataframe.diff( df_trg, df_src, ['_date', '_datetime', '_updated', '_hash', '_state']) rows_del = df_del.count() updated = (rows_add + rows_del) > 0 num_cols = len(df_add.columns) num_rows = max(df_src.count(), df_trg.count()) # save diff if updated: if md_trg['state_column']: df_add = df_add.withColumn('_state', F.lit(0)) df_del = df_del.withColumn('_state', F.lit(1)) df = df_add.union(df_del) else: df = df_add result = self.save(df, md_trg, mode=mode) else: result = True log_data.update({ 'updated': updated, 'records_read': num_rows, 'records_add': rows_add, 'records_del': rows_del, 'columns': num_cols, 'time': timer() - timer_start }) logging.notice(log_data) if result else logging.error(log_data)