def load_raw_cdc(self, path=None, provider=None, catch_exception=True, **kargs): """ Load all version of the cdc database :return: list of dataframes at each versions """ if isinstance(path, YamlDict): md = path.to_dict() elif isinstance(path, str): md = get_metadata(self._rootdir, self._metadata, path, provider) elif isinstance(path, dict): md = path try: versions = self.list(md) versions = [version for version in versions if '_version=' in version] except: versions = [] dataframes = [] for version in versions: md_copy = copy.deepcopy(md) md_copy['url'] += '/' + version obj = self.load(path=md_copy, catch_exception=catch_exception, **kargs) dataframes.append(obj) return dataframes
def list(self, provider): if isinstance(provider, YamlDict): md = provider.to_dict() elif isinstance(provider, str): md = get_metadata(self._rootdir, self._metadata, None, provider) elif isinstance(provider, dict): md = provider else: logging.warning(f'{str(provider)} cannot be used to reference a provider') return [] try: if md['service'] in ['local', 'file']: d = [] for f in os.listdir(md['provider_path']): d.append(os.path.join(md['provider_path'], f)) return d elif md['service'] == 'hdfs': sc = self._ctx._sc URI = sc._gateway.jvm.java.net.URI Path = sc._gateway.jvm.org.apache.hadoop.fs.Path FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem fs = FileSystem.get(URI(md['url']), sc._jsc.hadoopConfiguration()) obj = fs.listStatus(Path(md['url'])) tables = [obj[i].getPath().getName() for i in range(len(obj))] return tables elif md['format'] == 'jdbc': if md['service'] == 'mssql': query = "(SELECT table_name FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE') as query" elif md['service'] == 'oracle': query = "(SELECT table_name FROM all_tables WHERE owner='schema_name') as query" elif md['service'] == 'mysql': query = f"(SELECT table_name FROM information_schema.tables where table_schema='{md['database']}') as query" elif md['service'] == 'pgsql': query = f"(SELECT table_name FROM information_schema.tables) as query" else: # vanilla query ... for other databases query = f"(SELECT table_name FROM information_schema.tables) as query" obj = self._ctx.read \ .format('jdbc') \ .option('url', md['url']) \ .option("dbtable", query) \ .option("driver", md['driver']) \ .option("user", md['username']) \ .option('password', md['password']) \ .load() # load the data from jdbc return [x.TABLE_NAME for x in obj.select('TABLE_NAME').collect()] else: logging.error({'md': md, 'error_msg': f'List resource on service "{md["service"]}" not implemented'}) return [] except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e return []
def load_cdc(self, path=None, provider=None, date=None, catch_exception=True, **kargs): """ Load the cdc database at date. If date is null, the latest one will be loaded :param date: the query datetime object """ if isinstance(path, YamlDict): md = path.to_dict() elif isinstance(path, str): md = get_metadata(self._rootdir, self._metadata, path, provider) elif isinstance(path, dict): md = path version = self.find_version(date, md) md_copy = copy.deepcopy(md) md_copy['url'] += '/_version=' + version obj = self.load(path=md_copy, catch_exception=catch_exception, **kargs) if date != None: obj = obj.filter(F.col('_updated') <= date) obj = dataframe.view(obj) return obj
def load(self, path=None, provider=None, catch_exception=True, **kargs): if isinstance(path, YamlDict): md = path.to_dict() elif isinstance(path, str): md = get_metadata(self._rootdir, self._metadata, path, provider) elif isinstance(path, dict): md = path core_start = timer() obj = self.load_dataframe(md, catch_exception, **kargs) core_end = timer() if obj is None: return obj prep_start = timer() date_column = '_date' if md['date_partition'] else md['date_column'] obj = dataframe.filter_by_date( obj, date_column, md['date_start'], md['date_end'], md['date_window']) # partition and sorting (hmmm, needed?) if date_column and date_column in obj.columns: obj = obj.repartition(date_column) if '_updated' in obj.columns: obj = obj.sortWithinPartitions(F.desc('_updated')) num_rows = obj.count() num_cols = len(obj.columns) obj = dataframe.cache(obj, md['cache']) prep_end = timer() log_data = { 'md': dict(md), 'mode': kargs.get('mode', md.get('options', {}).get('mode')), 'records': num_rows, 'columns': num_cols, 'time': prep_end - core_start, 'time_core': core_end - core_start, 'time_prep': prep_end - prep_start } logging.info(log_data) if obj is not None else logging.error(log_data) return obj
def save(self, obj, path=None, provider=None, **kargs): if isinstance(path, YamlDict): md = path.to_dict() elif isinstance(path, str): md = get_metadata(self._rootdir, self._metadata, path, provider) elif isinstance(path, dict): md = path prep_start = timer() if md['date_partition'] and md['date_column']: tzone = 'UTC' if self._timestamps == 'naive' else self._timezone obj = dataframe.add_datetime_columns(obj, column=md['date_column'], tzone=tzone) kargs['partitionBy'] = ['_date'] + kargs.get('partitionBy', md.get('options', {}).get('partitionBy', [])) if md['version_column']: kargs['partitionBy'] = ['_version'] + kargs.get('partitionBy', []) if md['update_column']: obj = dataframe.add_update_column(obj, tzone=self._timezone) if md['hash_column']: obj = dataframe.add_hash_column(obj, cols=md['hash_column'], exclude_cols=['_date', '_datetime', '_updated', '_hash', '_state', '_version']) date_column = '_date' if md['date_partition'] else md['date_column'] obj = dataframe.filter_by_date( obj, date_column, md['date_start'], md['date_end'], md['date_window']) obj = dataframe.cache(obj, md['cache']) num_rows = obj.count() num_cols = len(obj.columns) # force 1 file per partition, just before saving obj = obj.repartition(1, *kargs['partitionBy']) if kargs.get('partitionBy') else obj.repartition(1) # obj = obj.coalesce(1) prep_end = timer() core_start = timer() result = self.save_dataframe(obj, md, **kargs) core_end = timer() log_data = { 'md': dict(md), 'mode': kargs.get('mode', md.get('options', {}).get('mode')), 'records': num_rows, 'columns': num_cols, 'time': core_end - prep_start, 'time_core': core_end - core_start, 'time_prep': prep_end - prep_start } logging.info(log_data) if result else logging.error(log_data) return result
def resource(self, path=None, provider=None, md=dict()): if not self.profile: raise ValueError("No project profile loaded. Try first: datalabframework.project.load(...)") md = resource.get_metadata(paths.rootdir(), self._metadata , path, provider, md) return md
def list(self, provider, path=''): df_schema = T.StructType([ T.StructField('name', T.StringType(), True), T.StructField('type', T.StringType(), True) ]) df_empty = self._ctx.createDataFrame(data=(), schema=df_schema) if isinstance(provider, str): md = get_metadata(self._rootdir, self._metadata, None, provider) elif isinstance(provider, dict): md = provider else: logging.warning( f'{str(provider)} cannot be used to reference a provider') return df_empty try: if md['service'] in ['local', 'file']: lst = [] rootpath = os.path.join(md['provider_path'], path) for f in os.listdir(rootpath): fullpath = os.path.join(rootpath, f) if os.path.isfile(fullpath): obj_type = 'FILE' elif os.path.isdir(fullpath): obj_type = 'DIRECTORY' elif os.path.islink(fullpath): obj_type = 'LINK' elif os.path.ismount(fullpath): obj_type = 'MOUNT' else: obj_type = 'UNDEFINED' obj_name = f lst += [(obj_name, obj_type)] if lst: df = self._ctx.createDataFrame(lst, ['name', 'type']) else: df = df_empty return df elif md['service'] in ['hdfs', 'minio', 's3a']: sc = self._ctx._sc URI = sc._gateway.jvm.java.net.URI Path = sc._gateway.jvm.org.apache.hadoop.fs.Path FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem fs = FileSystem.get(URI(md['url']), sc._jsc.hadoopConfiguration()) provider_path = md['provider_path'] if md[ 'service'] == 'hdfs' else '/' obj = fs.listStatus(Path(os.path.join(provider_path, path))) lst = [] for i in range(len(obj)): if obj[i].isFile(): obj_type = 'FILE' elif obj[i].isDirectory(): obj_type = 'DIRECTORY' else: obj_type = 'UNDEFINED' obj_name = obj[i].getPath().getName() lst += [(obj_name, obj_type)] if lst: df = self._ctx.createDataFrame(lst, ['name', 'type']) else: df = df_empty return df elif md['format'] == 'jdbc': # remove options from database, if any database = md["database"].split('?')[0] schema = md['schema'] if md['service'] == 'mssql': query = f""" ( SELECT table_name, table_type FROM INFORMATION_SCHEMA.TABLES WHERE table_schema='{schema}' ) as query """ elif md['service'] == 'oracle': query = f""" ( SELECT table_name, table_type FROM all_tables WHERE table_schema='{schema}' ) as query """ elif md['service'] == 'mysql': query = f""" ( SELECT table_name, table_type FROM information_schema.tables WHERE table_schema='{database}' ) as query """ elif md['service'] == 'postgres': query = f""" ( SELECT table_name, table_type FROM information_schema.tables WHERE table_schema = '{schema}' ) as query """ else: # vanilla query ... for other databases query = f""" ( SELECT table_name, table_type FROM information_schema.tables' ) as query """ obj = self._ctx.read \ .format('jdbc') \ .option('url', md['url']) \ .option("dbtable", query) \ .option("driver", md['driver']) \ .option("user", md['username']) \ .option('password', md['password']) \ .load() # load the data from jdbc lst = [] for x in obj.select('TABLE_NAME', 'TABLE_TYPE').collect(): lst.append((x.TABLE_NAME, x.TABLE_TYPE)) if lst: df = self._ctx.createDataFrame(lst, ['name', 'type']) else: df = df_empty return df else: logging.error({ 'md': md, 'error_msg': f'List resource on service "{md["service"]}" not implemented' }) return df_empty except Exception as e: logging.error({'md': md, 'error_msg': str(e)}) raise e return df_empty