Example #1
0
 def load_raw_cdc(self, path=None, provider=None, catch_exception=True, **kargs):
     """
     Load all version of the cdc database
     
     :return: list of dataframes at each versions 
     """
     
     if isinstance(path, YamlDict):
         md = path.to_dict()
     elif isinstance(path, str):
         md = get_metadata(self._rootdir, self._metadata, path, provider)
     elif isinstance(path, dict):
         md = path
         
     try:
         versions = self.list(md)
         versions = [version for version in versions if '_version=' in version]
     except:
         versions = []
         
     dataframes = []
     for version in versions:
         md_copy = copy.deepcopy(md)
         md_copy['url'] += '/' + version
         obj = self.load(path=md_copy, catch_exception=catch_exception, **kargs)
         dataframes.append(obj)
         
     return dataframes
Example #2
0
    def list(self, provider):
        if isinstance(provider, YamlDict):
            md = provider.to_dict()
        elif isinstance(provider, str):
            md = get_metadata(self._rootdir, self._metadata, None, provider)
        elif isinstance(provider, dict):
            md = provider
        else:
            logging.warning(f'{str(provider)} cannot be used to reference a provider')
            return []

        try:
            if md['service'] in ['local', 'file']:
                d = []
                for f in os.listdir(md['provider_path']):
                    d.append(os.path.join(md['provider_path'], f))
                return d
            elif md['service'] == 'hdfs':
                sc = self._ctx._sc
                URI = sc._gateway.jvm.java.net.URI
                Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
                FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
                fs = FileSystem.get(URI(md['url']), sc._jsc.hadoopConfiguration())

                obj = fs.listStatus(Path(md['url']))
                tables = [obj[i].getPath().getName() for i in range(len(obj))]
                return tables

            elif md['format'] == 'jdbc':
                if md['service'] == 'mssql':
                    query = "(SELECT table_name FROM INFORMATION_SCHEMA.TABLES WHERE TABLE_TYPE = 'BASE TABLE') as query"
                elif md['service'] == 'oracle':
                    query = "(SELECT table_name FROM all_tables WHERE owner='schema_name') as query"
                elif md['service'] == 'mysql':
                    query = f"(SELECT table_name FROM information_schema.tables where table_schema='{md['database']}') as query"
                elif md['service'] == 'pgsql':
                    query = f"(SELECT table_name FROM information_schema.tables) as query"
                else:
                    # vanilla query ... for other databases
                    query = f"(SELECT table_name FROM information_schema.tables) as query"

                obj = self._ctx.read \
                    .format('jdbc') \
                    .option('url', md['url']) \
                    .option("dbtable", query) \
                    .option("driver", md['driver']) \
                    .option("user", md['username']) \
                    .option('password', md['password']) \
                    .load()

                # load the data from jdbc
                return [x.TABLE_NAME for x in obj.select('TABLE_NAME').collect()]
            else:
                logging.error({'md': md, 'error_msg': f'List resource on service "{md["service"]}" not implemented'})
                return []
        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        return []
Example #3
0
    def load_cdc(self,
                 path=None,
                 provider=None,
                 date=None,
                 catch_exception=True,
                 **kargs):
        """
        Load the cdc database at date. If date is null, the latest one will be loaded

        :param date: the query datetime object
        """

        if isinstance(path, YamlDict):
            md = path.to_dict()
        elif isinstance(path, str):
            md = get_metadata(self._rootdir, self._metadata, path, provider)
        elif isinstance(path, dict):
            md = path

        version = self.find_version(date, md)
        md_copy = copy.deepcopy(md)
        md_copy['url'] += '/_version=' + version
        obj = self.load(path=md_copy, catch_exception=catch_exception, **kargs)
        if date != None:
            obj = obj.filter(F.col('_updated') <= date)
        obj = dataframe.view(obj)

        return obj
Example #4
0
    def load(self, path=None, provider=None, catch_exception=True, **kargs):
        if isinstance(path, YamlDict):
            md = path.to_dict()
        elif isinstance(path, str):
            md = get_metadata(self._rootdir, self._metadata, path, provider)
        elif isinstance(path, dict):
            md = path

        core_start = timer()
        obj = self.load_dataframe(md, catch_exception, **kargs)
        core_end = timer()
        if obj is None:
            return obj

        prep_start = timer()
        date_column = '_date' if md['date_partition'] else md['date_column']
                
        obj = dataframe.filter_by_date(
            obj,
            date_column,
            md['date_start'],
            md['date_end'],
            md['date_window'])

        # partition and sorting (hmmm, needed?)
        if date_column and date_column in obj.columns:
            obj = obj.repartition(date_column)

        if '_updated' in obj.columns:
            obj = obj.sortWithinPartitions(F.desc('_updated'))

        num_rows = obj.count()
        num_cols = len(obj.columns)

        obj = dataframe.cache(obj, md['cache'])

        prep_end = timer()

        log_data = {
            'md': dict(md),
            'mode': kargs.get('mode', md.get('options', {}).get('mode')),
            'records': num_rows,
            'columns': num_cols,
            'time': prep_end - core_start,
            'time_core': core_end - core_start,
            'time_prep': prep_end - prep_start
        }
        logging.info(log_data) if obj is not None else logging.error(log_data)

        return obj
Example #5
0
    def save(self, obj, path=None, provider=None, **kargs):

        if isinstance(path, YamlDict):
            md = path.to_dict()
        elif isinstance(path, str):
            md = get_metadata(self._rootdir, self._metadata, path, provider)
        elif isinstance(path, dict):
            md = path

        prep_start = timer()

        if md['date_partition'] and md['date_column']:
            tzone = 'UTC' if self._timestamps == 'naive' else self._timezone
            obj = dataframe.add_datetime_columns(obj, column=md['date_column'], tzone=tzone)
            kargs['partitionBy'] = ['_date'] + kargs.get('partitionBy', md.get('options', {}).get('partitionBy', []))

        if md['version_column']:
            kargs['partitionBy'] = ['_version'] + kargs.get('partitionBy', [])

        if md['update_column']:
            obj = dataframe.add_update_column(obj, tzone=self._timezone)

        if md['hash_column']:
            obj = dataframe.add_hash_column(obj, cols=md['hash_column'],
                                            exclude_cols=['_date', '_datetime', '_updated', '_hash', '_state', '_version'])

        date_column = '_date' if md['date_partition'] else md['date_column']
        obj = dataframe.filter_by_date(
            obj,
            date_column,
            md['date_start'],
            md['date_end'],
            md['date_window'])

        obj = dataframe.cache(obj, md['cache'])

        num_rows = obj.count()
        num_cols = len(obj.columns)

        # force 1 file per partition, just before saving
        obj = obj.repartition(1, *kargs['partitionBy']) if kargs.get('partitionBy') else obj.repartition(1)
        # obj = obj.coalesce(1)

        prep_end = timer()

        core_start = timer()
        result = self.save_dataframe(obj, md, **kargs)
        core_end = timer()

        log_data = {
            'md': dict(md),
            'mode': kargs.get('mode', md.get('options', {}).get('mode')),
            'records': num_rows,
            'columns': num_cols,
            'time': core_end - prep_start,
            'time_core': core_end - core_start,
            'time_prep': prep_end - prep_start
        }

        logging.info(log_data) if result else logging.error(log_data)

        return result
Example #6
0
    def resource(self, path=None, provider=None, md=dict()):
        if not self.profile:
            raise ValueError("No project profile loaded. Try first: datalabframework.project.load(...)") 

        md = resource.get_metadata(paths.rootdir(), self._metadata , path, provider, md)
        return md
Example #7
0
    def list(self, provider, path=''):
        df_schema = T.StructType([
            T.StructField('name', T.StringType(), True),
            T.StructField('type', T.StringType(), True)
        ])

        df_empty = self._ctx.createDataFrame(data=(), schema=df_schema)

        if isinstance(provider, str):
            md = get_metadata(self._rootdir, self._metadata, None, provider)
        elif isinstance(provider, dict):
            md = provider
        else:
            logging.warning(
                f'{str(provider)} cannot be used to reference a provider')
            return df_empty

        try:
            if md['service'] in ['local', 'file']:
                lst = []
                rootpath = os.path.join(md['provider_path'], path)
                for f in os.listdir(rootpath):
                    fullpath = os.path.join(rootpath, f)
                    if os.path.isfile(fullpath):
                        obj_type = 'FILE'
                    elif os.path.isdir(fullpath):
                        obj_type = 'DIRECTORY'
                    elif os.path.islink(fullpath):
                        obj_type = 'LINK'
                    elif os.path.ismount(fullpath):
                        obj_type = 'MOUNT'
                    else:
                        obj_type = 'UNDEFINED'

                    obj_name = f
                    lst += [(obj_name, obj_type)]

                if lst:
                    df = self._ctx.createDataFrame(lst, ['name', 'type'])
                else:
                    df = df_empty
                return df

            elif md['service'] in ['hdfs', 'minio', 's3a']:
                sc = self._ctx._sc
                URI = sc._gateway.jvm.java.net.URI
                Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
                FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
                fs = FileSystem.get(URI(md['url']),
                                    sc._jsc.hadoopConfiguration())
                provider_path = md['provider_path'] if md[
                    'service'] == 'hdfs' else '/'
                obj = fs.listStatus(Path(os.path.join(provider_path, path)))
                lst = []

                for i in range(len(obj)):
                    if obj[i].isFile():
                        obj_type = 'FILE'
                    elif obj[i].isDirectory():
                        obj_type = 'DIRECTORY'
                    else:
                        obj_type = 'UNDEFINED'

                    obj_name = obj[i].getPath().getName()
                    lst += [(obj_name, obj_type)]

                if lst:
                    df = self._ctx.createDataFrame(lst, ['name', 'type'])
                else:
                    df = df_empty

                return df

            elif md['format'] == 'jdbc':
                # remove options from database, if any
                database = md["database"].split('?')[0]
                schema = md['schema']
                if md['service'] == 'mssql':
                    query = f"""
                            ( SELECT table_name, table_type
                              FROM INFORMATION_SCHEMA.TABLES
                              WHERE table_schema='{schema}'
                            ) as query
                            """
                elif md['service'] == 'oracle':
                    query = f"""
                            ( SELECT table_name, table_type
                             FROM all_tables
                             WHERE table_schema='{schema}'
                            ) as query
                            """
                elif md['service'] == 'mysql':
                    query = f"""
                            ( SELECT table_name, table_type
                              FROM information_schema.tables
                              WHERE table_schema='{database}'
                            ) as query
                            """
                elif md['service'] == 'postgres':
                    query = f"""
                            ( SELECT table_name, table_type
                              FROM information_schema.tables
                              WHERE table_schema = '{schema}'
                            ) as query
                            """
                else:
                    # vanilla query ... for other databases
                    query = f"""
                                ( SELECT table_name, table_type
                                  FROM information_schema.tables'
                                ) as query
                                """

                obj = self._ctx.read \
                    .format('jdbc') \
                    .option('url', md['url']) \
                    .option("dbtable", query) \
                    .option("driver", md['driver']) \
                    .option("user", md['username']) \
                    .option('password', md['password']) \
                    .load()

                # load the data from jdbc
                lst = []
                for x in obj.select('TABLE_NAME', 'TABLE_TYPE').collect():
                    lst.append((x.TABLE_NAME, x.TABLE_TYPE))

                if lst:
                    df = self._ctx.createDataFrame(lst, ['name', 'type'])
                else:
                    df = df_empty

                return df

            else:
                logging.error({
                    'md':
                    md,
                    'error_msg':
                    f'List resource on service "{md["service"]}" not implemented'
                })
                return df_empty
        except Exception as e:
            logging.error({'md': md, 'error_msg': str(e)})
            raise e

        return df_empty