Example #1
0
    def set_submit_args(self):
        submit_args = ''

        for k in self.submit.keys() - {'conf'}:
            s = ",".join(self.submit[k])
            submit_args += f' --{k} {s}' if s else ''

        # submit config options one by one
        for c in self.submit['conf']:
            submit_args += f' --conf {c[0]}={c[1]}'

        # print debug
        for k in self.submit.keys():
            if self.submit[k]:
                logging.notice(f'Configuring {k}:')
                for e in self.submit[k]:
                    v = e
                    if isinstance(e, tuple):
                        if len(e) > 1 and str(e[0]).endswith('.key'):
                            e = (e[0], '****** (redacted)')
                        v = ' : '.join(list([str(x) for x in e]))
                    if k == 'conf':
                        logging.info(f'  -  {v}')
                    else:
                        logging.notice(f'  -  {v}')

        # set PYSPARK_SUBMIT_ARGS env variable
        submit_args = '{} pyspark-shell'.format(submit_args)
        os.environ['PYSPARK_SUBMIT_ARGS'] = submit_args
Example #2
0
    def start_session(self, conf):
        try:
            # init the spark session
            session = pyspark.sql.SparkSession.builder.config(
                conf=conf).getOrCreate()

            # store the spark session
            self.session = session

            # fix SQLContext for back compatibility
            initialize_spark_sql_context(session, session.sparkContext)

            # pyspark set log level method
            # (this will not suppress WARN before starting the context)
            session.sparkContext.setLogLevel("ERROR")

            # bootstrap datafaucet.zip in the cluster
            if not self.is_spark_local():
                dir_path = os.path.dirname(os.path.realpath(__file__))
                filename = os.path.abspath(
                    os.path.join(dir_path, 'dist/datafaucet.zip'))
                session.sparkContext.addPyFile(filename)

            # collect configuration
            self.conf = dict(dict(session.sparkContext.getConf().getAll()))

            # set the engine version
            self.version = session.version

            # set environment
            self.env = self.get_environment()

            # set info
            self.info['spark_classpath'] = self.info['spark_classpath'][
                0].split(' ')
            self.info = YamlDict(self.info)

            # set version if spark is loaded
            logging.notice(
                f'Engine context {self.engine_type}:{self.version} successfully started'
            )

            # session is running
            self.stopped = False

        except Exception as e:
            print(e)
            logging.error('Could not start the engine context')
            return None
Example #3
0
    def copy(self, md_src, md_trg, mode='append'):
        # timer
        timer_start = timer()

        # src dataframe
        df_src = self.load(md_src)

        # if not path on target, get it from src
        if not md_trg['resource_path']:
            md_trg = resource.metadata(self._rootdir, self._metadata,
                                       md_src['resource_path'],
                                       md_trg['provider_alias'])

        # logging
        log_data = {
            'src_hash': md_src['hash'],
            'src_path': md_src['resource_path'],
            'trg_hash': md_trg['hash'],
            'trg_path': md_trg['resource_path'],
            'mode': mode,
            'updated': False,
            'records_read': 0,
            'records_add': 0,
            'records_del': 0,
            'columns': 0,
            'time': timer() - timer_start
        }

        # could not read source, log error and return
        if df_src is None:
            logging.error(log_data)
            return

        num_rows = df_src.count()
        num_cols = len(df_src.columns)

        # empty source, log notice and return
        if num_rows == 0 and mode == 'append':
            log_data['time'] = timer() - timer_start
            logging.notice(log_data)
            return

        # overwrite target, save, log notice/error and return
        if mode == 'overwrite':
            if md_trg['state_column']:
                df_src = df_src.withColumn('_state', F.lit(0))

            result = self.save(df_src, md_trg, mode=mode)

            log_data['time'] = timer() - timer_start
            log_data['records_read'] = num_rows
            log_data['records_add'] = num_rows
            log_data['columns'] = num_cols

            logging.notice(log_data) if result else logging.error(log_data)
            return

        # trg dataframe (if exists)
        try:
            df_trg = self.load(md_trg, catch_exception=False)
        except:
            df_trg = dataframe.empty(df_src)

        # de-dup (exclude the _updated column)

        # create a view from the extracted log
        df_trg = dataframe.view(df_trg)

        # capture added records
        df_add = dataframe.diff(
            df_src, df_trg,
            ['_date', '_datetime', '_updated', '_hash', '_state'])
        rows_add = df_add.count()

        # capture deleted records
        rows_del = 0
        if md_trg['state_column']:
            df_del = dataframe.diff(
                df_trg, df_src,
                ['_date', '_datetime', '_updated', '_hash', '_state'])
            rows_del = df_del.count()

        updated = (rows_add + rows_del) > 0

        num_cols = len(df_add.columns)
        num_rows = max(df_src.count(), df_trg.count())

        # save diff
        if updated:
            if md_trg['state_column']:
                df_add = df_add.withColumn('_state', F.lit(0))
                df_del = df_del.withColumn('_state', F.lit(1))

                df = df_add.union(df_del)
            else:
                df = df_add

            result = self.save(df, md_trg, mode=mode)
        else:
            result = True

        log_data.update({
            'updated': updated,
            'records_read': num_rows,
            'records_add': rows_add,
            'records_del': rows_del,
            'columns': num_cols,
            'time': timer() - timer_start
        })

        logging.notice(log_data) if result else logging.error(log_data)
Example #4
0
    def load(self, profile='default', rootpath=None):
        """
        Performs the following steps:
            - set rootdir for the given project
            - import variables from  <rootdir>/.env (if present),
            - load the `profile` from the metadata files
            - setup and start the data engine

        :param profile: load the given metadata profile (default: 'default')
        
        :param rootpath: root directory for loaded project 
               default behaviour: search parent dirs to detect rootdir by 
               looking for a '__main__.py' or 'main.ipynb' file. 
               When such a file is found, the corresponding directory is the 
               root path for the project. If nothing is found, the current 
               working directory, will be the rootpath

        :return: None

        Notes abount metadata configuration:

        1)  Metadata files are merged up, so you can split the information in 
            multiple files as long as they end with `metadata.yml`. 

            For example: `metadata.yml`, `abc.metadata.yaml`, `abc_metadata.yml` 
            are all valid metadata file names.

        2)  All metadata files in all subdirectories from the project root directory 
            are loaded, unless the directory contains a file `metadata.ignore.yml`

        3)  Metadata files can provide multiple profile configurations,
            by separating each profile configuration with a Document Marker 
            ( a line with `---`) (see https://yaml.org/spec/1.2/spec.html#YAML)

        4)  Each metadata profile, can be broken down in multiple yaml files,
            When loading the files all configuration belonging to the same profile 
            with be merged.

        5)  All metadata profiles inherit the settings from profile 'default'

        Metadata files are composed of 6 sections:
            - profile
            - variables
            - providers
            - resources
            - engine
            - loggers

        For more information about metadata configuration,
        type `help(datafaucet.project.metadata)`    
        """

        if self.loaded and self._no_reload:
            logging.notice(f"Profile {self._profile} already loaded. "
                           "Skipping project.load()")
            return self

        # set rootpath
        paths.set_rootdir(rootpath)

        # set loaded to false
        self.loaded = False

        # set username
        self._username = getpass.getuser()

        # get repo data
        self._repo = repo_data()

        # set session name
        L = [self._profile, self._repo.get('name')]
        self._session_name = '-'.join([x for x in L if x])

        # set session id
        self._session_id = hex(uuid.uuid1().int >> 64)

        # get currently running script path
        self._script_path = files.get_script_path(paths.rootdir())

        # set dotenv default file, check the file exists
        self._dotenv_path = files.get_dotenv_path(paths.rootdir())

        # get files
        self._metadata_files = files.get_metadata_files(paths.rootdir())
        self._notebook_files = files.get_jupyter_notebook_files(
            paths.rootdir())
        self._python_files = files.get_python_files(paths.rootdir())

        # metadata defaults
        dir_path = os.path.dirname(os.path.realpath(__file__))
        default_md_files = [os.path.join(dir_path, 'schemas/default.yml')]
        project_md_files = abspath(self._metadata_files, paths.rootdir())

        # load metadata
        try:
            md_paths = default_md_files + project_md_files
            dotenv_path = abspath(self._dotenv_path, paths.rootdir())

            metadata.load(profile, md_paths, dotenv_path)
        except ValueError as e:
            print(e)

        # bail if no metadata
        if metadata.profile is None:
            raise ValueError('No valid metadata to load.')

        # set profile from metadata
        self._profile_name = metadata.info()['active']

        # add roothpath to the list of python sys paths
        if paths.rootdir() not in sys.path:
            sys.path.append(paths.rootdir())

        # stop existing engine
        if self._engine:
            self._engine.stop()

        #services
        services = dict()

        all_aliases = list(metadata.profile()['providers'].keys())

        # get services from aliases
        for alias in all_aliases:
            r = Resource(alias)
            services[r['service']] = r

        # get one service from each type to
        # load drivers, jars etc via the engine init
        services = list(services.values())

        #initialize the engine
        md = metadata.profile()['engine']
        engines.Engine(md['type'],
                       session_name=self._session_name,
                       session_id=self._session_id,
                       master=md['master'],
                       timezone=md['timezone'],
                       jars=md['submit']['jars'],
                       packages=md['submit']['packages'],
                       pyfiles=md['submit']['py-files'],
                       files=md['submit']['files'],
                       repositories=md['submit']['repositories'],
                       conf=md['submit']['conf'],
                       services=services)

        # initialize logging
        logging.init(metadata.profile()['loggers'], self._session_id,
                     self._username, self._script_path, self._repo['name'],
                     self._repo['hash'])

        # set loaded to True
        self.loaded = True

        # return object
        return self
Example #5
0
    def __init__(self,
                 session_name=None,
                 session_id=0,
                 master='local[*]',
                 timezone=None,
                 repositories=None,
                 jars=None,
                 packages=None,
                 files=None,
                 services=None,
                 conf=None,
                 detect=True):

        # call base class
        # stop the previous instance,
        # register self a the new instance
        super().__init__('spark', session_name, session_id)

        # bundle all submit in a dictionary
        self.submit = {
            'jars': [jars] if isinstance(jars, str) else jars or [],
            'packages':
            [packages] if isinstance(packages, str) else packages or [],
            'files': [files] if isinstance(files, str) else files or [],
            'repositories': [repositories]
            if isinstance(repositories, str) else repositories or [],
            'conf': [conf] if isinstance(conf, tuple) else conf or [],
        }

        # suppress INFO logging for java_gateway
        python_logging.getLogger('py4j.java_gateway').setLevel(
            python_logging.ERROR)

        # collect info
        self.set_info()

        # detect packages and configuration from services
        if detect:
            detected = self.detect_submit_params(services)
            self.submit = merge(detected, self.submit)

        # set submit args via env variable
        self.set_submit_args()

        # set other spark-related environment variables
        self.set_env_variables()

        # set spark conf object
        logging.notice(f"Connecting to spark master: {master}")

        conf = pyspark.SparkConf()
        self.set_conf_timezone(conf, timezone)

        # set session name
        conf.setAppName(session_name)

        # set master
        conf.setMaster(master)

        # config passed through the api call go via the config
        for c in self.submit['conf']:
            k, v, *_ = list(c) + ['']
            if isinstance(v, (bool, int, float, str)):
                conf.set(k, v)

        # stop the current session if running
        self.stop()

        # start spark
        self.start_session(conf)
Example #6
0
    def save_scd(self,
                 obj,
                 path=None,
                 provider=None,
                 *args,
                 format=None,
                 mode=None,
                 merge_on=None,
                 where=None,
                 **kwargs):

        result = True
        md = Resource(path, provider, format=format, mode=mode, **kwargs)

        options = md['options']

        # after collecting from metadata, or method call, define csv defaults
        options['mode'] = options.get('mode', None) or 'append'
        format = md['format'] or 'parquet'

        where = where or []
        where = where if isinstance(where, (list, tuple)) else [where]

        ts_start = timer()

        num_rows = obj.count()
        num_cols = len(obj.columns)

        # empty source, log notice and return
        if num_rows == 0 and mode == 'append':
            return True

        # overwrite target, save, log notice/error and return
        if options['mode'] == 'overwrite':
            obj = obj.withColumn('_state', F.lit(0))
            obj = dataframe.add_update_column(obj, '_updated')

            result = self.save(obj, md, mode=options['mode'])
            self.save_log(md, options, ts_start)
            return True

        # append
        df_src = obj

        # trg dataframe (if exists)
        df_trg = self.load(md, format=format) or dataframe.empty(df_src)

        if '_state' not in df_trg.columns:
            df_trg = df_trg.withColumn('_state', F.lit(0))

        if '_updated' not in df_trg.columns:
            df_trg = dataframe.add_update_column(df_trg, '_updated')

        # filter src and trg (mainly speed reason: reduce diff time, but compare only a portion of all records)
        for predicate in where:
            df_src = df_src.filter(predicate)
            df_trg = df_trg.filter(predicate)

        # create a view from the extracted log
        df_trg = dataframe.view(df_trg, merge_on=merge_on)

        # schema change: add new columns
        added_cols = set(df_src.columns) - set(df_trg.columns)
        added_cols = {
            x.name: x.dataType
            for x in list(df_src.schema) if x.name in added_cols
        }
        for c, t in added_cols.items():
            df_trg = df_trg.withColumn(c, F.lit(None).cast(t))

        # schema change: removed columns
        # no need to do anything, diff will take care of that

        # capture added records
        df_add = dataframe.diff(df_src, df_trg, ['_updated', '_state'])

        # capture deleted records
        df_del = dataframe.diff(df_trg, df_src, ['_updated', '_state'])

        # capture updated records
        cnt_upd = 0
        if merge_on is not None:
            on = merge_on if isinstance(merge_on,
                                        (list, tuple)) else [merge_on]
            cnt_upd = df_add.join(df_del, on=on).count()

        cnt_del = df_del.count() - cnt_upd
        cnt_add = df_add.count() - cnt_upd

        logging.notice(
            f'merge on={merge_on}, updated={cnt_upd}, added={cnt_add}, deleted={cnt_del}'
        )

        df_add = df_add.withColumn('_state', F.lit(0))
        df_del = df_del.withColumn('_state', F.lit(1))

        df = df_add.union(df_del)
        df = dataframe.add_update_column(df, '_updated')

        result = self.save(df, md, format=format, **options)
        return result