def set_submit_args(self): submit_args = '' for k in self.submit.keys() - {'conf'}: s = ",".join(self.submit[k]) submit_args += f' --{k} {s}' if s else '' # submit config options one by one for c in self.submit['conf']: submit_args += f' --conf {c[0]}={c[1]}' # print debug for k in self.submit.keys(): if self.submit[k]: logging.notice(f'Configuring {k}:') for e in self.submit[k]: v = e if isinstance(e, tuple): if len(e) > 1 and str(e[0]).endswith('.key'): e = (e[0], '****** (redacted)') v = ' : '.join(list([str(x) for x in e])) if k == 'conf': logging.info(f' - {v}') else: logging.notice(f' - {v}') # set PYSPARK_SUBMIT_ARGS env variable submit_args = '{} pyspark-shell'.format(submit_args) os.environ['PYSPARK_SUBMIT_ARGS'] = submit_args
def start_session(self, conf): try: # init the spark session session = pyspark.sql.SparkSession.builder.config( conf=conf).getOrCreate() # store the spark session self.session = session # fix SQLContext for back compatibility initialize_spark_sql_context(session, session.sparkContext) # pyspark set log level method # (this will not suppress WARN before starting the context) session.sparkContext.setLogLevel("ERROR") # bootstrap datafaucet.zip in the cluster if not self.is_spark_local(): dir_path = os.path.dirname(os.path.realpath(__file__)) filename = os.path.abspath( os.path.join(dir_path, 'dist/datafaucet.zip')) session.sparkContext.addPyFile(filename) # collect configuration self.conf = dict(dict(session.sparkContext.getConf().getAll())) # set the engine version self.version = session.version # set environment self.env = self.get_environment() # set info self.info['spark_classpath'] = self.info['spark_classpath'][ 0].split(' ') self.info = YamlDict(self.info) # set version if spark is loaded logging.notice( f'Engine context {self.engine_type}:{self.version} successfully started' ) # session is running self.stopped = False except Exception as e: print(e) logging.error('Could not start the engine context') return None
def copy(self, md_src, md_trg, mode='append'): # timer timer_start = timer() # src dataframe df_src = self.load(md_src) # if not path on target, get it from src if not md_trg['resource_path']: md_trg = resource.metadata(self._rootdir, self._metadata, md_src['resource_path'], md_trg['provider_alias']) # logging log_data = { 'src_hash': md_src['hash'], 'src_path': md_src['resource_path'], 'trg_hash': md_trg['hash'], 'trg_path': md_trg['resource_path'], 'mode': mode, 'updated': False, 'records_read': 0, 'records_add': 0, 'records_del': 0, 'columns': 0, 'time': timer() - timer_start } # could not read source, log error and return if df_src is None: logging.error(log_data) return num_rows = df_src.count() num_cols = len(df_src.columns) # empty source, log notice and return if num_rows == 0 and mode == 'append': log_data['time'] = timer() - timer_start logging.notice(log_data) return # overwrite target, save, log notice/error and return if mode == 'overwrite': if md_trg['state_column']: df_src = df_src.withColumn('_state', F.lit(0)) result = self.save(df_src, md_trg, mode=mode) log_data['time'] = timer() - timer_start log_data['records_read'] = num_rows log_data['records_add'] = num_rows log_data['columns'] = num_cols logging.notice(log_data) if result else logging.error(log_data) return # trg dataframe (if exists) try: df_trg = self.load(md_trg, catch_exception=False) except: df_trg = dataframe.empty(df_src) # de-dup (exclude the _updated column) # create a view from the extracted log df_trg = dataframe.view(df_trg) # capture added records df_add = dataframe.diff( df_src, df_trg, ['_date', '_datetime', '_updated', '_hash', '_state']) rows_add = df_add.count() # capture deleted records rows_del = 0 if md_trg['state_column']: df_del = dataframe.diff( df_trg, df_src, ['_date', '_datetime', '_updated', '_hash', '_state']) rows_del = df_del.count() updated = (rows_add + rows_del) > 0 num_cols = len(df_add.columns) num_rows = max(df_src.count(), df_trg.count()) # save diff if updated: if md_trg['state_column']: df_add = df_add.withColumn('_state', F.lit(0)) df_del = df_del.withColumn('_state', F.lit(1)) df = df_add.union(df_del) else: df = df_add result = self.save(df, md_trg, mode=mode) else: result = True log_data.update({ 'updated': updated, 'records_read': num_rows, 'records_add': rows_add, 'records_del': rows_del, 'columns': num_cols, 'time': timer() - timer_start }) logging.notice(log_data) if result else logging.error(log_data)
def load(self, profile='default', rootpath=None): """ Performs the following steps: - set rootdir for the given project - import variables from <rootdir>/.env (if present), - load the `profile` from the metadata files - setup and start the data engine :param profile: load the given metadata profile (default: 'default') :param rootpath: root directory for loaded project default behaviour: search parent dirs to detect rootdir by looking for a '__main__.py' or 'main.ipynb' file. When such a file is found, the corresponding directory is the root path for the project. If nothing is found, the current working directory, will be the rootpath :return: None Notes abount metadata configuration: 1) Metadata files are merged up, so you can split the information in multiple files as long as they end with `metadata.yml`. For example: `metadata.yml`, `abc.metadata.yaml`, `abc_metadata.yml` are all valid metadata file names. 2) All metadata files in all subdirectories from the project root directory are loaded, unless the directory contains a file `metadata.ignore.yml` 3) Metadata files can provide multiple profile configurations, by separating each profile configuration with a Document Marker ( a line with `---`) (see https://yaml.org/spec/1.2/spec.html#YAML) 4) Each metadata profile, can be broken down in multiple yaml files, When loading the files all configuration belonging to the same profile with be merged. 5) All metadata profiles inherit the settings from profile 'default' Metadata files are composed of 6 sections: - profile - variables - providers - resources - engine - loggers For more information about metadata configuration, type `help(datafaucet.project.metadata)` """ if self.loaded and self._no_reload: logging.notice(f"Profile {self._profile} already loaded. " "Skipping project.load()") return self # set rootpath paths.set_rootdir(rootpath) # set loaded to false self.loaded = False # set username self._username = getpass.getuser() # get repo data self._repo = repo_data() # set session name L = [self._profile, self._repo.get('name')] self._session_name = '-'.join([x for x in L if x]) # set session id self._session_id = hex(uuid.uuid1().int >> 64) # get currently running script path self._script_path = files.get_script_path(paths.rootdir()) # set dotenv default file, check the file exists self._dotenv_path = files.get_dotenv_path(paths.rootdir()) # get files self._metadata_files = files.get_metadata_files(paths.rootdir()) self._notebook_files = files.get_jupyter_notebook_files( paths.rootdir()) self._python_files = files.get_python_files(paths.rootdir()) # metadata defaults dir_path = os.path.dirname(os.path.realpath(__file__)) default_md_files = [os.path.join(dir_path, 'schemas/default.yml')] project_md_files = abspath(self._metadata_files, paths.rootdir()) # load metadata try: md_paths = default_md_files + project_md_files dotenv_path = abspath(self._dotenv_path, paths.rootdir()) metadata.load(profile, md_paths, dotenv_path) except ValueError as e: print(e) # bail if no metadata if metadata.profile is None: raise ValueError('No valid metadata to load.') # set profile from metadata self._profile_name = metadata.info()['active'] # add roothpath to the list of python sys paths if paths.rootdir() not in sys.path: sys.path.append(paths.rootdir()) # stop existing engine if self._engine: self._engine.stop() #services services = dict() all_aliases = list(metadata.profile()['providers'].keys()) # get services from aliases for alias in all_aliases: r = Resource(alias) services[r['service']] = r # get one service from each type to # load drivers, jars etc via the engine init services = list(services.values()) #initialize the engine md = metadata.profile()['engine'] engines.Engine(md['type'], session_name=self._session_name, session_id=self._session_id, master=md['master'], timezone=md['timezone'], jars=md['submit']['jars'], packages=md['submit']['packages'], pyfiles=md['submit']['py-files'], files=md['submit']['files'], repositories=md['submit']['repositories'], conf=md['submit']['conf'], services=services) # initialize logging logging.init(metadata.profile()['loggers'], self._session_id, self._username, self._script_path, self._repo['name'], self._repo['hash']) # set loaded to True self.loaded = True # return object return self
def __init__(self, session_name=None, session_id=0, master='local[*]', timezone=None, repositories=None, jars=None, packages=None, files=None, services=None, conf=None, detect=True): # call base class # stop the previous instance, # register self a the new instance super().__init__('spark', session_name, session_id) # bundle all submit in a dictionary self.submit = { 'jars': [jars] if isinstance(jars, str) else jars or [], 'packages': [packages] if isinstance(packages, str) else packages or [], 'files': [files] if isinstance(files, str) else files or [], 'repositories': [repositories] if isinstance(repositories, str) else repositories or [], 'conf': [conf] if isinstance(conf, tuple) else conf or [], } # suppress INFO logging for java_gateway python_logging.getLogger('py4j.java_gateway').setLevel( python_logging.ERROR) # collect info self.set_info() # detect packages and configuration from services if detect: detected = self.detect_submit_params(services) self.submit = merge(detected, self.submit) # set submit args via env variable self.set_submit_args() # set other spark-related environment variables self.set_env_variables() # set spark conf object logging.notice(f"Connecting to spark master: {master}") conf = pyspark.SparkConf() self.set_conf_timezone(conf, timezone) # set session name conf.setAppName(session_name) # set master conf.setMaster(master) # config passed through the api call go via the config for c in self.submit['conf']: k, v, *_ = list(c) + [''] if isinstance(v, (bool, int, float, str)): conf.set(k, v) # stop the current session if running self.stop() # start spark self.start_session(conf)
def save_scd(self, obj, path=None, provider=None, *args, format=None, mode=None, merge_on=None, where=None, **kwargs): result = True md = Resource(path, provider, format=format, mode=mode, **kwargs) options = md['options'] # after collecting from metadata, or method call, define csv defaults options['mode'] = options.get('mode', None) or 'append' format = md['format'] or 'parquet' where = where or [] where = where if isinstance(where, (list, tuple)) else [where] ts_start = timer() num_rows = obj.count() num_cols = len(obj.columns) # empty source, log notice and return if num_rows == 0 and mode == 'append': return True # overwrite target, save, log notice/error and return if options['mode'] == 'overwrite': obj = obj.withColumn('_state', F.lit(0)) obj = dataframe.add_update_column(obj, '_updated') result = self.save(obj, md, mode=options['mode']) self.save_log(md, options, ts_start) return True # append df_src = obj # trg dataframe (if exists) df_trg = self.load(md, format=format) or dataframe.empty(df_src) if '_state' not in df_trg.columns: df_trg = df_trg.withColumn('_state', F.lit(0)) if '_updated' not in df_trg.columns: df_trg = dataframe.add_update_column(df_trg, '_updated') # filter src and trg (mainly speed reason: reduce diff time, but compare only a portion of all records) for predicate in where: df_src = df_src.filter(predicate) df_trg = df_trg.filter(predicate) # create a view from the extracted log df_trg = dataframe.view(df_trg, merge_on=merge_on) # schema change: add new columns added_cols = set(df_src.columns) - set(df_trg.columns) added_cols = { x.name: x.dataType for x in list(df_src.schema) if x.name in added_cols } for c, t in added_cols.items(): df_trg = df_trg.withColumn(c, F.lit(None).cast(t)) # schema change: removed columns # no need to do anything, diff will take care of that # capture added records df_add = dataframe.diff(df_src, df_trg, ['_updated', '_state']) # capture deleted records df_del = dataframe.diff(df_trg, df_src, ['_updated', '_state']) # capture updated records cnt_upd = 0 if merge_on is not None: on = merge_on if isinstance(merge_on, (list, tuple)) else [merge_on] cnt_upd = df_add.join(df_del, on=on).count() cnt_del = df_del.count() - cnt_upd cnt_add = df_add.count() - cnt_upd logging.notice( f'merge on={merge_on}, updated={cnt_upd}, added={cnt_add}, deleted={cnt_del}' ) df_add = df_add.withColumn('_state', F.lit(0)) df_del = df_del.withColumn('_state', F.lit(1)) df = df_add.union(df_del) df = dataframe.add_update_column(df, '_updated') result = self.save(df, md, format=format, **options) return result