def register(cls, alias): global _engines _engines[cls.__name__] = cls _engines[alias] = cls logging.info('Registering names ', cls.__name__, alias, ' for class ', cls)
def set_submit_args(self): submit_args = '' for k in self.submit.keys() - {'conf'}: s = ",".join(self.submit[k]) submit_args += f' --{k} {s}' if s else '' # submit config options one by one for c in self.submit['conf']: submit_args += f' --conf {c[0]}={c[1]}' # print debug for k in self.submit.keys(): if self.submit[k]: logging.notice(f'Configuring {k}:') for e in self.submit[k]: v = e if isinstance(e, tuple): if len(e) > 1 and str(e[0]).endswith('.key'): e = (e[0], '****** (redacted)') v = ' : '.join(list([str(x) for x in e])) if k == 'conf': logging.info(f' - {v}') else: logging.notice(f' - {v}') # set PYSPARK_SUBMIT_ARGS env variable submit_args = '{} pyspark-shell'.format(submit_args) os.environ['PYSPARK_SUBMIT_ARGS'] = submit_args
def save_plus(self, obj, path=None, provider=None, **kwargs): md = Resource(path, provider, **kwargs) prep_start = timer() options = md['options'] or {} if md['date_partition'] and md['date_column']: tzone = 'UTC' if self._timestamps == 'naive' else self._timezone obj = dataframe.add_datetime_columns(obj, column=md['date_column'], tzone=tzone) kwargs['partitionBy'] = ['_date'] + kwargs.get( 'partitionBy', options.get('partitionBy', [])) if md['update_column']: obj = dataframe.add_update_column(obj, tzone=self._timezone) if md['hash_column']: obj = dataframe.add_hash_column(obj, cols=md['hash_column'], exclude_cols=[ '_date', '_datetime', '_updated', '_hash', '_state' ]) date_column = '_date' if md['date_partition'] else md['date_column'] obj = dataframe.filter_by_date(obj, date_column, md['date_start'], md['date_end'], md['date_window']) obj = dataframe.cache(obj, md['cache']) num_rows = obj.count() num_cols = len(obj.columns) # force 1 file per partition, just before saving obj = obj.repartition(1, *kwargs['partitionBy']) if kwargs.get( 'partitionBy') else obj.repartition(1) # obj = obj.coalesce(1) prep_end = timer() core_start = timer() result = self.save_dataframe(obj, md, **kwargs) core_end = timer() log_data = { 'md': dict(md), 'mode': kwargs.get('mode', options.get('mode')), 'records': num_rows, 'columns': num_cols, 'time': core_end - prep_start, 'time_core': core_end - core_start, 'time_prep': prep_end - prep_start } logging.info(log_data) if result else logging.error(log_data) return result
def load_plus(self, path=None, provider=None, catch_exception=True, **kwargs): md = Resource(path, provider, **kwargs) core_start = timer() obj = self.load_dataframe(md, catch_exception, **kwargs) core_end = timer() if obj is None: return obj prep_start = timer() #date_column = '_date' if md['date_partition'] else md['date_column'] obj = dataframe.filter_by_date(obj, date_column, md['date_start'], md['date_end'], md['date_window']) # partition and sorting (hmmm, needed?) if date_column and date_column in obj.columns: obj = obj.repartition(date_column) if '_updated' in obj.columns: obj = obj.sortWithinPartitions(F.desc('_updated')) num_rows = obj.count() num_cols = len(obj.columns) obj = dataframe.cache(obj, md['cache']) prep_end = timer() log_data = { 'md': md, 'mode': kwargs.get('mode', md.get('options', {}).get('mode')), 'records': num_rows, 'columns': num_cols, 'time': prep_end - core_start, 'time_core': core_end - core_start, 'time_prep': prep_end - prep_start } logging.info(log_data) if obj is not None else logging.error(log_data) obj.__name__ = path return obj
def save_log(self, md, options, ts_start): ts_end = timer() log_data = {'md': md, 'options': options, 'time': ts_end - ts_start} logging.info('save', extra=log_data)
def bar(): logging.info('bar')
def foo(): logging.info('foo') bar()