def save(self, obj, path=None, provider=None, **kargs): if isinstance(path, YamlDict): md = path.to_dict() elif isinstance(path, str): md = resource.metadata(self._rootdir, self._metadata, path, provider) elif isinstance(path, dict): md = path prep_start = timer() options = md['options'] or {} if md['date_partition'] and md['date_column']: tzone = 'UTC' if self._timestamps == 'naive' else self._timezone obj = dataframe.add_datetime_columns(obj, column=md['date_column'], tzone=tzone) kargs['partitionBy'] = ['_date'] + kargs.get('partitionBy', options.get('partitionBy', [])) if md['update_column']: obj = dataframe.add_update_column(obj, tzone=self._timezone) if md['hash_column']: obj = dataframe.add_hash_column(obj, cols=md['hash_column'], exclude_cols=['_date', '_datetime', '_updated', '_hash', '_state']) date_column = '_date' if md['date_partition'] else md['date_column'] obj = dataframe.filter_by_date( obj, date_column, md['date_start'], md['date_end'], md['date_window']) obj = dataframe.cache(obj, md['cache']) num_rows = obj.count() num_cols = len(obj.columns) # force 1 file per partition, just before saving obj = obj.repartition(1, *kargs['partitionBy']) if kargs.get('partitionBy') else obj.repartition(1) # obj = obj.coalesce(1) prep_end = timer() core_start = timer() result = self.save_dataframe(obj, md, **kargs) core_end = timer() log_data = { 'md': {i: md[i] for i in md if i != 'password'}, 'mode': kargs.get('mode', options.get('mode')), 'records': num_rows, 'columns': num_cols, 'time': core_end - prep_start, 'time_core': core_end - core_start, 'time_prep': prep_end - prep_start } logging.info(log_data) if result else logging.error(log_data) return result
def load(self, path=None, provider=None, catch_exception=True, **kargs): if isinstance(path, YamlDict): md = path.to_dict() elif isinstance(path, str): md = get_metadata(self._rootdir, self._metadata, path, provider) elif isinstance(path, dict): md = path core_start = timer() obj = self.load_dataframe(md, catch_exception, **kargs) core_end = timer() if obj is None: return obj prep_start = timer() date_column = '_date' if md['date_partition'] else md['date_column'] obj = dataframe.filter_by_date( obj, date_column, md['date_start'], md['date_end'], md['date_window']) # partition and sorting (hmmm, needed?) if date_column and date_column in obj.columns: obj = obj.repartition(date_column) if '_updated' in obj.columns: obj = obj.sortWithinPartitions(F.desc('_updated')) num_rows = obj.count() num_cols = len(obj.columns) obj = dataframe.cache(obj, md['cache']) prep_end = timer() log_data = { 'md': dict(md), 'mode': kargs.get('mode', md.get('options', {}).get('mode')), 'records': num_rows, 'columns': num_cols, 'time': prep_end - core_start, 'time_core': core_end - core_start, 'time_prep': prep_end - prep_start } logging.info(log_data) if obj is not None else logging.error(log_data) return obj
def load_plus(self, path=None, provider=None, catch_exception=True, **kwargs): md = Resource(path, provider, **kwargs) core_start = timer() obj = self.load_dataframe(md, catch_exception, **kwargs) core_end = timer() if obj is None: return obj prep_start = timer() #date_column = '_date' if md['date_partition'] else md['date_column'] obj = dataframe.filter_by_date(obj, date_column, md['date_start'], md['date_end'], md['date_window']) # partition and sorting (hmmm, needed?) if date_column and date_column in obj.columns: obj = obj.repartition(date_column) if '_updated' in obj.columns: obj = obj.sortWithinPartitions(F.desc('_updated')) num_rows = obj.count() num_cols = len(obj.columns) obj = dataframe.cache(obj, md['cache']) prep_end = timer() log_data = { 'md': md, 'mode': kwargs.get('mode', md.get('options', {}).get('mode')), 'records': num_rows, 'columns': num_cols, 'time': prep_end - core_start, 'time_core': core_end - core_start, 'time_prep': prep_end - prep_start } logging.info(log_data) if obj is not None else logging.error(log_data) obj.__name__ = path return obj