Exemple #1
0
def register(cls, alias):
    global _engines

    _engines[cls.__name__] = cls
    _engines[alias] = cls

    logging.info('Registering names ', cls.__name__, alias, ' for class ', cls)
Exemple #2
0
    def set_submit_args(self):
        submit_args = ''

        for k in self.submit.keys() - {'conf'}:
            s = ",".join(self.submit[k])
            submit_args += f' --{k} {s}' if s else ''

        # submit config options one by one
        for c in self.submit['conf']:
            submit_args += f' --conf {c[0]}={c[1]}'

        # print debug
        for k in self.submit.keys():
            if self.submit[k]:
                logging.notice(f'Configuring {k}:')
                for e in self.submit[k]:
                    v = e
                    if isinstance(e, tuple):
                        if len(e) > 1 and str(e[0]).endswith('.key'):
                            e = (e[0], '****** (redacted)')
                        v = ' : '.join(list([str(x) for x in e]))
                    if k == 'conf':
                        logging.info(f'  -  {v}')
                    else:
                        logging.notice(f'  -  {v}')

        # set PYSPARK_SUBMIT_ARGS env variable
        submit_args = '{} pyspark-shell'.format(submit_args)
        os.environ['PYSPARK_SUBMIT_ARGS'] = submit_args
Exemple #3
0
    def save_plus(self, obj, path=None, provider=None, **kwargs):
        md = Resource(path, provider, **kwargs)

        prep_start = timer()
        options = md['options'] or {}

        if md['date_partition'] and md['date_column']:
            tzone = 'UTC' if self._timestamps == 'naive' else self._timezone
            obj = dataframe.add_datetime_columns(obj,
                                                 column=md['date_column'],
                                                 tzone=tzone)
            kwargs['partitionBy'] = ['_date'] + kwargs.get(
                'partitionBy', options.get('partitionBy', []))

        if md['update_column']:
            obj = dataframe.add_update_column(obj, tzone=self._timezone)

        if md['hash_column']:
            obj = dataframe.add_hash_column(obj,
                                            cols=md['hash_column'],
                                            exclude_cols=[
                                                '_date', '_datetime',
                                                '_updated', '_hash', '_state'
                                            ])

        date_column = '_date' if md['date_partition'] else md['date_column']
        obj = dataframe.filter_by_date(obj, date_column, md['date_start'],
                                       md['date_end'], md['date_window'])

        obj = dataframe.cache(obj, md['cache'])

        num_rows = obj.count()
        num_cols = len(obj.columns)

        # force 1 file per partition, just before saving
        obj = obj.repartition(1, *kwargs['partitionBy']) if kwargs.get(
            'partitionBy') else obj.repartition(1)
        # obj = obj.coalesce(1)

        prep_end = timer()

        core_start = timer()
        result = self.save_dataframe(obj, md, **kwargs)
        core_end = timer()

        log_data = {
            'md': dict(md),
            'mode': kwargs.get('mode', options.get('mode')),
            'records': num_rows,
            'columns': num_cols,
            'time': core_end - prep_start,
            'time_core': core_end - core_start,
            'time_prep': prep_end - prep_start
        }

        logging.info(log_data) if result else logging.error(log_data)

        return result
Exemple #4
0
    def load_plus(self,
                  path=None,
                  provider=None,
                  catch_exception=True,
                  **kwargs):
        md = Resource(path, provider, **kwargs)

        core_start = timer()
        obj = self.load_dataframe(md, catch_exception, **kwargs)
        core_end = timer()
        if obj is None:
            return obj

        prep_start = timer()
        #date_column = '_date' if md['date_partition'] else md['date_column']
        obj = dataframe.filter_by_date(obj, date_column, md['date_start'],
                                       md['date_end'], md['date_window'])

        # partition and sorting (hmmm, needed?)
        if date_column and date_column in obj.columns:
            obj = obj.repartition(date_column)

        if '_updated' in obj.columns:
            obj = obj.sortWithinPartitions(F.desc('_updated'))

        num_rows = obj.count()
        num_cols = len(obj.columns)

        obj = dataframe.cache(obj, md['cache'])

        prep_end = timer()

        log_data = {
            'md': md,
            'mode': kwargs.get('mode',
                               md.get('options', {}).get('mode')),
            'records': num_rows,
            'columns': num_cols,
            'time': prep_end - core_start,
            'time_core': core_end - core_start,
            'time_prep': prep_end - prep_start
        }
        logging.info(log_data) if obj is not None else logging.error(log_data)

        obj.__name__ = path
        return obj
Exemple #5
0
    def save_log(self, md, options, ts_start):
        ts_end = timer()

        log_data = {'md': md, 'options': options, 'time': ts_end - ts_start}
        logging.info('save', extra=log_data)
Exemple #6
0
def bar():
    logging.info('bar')
Exemple #7
0
def foo():
    logging.info('foo')
    bar()