Ejemplo n.º 1
0
    def _run_object_import(self, force, last_update, flush, full_history):
        workers = self.lconfig.get('workers')
        # if we're using multiple workers, break the oids
        # according to worker batchsize, then each worker will
        # break the batch into smaller sql batch size batches
        # otherwise, single threaded, use sql batch size
        w_batch_size = self.lconfig.get('worker_batch_size')
        s_batch_size = self.lconfig.get('batch_size')

        # store the time right before the ETL job starts,
        # so next run, we can catch delta changes b/w
        # next ETL start and previous (this)
        new_delta_ts = utcnow()
        # get list of oids which we plan to update
        oids, save_delta_ts = self._delta_force(force, last_update)

        msg = 'Getting Full History' if full_history else \
            'Getting Objects - Current Values'
        if HAS_JOBLIB and workers > 1:
            logger.debug('%s (%s@%s)' % (msg, workers, w_batch_size))
            runner = Parallel(n_jobs=workers)
            func = delayed(get_objects)
            result = runner(func(
                cube=self._cube, oids=batch,
                full_history=full_history, flush=flush,
                cube_name=self.name, config=self.config,
                config_file=self.config_file,
                config_key=self.config_key,
                container=type(self.objects),
                container_config=self.container_config,
                proxy=type(self.proxy),
                proxy_config=self.proxy_config)
                for batch in batch_gen(oids, w_batch_size))
            # merge list of lists (batched) into single list
            result = [i for l in result for i in l]
            if not flush:
                self.objects.extend(result)
        else:
            logger.debug('%s (%s@%s)' % (msg, workers, w_batch_size))
            result = []
            _s = 0
            for i, batch in enumerate(batch_gen(oids, s_batch_size)):
                _e = _s + s_batch_size
                logger.debug('batch %s: %s-%s of %s' % (i, _s, _e, len(oids)))
                if full_history:
                    _ = self._activity_get_objects(oids=batch, flush=flush)
                else:
                    _ = self._get_objects(oids=batch, flush=flush)
                result.extend(_)
                _s = _e

        # save new delta_ts:
        if flush and save_delta_ts:
            self.container.proxy.update_delta_ts(new_delta_ts)

        if flush:
            return result
        else:
            return self
Ejemplo n.º 2
0
def test_batch_gen():
    '''

    '''
    from metrique.utils import batch_gen

    try:
        next(batch_gen(None, 1))
    except StopIteration:
        pass
    else:
        assert False

    assert len(next(batch_gen([1], 1))) == 1

    assert len(next(batch_gen([1, 2], 1))) == 1
    assert len(next(batch_gen([1, 2], 2))) == 2

    assert len(tuple(batch_gen([1, 2], 1))) == 2
    assert len(tuple(batch_gen([1, 2], 2))) == 1

    assert len(tuple(batch_gen(range(50), 2))) == 25
    assert len(tuple(batch_gen(range(50), 5))) == 10

    assert len(tuple(batch_gen(range(100000), 1))) == 100000
Ejemplo n.º 3
0
def test_batch_gen():
    '''

    '''
    from metrique.utils import batch_gen

    try:
        next(batch_gen(None, 1))
    except StopIteration:
        pass
    else:
        assert False

    assert len(next(batch_gen([1], 1))) == 1

    assert len(next(batch_gen([1, 2], 1))) == 1
    assert len(next(batch_gen([1, 2], 2))) == 2

    assert len(tuple(batch_gen([1, 2], 1))) == 2
    assert len(tuple(batch_gen([1, 2], 2))) == 1

    assert len(tuple(batch_gen(range(50), 2))) == 25
    assert len(tuple(batch_gen(range(50), 5))) == 10

    assert len(tuple(batch_gen(range(100000), 1))) == 100000
Ejemplo n.º 4
0
def get_objects(cube,
                oids,
                full_history,
                flush=False,
                cube_name=None,
                config=None,
                config_file=None,
                config_key=None,
                container=None,
                container_config=None,
                proxy=None,
                proxy_config=None,
                **kwargs):
    # force a single worker to avoid 'nested' (invalid) joblib runs.
    kwargs['workers'] = 1
    m = pyclient(cube=cube,
                 name=cube_name,
                 config=config,
                 config_file=config_file,
                 config_key=config_key,
                 container=container,
                 container_config=container_config,
                 proxy=proxy,
                 proxy_config=proxy_config,
                 **kwargs)
    results = []
    batch_size = m.lconfig.get('batch_size')
    for batch in batch_gen(oids, batch_size):
        if full_history:
            _ = m._activity_get_objects(oids=batch, flush=flush)
        else:
            _ = m._get_objects(oids=batch, flush=flush)
        results.extend(_)
    return results
Ejemplo n.º 5
0
def get_objects(cube, oids, full_history, flush=False, cube_name=None,
                config=None, config_file=None, config_key=None,
                container=None, container_config=None,
                proxy=None, proxy_config=None, **kwargs):
    # force a single worker to avoid 'nested' (invalid) joblib runs.
    kwargs['workers'] = 1
    m = pyclient(cube=cube, name=cube_name, config=config,
                 config_file=config_file, config_key=config_key,
                 container=container, container_config=container_config,
                 proxy=proxy, proxy_config=proxy_config, **kwargs)
    results = []
    batch_size = m.lconfig.get('batch_size')
    for batch in batch_gen(oids, batch_size):
        if full_history:
            _ = m._activity_get_objects(oids=batch, flush=flush)
        else:
            _ = m._get_objects(oids=batch, flush=flush)
        results.extend(_)
    return results
Ejemplo n.º 6
0
    def _run_object_import(self, force, last_update, flush, full_history):
        workers = self.lconfig.get('workers')
        # if we're using multiple workers, break the oids
        # according to worker batchsize, then each worker will
        # break the batch into smaller sql batch size batches
        # otherwise, single threaded, use sql batch size
        w_batch_size = self.lconfig.get('worker_batch_size')
        s_batch_size = self.lconfig.get('batch_size')

        # store the time right before the ETL job starts,
        # so next run, we can catch delta changes b/w
        # next ETL start and previous (this)
        new_delta_ts = utcnow()
        # get list of oids which we plan to update
        oids, save_delta_ts = self._delta_force(force, last_update)

        msg = 'Getting Full History' if full_history else \
            'Getting Objects - Current Values'
        if HAS_JOBLIB and workers > 1:
            logger.debug('%s (%s@%s)' % (msg, workers, w_batch_size))
            runner = Parallel(n_jobs=workers)
            func = delayed(get_objects)
            result = runner(
                func(cube=self._cube,
                     oids=batch,
                     full_history=full_history,
                     flush=flush,
                     cube_name=self.name,
                     config=self.config,
                     config_file=self.config_file,
                     config_key=self.config_key,
                     container=type(self.objects),
                     container_config=self.container_config,
                     proxy=type(self.proxy),
                     proxy_config=self.proxy_config)
                for batch in batch_gen(oids, w_batch_size))
            # merge list of lists (batched) into single list
            result = [i for l in result for i in l]
            if not flush:
                self.objects.extend(result)
        else:
            logger.debug('%s (%s@%s)' % (msg, workers, w_batch_size))
            result = []
            _s = 0
            for i, batch in enumerate(batch_gen(oids, s_batch_size)):
                _e = _s + s_batch_size
                logger.debug('batch %s: %s-%s of %s' % (i, _s, _e, len(oids)))
                if full_history:
                    _ = self._activity_get_objects(oids=batch, flush=flush)
                else:
                    _ = self._get_objects(oids=batch, flush=flush)
                result.extend(_)
                _s = _e

        # save new delta_ts:
        if flush and save_delta_ts:
            self.container.proxy.update_delta_ts(new_delta_ts)

        if flush:
            return result
        else:
            return self