def _run_object_import(self, force, last_update, flush, full_history): workers = self.lconfig.get('workers') # if we're using multiple workers, break the oids # according to worker batchsize, then each worker will # break the batch into smaller sql batch size batches # otherwise, single threaded, use sql batch size w_batch_size = self.lconfig.get('worker_batch_size') s_batch_size = self.lconfig.get('batch_size') # store the time right before the ETL job starts, # so next run, we can catch delta changes b/w # next ETL start and previous (this) new_delta_ts = utcnow() # get list of oids which we plan to update oids, save_delta_ts = self._delta_force(force, last_update) msg = 'Getting Full History' if full_history else \ 'Getting Objects - Current Values' if HAS_JOBLIB and workers > 1: logger.debug('%s (%s@%s)' % (msg, workers, w_batch_size)) runner = Parallel(n_jobs=workers) func = delayed(get_objects) result = runner(func( cube=self._cube, oids=batch, full_history=full_history, flush=flush, cube_name=self.name, config=self.config, config_file=self.config_file, config_key=self.config_key, container=type(self.objects), container_config=self.container_config, proxy=type(self.proxy), proxy_config=self.proxy_config) for batch in batch_gen(oids, w_batch_size)) # merge list of lists (batched) into single list result = [i for l in result for i in l] if not flush: self.objects.extend(result) else: logger.debug('%s (%s@%s)' % (msg, workers, w_batch_size)) result = [] _s = 0 for i, batch in enumerate(batch_gen(oids, s_batch_size)): _e = _s + s_batch_size logger.debug('batch %s: %s-%s of %s' % (i, _s, _e, len(oids))) if full_history: _ = self._activity_get_objects(oids=batch, flush=flush) else: _ = self._get_objects(oids=batch, flush=flush) result.extend(_) _s = _e # save new delta_ts: if flush and save_delta_ts: self.container.proxy.update_delta_ts(new_delta_ts) if flush: return result else: return self
def test_batch_gen(): ''' ''' from metrique.utils import batch_gen try: next(batch_gen(None, 1)) except StopIteration: pass else: assert False assert len(next(batch_gen([1], 1))) == 1 assert len(next(batch_gen([1, 2], 1))) == 1 assert len(next(batch_gen([1, 2], 2))) == 2 assert len(tuple(batch_gen([1, 2], 1))) == 2 assert len(tuple(batch_gen([1, 2], 2))) == 1 assert len(tuple(batch_gen(range(50), 2))) == 25 assert len(tuple(batch_gen(range(50), 5))) == 10 assert len(tuple(batch_gen(range(100000), 1))) == 100000
def get_objects(cube, oids, full_history, flush=False, cube_name=None, config=None, config_file=None, config_key=None, container=None, container_config=None, proxy=None, proxy_config=None, **kwargs): # force a single worker to avoid 'nested' (invalid) joblib runs. kwargs['workers'] = 1 m = pyclient(cube=cube, name=cube_name, config=config, config_file=config_file, config_key=config_key, container=container, container_config=container_config, proxy=proxy, proxy_config=proxy_config, **kwargs) results = [] batch_size = m.lconfig.get('batch_size') for batch in batch_gen(oids, batch_size): if full_history: _ = m._activity_get_objects(oids=batch, flush=flush) else: _ = m._get_objects(oids=batch, flush=flush) results.extend(_) return results
def _run_object_import(self, force, last_update, flush, full_history): workers = self.lconfig.get('workers') # if we're using multiple workers, break the oids # according to worker batchsize, then each worker will # break the batch into smaller sql batch size batches # otherwise, single threaded, use sql batch size w_batch_size = self.lconfig.get('worker_batch_size') s_batch_size = self.lconfig.get('batch_size') # store the time right before the ETL job starts, # so next run, we can catch delta changes b/w # next ETL start and previous (this) new_delta_ts = utcnow() # get list of oids which we plan to update oids, save_delta_ts = self._delta_force(force, last_update) msg = 'Getting Full History' if full_history else \ 'Getting Objects - Current Values' if HAS_JOBLIB and workers > 1: logger.debug('%s (%s@%s)' % (msg, workers, w_batch_size)) runner = Parallel(n_jobs=workers) func = delayed(get_objects) result = runner( func(cube=self._cube, oids=batch, full_history=full_history, flush=flush, cube_name=self.name, config=self.config, config_file=self.config_file, config_key=self.config_key, container=type(self.objects), container_config=self.container_config, proxy=type(self.proxy), proxy_config=self.proxy_config) for batch in batch_gen(oids, w_batch_size)) # merge list of lists (batched) into single list result = [i for l in result for i in l] if not flush: self.objects.extend(result) else: logger.debug('%s (%s@%s)' % (msg, workers, w_batch_size)) result = [] _s = 0 for i, batch in enumerate(batch_gen(oids, s_batch_size)): _e = _s + s_batch_size logger.debug('batch %s: %s-%s of %s' % (i, _s, _e, len(oids))) if full_history: _ = self._activity_get_objects(oids=batch, flush=flush) else: _ = self._get_objects(oids=batch, flush=flush) result.extend(_) _s = _e # save new delta_ts: if flush and save_delta_ts: self.container.proxy.update_delta_ts(new_delta_ts) if flush: return result else: return self