def test_batch_gen(): ''' ''' from metriqueu.utils import batch_gen # test arg signature, aka API try: next(batch_gen(None, 1)) except StopIteration: pass assert len(next(batch_gen([1], 1))) == 1 assert len(next(batch_gen([1, 2], 1))) == 1 assert len(next(batch_gen([1, 2], 2))) == 2 assert len(tuple(batch_gen([1, 2], 1))) == 2 assert len(tuple(batch_gen([1, 2], 2))) == 1 assert len(tuple(batch_gen(range(50), 2))) == 25 assert len(tuple(batch_gen(range(50), 5))) == 10 assert len(tuple(batch_gen(range(100000), 1))) == 100000
def get_objects(self, force=None, last_update=None, parse_timestamp=None, save=False, autosnap=True): ''' Extract routine for SQL based cubes. :param force: for querying for all objects (True) or only those passed in as list :param last_update: manual override for 'changed since date' :param parse_timestamp: flag to convert timestamp timezones in-line ''' logger.debug('Fetching Objects - Current Values') objects = [] start = utcnow() # determine which oids will we query oids = self._delta_force(force, last_update, parse_timestamp) # set the 'index' of sql columns so we can extract # out the sql rows and know which column : field field_order = tuple(self.fields) max_workers = self.config.max_workers batch_size = self.config.sql_batch_size if max_workers > 1: with ProcessPoolExecutor(max_workers=max_workers) as ex: futures = [] kwargs = self.config kwargs.pop('cube', None) # if in self.config, ignore it for batch in batch_gen(oids, batch_size): f = ex.submit(get_objects, cube=self._cube, oids=batch, field_order=field_order, start=start, save=save, cube_name=self.name, autosnap=autosnap, **kwargs) futures.append(f) objects = [] for future in as_completed(futures): try: objs = future.result() objects.extend(objs) except Exception as e: tb = traceback.format_exc() logger.error('Extract Error: %s\n%s' % (e, tb)) del tb, e else: # respect the global batch size, even if sql batch # size is not set for batch in batch_gen(oids, batch_size): objs = self._get_objects(oids=batch, field_order=field_order, start=start, save=save, autosnap=autosnap) objects.extend(objs) logger.debug('... current values objects get - done') return objects
def get_full_history(self, force=None, last_update=None, parse_timestamp=None, save=False, autosnap=False): ''' Fields change depending on when you run activity_import, such as "last_updated" type fields which don't have activity being tracked, which means we'll always end up with different hash values, so we need to always remove all existing object states and import fresh ''' logger.debug('Extracting Objects - Full History') oids = self._delta_force(force, last_update, parse_timestamp) logger.debug("Updating %s objects" % len(oids)) sql_batch_size = self.config.sql_batch_size max_workers = self.config.max_workers objects = [] if max_workers > 1: with ProcessPoolExecutor(max_workers=max_workers) as ex: futures = [] kwargs = self.config kwargs.pop('cube', None) # ends up in config; ignore it for batch in batch_gen(oids, sql_batch_size): f = ex.submit(get_full_history, cube=self._cube, oids=batch, save=save, cube_name=self.name, autosnap=autosnap, **kwargs) futures.append(f) for future in as_completed(futures): try: objs = future.result() objects.extend(objs) except Exception as e: tb = traceback.format_exc() logger.error( 'Activity Import Error: %s\n%s\n%s' % ( e, tb, batch)) del tb, e else: for batch in batch_gen(oids, sql_batch_size): objs = self._activity_get_objects(oids=batch, save=save, autosnap=autosnap) objects.extend(objs) return objects
def insert_bulk(_cube, docs, size=-1): # little reason to batch insert... # http://stackoverflow.com/questions/16753366 # and after testing, it seems splitting things # up more slows things down. if size <= 0: _cube.insert(docs, manipulate=False) else: for batch in batch_gen(docs, size): _cube.insert(batch, manipulate=False)
def activity_import(self, oids=None, logfile=None, cube=None, owner=None): ''' WARNING: Do NOT run extract while activity import is running, it might result in data corruption. Run the activity import for a given cube, if the cube supports it. Essentially, recreate object histories from a cubes 'activity history' table row data, and dump those pre-calcultated historical state object copies into the timeline. :param object ids: - None: import for all ids - list of ids: import for ids in the list ''' if logfile: logger_orig = self.logger self.debug_set(self.config.debug, False, logfile) if oids is None: oids = self.find('_oid == exists(True)', fields='_oid', date='~', cube=cube, owner=owner) oids = sorted(oids._oid.unique()) max_workers = self.config.max_workers batch_size = self.config.batch_size with ThreadPoolExecutor(max_workers=max_workers) as ex: futures = [ex.submit(_activity_import, self, oids=batch, cube=cube, owner=owner) for batch in batch_gen(oids, batch_size)] saved = [] for future in as_completed(futures): try: result = future.result() except Exception as e: tb = traceback.format_exc() self.logger.error('Activity Import Error: %s\n%s' % (e, tb)) del tb else: saved.extend(result) self.logger.info( '%i of %i extracted' % (len(saved), len(oids))) failed = set(oids) - set(saved) result = {'saved': sorted(saved), 'failed': sorted(failed)} self.logger.debug(result) # reset logger self.logger = logger_orig return result
def save(self, objects, cube=None, owner=None): ''' Save a list of objects the given metrique.cube. Returns back a list of object ids (_id|_oid) saved. :param list objects: list of dictionary-like objects to be stored :param string cube: cube name :param string owner: username of cube owner :rtype: list - list of object ids saved ''' batch_size = self.config.batch_size olen = len(objects) if objects else None if not olen: self.logger.info("... No objects to save") return [] else: self.logger.info("Saving %s objects" % len(objects)) # get 'now' utc timezone aware datetime object # FIXME IMPORTANT timestamp should be really taken before extract now = utcnow(tz_aware=True) cmd = self.get_cmd(owner, cube, 'save') if (batch_size <= 0) or (olen <= batch_size): saved = self._post(cmd, objects=objects, mtime=now) else: saved = [] k = 0 for batch in batch_gen(objects, batch_size): _saved = self._post(cmd, objects=batch, mtime=now) saved.extend(_saved) k += len(batch) self.logger.info("... %i of %i posted" % (k, olen)) if saved: objects = [o for o in objects if o['_oid'] in saved] # journal locally as well if self.config.journal: # journal objects locally if any were saved for o in sorted(objects, key=itemgetter('_oid')): dump = {'owner': self.owner, 'name': self.name, 'when': utcnow(), 'object': o} ojson = json.dumps(dump, default=json_encode, ensure_ascii=True, encoding="ISO-8859-1") self.journal.debug(ojson) self.logger.info("... Saved %s NEW docs" % len(saved)) return sorted(saved)
def _save_default(self, objects, start_time, owner, cube, autosnap): batch_size = self.config.batch_size cmd = self.get_cmd(owner, cube, 'save') olen = len(objects) if objects else None if (batch_size <= 0) or (olen <= batch_size): saved = self._post(cmd, objects=objects, start_time=start_time, autosnap=autosnap) else: saved = [] k = 0 for batch in batch_gen(objects, batch_size): _saved = self._post(cmd, objects=batch, start_time=start_time, autosnap=autosnap) saved.extend(_saved) k += len(batch) return saved
def _extract_threaded(self, id_delta, field_order): with ThreadPoolExecutor(max_workers=self.config.max_workers) as ex: futures = [ex.submit(self._extract, batch, field_order) for batch in batch_gen(id_delta, self.config.batch_size)] saved = [] for future in as_completed(futures): try: result = future.result() except Exception as e: tb = traceback.format_exc() self.logger.error('Extract Error: %s\n%s' % (e, tb)) del tb else: saved.extend(result) self.logger.info( '%i of %i extracted' % (len(saved), len(id_delta))) failed = set(id_delta) - set(saved) result = {'saved': sorted(saved), 'failed': sorted(failed)} self.logger.debug(result) return result