Example #1
0
def test_batch_gen():
    '''

    '''
    from metriqueu.utils import batch_gen

    # test arg signature, aka API

    try:
        next(batch_gen(None, 1))
    except StopIteration:
        pass

    assert len(next(batch_gen([1], 1))) == 1

    assert len(next(batch_gen([1, 2], 1))) == 1
    assert len(next(batch_gen([1, 2], 2))) == 2

    assert len(tuple(batch_gen([1, 2], 1))) == 2
    assert len(tuple(batch_gen([1, 2], 2))) == 1

    assert len(tuple(batch_gen(range(50), 2))) == 25
    assert len(tuple(batch_gen(range(50), 5))) == 10

    assert len(tuple(batch_gen(range(100000), 1))) == 100000
Example #2
0
    def get_objects(self, force=None, last_update=None, parse_timestamp=None,
                    save=False, autosnap=True):
        '''
        Extract routine for SQL based cubes.

        :param force:
            for querying for all objects (True) or only those passed in as list
        :param last_update: manual override for 'changed since date'
        :param parse_timestamp: flag to convert timestamp timezones in-line
        '''
        logger.debug('Fetching Objects - Current Values')
        objects = []
        start = utcnow()

        # determine which oids will we query
        oids = self._delta_force(force, last_update, parse_timestamp)

        # set the 'index' of sql columns so we can extract
        # out the sql rows and know which column : field
        field_order = tuple(self.fields)

        max_workers = self.config.max_workers
        batch_size = self.config.sql_batch_size
        if max_workers > 1:
            with ProcessPoolExecutor(max_workers=max_workers) as ex:
                futures = []
                kwargs = self.config
                kwargs.pop('cube', None)  # if in self.config, ignore it
                for batch in batch_gen(oids, batch_size):
                    f = ex.submit(get_objects, cube=self._cube, oids=batch,
                                  field_order=field_order, start=start,
                                  save=save, cube_name=self.name,
                                  autosnap=autosnap, **kwargs)
                    futures.append(f)
                objects = []
                for future in as_completed(futures):
                    try:
                        objs = future.result()
                        objects.extend(objs)
                    except Exception as e:
                        tb = traceback.format_exc()
                        logger.error('Extract Error: %s\n%s' % (e, tb))
                        del tb, e
        else:
            # respect the global batch size, even if sql batch
            # size is not set
            for batch in batch_gen(oids, batch_size):
                objs = self._get_objects(oids=batch, field_order=field_order,
                                         start=start, save=save, 
                                         autosnap=autosnap)
                objects.extend(objs)
        logger.debug('... current values objects get - done')
        return objects
Example #3
0
    def get_full_history(self, force=None, last_update=None,
                         parse_timestamp=None, save=False, autosnap=False):
        '''
        Fields change depending on when you run activity_import,
        such as "last_updated" type fields which don't have activity
        being tracked, which means we'll always end up with different
        hash values, so we need to always remove all existing object
        states and import fresh
        '''
        logger.debug('Extracting Objects - Full History')

        oids = self._delta_force(force, last_update, parse_timestamp)
        logger.debug("Updating %s objects" % len(oids))

        sql_batch_size = self.config.sql_batch_size
        max_workers = self.config.max_workers
        objects = []
        if max_workers > 1:
            with ProcessPoolExecutor(max_workers=max_workers) as ex:
                futures = []
                kwargs = self.config
                kwargs.pop('cube', None)  # ends up in config; ignore it
                for batch in batch_gen(oids, sql_batch_size):
                    f = ex.submit(get_full_history, cube=self._cube,
                                  oids=batch, save=save, cube_name=self.name,
                                  autosnap=autosnap, **kwargs)
                    futures.append(f)
                for future in as_completed(futures):
                    try:
                        objs = future.result()
                        objects.extend(objs)
                    except Exception as e:
                        tb = traceback.format_exc()
                        logger.error(
                            'Activity Import Error: %s\n%s\n%s' % (
                                e, tb, batch))
                        del tb, e
        else:
            for batch in batch_gen(oids, sql_batch_size):
                objs = self._activity_get_objects(oids=batch, save=save,
                                                  autosnap=autosnap)
                objects.extend(objs)
        return objects
Example #4
0
def insert_bulk(_cube, docs, size=-1):
    # little reason to batch insert...
    # http://stackoverflow.com/questions/16753366
    # and after testing, it seems splitting things
    # up more slows things down.
    if size <= 0:
        _cube.insert(docs, manipulate=False)
    else:
        for batch in batch_gen(docs, size):
            _cube.insert(batch, manipulate=False)
Example #5
0
def activity_import(self, oids=None, logfile=None, cube=None, owner=None):
    '''
    WARNING: Do NOT run extract while activity import is running,
             it might result in data corruption.
    Run the activity import for a given cube, if the cube supports it.

    Essentially, recreate object histories from
    a cubes 'activity history' table row data,
    and dump those pre-calcultated historical
    state object copies into the timeline.

    :param object ids:
        - None: import for all ids
        - list of ids: import for ids in the list
    '''
    if logfile:
        logger_orig = self.logger
        self.debug_set(self.config.debug, False, logfile)

    if oids is None:
        oids = self.find('_oid == exists(True)', fields='_oid', date='~',
                         cube=cube, owner=owner)
        oids = sorted(oids._oid.unique())

    max_workers = self.config.max_workers
    batch_size = self.config.batch_size

    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futures = [ex.submit(_activity_import, self, oids=batch,
                             cube=cube, owner=owner)
                   for batch in batch_gen(oids, batch_size)]

    saved = []
    for future in as_completed(futures):
        try:
            result = future.result()
        except Exception as e:
            tb = traceback.format_exc()
            self.logger.error('Activity Import Error: %s\n%s' % (e, tb))
            del tb
        else:
            saved.extend(result)
            self.logger.info(
                '%i of %i extracted' % (len(saved),
                                        len(oids)))
    failed = set(oids) - set(saved)
    result = {'saved': sorted(saved), 'failed': sorted(failed)}
    self.logger.debug(result)
    # reset logger
    self.logger = logger_orig
    return result
Example #6
0
def save(self, objects, cube=None, owner=None):
    '''
    Save a list of objects the given metrique.cube.
    Returns back a list of object ids (_id|_oid) saved.

    :param list objects: list of dictionary-like objects to be stored
    :param string cube: cube name
    :param string owner: username of cube owner
    :rtype: list - list of object ids saved
    '''
    batch_size = self.config.batch_size

    olen = len(objects) if objects else None
    if not olen:
        self.logger.info("... No objects to save")
        return []
    else:
        self.logger.info("Saving %s objects" % len(objects))

    # get 'now' utc timezone aware datetime object
    # FIXME IMPORTANT timestamp should be really taken before extract
    now = utcnow(tz_aware=True)

    cmd = self.get_cmd(owner, cube, 'save')
    if (batch_size <= 0) or (olen <= batch_size):
        saved = self._post(cmd, objects=objects, mtime=now)
    else:
        saved = []
        k = 0
        for batch in batch_gen(objects, batch_size):
            _saved = self._post(cmd, objects=batch, mtime=now)
            saved.extend(_saved)
            k += len(batch)
            self.logger.info("... %i of %i posted" % (k, olen))

    if saved:
        objects = [o for o in objects if o['_oid'] in saved]
        # journal locally as well
        if self.config.journal:
            # journal objects locally if any were saved
            for o in sorted(objects, key=itemgetter('_oid')):
                dump = {'owner': self.owner, 'name': self.name,
                        'when': utcnow(), 'object': o}
                ojson = json.dumps(dump, default=json_encode,
                                   ensure_ascii=True,
                                   encoding="ISO-8859-1")
                self.journal.debug(ojson)
    self.logger.info("... Saved %s NEW docs" % len(saved))
    return sorted(saved)
Example #7
0
def _save_default(self, objects, start_time, owner, cube, autosnap):
    batch_size = self.config.batch_size
    cmd = self.get_cmd(owner, cube, 'save')
    olen = len(objects) if objects else None
    if (batch_size <= 0) or (olen <= batch_size):
        saved = self._post(cmd, objects=objects, start_time=start_time,
                           autosnap=autosnap)
    else:
        saved = []
        k = 0
        for batch in batch_gen(objects, batch_size):
            _saved = self._post(cmd, objects=batch, start_time=start_time,
                                autosnap=autosnap)
            saved.extend(_saved)
            k += len(batch)
    return saved
Example #8
0
 def _extract_threaded(self, id_delta, field_order):
     with ThreadPoolExecutor(max_workers=self.config.max_workers) as ex:
         futures = [ex.submit(self._extract, batch, field_order)
                    for batch in batch_gen(id_delta,
                                           self.config.batch_size)]
     saved = []
     for future in as_completed(futures):
         try:
             result = future.result()
         except Exception as e:
             tb = traceback.format_exc()
             self.logger.error('Extract Error: %s\n%s' % (e, tb))
             del tb
         else:
             saved.extend(result)
             self.logger.info(
                 '%i of %i extracted' % (len(saved),
                                         len(id_delta)))
     failed = set(id_delta) - set(saved)
     result = {'saved': sorted(saved), 'failed': sorted(failed)}
     self.logger.debug(result)
     return result