Ejemplo n.º 1
0
    def prepare_objects(self, _cube, objects, mtime):
        '''
        :param dict obj: dictionary that will be converted to mongodb doc
        :param int mtime: timestamp to apply as _start for objects

        Do some basic object validatation and add an _start timestamp value
        '''
        new_obj_hashes = []
        for obj in objects:
            _start = obj.pop('_start') if '_start' in obj else None
            _end = obj.pop('_end') if '_end' in obj else None

            if _end is not None and _start is None:
                self._raise(400, "objects with _end must have _start")
            if not _start:
                _start = mtime
            if not isinstance(_start, (int, float)):
                self._raise(400, "_start must be float/int")
            if not isinstance(_end, (int, float)) and _end is not None:
                self._raise(400, "_end must be float/int/None")

            if '_id' in obj:
                self._raise(400, "_id field CAN NOT be defined: %s" % obj)
            if '_hash' in obj:
                self._raise(400, "_hash field CAN NOT be defined: %s" % obj)
            if '_oid' not in obj:
                self._raise(400, "_oid field MUST be defined: %s" % obj)

            # hash the object (minus _start/_end)
            _hash = jsonhash(obj)
            obj['_hash'] = _hash
            if _end is None:
                new_obj_hashes.append(_hash)

            # add back _start and _end properties
            obj['_start'] = _start
            obj['_end'] = _end

            # we want to avoid serializing in and out later
            obj['_id'] = str(ObjectId())

        # FIXME: refactor this so we split the _hashes
        # mongodb lookups iterate across 16M max
        # spec docs...
        # get the estimate size, as follows
        #est_size_hashes = estimate_obj_size(_hashes)

        # Filter out objects whose most recent version did not change
        docs = _cube.find({'_hash': {'$in': new_obj_hashes},
                           '_end': None},
                          fields={'_hash': 1, '_id': -1})
        _dup_hashes = set([doc['_hash'] for doc in docs])
        objects = [obj for obj in objects if obj['_hash'] not in _dup_hashes]
        objects = filter(None, objects)
        return objects
Ejemplo n.º 2
0
def test_jsonhash():
    from metriqued.utils import jsonhash

    dct = {'a': [3, 2, 1], 'z': ['a', 'c', 'b', 1], 'b': {1: [], 3: {}}}

    dct_sorted_z = copy(dct)
    dct_sorted_z['z'] = sorted(dct_sorted_z['z'])

    dct_diff = copy(dct)
    del dct_diff['z']

    DCT = '541d0fa961265d976d9a27e8632787875dc58406'
    DCT_SORTED_Z = 'ca4631674276933bd251bd4bc86372138a841a4b'
    DCT_DIFF = '07d6c518867fb6b6c77c0ec1d835fb800419fc24'

    assert dct != dct_sorted_z

    assert jsonhash(dct) == DCT
    assert jsonhash(dct_sorted_z) == DCT_SORTED_Z
    assert jsonhash(dct_diff) == DCT_DIFF

    ' list sort order is an identifier of a unique object '
    assert jsonhash(dct) != jsonhash(dct_sorted_z)