def _get_string(data, position, obj_end, dummy): """Decode a BSON string to python unicode string.""" length = _UNPACK_INT(data[position:position + 4])[0] position += 4 if length < 1 or obj_end - position < length: raise InvalidBSON("invalid string length") end = position + length - 1 if data[end:end + 1] != b"\x00": raise InvalidBSON("invalid end of string") return _utf_8_decode(data[position:end], None, True)[0], end + 1
def _bson_to_dict(data, as_class, tz_aware, uuid_subtype): obj_size = struct.unpack("<i", data[:4])[0] length = len(data) if length < obj_size: raise InvalidBSON("objsize too large") if obj_size != length or data[obj_size - 1:obj_size] != ZERO: raise InvalidBSON("bad eoo") elements = data[4:obj_size - 1] return (_elements_to_dict(elements, as_class, tz_aware, uuid_subtype), data[obj_size:])
def decode_all(data, codec_options=DEFAULT_CODEC_OPTIONS): """Decode BSON data to multiple documents. `data` must be a string of concatenated, valid, BSON-encoded documents. :Parameters: - `data`: BSON data - `codec_options` (optional): An instance of :class:`~bson.codec_options.CodecOptions`. .. versionchanged:: 3.0 Removed `compile_re` option: PyMongo now always represents BSON regular expressions as :class:`~bson.regex.Regex` objects. Use :meth:`~bson.regex.Regex.try_compile` to attempt to convert from a BSON regular expression to a Python regular expression object. Replaced `as_class`, `tz_aware`, and `uuid_subtype` options with `codec_options`. .. versionchanged:: 2.7 Added `compile_re` option. If set to False, PyMongo represented BSON regular expressions as :class:`~bson.regex.Regex` objects instead of attempting to compile BSON regular expressions as Python native regular expressions, thus preventing errors for some incompatible patterns, see `PYTHON-500`_. .. _PYTHON-500: https://jira.mongodb.org/browse/PYTHON-500 """ if not isinstance(codec_options, CodecOptions): raise _CODEC_OPTIONS_TYPE_ERROR docs = [] position = 0 end = len(data) - 1 try: while position < end: obj_size = _UNPACK_INT(data[position:position + 4])[0] if len(data) - position < obj_size: raise InvalidBSON("invalid object size") obj_end = position + obj_size - 1 if data[obj_end:position + obj_size] != b"\x00": raise InvalidBSON("bad eoo") docs.append(_elements_to_dict(data, position + 4, obj_end, codec_options)) position += obj_size return docs except InvalidBSON: raise except Exception: # Change exception type to InvalidBSON but preserve traceback. _, exc_value, exc_tb = sys.exc_info() reraise(InvalidBSON, exc_value, exc_tb)
def _get_string(data, view, position, obj_end, opts, dummy): """Decode a BSON string to python unicode string.""" length = _UNPACK_INT_FROM(data, position)[0] position += 4 if length < 1 or obj_end - position < length: raise InvalidBSON("invalid string length") end = position + length - 1 if data[end] != 0: raise InvalidBSON("invalid end of string") return _utf_8_decode(view[position:end], opts.unicode_decode_error_handler, True)[0], end + 1
def decode_all(data, as_class=dict, tz_aware=True, uuid_subtype=OLD_UUID_SUBTYPE, compile_re=True, codec_options=None): """Decode BSON data to multiple documents. `data` must be a string of concatenated, valid, BSON-encoded documents. :Parameters: - `data`: BSON data - `as_class` (optional): the class to use for the resulting documents - `tz_aware` (optional): if ``True``, return timezone-aware :class:`~datetime.datetime` instances - `uuid_subtype` (optional): The BSON representation to use for UUIDs. See the :mod:`bson.binary` module for all options. - `compile_re` (optional): if ``False``, don't attempt to compile BSON regular expressions into Python regular expressions. Return instances of :class:`~bson.regex.Regex` instead. Can avoid :exc:`~bson.errors.InvalidBSON` errors when receiving Python-incompatible regular expressions, for example from ``currentOp`` .. versionchanged:: 2.7 Added `compile_re` option. .. versionadded:: 1.9 """ if codec_options is not None: if not isinstance(codec_options, CodecOptions): raise _CODEC_OPTIONS_TYPE_ERROR as_class = codec_options.document_class tz_aware = codec_options.tz_aware uuid_subtype = codec_options.uuid_representation docs = [] position = 0 end = len(data) - 1 try: while position < end: obj_size = struct.unpack("<i", data[position:position + 4])[0] if len(data) - position < obj_size: raise InvalidBSON("objsize too large") if data[position + obj_size - 1:position + obj_size] != ZERO: raise InvalidBSON("bad eoo") elements = data[position + 4:position + obj_size - 1] position += obj_size docs.append(_elements_to_dict(elements, as_class, tz_aware, uuid_subtype, compile_re)) return docs except InvalidBSON: raise except Exception: # Change exception type to InvalidBSON but preserve traceback. exc_type, exc_value, exc_tb = sys.exc_info() raise InvalidBSON, str(exc_value), exc_tb
def _get_string( data: Any, view: Any, position: int, obj_end: int, opts: CodecOptions, dummy: Any ) -> Tuple[str, int]: """Decode a BSON string to python str.""" length = _UNPACK_INT_FROM(data, position)[0] position += 4 if length < 1 or obj_end - position < length: raise InvalidBSON("invalid string length") end = position + length - 1 if data[end] != 0: raise InvalidBSON("invalid end of string") return _utf_8_decode(view[position:end], opts.unicode_decode_error_handler, True)[0], end + 1
def _get_object(data, position, obj_end, opts): """Decode a BSON subdocument to opts.document_class or bson.dbref.DBRef.""" obj_size = _UNPACK_INT(data[position:position + 4])[0] end = position + obj_size - 1 if data[end:position + obj_size] != b"\x00": raise InvalidBSON("bad eoo") if end >= obj_end: raise InvalidBSON("invalid object length") obj = _elements_to_dict(data, position + 4, end, opts, subdocument=True) position += obj_size if "$ref" in obj: return (DBRef(obj.pop("$ref"), obj.pop("$id", None), obj.pop("$db", None), obj), position) return obj, position
def _get_string(data, position, obj_end, opts, dummy): """Decode a BSON string to python unicode string.""" length = _UNPACK_INT(data[position:position + 4])[0] position += 4 if length < 1 or obj_end - position < length: raise InvalidBSON("invalid string length") if opts.use_unicode: end = position + length - 1 if data[end:end + 1] != b"\x00": raise InvalidBSON("invalid end of string") return _utf_8_decode(data[position:end], opts.unicode_decode_error_handler, True)[0], end + 1 else: return data[position:], len(data) - position
def _get_object_size(data, position, obj_end): """Validate and return a BSON document's size.""" try: obj_size = _UNPACK_INT_FROM(data, position)[0] except struct.error as exc: raise InvalidBSON(str(exc)) end = position + obj_size - 1 if data[end] != 0: raise InvalidBSON("bad eoo") if end >= obj_end: raise InvalidBSON("invalid object length") # If this is the top-level document, validate the total size too. if position == 0 and obj_size != obj_end: raise InvalidBSON("invalid object length") return obj_size, end
def _bson_to_dict(data, as_class, tz_aware, uuid_subtype, compile_re): try: obj_size = struct.unpack("<i", data[:4])[0] except struct.error as e: raise InvalidBSON(str(e)) length = len(data) if length < obj_size: raise InvalidBSON("objsize too large") if obj_size != length or data[obj_size - 1:obj_size] != ZERO: raise InvalidBSON("bad eoo") elements = data[4:obj_size - 1] dct = _elements_to_dict(elements, as_class, tz_aware, uuid_subtype, compile_re) return dct, data[obj_size:]
def _get_binary(data, position, as_class, tz_aware, uuid_subtype, compile_re): length, position = _get_int(data, position) subtype = ord(data[position:position + 1]) position += 1 if subtype == 2: length2, position = _get_int(data, position) if length2 != length - 4: raise InvalidBSON("invalid binary (st 2) - lengths don't match!") length = length2 if subtype in (3, 4) and _use_uuid: # Java Legacy if uuid_subtype == JAVA_LEGACY: java = data[position:position + length] value = uuid.UUID(bytes=java[0:8][::-1] + java[8:16][::-1]) # C# legacy elif uuid_subtype == CSHARP_LEGACY: value = uuid.UUID(bytes_le=data[position:position + length]) # Python else: value = uuid.UUID(bytes=data[position:position + length]) position += length return (value, position) # Python3 special case. Decode subtype 0 to 'bytes'. if PY3 and subtype == 0: value = data[position:position + length] else: value = Binary(data[position:position + length], subtype) position += length return value, position
def _get_array( data, position, name, as_class, tz_aware, uuid_subtype, compile_re): size = struct.unpack("<i", data[position:position + 4])[0] end = position + size - 1 if data[end:end + 1] != ZERO: raise InvalidBSON("bad eoo") position += 4 end -= 1 result = [] # Avoid doing global and attibute lookups in the loop. append = result.append index = data.index getter = _element_getter while position < end: element_type = data[position:position + 1] # Just skip the keys. position = index(ZERO, position) + 1 try: value, position = getter[element_type]( data, position, name, as_class, tz_aware, uuid_subtype, compile_re) except KeyError: _raise_unknown_type(element_type, name) append(value) return result, position + 1
def decode_file_iter(file_obj, codec_options=DEFAULT_CODEC_OPTIONS): """Decode bson data from a file to multiple documents as a generator. Works similarly to the decode_all function, but reads from the file object in chunks and parses bson in chunks, yielding one document at a time. :Parameters: - `file_obj`: A file object containing BSON data. - `codec_options` (optional): An instance of :class:`~bson.codec_options.CodecOptions`. .. versionchanged:: 3.0 Replaced `as_class`, `tz_aware`, and `uuid_subtype` options with `codec_options`. .. versionadded:: 2.8 """ while True: # Read size of next object. size_data = file_obj.read(4) if not size_data: break # Finished with file normaly. elif len(size_data) != 4: raise InvalidBSON("cut off in middle of objsize") obj_size = _UNPACK_INT_FROM(size_data, 0)[0] - 4 elements = size_data + file_obj.read(max(0, obj_size)) yield _bson_to_dict(elements, codec_options)
def _get_binary(data, position, dummy0, opts, dummy1): """Decode a BSON binary to bson.binary.Binary or python UUID.""" length, subtype = _UNPACK_LENGTH_SUBTYPE(data[position:position + 5]) position += 5 if subtype == 2: length2 = _UNPACK_INT(data[position:position + 4])[0] position += 4 if length2 != length - 4: raise InvalidBSON("invalid binary (st 2) - lengths don't match!") length = length2 end = position + length if subtype in (3, 4): # Java Legacy uuid_representation = opts.uuid_representation if uuid_representation == JAVA_LEGACY: java = data[position:end] value = uuid.UUID(bytes=java[0:8][::-1] + java[8:16][::-1]) # C# legacy elif uuid_representation == CSHARP_LEGACY: value = uuid.UUID(bytes_le=data[position:end]) # Python else: value = uuid.UUID(bytes=data[position:end]) return value, end # Python3 special case. Decode subtype 0 to 'bytes'. if PY3 and subtype == 0: value = data[position:end] else: value = Binary(data[position:end], subtype) return value, end
def _get_array(data, position, obj_end, opts, element_name): """Decode a BSON array to python list.""" size = _UNPACK_INT(data[position:position + 4])[0] end = position + size - 1 if data[end:end + 1] != b"\x00": raise InvalidBSON("bad eoo") position += 4 end -= 1 result = [] # Avoid doing global and attibute lookups in the loop. append = result.append index = data.index getter = _ELEMENT_GETTER while position < end: element_type = data[position:position + 1] # Just skip the keys. position = index(b'\x00', position) + 1 try: value, position = getter[element_type](data, position, obj_end, opts, element_name) except KeyError: _raise_unknown_type(element_type, element_name) append(value) return result, position + 1
def _get_c_string(data, length=None): if length is None: try: length = data.index("\x00") except ValueError: raise InvalidBSON() return (unicode(data[:length], "utf-8"), data[length + 1:])
def _get_int(data, position, as_class=None, tz_aware=False, unsigned=False): format = unsigned and "I" or "i" try: value = struct.unpack("<%s" % format, data[position:position + 4])[0] except struct.error: raise InvalidBSON() position += 4 return value, position
def _get_int(data, as_class=None, tz_aware=False, unsigned=False): format = unsigned and "I" or "i" try: value = struct.unpack("<%s" % format, data[:4])[0] except struct.error: raise InvalidBSON() return (value, data[4:])
def _bson_to_dict(data, opts): """Decode a BSON string to document_class.""" try: obj_size = _UNPACK_INT(data[:4])[0] except struct.error as exc: raise InvalidBSON(str(exc)) if obj_size != len(data): raise InvalidBSON("invalid object size") if data[obj_size - 1:obj_size] != b"\x00": raise InvalidBSON("bad eoo") try: return _elements_to_dict(data, 4, obj_size - 1, opts) except InvalidBSON: raise except Exception: # Change exception type to InvalidBSON but preserve traceback. _, exc_value, exc_tb = sys.exc_info() reraise(InvalidBSON, exc_value, exc_tb)
def _get_boolean(data, view, position, dummy0, dummy1, dummy2): """Decode a BSON true/false to python True/False.""" end = position + 1 boolean_byte = data[position:end] if boolean_byte == b'\x00': return False, end elif boolean_byte == b'\x01': return True, end raise InvalidBSON('invalid boolean value: %r' % boolean_byte)
def _elements_to_dict(data, position, obj_end, opts): """Decode a BSON document.""" result = opts.document_class() pos = position for key, value, pos in _iterate_elements(data, position, obj_end, opts): result[key] = value if pos != obj_end: raise InvalidBSON('bad object or element length') return result
def _get_code_w_scope(data, position, obj_end, opts, element_name): """Decode a BSON code_w_scope to bson.code.Code.""" code_end = position + _UNPACK_INT(data[position:position + 4])[0] code, position = _get_string(data, position + 4, code_end, opts, element_name) scope, position = _get_object(data, position, code_end, opts, element_name) if position != code_end: raise InvalidBSON('scope outside of javascript code boundaries') return Code(code, scope), position
def _get_int(data, position, as_class=None, tz_aware=False, uuid_subtype=OLD_UUID_SUBTYPE, compile_re=True, unsigned=False): format = unsigned and "I" or "i" try: value = struct.unpack("%s" % format, data[position:position + 4])[0] except struct.error: raise InvalidBSON() position += 4 return value, position
def _get_code_w_scope( data: Any, view: Any, position: int, obj_end: int, opts: CodecOptions, element_name: str ) -> Tuple[Code, int]: """Decode a BSON code_w_scope to bson.code.Code.""" code_end = position + _UNPACK_INT_FROM(data, position)[0] code, position = _get_string(data, view, position + 4, code_end, opts, element_name) scope, position = _get_object(data, view, position, code_end, opts, element_name) if position != code_end: raise InvalidBSON("scope outside of javascript code boundaries") return Code(code, scope), position
def decode_file_iter(file_obj, as_class=dict, tz_aware=True, uuid_subtype=OLD_UUID_SUBTYPE): """Decode bson data from a file to multiple documents as a generator. Works similarly to the decode_all function, but reads from the file object in chunks and parses bson in chunks, yielding one document at a time. :Parameters: - `file_obj`: A file object containing BSON data. - `as_class` (optional): the class to use for the resulting documents - `tz_aware` (optional): if ``True``, return timezone-aware :class:`~datetime.datetime` instances .. versionadded:: 2.5 """ while True: # Read size of next object. size_data = file_obj.read(4) if len(size_data) == 0: break # Finished with file normaly. elif len(size_data) != 4: raise InvalidBSON("cut off in middle of objsize") obj_size = struct.unpack("<i", size_data)[0] if obj_size < 5: # The obj_size should at least be big enough to encode the # obj_size and EOO itself, even on a zero-sized elements. raise InvalidBSON("objsize too small") # Actual data for elements is total size - size_prefix - suffix, but # we read the suffix together with the element to reduce number of # reads. elements_size = obj_size - 4 # Read object itself and the EOO in one read (to reduce num reads). elements = file_obj.read(elements_size) if len(elements) != elements_size: raise InvalidBSON("objsize too large") if elements[-1] != ZERO: raise InvalidBSON("bad eoo") yield _elements_to_dict(elements[:-1], as_class, tz_aware, uuid_subtype)
def decode_all(data, as_class=dict, tz_aware=True, uuid_subtype=OLD_UUID_SUBTYPE, compile_re=True): """Decode BSON data to multiple documents. `data` must be a string of concatenated, valid, BSON-encoded documents. :Parameters: - `data`: BSON data - `as_class` (optional): the class to use for the resulting documents - `tz_aware` (optional): if ``True``, return timezone-aware :class:`~datetime.datetime` instances - `compile_re` (optional): if ``False``, don't attempt to compile BSON regular expressions into Python regular expressions. Return instances of :class:`~bson.regex.Regex` instead. Can avoid :exc:`~bson.errors.InvalidBSON` errors when receiving Python-incompatible regular expressions, for example from ``currentOp`` .. versionchanged:: 2.7 Added `compile_re` option. .. versionadded:: 1.9 """ docs = [] position = 0 end = len(data) - 1 try: while position < end: obj_size = struct.unpack("i", data[position:position + 4])[0] if len(data) - position < obj_size: raise InvalidBSON("objsize too large") if data[position + obj_size - 1:position + obj_size] != ZERO: raise InvalidBSON("bad eoo") elements = data[position + 4:position + obj_size - 1] position += obj_size docs.append(_elements_to_dict(elements, as_class, tz_aware, uuid_subtype, compile_re)) return docs except InvalidBSON: raise except Exception as e: reraise(InvalidBSON, InvalidBSON(e), sys.exc_info()[2])
def _get_object(data, position, as_class, tz_aware, uuid_subtype): obj_size = struct.unpack("<i", data[position:position + 4])[0] if data[position + obj_size - 1:position + obj_size] != ZERO: raise InvalidBSON("bad eoo") encoded = data[position + 4:position + obj_size - 1] object = _elements_to_dict(encoded, as_class, tz_aware, uuid_subtype) position += obj_size if "$ref" in object: return (DBRef(object.pop("$ref"), object.pop("$id", None), object.pop("$db", None), object), position) return object, position
def _elements_to_dict(data, view, position, obj_end, opts, result=None): """Decode a BSON document into result.""" if result is None: result = opts.document_class() end = obj_end - 1 while position < end: key, value, position = _element_to_dict(data, view, position, obj_end, opts) result[key] = value if position != obj_end: raise InvalidBSON('bad object or element length') return result
def _get_boolean( data: Any, view: Any, position: int, dummy0: Any, dummy1: Any, dummy2: Any ) -> Tuple[bool, int]: """Decode a BSON true/false to python True/False.""" end = position + 1 boolean_byte = data[position:end] if boolean_byte == b"\x00": return False, end elif boolean_byte == b"\x01": return True, end raise InvalidBSON("invalid boolean value: %r" % boolean_byte)
def _get_c_string(data, position, length=None): if length is None: try: end = data.index(ZERO, position) except ValueError: raise InvalidBSON() else: end = position + length value = data[position:end].decode("utf-8") position = end + 1 return value, position