def _do_batched_insert(collection_name, docs, check_keys, safe, last_error_args, continue_on_error, opts, ctx): """Insert `docs` using multiple batches. """ def _insert_message(insert_message, send_safe): """Build the insert message with header and GLE. """ request_id, final_message = __pack_message(2002, insert_message) if send_safe: request_id, error_message, _ = __last_error( collection_name, last_error_args) final_message += error_message return request_id, final_message send_safe = safe or not continue_on_error last_error = None data = StringIO() data.write(struct.pack("<i", int(continue_on_error))) data.write(bson._make_c_string(collection_name)) message_length = begin_loc = data.tell() has_docs = False to_send = [] for doc in docs: encoded = bson.BSON.encode(doc, check_keys, opts) encoded_length = len(encoded) too_large = (encoded_length > ctx.max_bson_size) message_length += encoded_length if message_length < ctx.max_message_size and not too_large: data.write(encoded) to_send.append(doc) has_docs = True continue if has_docs: # We have enough data, send this message. try: request_id, msg = _insert_message(data.getvalue(), send_safe) ctx.legacy_write(request_id, msg, 0, send_safe, to_send) # Exception type could be OperationFailure or a subtype # (e.g. DuplicateKeyError) except OperationFailure as exc: # Like it says, continue on error... if continue_on_error: # Store exception details to re-raise after the final batch. last_error = exc # With unacknowledged writes just return at the first error. elif not safe: return # With acknowledged writes raise immediately. else: raise if too_large: _raise_document_too_large("insert", encoded_length, ctx.max_bson_size) message_length = begin_loc + encoded_length data.seek(begin_loc) data.truncate() data.write(encoded) to_send = [doc] if not has_docs: raise InvalidOperation("cannot do an empty bulk insert") request_id, msg = _insert_message(data.getvalue(), safe) ctx.legacy_write(request_id, msg, 0, safe, to_send) # Re-raise any exception stored due to continue_on_error if last_error is not None: raise last_error
class GridIn(object): """Class to write data to GridFS. """ def __init__(self, root_collection, **kwargs): """Write a file to GridFS Application developers should generally not need to instantiate this class directly - instead see the methods provided by :class:`~gridfs.GridFS`. Raises :class:`TypeError` if `root_collection` is not an instance of :class:`~pymongo.collection.Collection`. Any of the file level options specified in the `GridFS Spec <http://dochub.mongodb.org/core/gridfsspec>`_ may be passed as keyword arguments. Any additional keyword arguments will be set as additional fields on the file document. Valid keyword arguments include: - ``"_id"``: unique ID for this file (default: :class:`~bson.objectid.ObjectId`) - this ``"_id"`` must not have already been used for another file - ``"filename"``: human name for the file - ``"contentType"`` or ``"content_type"``: valid mime-type for the file - ``"chunkSize"`` or ``"chunk_size"``: size of each of the chunks, in bytes (default: 256 kb) - ``"encoding"``: encoding used for this file. In Python 2, any :class:`unicode` that is written to the file will be converted to a :class:`str`. In Python 3, any :class:`str` that is written to the file will be converted to :class:`bytes`. :Parameters: - `root_collection`: root collection to write to - `**kwargs` (optional): file level options (see above) """ if not isinstance(root_collection, Collection): raise TypeError("root_collection must be an " "instance of Collection") # Handle alternative naming if "content_type" in kwargs: kwargs["contentType"] = kwargs.pop("content_type") if "chunk_size" in kwargs: kwargs["chunkSize"] = kwargs.pop("chunk_size") # Defaults kwargs["_id"] = kwargs.get("_id", ObjectId()) kwargs["chunkSize"] = kwargs.get("chunkSize", DEFAULT_CHUNK_SIZE) root_collection.chunks.ensure_index([("files_id", ASCENDING), ("n", ASCENDING)], unique=True) object.__setattr__(self, "_coll", root_collection) object.__setattr__(self, "_chunks", root_collection.chunks) object.__setattr__(self, "_file", kwargs) object.__setattr__(self, "_buffer", StringIO()) object.__setattr__(self, "_position", 0) object.__setattr__(self, "_chunk_number", 0) object.__setattr__(self, "_closed", False) @property def closed(self): """Is this file closed? """ return self._closed _id = _create_property("_id", "The ``'_id'`` value for this file.", read_only=True) filename = _create_property("filename", "Name of this file.") content_type = _create_property("contentType", "Mime-type for this file.") length = _create_property("length", "Length (in bytes) of this file.", closed_only=True) chunk_size = _create_property("chunkSize", "Chunk size for this file.", read_only=True) upload_date = _create_property("uploadDate", "Date that this file was uploaded.", closed_only=True) md5 = _create_property("md5", "MD5 of the contents of this file " "(generated on the server).", closed_only=True) def __getattr__(self, name): if name in self._file: return self._file[name] raise AttributeError("GridIn object has no attribute '%s'" % name) def __setattr__(self, name, value): object.__setattr__(self, name, value) if self._closed: self._coll.files.update({"_id": self._file["_id"]}, {"$set": {name: value}}, safe=True) def __flush_data(self, data): """Flush `data` to a chunk. """ if not data: return assert(len(data) <= self.chunk_size) chunk = {"files_id": self._file["_id"], "n": self._chunk_number, "data": Binary(data)} self._chunks.insert(chunk) self._chunk_number += 1 self._position += len(data) def __flush_buffer(self): """Flush the buffer contents out to a chunk. """ self.__flush_data(self._buffer.getvalue()) self._buffer.close() self._buffer = StringIO() def __flush(self): """Flush the file to the database. """ self.__flush_buffer() md5 = self._coll.database.command("filemd5", self._id, root=self._coll.name)["md5"] self._file["md5"] = md5 self._file["length"] = self._position self._file["uploadDate"] = datetime.datetime.utcnow() try: return self._coll.files.insert(self._file, safe=True) except DuplicateKeyError: raise FileExists("file with _id %r already exists" % self._id) def close(self): """Flush the file and close it. A closed file cannot be written any more. Calling :meth:`close` more than once is allowed. """ if not self._closed: self.__flush() object.__setattr__(self, "_closed", True) def write(self, data): """Write data to the file. There is no return value. `data` can be either a string of bytes or a file-like object (implementing :meth:`read`). If the file has an :attr:`encoding` attribute, `data` can also be a :class:`unicode` (:class:`str` in python 3) instance, which will be encoded as :attr:`encoding` before being written. Due to buffering, the data may not actually be written to the database until the :meth:`close` method is called. Raises :class:`ValueError` if this file is already closed. Raises :class:`TypeError` if `data` is not an instance of :class:`str` (:class:`bytes` in python 3), a file-like object, or an instance of :class:`unicode` (:class:`str` in python 3). Unicode data is only allowed if the file has an :attr:`encoding` attribute. :Parameters: - `data`: string of bytes or file-like object to be written to the file .. versionadded:: 1.9 The ability to write :class:`unicode`, if the file has an :attr:`encoding` attribute. """ if self._closed: raise ValueError("cannot write to a closed file") try: # file-like read = data.read except AttributeError: # string if not isinstance(data, string_types): raise TypeError("can only write strings or file-like objects") if isinstance(data, unicode): try: data = data.encode(self.encoding) except AttributeError: raise TypeError("must specify an encoding for file in " "order to write %s" % (text_type.__name__,)) read = StringIO(data).read if self._buffer.tell() > 0: # Make sure to flush only when _buffer is complete space = self.chunk_size - self._buffer.tell() if space: to_write = read(space) self._buffer.write(to_write) if len(to_write) < space: return # EOF or incomplete self.__flush_buffer() to_write = read(self.chunk_size) while to_write and len(to_write) == self.chunk_size: self.__flush_data(to_write) to_write = read(self.chunk_size) self._buffer.write(to_write) def writelines(self, sequence): """Write a sequence of strings to the file. Does not add seperators. """ for line in sequence: self.write(line) def __enter__(self): """Support for the context manager protocol. """ return self def __exit__(self, exc_type, exc_val, exc_tb): """Support for the context manager protocol. Close the file and allow exceptions to propagate. """ self.close() # propagate exceptions return False
class GridIn(object): """Class to write data to GridFS. """ def __init__(self, root_collection, session=None, disable_md5=False, **kwargs): """Write a file to GridFS Application developers should generally not need to instantiate this class directly - instead see the methods provided by :class:`~gridfs.GridFS`. Raises :class:`TypeError` if `root_collection` is not an instance of :class:`~pymongo.collection.Collection`. Any of the file level options specified in the `GridFS Spec <http://dochub.mongodb.org/core/gridfsspec>`_ may be passed as keyword arguments. Any additional keyword arguments will be set as additional fields on the file document. Valid keyword arguments include: - ``"_id"``: unique ID for this file (default: :class:`~bson.objectid.ObjectId`) - this ``"_id"`` must not have already been used for another file - ``"filename"``: human name for the file - ``"contentType"`` or ``"content_type"``: valid mime-type for the file - ``"chunkSize"`` or ``"chunk_size"``: size of each of the chunks, in bytes (default: 255 kb) - ``"encoding"``: encoding used for this file. In Python 2, any :class:`unicode` that is written to the file will be converted to a :class:`str`. In Python 3, any :class:`str` that is written to the file will be converted to :class:`bytes`. :Parameters: - `root_collection`: root collection to write to - `session` (optional): a :class:`~pymongo.client_session.ClientSession` to use for all commands - `disable_md5` (optional): When True, an MD5 checksum will not be computed for the uploaded file. Useful in environments where MD5 cannot be used for regulatory or other reasons. Defaults to False. - `**kwargs` (optional): file level options (see above) .. versionchanged:: 3.6 Added ``session`` parameter. .. versionchanged:: 3.0 `root_collection` must use an acknowledged :attr:`~pymongo.collection.Collection.write_concern` """ if not isinstance(root_collection, Collection): raise TypeError("root_collection must be an " "instance of Collection") if not root_collection.write_concern.acknowledged: raise ConfigurationError('root_collection must use ' 'acknowledged write_concern') # Handle alternative naming if "content_type" in kwargs: kwargs["contentType"] = kwargs.pop("content_type") if "chunk_size" in kwargs: kwargs["chunkSize"] = kwargs.pop("chunk_size") coll = _clear_entity_type_registry( root_collection, read_preference=ReadPreference.PRIMARY) if not disable_md5: kwargs["md5"] = hashlib.md5() # Defaults kwargs["_id"] = kwargs.get("_id", ObjectId()) kwargs["chunkSize"] = kwargs.get("chunkSize", DEFAULT_CHUNK_SIZE) object.__setattr__(self, "_session", session) object.__setattr__(self, "_coll", coll) object.__setattr__(self, "_chunks", coll.chunks) object.__setattr__(self, "_file", kwargs) object.__setattr__(self, "_buffer", StringIO()) object.__setattr__(self, "_position", 0) object.__setattr__(self, "_chunk_number", 0) object.__setattr__(self, "_closed", False) object.__setattr__(self, "_ensured_index", False) def __create_index(self, collection, index_key, unique): doc = collection.find_one(projection={"_id": 1}, session=self._session) if doc is None: try: index_keys = [ index_spec['key'] for index_spec in collection.list_indexes( session=self._session) ] except OperationFailure: index_keys = [] if index_key not in index_keys: collection.create_index(index_key.items(), unique=unique, session=self._session) def __ensure_indexes(self): if not object.__getattribute__(self, "_ensured_index"): self.__create_index(self._coll.files, _F_INDEX, False) self.__create_index(self._coll.chunks, _C_INDEX, True) object.__setattr__(self, "_ensured_index", True) def abort(self): """Remove all chunks/files that may have been uploaded and close. """ self._coll.chunks.delete_many({"files_id": self._file['_id']}, session=self._session) self._coll.files.delete_one({"_id": self._file['_id']}, session=self._session) object.__setattr__(self, "_closed", True) @property def closed(self): """Is this file closed? """ return self._closed _id = _grid_in_property("_id", "The ``'_id'`` value for this file.", read_only=True) filename = _grid_in_property("filename", "Name of this file.") name = _grid_in_property("filename", "Alias for `filename`.") content_type = _grid_in_property("contentType", "Mime-type for this file.") length = _grid_in_property("length", "Length (in bytes) of this file.", closed_only=True) chunk_size = _grid_in_property("chunkSize", "Chunk size for this file.", read_only=True) upload_date = _grid_in_property("uploadDate", "Date that this file was uploaded.", closed_only=True) md5 = _grid_in_property("md5", "MD5 of the contents of this file " "if an md5 sum was created.", closed_only=True) def __getattr__(self, name): if name in self._file: return self._file[name] raise AttributeError("GridIn object has no attribute '%s'" % name) def __setattr__(self, name, value): # For properties of this instance like _buffer, or descriptors set on # the class like filename, use regular __setattr__ if name in self.__dict__ or name in self.__class__.__dict__: object.__setattr__(self, name, value) else: # All other attributes are part of the document in db.fs.files. # Store them to be sent to server on close() or if closed, send # them now. self._file[name] = value if self._closed: self._coll.files.update_one({"_id": self._file["_id"]}, {"$set": { name: value }}) def __flush_data(self, data): """Flush `data` to a chunk. """ self.__ensure_indexes() if 'md5' in self._file: self._file['md5'].update(data) if not data: return assert (len(data) <= self.chunk_size) chunk = { "files_id": self._file["_id"], "n": self._chunk_number, "data": Binary(data) } try: self._chunks.insert_one(chunk, session=self._session) except DuplicateKeyError: self._raise_file_exists(self._file['_id']) self._chunk_number += 1 self._position += len(data) def __flush_buffer(self): """Flush the buffer contents out to a chunk. """ self.__flush_data(self._buffer.getvalue()) self._buffer.close() self._buffer = StringIO() def __flush(self): """Flush the file to the database. """ try: self.__flush_buffer() if "md5" in self._file: self._file["md5"] = self._file["md5"].hexdigest() # The GridFS spec says length SHOULD be an Int64. self._file["length"] = Int64(self._position) self._file["uploadDate"] = datetime.datetime.utcnow() return self._coll.files.insert_one(self._file, session=self._session) except DuplicateKeyError: self._raise_file_exists(self._id) def _raise_file_exists(self, file_id): """Raise a FileExists exception for the given file_id.""" raise FileExists("file with _id %r already exists" % file_id) def close(self): """Flush the file and close it. A closed file cannot be written any more. Calling :meth:`close` more than once is allowed. """ if not self._closed: self.__flush() object.__setattr__(self, "_closed", True) def read(self, size=-1): raise io.UnsupportedOperation('read') def readable(self): return False def seekable(self): return False def write(self, data): """Write data to the file. There is no return value. `data` can be either a string of bytes or a file-like object (implementing :meth:`read`). If the file has an :attr:`encoding` attribute, `data` can also be a :class:`unicode` (:class:`str` in python 3) instance, which will be encoded as :attr:`encoding` before being written. Due to buffering, the data may not actually be written to the database until the :meth:`close` method is called. Raises :class:`ValueError` if this file is already closed. Raises :class:`TypeError` if `data` is not an instance of :class:`str` (:class:`bytes` in python 3), a file-like object, or an instance of :class:`unicode` (:class:`str` in python 3). Unicode data is only allowed if the file has an :attr:`encoding` attribute. :Parameters: - `data`: string of bytes or file-like object to be written to the file """ if self._closed: raise ValueError("cannot write to a closed file") try: # file-like read = data.read except AttributeError: # string if not isinstance(data, (text_type, bytes)): raise TypeError("can only write strings or file-like objects") if isinstance(data, text_type): try: data = data.encode(self.encoding) except AttributeError: raise TypeError("must specify an encoding for file in " "order to write %s" % (text_type.__name__, )) read = StringIO(data).read if self._buffer.tell() > 0: # Make sure to flush only when _buffer is complete space = self.chunk_size - self._buffer.tell() if space: try: to_write = read(space) except: self.abort() raise self._buffer.write(to_write) if len(to_write) < space: return # EOF or incomplete self.__flush_buffer() to_write = read(self.chunk_size) while to_write and len(to_write) == self.chunk_size: self.__flush_data(to_write) to_write = read(self.chunk_size) self._buffer.write(to_write) def writelines(self, sequence): """Write a sequence of strings to the file. Does not add seperators. """ for line in sequence: self.write(line) def writeable(self): return True def __enter__(self): """Support for the context manager protocol. """ return self def __exit__(self, exc_type, exc_val, exc_tb): """Support for the context manager protocol. Close the file and allow exceptions to propagate. """ self.close() # propagate exceptions return False
def _do_batched_write_command(namespace, operation, command, docs, check_keys, uuid_subtype, client): """Execute a batch of insert, update, or delete commands. """ max_bson_size = client.max_bson_size # Max BSON object size + 16k - 2 bytes for ending NUL bytes # XXX: This should come from the server - SERVER-10643 max_cmd_size = max_bson_size + 16382 ordered = command.get('ordered', True) buf = StringIO() # Save space for message length and request id buf.write(_ZERO_64) # responseTo, opCode buf.write(b("\x00\x00\x00\x00\xd4\x07\x00\x00")) # No options buf.write(_ZERO_32) # Namespace as C string buf.write(b(namespace)) buf.write(_ZERO_8) # Skip: 0, Limit: -1 buf.write(_SKIPLIM) # Where to write command document length command_start = buf.tell() buf.write(bson.BSON.encode(command)) # Start of payload buf.seek(-1, 2) # Work around some Jython weirdness. buf.truncate() try: buf.write(_OP_MAP[operation]) except KeyError: raise InvalidOperation('Unknown command') if operation in (_UPDATE, _DELETE): check_keys = False # Where to write list document length list_start = buf.tell() - 4 def send_message(): """Finalize and send the current OP_QUERY message. """ # Close list and command documents buf.write(_ZERO_16) # Write document lengths and request id length = buf.tell() buf.seek(list_start) buf.write(struct.pack('<i', length - list_start - 1)) buf.seek(command_start) buf.write(struct.pack('<i', length - command_start)) buf.seek(4) request_id = random.randint(MIN_INT32, MAX_INT32) buf.write(struct.pack('<i', request_id)) buf.seek(0) buf.write(struct.pack('<i', length)) return client._send_message((request_id, buf.getvalue()), with_last_error=True, command=True) # If there are multiple batches we'll # merge results in the caller. results = [] idx = 0 idx_offset = 0 has_docs = False for doc in docs: has_docs = True # Encode the current operation key = b(str(idx)) value = bson.BSON.encode(doc, check_keys, uuid_subtype) # Send a batch? if (buf.tell() + len(key) + len(value) + 2) >= max_cmd_size: if not idx: if operation == _INSERT: raise InvalidDocument("BSON document too large (%d bytes)" " - the connected server supports" " BSON document sizes up to %d" " bytes." % (len(value), max_bson_size)) # There's nothing intelligent we can say # about size for update and remove raise InvalidDocument("command document too large") result = send_message() results.append((idx_offset, result)) if ordered and "writeErrors" in result: return results # Truncate back to the start of list elements buf.seek(list_start + 4) buf.truncate() idx_offset += idx idx = 0 key = b('0') buf.write(_BSONOBJ) buf.write(key) buf.write(_ZERO_8) buf.write(value) idx += 1 if not has_docs: raise InvalidOperation("cannot do an empty bulk write") results.append((idx_offset, send_message())) return results
def _do_batched_write_command(namespace, operation, command, docs, check_keys, opts, ctx): """Create the next batched insert, update, or delete command. """ max_bson_size = ctx.max_bson_size max_write_batch_size = ctx.max_write_batch_size # Max BSON object size + 16k - 2 bytes for ending NUL bytes. # Server guarantees there is enough room: SERVER-10643. max_cmd_size = max_bson_size + _COMMAND_OVERHEAD buf = StringIO() # Save space for message length and request id buf.write(_ZERO_64) # responseTo, opCode buf.write(b"\x00\x00\x00\x00\xd4\x07\x00\x00") # No options buf.write(_ZERO_32) # Namespace as C string buf.write(b(namespace)) buf.write(_ZERO_8) # Skip: 0, Limit: -1 buf.write(_SKIPLIM) # Where to write command document length command_start = buf.tell() buf.write(bson.BSON.encode(command)) # Start of payload buf.seek(-1, 2) # Work around some Jython weirdness. buf.truncate() try: buf.write(_OP_MAP[operation]) except KeyError: raise InvalidOperation('Unknown command') if operation in (_UPDATE, _DELETE): check_keys = False # Where to write list document length list_start = buf.tell() - 4 to_send = [] idx = 0 for doc in docs: # Encode the current operation key = b(str(idx)) value = bson.BSON.encode(doc, check_keys, opts) # Is there enough room to add this document? max_cmd_size accounts for # the two trailing null bytes. enough_data = (buf.tell() + len(key) + len(value)) >= max_cmd_size enough_documents = (idx >= max_write_batch_size) if enough_data or enough_documents: if not idx: write_op = "insert" if operation == _INSERT else None _raise_document_too_large(write_op, len(value), max_bson_size) break buf.write(_BSONOBJ) buf.write(key) buf.write(_ZERO_8) buf.write(value) to_send.append(doc) idx += 1 # Finalize the current OP_QUERY message. # Close list and command documents buf.write(_ZERO_16) # Write document lengths and request id length = buf.tell() buf.seek(list_start) buf.write(struct.pack('<i', length - list_start - 1)) buf.seek(command_start) buf.write(struct.pack('<i', length - command_start)) buf.seek(4) request_id = _randint() buf.write(struct.pack('<i', request_id)) buf.seek(0) buf.write(struct.pack('<i', length)) return request_id, buf.getvalue(), to_send
class GridIn(object): """Class to write data to GridFS. """ def __init__(self, root_collection, **kwargs): """Write a file to GridFS Application developers should generally not need to instantiate this class directly - instead see the methods provided by :class:`~gridfs.GridFS`. Raises :class:`TypeError` if `root_collection` is not an instance of :class:`~pymongo.collection.Collection`. Any of the file level options specified in the `GridFS Spec <http://dochub.mongodb.org/core/gridfsspec>`_ may be passed as keyword arguments. Any additional keyword arguments will be set as additional fields on the file document. Valid keyword arguments include: - ``"_id"``: unique ID for this file (default: :class:`~bson.objectid.ObjectId`) - this ``"_id"`` must not have already been used for another file - ``"filename"``: human name for the file - ``"contentType"`` or ``"content_type"``: valid mime-type for the file - ``"chunkSize"`` or ``"chunk_size"``: size of each of the chunks, in bytes (default: 255 kb) - ``"encoding"``: encoding used for this file. In Python 2, any :class:`unicode` that is written to the file will be converted to a :class:`str`. In Python 3, any :class:`str` that is written to the file will be converted to :class:`bytes`. :Parameters: - `root_collection`: root collection to write to - `**kwargs` (optional): file level options (see above) .. versionchanged:: 3.0 `root_collection` must use an acknowledged :attr:`~pymongo.collection.Collection.write_concern` """ if not isinstance(root_collection, Collection): raise TypeError("root_collection must be an " "instance of Collection") # With w=0, 'filemd5' might run before the final chunks are written. if not root_collection.write_concern.acknowledged: raise ConfigurationError('root_collection must use ' 'acknowledged write_concern') # Handle alternative naming if "content_type" in kwargs: kwargs["contentType"] = kwargs.pop("content_type") if "chunk_size" in kwargs: kwargs["chunkSize"] = kwargs.pop("chunk_size") # Defaults kwargs["_id"] = kwargs.get("_id", ObjectId()) kwargs["chunkSize"] = kwargs.get("chunkSize", DEFAULT_CHUNK_SIZE) object.__setattr__(self, "_coll", root_collection) object.__setattr__(self, "_chunks", root_collection.chunks) object.__setattr__(self, "_file", kwargs) object.__setattr__(self, "_buffer", StringIO()) object.__setattr__(self, "_position", 0) object.__setattr__(self, "_chunk_number", 0) object.__setattr__(self, "_closed", False) object.__setattr__(self, "_ensured_index", False) def _ensure_index(self): if not object.__getattribute__(self, "_ensured_index"): try: self._coll.chunks.create_index( [("files_id", ASCENDING), ("n", ASCENDING)], unique=True) except OperationFailure as exc: if not (exc.code in UNAUTHORIZED_CODES or "authorized" in str(exc)): raise exc object.__setattr__(self, "_ensured_index", True) @property def closed(self): """Is this file closed? """ return self._closed _id = _grid_in_property("_id", "The ``'_id'`` value for this file.", read_only=True) filename = _grid_in_property("filename", "Name of this file.") name = _grid_in_property("filename", "Alias for `filename`.") content_type = _grid_in_property("contentType", "Mime-type for this file.") length = _grid_in_property("length", "Length (in bytes) of this file.", closed_only=True) chunk_size = _grid_in_property("chunkSize", "Chunk size for this file.", read_only=True) upload_date = _grid_in_property("uploadDate", "Date that this file was uploaded.", closed_only=True) md5 = _grid_in_property("md5", "MD5 of the contents of this file " "(generated on the server).", closed_only=True) def __getattr__(self, name): if name in self._file: return self._file[name] raise AttributeError("GridIn object has no attribute '%s'" % name) def __setattr__(self, name, value): # For properties of this instance like _buffer, or descriptors set on # the class like filename, use regular __setattr__ if name in self.__dict__ or name in self.__class__.__dict__: object.__setattr__(self, name, value) else: # All other attributes are part of the document in db.fs.files. # Store them to be sent to server on close() or if closed, send # them now. self._file[name] = value if self._closed: self._coll.files.update_one({"_id": self._file["_id"]}, {"$set": {name: value}}) def __flush_data(self, data): """Flush `data` to a chunk. """ # Ensure the index, even if there's nothing to write, so # the filemd5 command always succeeds. self._ensure_index() if not data: return assert(len(data) <= self.chunk_size) chunk = {"files_id": self._file["_id"], "n": self._chunk_number, "data": Binary(data)} try: self._chunks.insert_one(chunk) except DuplicateKeyError: self._raise_file_exists(self._file['_id']) self._chunk_number += 1 self._position += len(data) def __flush_buffer(self): """Flush the buffer contents out to a chunk. """ self.__flush_data(self._buffer.getvalue()) self._buffer.close() self._buffer = StringIO() def __flush(self): """Flush the file to the database. """ try: self.__flush_buffer() db = self._coll.database md5 = db.command( "filemd5", self._id, root=self._coll.name, read_preference=ReadPreference.PRIMARY)["md5"] self._file["md5"] = md5 self._file["length"] = self._position self._file["uploadDate"] = datetime.datetime.utcnow() return self._coll.files.insert_one(self._file) except DuplicateKeyError: self._raise_file_exists(self._id) def _raise_file_exists(self, file_id): """Raise a FileExists exception for the given file_id.""" raise FileExists("file with _id %r already exists" % file_id) def close(self): """Flush the file and close it. A closed file cannot be written any more. Calling :meth:`close` more than once is allowed. """ if not self._closed: self.__flush() object.__setattr__(self, "_closed", True) def write(self, data): """Write data to the file. There is no return value. `data` can be either a string of bytes or a file-like object (implementing :meth:`read`). If the file has an :attr:`encoding` attribute, `data` can also be a :class:`unicode` (:class:`str` in python 3) instance, which will be encoded as :attr:`encoding` before being written. Due to buffering, the data may not actually be written to the database until the :meth:`close` method is called. Raises :class:`ValueError` if this file is already closed. Raises :class:`TypeError` if `data` is not an instance of :class:`str` (:class:`bytes` in python 3), a file-like object, or an instance of :class:`unicode` (:class:`str` in python 3). Unicode data is only allowed if the file has an :attr:`encoding` attribute. :Parameters: - `data`: string of bytes or file-like object to be written to the file """ if self._closed: raise ValueError("cannot write to a closed file") try: # file-like read = data.read except AttributeError: # string if not isinstance(data, (text_type, bytes)): raise TypeError("can only write strings or file-like objects") if isinstance(data, text_type): try: data = data.encode(self.encoding) except AttributeError: raise TypeError("must specify an encoding for file in " "order to write %s" % (text_type.__name__,)) read = StringIO(data).read if self._buffer.tell() > 0: # Make sure to flush only when _buffer is complete space = self.chunk_size - self._buffer.tell() if space: to_write = read(space) self._buffer.write(to_write) if len(to_write) < space: return # EOF or incomplete self.__flush_buffer() to_write = read(self.chunk_size) while to_write and len(to_write) == self.chunk_size: self.__flush_data(to_write) to_write = read(self.chunk_size) self._buffer.write(to_write) def writelines(self, sequence): """Write a sequence of strings to the file. Does not add seperators. """ for line in sequence: self.write(line) def __enter__(self): """Support for the context manager protocol. """ return self def __exit__(self, exc_type, exc_val, exc_tb): """Support for the context manager protocol. Close the file and allow exceptions to propagate. """ self.close() # propagate exceptions return False
def _do_batched_insert(collection_name, docs, check_keys, safe, last_error_args, continue_on_error, opts, sock_info): """Insert `docs` using multiple batches. """ def _insert_message(insert_message, send_safe): """Build the insert message with header and GLE. """ request_id, final_message = __pack_message(2002, insert_message) if send_safe: request_id, error_message, _ = __last_error(collection_name, last_error_args) final_message += error_message return request_id, final_message send_safe = safe or not continue_on_error last_error = None data = StringIO() data.write(struct.pack("<i", int(continue_on_error))) data.write(bson._make_c_string(collection_name)) message_length = begin_loc = data.tell() has_docs = False for doc in docs: encoded = bson.BSON.encode(doc, check_keys, opts) encoded_length = len(encoded) too_large = (encoded_length > sock_info.max_bson_size) message_length += encoded_length if message_length < sock_info.max_message_size and not too_large: data.write(encoded) has_docs = True continue if has_docs: # We have enough data, send this message. try: request_id, msg = _insert_message(data.getvalue(), send_safe) sock_info.legacy_write(request_id, msg, 0, send_safe) # Exception type could be OperationFailure or a subtype # (e.g. DuplicateKeyError) except OperationFailure as exc: # Like it says, continue on error... if continue_on_error: # Store exception details to re-raise after the final batch. last_error = exc # With unacknowledged writes just return at the first error. elif not safe: return # With acknowledged writes raise immediately. else: raise if too_large: raise DocumentTooLarge("BSON document too large (%d bytes)" " - the connected server supports" " BSON document sizes up to %d" " bytes." % (encoded_length, sock_info.max_bson_size)) message_length = begin_loc + encoded_length data.seek(begin_loc) data.truncate() data.write(encoded) if not has_docs: raise InvalidOperation("cannot do an empty bulk insert") request_id, msg = _insert_message(data.getvalue(), safe) sock_info.legacy_write(request_id, msg, 0, safe) # Re-raise any exception stored due to continue_on_error if last_error is not None: raise last_error
def _do_batched_write_command(namespace, operation, command, docs, check_keys, opts, ctx): """Execute a batch of insert, update, or delete commands. """ max_bson_size = ctx.max_bson_size max_write_batch_size = ctx.max_write_batch_size # Max BSON object size + 16k - 2 bytes for ending NUL bytes. # Server guarantees there is enough room: SERVER-10643. max_cmd_size = max_bson_size + _COMMAND_OVERHEAD ordered = command.get('ordered', True) buf = StringIO() # Save space for message length and request id buf.write(_ZERO_64) # responseTo, opCode buf.write(b"\x00\x00\x00\x00\xd4\x07\x00\x00") # No options buf.write(_ZERO_32) # Namespace as C string buf.write(b(namespace)) buf.write(_ZERO_8) # Skip: 0, Limit: -1 buf.write(_SKIPLIM) # Where to write command document length command_start = buf.tell() buf.write(bson.BSON.encode(command)) # Start of payload buf.seek(-1, 2) # Work around some Jython weirdness. buf.truncate() try: buf.write(_OP_MAP[operation]) except KeyError: raise InvalidOperation('Unknown command') if operation in (_UPDATE, _DELETE): check_keys = False # Where to write list document length list_start = buf.tell() - 4 to_send = [] def send_message(): """Finalize and send the current OP_QUERY message. """ # Close list and command documents buf.write(_ZERO_16) # Write document lengths and request id length = buf.tell() buf.seek(list_start) buf.write(struct.pack('<i', length - list_start - 1)) buf.seek(command_start) buf.write(struct.pack('<i', length - command_start)) buf.seek(4) request_id = _randint() buf.write(struct.pack('<i', request_id)) buf.seek(0) buf.write(struct.pack('<i', length)) return ctx.write_command(request_id, buf.getvalue(), to_send) # If there are multiple batches we'll # merge results in the caller. results = [] idx = 0 idx_offset = 0 has_docs = False for doc in docs: has_docs = True # Encode the current operation key = b(str(idx)) value = bson.BSON.encode(doc, check_keys, opts) # Send a batch? enough_data = (buf.tell() + len(key) + len(value) + 2) >= max_cmd_size enough_documents = (idx >= max_write_batch_size) if enough_data or enough_documents: if not idx: write_op = "insert" if operation == _INSERT else None _raise_document_too_large( write_op, len(value), max_bson_size) result = send_message() results.append((idx_offset, result)) if ordered and "writeErrors" in result: return results # Truncate back to the start of list elements buf.seek(list_start + 4) buf.truncate() idx_offset += idx idx = 0 key = b'0' to_send = [] buf.write(_BSONOBJ) buf.write(key) buf.write(_ZERO_8) buf.write(value) to_send.append(doc) idx += 1 if not has_docs: raise InvalidOperation("cannot do an empty bulk write") results.append((idx_offset, send_message())) return results
def _do_batched_write_command(namespace, operation, command, docs, check_keys, uuid_subtype, client): """Execute a batch of insert, update, or delete commands. """ max_bson_size = client.max_bson_size max_write_batch_size = client.max_write_batch_size # Max BSON object size + 16k - 2 bytes for ending NUL bytes # XXX: This should come from the server - SERVER-10643 max_cmd_size = max_bson_size + 16382 ordered = command.get('ordered', True) buf = StringIO() # Save space for message length and request id buf.write(_ZERO_64) # responseTo, opCode buf.write(b("\x00\x00\x00\x00\xd4\x07\x00\x00")) # No options buf.write(_ZERO_32) # Namespace as C string buf.write(b(namespace)) buf.write(_ZERO_8) # Skip: 0, Limit: -1 buf.write(_SKIPLIM) # Where to write command document length command_start = buf.tell() buf.write(bson.BSON.encode(command)) # Start of payload buf.seek(-1, 2) # Work around some Jython weirdness. buf.truncate() try: buf.write(_OP_MAP[operation]) except KeyError: raise InvalidOperation('Unknown command') if operation in (_UPDATE, _DELETE): check_keys = False # Where to write list document length list_start = buf.tell() - 4 def send_message(): """Finalize and send the current OP_QUERY message. """ # Close list and command documents buf.write(_ZERO_16) # Write document lengths and request id length = buf.tell() buf.seek(list_start) buf.write(struct.pack('<i', length - list_start - 1)) buf.seek(command_start) buf.write(struct.pack('<i', length - command_start)) buf.seek(4) request_id = random.randint(MIN_INT32, MAX_INT32) buf.write(struct.pack('<i', request_id)) buf.seek(0) buf.write(struct.pack('<i', length)) return client._send_message((request_id, buf.getvalue()), with_last_error=True, command=True) # If there are multiple batches we'll # merge results in the caller. results = [] idx = 0 idx_offset = 0 has_docs = False for doc in docs: has_docs = True # Encode the current operation key = b(str(idx)) value = bson.BSON.encode(doc, check_keys, uuid_subtype) # Send a batch? enough_data = (buf.tell() + len(key) + len(value) + 2) >= max_cmd_size enough_documents = (idx >= max_write_batch_size) if enough_data or enough_documents: if not idx: if operation == _INSERT: raise DocumentTooLarge("BSON document too large (%d bytes)" " - the connected server supports" " BSON document sizes up to %d" " bytes." % (len(value), max_bson_size)) # There's nothing intelligent we can say # about size for update and remove raise DocumentTooLarge("command document too large") result = send_message() results.append((idx_offset, result)) if ordered and "writeErrors" in result: return results # Truncate back to the start of list elements buf.seek(list_start + 4) buf.truncate() idx_offset += idx idx = 0 key = b('0') buf.write(_BSONOBJ) buf.write(key) buf.write(_ZERO_8) buf.write(value) idx += 1 if not has_docs: raise InvalidOperation("cannot do an empty bulk write") results.append((idx_offset, send_message())) return results
class GridIn(object): def __init__(self, client, root_collection, **kwargs): self.client = client self.root_collection = root_collection self._files = self.client.connection(files_coll(self.root_collection)) self._chunks = self.client.connection(chunks_coll(self.root_collection)) # Handle alternative naming if "content_type" in kwargs: kwargs["contentType"] = kwargs.pop("content_type") if "chunk_size" in kwargs: kwargs["chunkSize"] = kwargs.pop("chunk_size") # Defaults kwargs["_id"] = kwargs.get("_id", ObjectId()) kwargs["chunkSize"] = kwargs.get("chunkSize", DEFAULT_CHUNK_SIZE) self._file = kwargs self._chunk_number = 0 self._position = 0 self._buffer = StringIO() def __flush_data(self, data): def no_check(*arg, **kwargs): pass chunk = {"files_id": self._file['_id'], "n": self._chunk_number, "data": Binary(data)} self._chunks.insert(chunk, callback = no_check) self._chunk_number += 1 self._position += len(data) def __flush_buffer(self): self.__flush_data(self._buffer.getvalue()) self._buffer.close() self._buffer = StringIO() def write(self, data, callback = None, **kwargs): self._file['length'] = len(data) self._file.update(kwargs) try: read = data.read except AttributeError: read = StringIO(data).read if self._buffer.tell() > 0: space = self.chunk_size = self._buffer.tell() if space: to_write = read(space) self._buffer.write(to_write) if len(to_write) < space: return # EOF self.__flush_buffer() to_write = read(self._file['chunkSize']) while to_write and len(to_write) == self._file['chunkSize']: self.__flush_data(to_write) to_write = read(self._file['chunkSize']) self._buffer.write(to_write) def __flush(self): self.__flush_buffer() def no_check(*arg, **kwargs): pass def cb_md5(*arg, **kwargs): try: self._file['md5'] = arg[0]['md5'] except: pass self._file["uploadDate"] = datetime.datetime.utcnow() self._files.insert(self._file, callback = no_check) self.client.command('filemd5', self._file['_id'], root=self.root_collection, callback=cb_md5) def close(self): self.__flush()
class GridIn(object): """Class to write data to GridFS. """ def __init__(self, root_collection, **kwargs): """Write a file to GridFS Application developers should generally not need to instantiate this class directly - instead see the methods provided by :class:`~gridfs.GridFS`. Raises :class:`TypeError` if `root_collection` is not an instance of :class:`~pymongo.collection.Collection`. Any of the file level options specified in the `GridFS Spec <http://dochub.mongodb.org/core/gridfsspec>`_ may be passed as keyword arguments. Any additional keyword arguments will be set as additional fields on the file document. Valid keyword arguments include: - ``"_id"``: unique ID for this file (default: :class:`~bson.objectid.ObjectId`) - this ``"_id"`` must not have already been used for another file - ``"filename"``: human name for the file - ``"contentType"`` or ``"content_type"``: valid mime-type for the file - ``"chunkSize"`` or ``"chunk_size"``: size of each of the chunks, in bytes (default: 255 kb) - ``"encoding"``: encoding used for this file. In Python 2, any :class:`unicode` that is written to the file will be converted to a :class:`str`. In Python 3, any :class:`str` that is written to the file will be converted to :class:`bytes`. :Parameters: - `root_collection`: root collection to write to - `**kwargs` (optional): file level options (see above) .. versionchanged:: 3.0 `root_collection` must use an acknowledged :attr:`~pymongo.collection.Collection.write_concern` """ if not isinstance(root_collection, Collection): raise TypeError("root_collection must be an " "instance of Collection") # With w=0, 'filemd5' might run before the final chunks are written. if not root_collection.write_concern.acknowledged: raise ConfigurationError('root_collection must use ' 'acknowledged write_concern') # Handle alternative naming if "content_type" in kwargs: kwargs["contentType"] = kwargs.pop("content_type") if "chunk_size" in kwargs: kwargs["chunkSize"] = kwargs.pop("chunk_size") kwargs['md5'] = md5() # Defaults kwargs["_id"] = kwargs.get("_id", ObjectId()) kwargs["chunkSize"] = kwargs.get("chunkSize", DEFAULT_CHUNK_SIZE) object.__setattr__(self, "_coll", root_collection) object.__setattr__(self, "_chunks", root_collection.chunks) object.__setattr__(self, "_file", kwargs) object.__setattr__(self, "_buffer", StringIO()) object.__setattr__(self, "_position", 0) object.__setattr__(self, "_chunk_number", 0) object.__setattr__(self, "_closed", False) object.__setattr__(self, "_ensured_index", False) def __create_index(self, collection, index, unique): doc = collection.find_one(projection={"_id": 1}) if doc is None: try: indexes = list(collection.list_indexes()) except OperationFailure: indexes = [] if index not in indexes: collection.create_index(index, unique=unique) def __ensure_indexes(self): if not object.__getattribute__(self, "_ensured_index"): self.__create_index(self._coll.files, _F_INDEX, False) self.__create_index(self._coll.chunks, _C_INDEX, True) object.__setattr__(self, "_ensured_index", True) def abort(self): """Remove all chunks/files that may have been uploaded and close. """ self._coll.chunks.delete_many({"files_id": self._file['_id']}) self._coll.files.delete_one({"_id": self._file['_id']}) object.__setattr__(self, "_closed", True) @property def closed(self): """Is this file closed? """ return self._closed _id = _grid_in_property("_id", "The ``'_id'`` value for this file.", read_only=True) filename = _grid_in_property("filename", "Name of this file.") name = _grid_in_property("filename", "Alias for `filename`.") content_type = _grid_in_property("contentType", "Mime-type for this file.") length = _grid_in_property("length", "Length (in bytes) of this file.", closed_only=True) chunk_size = _grid_in_property("chunkSize", "Chunk size for this file.", read_only=True) upload_date = _grid_in_property("uploadDate", "Date that this file was uploaded.", closed_only=True) md5 = _grid_in_property("md5", "MD5 of the contents of this file " "(generated on the server).", closed_only=True) def __getattr__(self, name): if name in self._file: return self._file[name] raise AttributeError("GridIn object has no attribute '%s'" % name) def __setattr__(self, name, value): # For properties of this instance like _buffer, or descriptors set on # the class like filename, use regular __setattr__ if name in self.__dict__ or name in self.__class__.__dict__: object.__setattr__(self, name, value) else: # All other attributes are part of the document in db.fs.files. # Store them to be sent to server on close() or if closed, send # them now. self._file[name] = value if self._closed: self._coll.files.update_one({"_id": self._file["_id"]}, {"$set": {name: value}}) def __flush_data(self, data): """Flush `data` to a chunk. """ # Ensure the index, even if there's nothing to write, so # the filemd5 command always succeeds. self.__ensure_indexes() self._file['md5'].update(data) if not data: return assert(len(data) <= self.chunk_size) chunk = {"files_id": self._file["_id"], "n": self._chunk_number, "data": Binary(data)} try: self._chunks.insert_one(chunk) except DuplicateKeyError: self._raise_file_exists(self._file['_id']) self._chunk_number += 1 self._position += len(data) def __flush_buffer(self): """Flush the buffer contents out to a chunk. """ self.__flush_data(self._buffer.getvalue()) self._buffer.close() self._buffer = StringIO() def __flush(self): """Flush the file to the database. """ try: self.__flush_buffer() self._file['md5'] = self._file["md5"].hexdigest() self._file["length"] = self._position self._file["uploadDate"] = datetime.datetime.utcnow() return self._coll.files.insert_one(self._file) except DuplicateKeyError: self._raise_file_exists(self._id) def _raise_file_exists(self, file_id): """Raise a FileExists exception for the given file_id.""" raise FileExists("file with _id %r already exists" % file_id) def close(self): """Flush the file and close it. A closed file cannot be written any more. Calling :meth:`close` more than once is allowed. """ if not self._closed: self.__flush() object.__setattr__(self, "_closed", True) def write(self, data): """Write data to the file. There is no return value. `data` can be either a string of bytes or a file-like object (implementing :meth:`read`). If the file has an :attr:`encoding` attribute, `data` can also be a :class:`unicode` (:class:`str` in python 3) instance, which will be encoded as :attr:`encoding` before being written. Due to buffering, the data may not actually be written to the database until the :meth:`close` method is called. Raises :class:`ValueError` if this file is already closed. Raises :class:`TypeError` if `data` is not an instance of :class:`str` (:class:`bytes` in python 3), a file-like object, or an instance of :class:`unicode` (:class:`str` in python 3). Unicode data is only allowed if the file has an :attr:`encoding` attribute. :Parameters: - `data`: string of bytes or file-like object to be written to the file """ if self._closed: raise ValueError("cannot write to a closed file") try: # file-like read = data.read except AttributeError: # string if not isinstance(data, (text_type, bytes)): raise TypeError("can only write strings or file-like objects") if isinstance(data, text_type): try: data = data.encode(self.encoding) except AttributeError: raise TypeError("must specify an encoding for file in " "order to write %s" % (text_type.__name__,)) read = StringIO(data).read if self._buffer.tell() > 0: # Make sure to flush only when _buffer is complete space = self.chunk_size - self._buffer.tell() if space: try: to_write = read(space) except: self.abort() raise self._buffer.write(to_write) if len(to_write) < space: return # EOF or incomplete self.__flush_buffer() to_write = read(self.chunk_size) while to_write and len(to_write) == self.chunk_size: self.__flush_data(to_write) to_write = read(self.chunk_size) self._buffer.write(to_write) def writelines(self, sequence): """Write a sequence of strings to the file. Does not add seperators. """ for line in sequence: self.write(line) def __enter__(self): """Support for the context manager protocol. """ return self def __exit__(self, exc_type, exc_val, exc_tb): """Support for the context manager protocol. Close the file and allow exceptions to propagate. """ self.close() # propagate exceptions return False
def _do_batched_write_command(namespace, operation, command, docs, check_keys, opts, ctx): """Create the next batched insert, update, or delete command. """ max_bson_size = ctx.max_bson_size max_write_batch_size = ctx.max_write_batch_size # Max BSON object size + 16k - 2 bytes for ending NUL bytes. # Server guarantees there is enough room: SERVER-10643. max_cmd_size = max_bson_size + _COMMAND_OVERHEAD buf = StringIO() # Save space for message length and request id buf.write(_ZERO_64) # responseTo, opCode buf.write(b"\x00\x00\x00\x00\xd4\x07\x00\x00") # No options buf.write(_ZERO_32) # Namespace as C string buf.write(b(namespace)) buf.write(_ZERO_8) # Skip: 0, Limit: -1 buf.write(_SKIPLIM) # Where to write command document length command_start = buf.tell() buf.write(bson.BSON.encode(command)) # Start of payload buf.seek(-1, 2) # Work around some Jython weirdness. buf.truncate() try: buf.write(_OP_MAP[operation]) except KeyError: raise InvalidOperation('Unknown command') if operation in (_UPDATE, _DELETE): check_keys = False # Where to write list document length list_start = buf.tell() - 4 to_send = [] idx = 0 for doc in docs: # Encode the current operation key = b(str(idx)) value = bson.BSON.encode(doc, check_keys, opts) # Is there enough room to add this document? max_cmd_size accounts for # the two trailing null bytes. enough_data = (buf.tell() + len(key) + len(value)) >= max_cmd_size enough_documents = (idx >= max_write_batch_size) if enough_data or enough_documents: if not idx: write_op = "insert" if operation == _INSERT else None _raise_document_too_large( write_op, len(value), max_bson_size) break buf.write(_BSONOBJ) buf.write(key) buf.write(_ZERO_8) buf.write(value) to_send.append(doc) idx += 1 # Finalize the current OP_QUERY message. # Close list and command documents buf.write(_ZERO_16) # Write document lengths and request id length = buf.tell() buf.seek(list_start) buf.write(struct.pack('<i', length - list_start - 1)) buf.seek(command_start) buf.write(struct.pack('<i', length - command_start)) buf.seek(4) request_id = _randint() buf.write(struct.pack('<i', request_id)) buf.seek(0) buf.write(struct.pack('<i', length)) return request_id, buf.getvalue(), to_send
def _do_batched_insert(collection_name, docs, check_keys, safe, last_error_args, continue_on_error, opts, ctx): """Insert `docs` using multiple batches. """ def _insert_message(insert_message, send_safe): """Build the insert message with header and GLE. """ request_id, final_message = __pack_message(2002, insert_message) if send_safe: request_id, error_message, _ = __last_error(collection_name, last_error_args) final_message += error_message return request_id, final_message send_safe = safe or not continue_on_error last_error = None data = StringIO() data.write(struct.pack("<i", int(continue_on_error))) data.write(_make_c_string(collection_name)) message_length = begin_loc = data.tell() has_docs = False to_send = [] encode = _dict_to_bson # Make local compress = ctx.compress and not (safe or send_safe) for doc in docs: encoded = encode(doc, check_keys, opts) encoded_length = len(encoded) too_large = (encoded_length > ctx.max_bson_size) message_length += encoded_length if message_length < ctx.max_message_size and not too_large: data.write(encoded) to_send.append(doc) has_docs = True continue if has_docs: # We have enough data, send this message. try: if compress: rid, msg = None, data.getvalue() else: rid, msg = _insert_message(data.getvalue(), send_safe) ctx.legacy_bulk_insert( rid, msg, 0, send_safe, to_send, compress) # Exception type could be OperationFailure or a subtype # (e.g. DuplicateKeyError) except OperationFailure as exc: # Like it says, continue on error... if continue_on_error: # Store exception details to re-raise after the final batch. last_error = exc # With unacknowledged writes just return at the first error. elif not safe: return # With acknowledged writes raise immediately. else: raise if too_large: _raise_document_too_large( "insert", encoded_length, ctx.max_bson_size) message_length = begin_loc + encoded_length data.seek(begin_loc) data.truncate() data.write(encoded) to_send = [doc] if not has_docs: raise InvalidOperation("cannot do an empty bulk insert") if compress: request_id, msg = None, data.getvalue() else: request_id, msg = _insert_message(data.getvalue(), safe) ctx.legacy_bulk_insert(request_id, msg, 0, safe, to_send, compress) # Re-raise any exception stored due to continue_on_error if last_error is not None: raise last_error
def _do_batched_insert(collection_name, docs, check_keys, safe, last_error_args, continue_on_error, uuid_subtype, client): """Insert `docs` using multiple batches. """ def _insert_message(insert_message, send_safe): """Build the insert message with header and GLE. """ request_id, final_message = __pack_message(2002, insert_message) if send_safe: request_id, error_message, _ = __last_error( collection_name, last_error_args) final_message += error_message return request_id, final_message send_safe = safe or not continue_on_error last_error = None data = StringIO() data.write(struct.pack("<i", int(continue_on_error))) data.write(bson._make_c_string(collection_name)) message_length = begin_loc = data.tell() has_docs = False for doc in docs: encoded = bson.BSON.encode(doc, check_keys, uuid_subtype) encoded_length = len(encoded) too_large = (encoded_length > client.max_bson_size) message_length += encoded_length if message_length < client.max_message_size and not too_large: data.write(encoded) has_docs = True continue if has_docs: # We have enough data, send this message. try: client._send_message( _insert_message(data.getvalue(), send_safe), send_safe) # Exception type could be OperationFailure or a subtype # (e.g. DuplicateKeyError) except OperationFailure, exc: # Like it says, continue on error... if continue_on_error: # Store exception details to re-raise after the final batch. last_error = exc # With unacknowledged writes just return at the first error. elif not safe: return # With acknowledged writes raise immediately. else: raise if too_large: raise DocumentTooLarge("BSON document too large (%d bytes)" " - the connected server supports" " BSON document sizes up to %d" " bytes." % (encoded_length, client.max_bson_size)) message_length = begin_loc + encoded_length data.seek(begin_loc) data.truncate() data.write(encoded)
def _do_batched_write_command(namespace, operation, command, docs, check_keys, opts, ctx): """Execute a batch of insert, update, or delete commands. """ max_bson_size = ctx.max_bson_size max_write_batch_size = ctx.max_write_batch_size # Max BSON object size + 16k - 2 bytes for ending NUL bytes. # Server guarantees there is enough room: SERVER-10643. max_cmd_size = max_bson_size + _COMMAND_OVERHEAD ordered = command.get('ordered', True) buf = StringIO() # Save space for message length and request id buf.write(_ZERO_64) # responseTo, opCode buf.write(b"\x00\x00\x00\x00\xd4\x07\x00\x00") # No options buf.write(_ZERO_32) # Namespace as C string buf.write(b(namespace)) buf.write(_ZERO_8) # Skip: 0, Limit: -1 buf.write(_SKIPLIM) # Where to write command document length command_start = buf.tell() buf.write(bson.BSON.encode(command)) # Start of payload buf.seek(-1, 2) # Work around some Jython weirdness. buf.truncate() try: buf.write(_OP_MAP[operation]) except KeyError: raise InvalidOperation('Unknown command') if operation in (_UPDATE, _DELETE): check_keys = False # Where to write list document length list_start = buf.tell() - 4 to_send = [] def send_message(): """Finalize and send the current OP_QUERY message. """ # Close list and command documents buf.write(_ZERO_16) # Write document lengths and request id length = buf.tell() buf.seek(list_start) buf.write(struct.pack('<i', length - list_start - 1)) buf.seek(command_start) buf.write(struct.pack('<i', length - command_start)) buf.seek(4) request_id = _randint() buf.write(struct.pack('<i', request_id)) buf.seek(0) buf.write(struct.pack('<i', length)) return ctx.write_command(request_id, buf.getvalue(), to_send) # If there are multiple batches we'll # merge results in the caller. results = [] idx = 0 idx_offset = 0 has_docs = False for doc in docs: has_docs = True # Encode the current operation key = b(str(idx)) value = bson.BSON.encode(doc, check_keys, opts) # Send a batch? enough_data = (buf.tell() + len(key) + len(value) + 2) >= max_cmd_size enough_documents = (idx >= max_write_batch_size) if enough_data or enough_documents: if not idx: write_op = "insert" if operation == _INSERT else None _raise_document_too_large(write_op, len(value), max_bson_size) result = send_message() results.append((idx_offset, result)) if ordered and "writeErrors" in result: return results # Truncate back to the start of list elements buf.seek(list_start + 4) buf.truncate() idx_offset += idx idx = 0 key = b'0' to_send = [] buf.write(_BSONOBJ) buf.write(key) buf.write(_ZERO_8) buf.write(value) to_send.append(doc) idx += 1 if not has_docs: raise InvalidOperation("cannot do an empty bulk write") results.append((idx_offset, send_message())) return results
class GridIn(object): """Class to write data to GridFS. """ def __init__(self, root_collection, **kwargs): """Write a file to GridFS Application developers should generally not need to instantiate this class directly - instead see the methods provided by :class:`~gridfs.GridFS`. Raises :class:`TypeError` if `root_collection` is not an instance of :class:`~pymongo.collection.Collection`. Any of the file level options specified in the `GridFS Spec <http://dochub.mongodb.org/core/gridfsspec>`_ may be passed as keyword arguments. Any additional keyword arguments will be set as additional fields on the file document. Valid keyword arguments include: - ``"_id"``: unique ID for this file (default: :class:`~bson.objectid.ObjectId`) - this ``"_id"`` must not have already been used for another file - ``"filename"``: human name for the file - ``"contentType"`` or ``"content_type"``: valid mime-type for the file - ``"chunkSize"`` or ``"chunk_size"``: size of each of the chunks, in bytes (default: 256 kb) - ``"encoding"``: encoding used for this file. In Python 2, any :class:`unicode` that is written to the file will be converted to a :class:`str`. In Python 3, any :class:`str` that is written to the file will be converted to :class:`bytes`. If you turn off write-acknowledgment for performance reasons, it is critical to wrap calls to :meth:`write` and :meth:`close` within a single request: >>> from pymongo import MongoClient >>> from gridfs import GridFS >>> client = MongoClient(w=0) # turn off write acknowledgment >>> fs = GridFS(client.database) >>> gridin = fs.new_file() >>> request = client.start_request() >>> try: ... for i in range(10): ... gridin.write('foo') ... gridin.close() ... finally: ... request.end() In Python 2.5 and later this code can be simplified with a with-statement, see :doc:`/examples/requests` for more information. :Parameters: - `root_collection`: root collection to write to - `**kwargs` (optional): file level options (see above) """ if not isinstance(root_collection, Collection): raise TypeError("root_collection must be an " "instance of Collection") # Handle alternative naming if "content_type" in kwargs: kwargs["contentType"] = kwargs.pop("content_type") if "chunk_size" in kwargs: kwargs["chunkSize"] = kwargs.pop("chunk_size") # Defaults kwargs["_id"] = kwargs.get("_id", ObjectId()) kwargs["chunkSize"] = kwargs.get("chunkSize", DEFAULT_CHUNK_SIZE) object.__setattr__(self, "_coll", root_collection) object.__setattr__(self, "_chunks", root_collection.chunks) object.__setattr__(self, "_file", kwargs) object.__setattr__(self, "_buffer", StringIO()) object.__setattr__(self, "_position", 0) object.__setattr__(self, "_chunk_number", 0) object.__setattr__(self, "_closed", False) object.__setattr__(self, "_ensured_index", False) def _ensure_index(self): if not object.__getattribute__(self, "_ensured_index"): self._coll.chunks.ensure_index( [("files_id", ASCENDING), ("n", ASCENDING)], unique=True) object.__setattr__(self, "_ensured_index", True) @property def closed(self): """Is this file closed? """ return self._closed _id = _create_property("_id", "The ``'_id'`` value for this file.", read_only=True) filename = _create_property("filename", "Name of this file.") name = _create_property("filename", "Alias for `filename`.") content_type = _create_property("contentType", "Mime-type for this file.") length = _create_property("length", "Length (in bytes) of this file.", closed_only=True) chunk_size = _create_property("chunkSize", "Chunk size for this file.", read_only=True) upload_date = _create_property("uploadDate", "Date that this file was uploaded.", closed_only=True) md5 = _create_property("md5", "MD5 of the contents of this file " "(generated on the server).", closed_only=True) def __getattr__(self, name): if name in self._file: return self._file[name] raise AttributeError("GridIn object has no attribute '%s'" % name) def __setattr__(self, name, value): # For properties of this instance like _buffer, or descriptors set on # the class like filename, use regular __setattr__ if name in self.__dict__ or name in self.__class__.__dict__: object.__setattr__(self, name, value) else: # All other attributes are part of the document in db.fs.files. # Store them to be sent to server on close() or if closed, send # them now. self._file[name] = value if self._closed: self._coll.files.update({"_id": self._file["_id"]}, {"$set": {name: value}}, **self._coll._get_wc_override()) def __flush_data(self, data): """Flush `data` to a chunk. """ # Ensure the index, even if there's nothing to write, so # the filemd5 command always succeeds. self._ensure_index() if not data: return assert(len(data) <= self.chunk_size) chunk = {"files_id": self._file["_id"], "n": self._chunk_number, "data": Binary(data)} try: self._chunks.insert(chunk) except DuplicateKeyError: self._raise_file_exists(self._file['_id']) self._chunk_number += 1 self._position += len(data) def __flush_buffer(self): """Flush the buffer contents out to a chunk. """ self.__flush_data(self._buffer.getvalue()) self._buffer.close() self._buffer = StringIO() def __flush(self): """Flush the file to the database. """ try: self.__flush_buffer() db = self._coll.database # See PYTHON-417, "Sharded GridFS fails with exception: chunks out # of order." Inserts via mongos, even if they use a single # connection, can succeed out-of-order due to the writebackListener. # We mustn't call "filemd5" until all inserts are complete, which # we ensure by calling getLastError (and ignoring the result). db.error() md5 = db.command( "filemd5", self._id, root=self._coll.name)["md5"] self._file["md5"] = md5 self._file["length"] = self._position self._file["uploadDate"] = datetime.datetime.utcnow() return self._coll.files.insert(self._file, **self._coll._get_wc_override()) except DuplicateKeyError: self._raise_file_exists(self._id) def _raise_file_exists(self, file_id): """Raise a FileExists exception for the given file_id.""" raise FileExists("file with _id %r already exists" % file_id) def close(self): """Flush the file and close it. A closed file cannot be written any more. Calling :meth:`close` more than once is allowed. """ if not self._closed: self.__flush() object.__setattr__(self, "_closed", True) def write(self, data): """Write data to the file. There is no return value. `data` can be either a string of bytes or a file-like object (implementing :meth:`read`). If the file has an :attr:`encoding` attribute, `data` can also be a :class:`unicode` (:class:`str` in python 3) instance, which will be encoded as :attr:`encoding` before being written. Due to buffering, the data may not actually be written to the database until the :meth:`close` method is called. Raises :class:`ValueError` if this file is already closed. Raises :class:`TypeError` if `data` is not an instance of :class:`str` (:class:`bytes` in python 3), a file-like object, or an instance of :class:`unicode` (:class:`str` in python 3). Unicode data is only allowed if the file has an :attr:`encoding` attribute. :Parameters: - `data`: string of bytes or file-like object to be written to the file .. versionadded:: 1.9 The ability to write :class:`unicode`, if the file has an :attr:`encoding` attribute. """ if self._closed: raise ValueError("cannot write to a closed file") try: # file-like read = data.read except AttributeError: # string if not isinstance(data, string_types): raise TypeError("can only write strings or file-like objects") if isinstance(data, unicode): try: data = data.encode(self.encoding) except AttributeError: raise TypeError("must specify an encoding for file in " "order to write %s" % (text_type.__name__,)) read = StringIO(data).read if self._buffer.tell() > 0: # Make sure to flush only when _buffer is complete space = self.chunk_size - self._buffer.tell() if space: to_write = read(space) self._buffer.write(to_write) if len(to_write) < space: return # EOF or incomplete self.__flush_buffer() to_write = read(self.chunk_size) while to_write and len(to_write) == self.chunk_size: self.__flush_data(to_write) to_write = read(self.chunk_size) self._buffer.write(to_write) def writelines(self, sequence): """Write a sequence of strings to the file. Does not add seperators. """ for line in sequence: self.write(line) def __enter__(self): """Support for the context manager protocol. """ return self def __exit__(self, exc_type, exc_val, exc_tb): """Support for the context manager protocol. Close the file and allow exceptions to propagate. """ self.close() # propagate exceptions return False
class GridIn(object): """Class to write data to GridFS. """ def __init__(self, root_collection, **kwargs): """Write a file to GridFS Application developers should generally not need to instantiate this class directly - instead see the methods provided by :class:`~gridfs.GridFS`. Raises :class:`TypeError` if `root_collection` is not an instance of :class:`~pymongo.collection.Collection`. Any of the file level options specified in the `GridFS Spec <http://dochub.mongodb.org/core/gridfsspec>`_ may be passed as keyword arguments. Any additional keyword arguments will be set as additional fields on the file document. Valid keyword arguments include: - ``"_id"``: unique ID for this file (default: :class:`~bson.objectid.ObjectId`) - this ``"_id"`` must not have already been used for another file - ``"filename"``: human name for the file - ``"contentType"`` or ``"content_type"``: valid mime-type for the file - ``"chunkSize"`` or ``"chunk_size"``: size of each of the chunks, in bytes (default: 256 kb) - ``"encoding"``: encoding used for this file. In Python 2, any :class:`unicode` that is written to the file will be converted to a :class:`str`. In Python 3, any :class:`str` that is written to the file will be converted to :class:`bytes`. If you turn off write-acknowledgment for performance reasons, it is critical to wrap calls to :meth:`write` and :meth:`close` within a single request: >>> from pymongo import MongoClient >>> from gridfs import GridFS >>> client = MongoClient(w=0) # turn off write acknowledgment >>> fs = GridFS(client) >>> gridin = fs.new_file() >>> request = client.start_request() >>> try: ... for i in range(10): ... gridin.write('foo') ... gridin.close() ... finally: ... request.end() In Python 2.5 and later this code can be simplified with a with-statement, see :doc:`/examples/requests` for more information. :Parameters: - `root_collection`: root collection to write to - `**kwargs` (optional): file level options (see above) """ if not isinstance(root_collection, Collection): raise TypeError("root_collection must be an " "instance of Collection") # Handle alternative naming if "content_type" in kwargs: kwargs["contentType"] = kwargs.pop("content_type") if "chunk_size" in kwargs: kwargs["chunkSize"] = kwargs.pop("chunk_size") # Defaults kwargs["_id"] = kwargs.get("_id", ObjectId()) kwargs["chunkSize"] = kwargs.get("chunkSize", DEFAULT_CHUNK_SIZE) root_collection.chunks.ensure_index([("files_id", ASCENDING), ("n", ASCENDING)], unique=True) object.__setattr__(self, "_coll", root_collection) object.__setattr__(self, "_chunks", root_collection.chunks) object.__setattr__(self, "_file", kwargs) object.__setattr__(self, "_buffer", StringIO()) object.__setattr__(self, "_position", 0) object.__setattr__(self, "_chunk_number", 0) object.__setattr__(self, "_closed", False) @property def closed(self): """Is this file closed? """ return self._closed _id = _create_property("_id", "The ``'_id'`` value for this file.", read_only=True) filename = _create_property("filename", "Name of this file.") name = _create_property("filename", "Alias for `filename`.") content_type = _create_property("contentType", "Mime-type for this file.") length = _create_property("length", "Length (in bytes) of this file.", closed_only=True) chunk_size = _create_property("chunkSize", "Chunk size for this file.", read_only=True) upload_date = _create_property("uploadDate", "Date that this file was uploaded.", closed_only=True) md5 = _create_property("md5", "MD5 of the contents of this file " "(generated on the server).", closed_only=True) def __getattr__(self, name): if name in self._file: return self._file[name] raise AttributeError("GridIn object has no attribute '%s'" % name) def __setattr__(self, name, value): # For properties of this instance like _buffer, or descriptors set on # the class like filename, use regular __setattr__ if name in self.__dict__ or name in self.__class__.__dict__: object.__setattr__(self, name, value) else: # All other attributes are part of the document in db.fs.files. # Store them to be sent to server on close() or if closed, send # them now. self._file[name] = value if self._closed: self._coll.files.update({"_id": self._file["_id"]}, {"$set": { name: value }}, **self._coll._get_wc_override()) def __flush_data(self, data): """Flush `data` to a chunk. """ if not data: return assert (len(data) <= self.chunk_size) chunk = { "files_id": self._file["_id"], "n": self._chunk_number, "data": Binary(data) } self._chunks.insert(chunk) self._chunk_number += 1 self._position += len(data) def __flush_buffer(self): """Flush the buffer contents out to a chunk. """ self.__flush_data(self._buffer.getvalue()) self._buffer.close() self._buffer = StringIO() def __flush(self): """Flush the file to the database. """ try: self.__flush_buffer() db = self._coll.database # See PYTHON-417, "Sharded GridFS fails with exception: chunks out # of order." Inserts via mongos, even if they use a single # connection, can succeed out-of-order due to the writebackListener. # We mustn't call "filemd5" until all inserts are complete, which # we ensure by calling getLastError (and ignoring the result). db.error() md5 = db.command("filemd5", self._id, root=self._coll.name)["md5"] self._file["md5"] = md5 self._file["length"] = self._position self._file["uploadDate"] = datetime.datetime.utcnow() return self._coll.files.insert(self._file, **self._coll._get_wc_override()) except DuplicateKeyError: raise FileExists("file with _id %r already exists" % self._id) def close(self): """Flush the file and close it. A closed file cannot be written any more. Calling :meth:`close` more than once is allowed. """ if not self._closed: self.__flush() object.__setattr__(self, "_closed", True) def write(self, data): """Write data to the file. There is no return value. `data` can be either a string of bytes or a file-like object (implementing :meth:`read`). If the file has an :attr:`encoding` attribute, `data` can also be a :class:`unicode` (:class:`str` in python 3) instance, which will be encoded as :attr:`encoding` before being written. Due to buffering, the data may not actually be written to the database until the :meth:`close` method is called. Raises :class:`ValueError` if this file is already closed. Raises :class:`TypeError` if `data` is not an instance of :class:`str` (:class:`bytes` in python 3), a file-like object, or an instance of :class:`unicode` (:class:`str` in python 3). Unicode data is only allowed if the file has an :attr:`encoding` attribute. :Parameters: - `data`: string of bytes or file-like object to be written to the file .. versionadded:: 1.9 The ability to write :class:`unicode`, if the file has an :attr:`encoding` attribute. """ if self._closed: raise ValueError("cannot write to a closed file") try: # file-like read = data.read except AttributeError: # string if not isinstance(data, string_types): raise TypeError("can only write strings or file-like objects") if isinstance(data, str): try: data = data.encode(self.encoding) except AttributeError: raise TypeError("must specify an encoding for file in " "order to write %s" % (text_type.__name__, )) read = StringIO(data).read if self._buffer.tell() > 0: # Make sure to flush only when _buffer is complete space = self.chunk_size - self._buffer.tell() if space: to_write = read(space) self._buffer.write(to_write) if len(to_write) < space: return # EOF or incomplete self.__flush_buffer() to_write = read(self.chunk_size) while to_write and len(to_write) == self.chunk_size: self.__flush_data(to_write) to_write = read(self.chunk_size) self._buffer.write(to_write) def writelines(self, sequence): """Write a sequence of strings to the file. Does not add seperators. """ for line in sequence: self.write(line) def __enter__(self): """Support for the context manager protocol. """ return self def __exit__(self, exc_type, exc_val, exc_tb): """Support for the context manager protocol. Close the file and allow exceptions to propagate. """ self.close() # propagate exceptions return False
def _do_batched_write_command(namespace, operation, command, docs, check_keys, uuid_subtype, client): """Execute a batch of insert, update, or delete commands. """ max_bson_size = client.max_bson_size # Max BSON object size + 16k - 2 bytes for ending NUL bytes # XXX: This should come from the server - SERVER-10643 max_cmd_size = max_bson_size + 16382 ordered = command.get('ordered', True) buf = StringIO() # Save space for message length and request id buf.write(_ZERO_64) # responseTo, opCode buf.write(b("\x00\x00\x00\x00\xd4\x07\x00\x00")) # No options buf.write(_ZERO_32) # Namespace as C string buf.write(b(namespace)) buf.write(_ZERO_8) # Skip: 0, Limit: -1 buf.write(_SKIPLIM) # Where to write command document length command_start = buf.tell() buf.write(bson.BSON.encode(command)) # Start of payload buf.seek(-1, 2) # Work around some Jython weirdness. buf.truncate() try: buf.write(_OP_MAP[operation]) except KeyError: raise InvalidOperation('Unknown command') if operation in (_UPDATE, _DELETE): check_keys = False # Where to write list document length list_start = buf.tell() - 4 def send_message(): """Finalize and send the current OP_QUERY message. """ # Close list and command documents buf.write(_ZERO_16) # Write document lengths and request id length = buf.tell() buf.seek(list_start) buf.write(struct.pack('<i', length - list_start - 1)) buf.seek(command_start) buf.write(struct.pack('<i', length - command_start)) buf.seek(4) request_id = random.randint(MIN_INT32, MAX_INT32) buf.write(struct.pack('<i', request_id)) buf.seek(0) buf.write(struct.pack('<i', length)) try: result = client._send_message((request_id, buf.getvalue()), with_last_error=True, command=True) except OperationFailure, exc: # If we were called from the bulk API we could be # many batches in. We have to update the indexes of # failed documents in the error document, using the # full offset including any previous batches. Do # that and re-raise in the caller. details = exc.error_document if not details: # Some error not related to write commands # (e.g. kerberos failure). Re-raise immediately. raise return True, details return not result.get('ok'), result