def __init__(self, path, key_class, value_class, metadata, compress=False, block_compress=False): if os.path.exists(path): raise IOError("File %s already exists." % path) self._key_class = key_class self._value_class = value_class self._compress = compress self._block_compress = block_compress if not metadata: metadata = Metadata() self._metadata = metadata if self._compress or self._block_compress: self._codec = CodecPool().getCompressor() else: self._codec = None self._last_sync = 0 self._block = None self._stream = DataOutputStream(FileOutputStream(path)) # sync is 16 random bytes self._sync = md5('%s@%d' % (uuid1().bytes, int(time() * 1000))).digest() self._writeFileHeader()
class Writer(object): COMPRESSION_BLOCK_SIZE = 1000000 def __init__(self, path, key_class, value_class, metadata, compress=False, block_compress=False): if os.path.exists(path): raise IOError("File %s already exists." % path) self._key_class = key_class self._value_class = value_class self._compress = compress self._block_compress = block_compress if not metadata: metadata = Metadata() self._metadata = metadata if self._compress or self._block_compress: self._codec = CodecPool().getCompressor() else: self._codec = None self._last_sync = 0 self._block = None self._stream = DataOutputStream(FileOutputStream(path)) # sync is 16 random bytes self._sync = md5('%s@%d' % (uuid1().bytes, int(time() * 1000))).digest() self._writeFileHeader() def close(self): if self._block_compress: self.sync() self._stream.close() def getCompressionCodec(self): return self._codec def getKeyClass(self): return self._key_class def getKeyClassName(self): return hadoopClassName(self._key_class) def getValueClass(self): return self._value_class def getValueClassName(self): return hadoopClassName(self._value_class) def isBlockCompressed(self): return self._block_compress def isCompressed(self): return self._compress def getLength(self): return self._stream.getPos() def append(self, key, value): if type(key) != self._key_class: raise IOError("Wrong key class %s is not %s" % (type(key), self._key_class)) if type(value) != self._value_class: raise IOError("Wrong Value class %s is not %s" % (type(key), self._key_class)) key_buffer = DataOutputBuffer() key.write(key_buffer) value_buffer = DataOutputBuffer() value.write(value_buffer) self.appendRaw(key_buffer.toByteArray(), value_buffer.toByteArray()) def appendRaw(self, key, value): if self._block_compress: if self._block: records, keys_len, keys, values_len, values = self._block else: keys_len = DataOutputBuffer() keys = DataOutputBuffer() values_len = DataOutputBuffer() values = DataOutputBuffer() records = 0 writeVInt(keys_len, len(key)) keys.write(key) writeVInt(values_len, len(value)) values.write(value) records += 1 self._block = (records, keys_len, keys, values_len, values) current_block_size = keys.getSize() + values.getSize() if current_block_size >= self.COMPRESSION_BLOCK_SIZE: self.sync() else: if self._compress: value = self._codec.compress(value) key_length = len(key) value_length = len(value) self._checkAndWriteSync() self._stream.writeInt(key_length + value_length) self._stream.writeInt(key_length) self._stream.write(key) self._stream.write(value) def sync(self): if self._last_sync != self._stream.getPos(): self._stream.writeInt(SYNC_ESCAPE) self._stream.write(self._sync) self._last_sync = self._stream.getPos() if self._block_compress and self._block: def _writeBuffer(data_buf): buf = self._codec.compress(data_buf.toByteArray()) writeVInt(self._stream, len(buf)) self._stream.write(buf) records, keys_len, keys, values_len, values = self._block writeVInt(self._stream, records) _writeBuffer(keys_len) _writeBuffer(keys) _writeBuffer(values_len) _writeBuffer(values) self._block = None def _writeFileHeader(self): self._stream.write(VERSION) Text.writeString(self._stream, self.getKeyClassName()) Text.writeString(self._stream, self.getValueClassName()) self._stream.writeBoolean(self._compress) self._stream.writeBoolean(self._block_compress) if self._codec: Text.writeString(self._stream, 'org.apache.hadoop.io.compress.DefaultCodec') self._metadata.write(self._stream) self._stream.write(self._sync) def _checkAndWriteSync(self): if self._stream.getPos() >= (self._last_sync + SYNC_INTERVAL): self.sync()