def process_file(self, path, st, cache): safe_path = make_path_safe(path) # Is it a hard link? if st.st_nlink > 1: source = self.hard_links.get((st.st_ino, st.st_dev)) if (st.st_ino, st.st_dev) in self.hard_links: item = self.stat_attrs(st, path) item.update({b'path': safe_path, b'source': source}) self.add_item(item) return else: self.hard_links[st.st_ino, st.st_dev] = safe_path path_hash = self.key.id_hash(os.path.join(self.cwd, path).encode('utf-8', 'surrogateescape')) ids = cache.file_known_and_unchanged(path_hash, st) chunks = None if ids is not None: # Make sure all ids are available for id_ in ids: if not cache.seen_chunk(id_): break else: chunks = [cache.chunk_incref(id_, self.stats) for id_ in ids] # Only chunkify the file if needed if chunks is None: with open(path, 'rb') as fd: chunks = [] for chunk in chunkify(fd, WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed): chunks.append(cache.add_chunk(self.key.id_hash(chunk), chunk, self.stats)) cache.memorize_file(path_hash, st, [c[0] for c in chunks]) item = {b'path': safe_path, b'chunks': chunks} item.update(self.stat_attrs(st, path)) self.stats.nfiles += 1 self.add_item(item)
def flush(self, flush=False): if self.buffer.tell() == 0: return self.buffer.seek(0) chunks = list(bytes(s) for s in chunkify(self.buffer, WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed)) self.buffer.seek(0) self.buffer.truncate(0) # Leave the last parital chunk in the buffer unless flush is True end = None if flush or len(chunks) == 1 else -1 for chunk in chunks[:end]: self.chunks.append(self.write_chunk(chunk)) if end == -1: self.buffer.write(chunks[-1])
def flush_items(self, flush=False): if self.items.tell() == 0: return self.items.seek(0) chunks = list(bytes(s) for s in chunkify(self.items, WINDOW_SIZE, CHUNK_MASK, CHUNK_MIN, self.key.chunk_seed)) self.items.seek(0) self.items.truncate() for chunk in chunks[:-1]: id, _, _ = self.cache.add_chunk(self.key.id_hash(chunk), chunk, self.stats) self.items_ids.append(id) if flush or len(chunks) == 1: id, _, _ = self.cache.add_chunk(self.key.id_hash(chunks[-1]), chunks[-1], self.stats) self.items_ids.append(id) else: self.items.write(chunks[-1])
def test_chunkify(self): data = b'0' * 1024 * 1024 * 15 + b'Y' parts = [bytes(c) for c in chunkify(BytesIO(data), 2, 0x3, 2, 0)] self.assert_equal(len(parts), 2) self.assert_equal(b''.join(parts), data) self.assert_equal( [bytes(c) for c in chunkify(BytesIO(b''), 2, 0x3, 2, 0)], []) self.assert_equal([ bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 2, 0x3, 2, 0) ], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz']) self.assert_equal([ bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 2, 0x3, 2, 1) ], [ b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz' ]) self.assert_equal([ bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 2, 0x3, 2, 2) ], [ b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz' ]) self.assert_equal([ bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 3, 0) ], [b'foobarboobaz' * 3]) self.assert_equal([ bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 3, 1) ], [ b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz' ]) self.assert_equal([ bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 3, 2) ], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz']) self.assert_equal([ bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 4, 0) ], [b'foobarboobaz' * 3]) self.assert_equal([ bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 4, 1) ], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz']) self.assert_equal([ bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 4, 2) ], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
def test_chunkify(self): data = b'0' * 1024 * 1024 * 15 + b'Y' parts = [bytes(c) for c in chunkify(BytesIO(data), 2, 0x3, 2, 0)] self.assert_equal(len(parts), 2) self.assert_equal(b''.join(parts), data) self.assert_equal([bytes(c) for c in chunkify(BytesIO(b''), 2, 0x3, 2, 0)], []) self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 2, 0x3, 2, 0)], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz']) self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 2, 0x3, 2, 1)], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz']) self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 2, 0x3, 2, 2)], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz']) self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 3, 0)], [b'foobarboobaz' * 3]) self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 3, 1)], [b'foobar', b'boo', b'bazfo', b'obar', b'boo', b'bazfo', b'obar', b'boobaz']) self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 3, 2)], [b'foo', b'barboobaz', b'foo', b'barboobaz', b'foo', b'barboobaz']) self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 4, 0)], [b'foobarboobaz' * 3]) self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 4, 1)], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz']) self.assert_equal([bytes(c) for c in chunkify(BytesIO(b'foobarboobaz' * 3), 3, 0x3, 4, 2)], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])