def test_fanout_behaviour(): with no_lingering_errors(): global hashbits global fanout levels = lambda data: [(len(b), l) for b, l in hashsplit.hashsplit_iter( [BytesIO(data)], True, None, fanout=fanout)] def hslevels(data): global hashbits global fanout return [(len(b), l) for b, l in HashSplitter([BytesIO(data)], bits=hashbits, fanbits=int(math.log(fanout, 2)))] # This is a tuple of max blob size (4 << 13 bytes) and expected level (0) # Return tuple with split content and expected level (from table) def sb(pfx, n): global fanout needed = hashbits + int(math.log(fanout, 2)) * n # internal algorithm ignores one bit after the split bits, # adjust for that (if n > 0): if n: needed += 1 return (b'\x00' * pfx + split_test_objs[needed], n) def end(n): return (b'\x00' * n, 0) # check a given sequence is handled correctly def check(objs): # old API allows only hashbits == 13 if hashbits == 13: WVPASSEQ(levels(b''.join([x[0] for x in objs])), [(len(x[0]), x[1]) for x in objs]) WVPASSEQ(hslevels(b''.join([x[0] for x in objs])), [(len(x[0]), x[1]) for x in objs]) for hashbits in (13, 14, 15): max_blob = (b'\x00' * (4 << hashbits), 0) for fanout in (2, 4): # never split - just max blobs check([max_blob] * 4) check([sb(0, 0)]) check([max_blob, sb(1, 3), max_blob]) check([sb(13, 1)]) check([sb(13, 1), end(200)]) fanout = 2 check([sb(0, 1), sb(30, 2), sb(20, 0), sb(10, 5)]) check([sb(0, 1), sb(30, 2), sb(20, 0), sb(10, 5), end(10)])
def test_fanout_behaviour(): # Drop in replacement for bupsplit, but splitting if the int value of a # byte >= BUP_BLOBBITS basebits = _helpers.blobbits() def splitbuf(buf): ofs = 0 for b in buf: b = byte_int(b) ofs += 1 if b >= basebits: return ofs, b return 0, 0 with no_lingering_errors(): old_splitbuf = _helpers.splitbuf _helpers.splitbuf = splitbuf old_BLOB_MAX = hashsplit.BLOB_MAX hashsplit.BLOB_MAX = 4 old_BLOB_READ_SIZE = hashsplit.BLOB_READ_SIZE hashsplit.BLOB_READ_SIZE = 10 old_fanout = hashsplit.fanout hashsplit.fanout = 2 levels = lambda f: [ (len(b), l) for b, l in hashsplit.hashsplit_iter([f], True, None) ] # Return a string of n null bytes z = lambda n: b'\x00' * n # Return a byte which will be split with a level of n sb = lambda n: bytes_from_uint(basebits + n) split_never = BytesIO(z(16)) split_first = BytesIO(z(1) + sb(3) + z(14)) split_end = BytesIO(z(13) + sb(1) + z(2)) split_many = BytesIO( sb(1) + z(3) + sb(2) + z(4) + sb(0) + z(4) + sb(5) + z(1)) WVPASSEQ(levels(split_never), [(4, 0), (4, 0), (4, 0), (4, 0)]) WVPASSEQ(levels(split_first), [(2, 3), (4, 0), (4, 0), (4, 0), (2, 0)]) WVPASSEQ(levels(split_end), [(4, 0), (4, 0), (4, 0), (2, 1), (2, 0)]) WVPASSEQ(levels(split_many), [(1, 1), (4, 2), (4, 0), (1, 0), (4, 0), (1, 5), (1, 0)]) _helpers.splitbuf = old_splitbuf hashsplit.BLOB_MAX = old_BLOB_MAX hashsplit.BLOB_READ_SIZE = old_BLOB_READ_SIZE hashsplit.fanout = old_fanout
def test_fanout_behaviour(): # Drop in replacement for bupsplit, but splitting if the int value of a # byte >= BUP_BLOBBITS basebits = _helpers.blobbits() def splitbuf(buf): ofs = 0 for c in buf: ofs += 1 if ord(c) >= basebits: return ofs, ord(c) return 0, 0 with no_lingering_errors(): old_splitbuf = _helpers.splitbuf _helpers.splitbuf = splitbuf old_BLOB_MAX = hashsplit.BLOB_MAX hashsplit.BLOB_MAX = 4 old_BLOB_READ_SIZE = hashsplit.BLOB_READ_SIZE hashsplit.BLOB_READ_SIZE = 10 old_fanout = hashsplit.fanout hashsplit.fanout = 2 levels = lambda f: [(len(b), l) for b, l in hashsplit.hashsplit_iter([f], True, None)] # Return a string of n null bytes z = lambda n: '\x00' * n # Return a byte which will be split with a level of n sb = lambda n: chr(basebits + n) split_never = BytesIO(z(16)) split_first = BytesIO(z(1) + sb(3) + z(14)) split_end = BytesIO(z(13) + sb(1) + z(2)) split_many = BytesIO(sb(1) + z(3) + sb(2) + z(4) + sb(0) + z(4) + sb(5) + z(1)) WVPASSEQ(levels(split_never), [(4, 0), (4, 0), (4, 0), (4, 0)]) WVPASSEQ(levels(split_first), [(2, 3), (4, 0), (4, 0), (4, 0), (2, 0)]) WVPASSEQ(levels(split_end), [(4, 0), (4, 0), (4, 0), (2, 1), (2, 0)]) WVPASSEQ(levels(split_many), [(1, 1), (4, 2), (4, 0), (1, 0), (4, 0), (1, 5), (1, 0)]) _helpers.splitbuf = old_splitbuf hashsplit.BLOB_MAX = old_BLOB_MAX hashsplit.BLOB_READ_SIZE = old_BLOB_READ_SIZE hashsplit.fanout = old_fanout
def split(opt, files, parent, out, pack_writer): # Hack around lack of nonlocal vars in python 2 total_bytes = [0] def prog(filenum, nbytes): total_bytes[0] += nbytes if filenum > 0: qprogress('Splitting: file #%d, %d kbytes\r' % (filenum + 1, total_bytes[0] // 1024)) else: qprogress('Splitting: %d kbytes\r' % (total_bytes[0] // 1024)) new_blob = pack_writer.new_blob new_tree = pack_writer.new_tree if opt.blobs: shalist = hashsplit.split_to_blobs(new_blob, files, keep_boundaries=opt.keep_boundaries, progress=prog) for sha, size, level in shalist: out.write(hexlify(sha) + b'\n') reprogress() elif opt.tree or opt.commit or opt.name: if opt.name: # insert dummy_name which may be used as a restore target mode, sha = \ hashsplit.split_to_blob_or_tree(new_blob, new_tree, files, keep_boundaries=opt.keep_boundaries, progress=prog) splitfile_name = git.mangle_name(b'data', hashsplit.GIT_MODE_FILE, mode) shalist = [(mode, splitfile_name, sha)] else: shalist = \ hashsplit.split_to_shalist(new_blob, new_tree, files, keep_boundaries=opt.keep_boundaries, progress=prog) tree = new_tree(shalist) else: last = 0 it = hashsplit.hashsplit_iter(files, keep_boundaries=opt.keep_boundaries, progress=prog) for blob, level in it: hashsplit.total_split += len(blob) if opt.copy: sys.stdout.write(str(blob)) megs = hashsplit.total_split // 1024 // 1024 if not opt.quiet and last != megs: last = megs if opt.verbose: log('\n') if opt.tree: out.write(hexlify(tree) + b'\n') commit = None if opt.commit or opt.name: msg = b'bup split\n\nGenerated by command:\n%r\n' % compat.get_argvb() userline = b'%s <%s@%s>' % (userfullname(), username(), hostname()) commit = pack_writer.new_commit(tree, parent, userline, opt.date, None, userline, opt.date, None, msg) if opt.commit: out.write(hexlify(commit) + b'\n') return commit
hashsplit.split_to_blob_or_tree(pack_writer.new_blob, pack_writer.new_tree, files, keep_boundaries=opt.keep_boundaries, progress=prog) splitfile_name = git.mangle_name('data', hashsplit.GIT_MODE_FILE, mode) shalist = [(mode, splitfile_name, sha)] else: shalist = hashsplit.split_to_shalist( pack_writer.new_blob, pack_writer.new_tree, files, keep_boundaries=opt.keep_boundaries, progress=prog) tree = pack_writer.new_tree(shalist) else: last = 0 it = hashsplit.hashsplit_iter(files, keep_boundaries=opt.keep_boundaries, progress=prog) for (blob, level) in it: hashsplit.total_split += len(blob) if opt.copy: sys.stdout.write(str(blob)) megs = hashsplit.total_split/1024/1024 if not opt.quiet and last != megs: last = megs if opt.verbose: log('\n') if opt.tree: print tree.encode('hex') if opt.commit or opt.name: msg = 'bup split\n\nGenerated by command:\n%r\n' % sys.argv
keep_boundaries=opt.keep_boundaries, progress=prog) splitfile_name = git.mangle_name('data', hashsplit.GIT_MODE_FILE, mode) shalist = [(mode, splitfile_name, sha)] else: shalist = hashsplit.split_to_shalist( pack_writer.new_blob, pack_writer.new_tree, files, keep_boundaries=opt.keep_boundaries, progress=prog) tree = pack_writer.new_tree(shalist) else: last = 0 it = hashsplit.hashsplit_iter(files, keep_boundaries=opt.keep_boundaries, progress=prog) for (blob, level) in it: hashsplit.total_split += len(blob) if opt.copy: sys.stdout.write(str(blob)) megs = hashsplit.total_split / 1024 / 1024 if not opt.quiet and last != megs: last = megs if opt.verbose: log('\n') if opt.tree: print tree.encode('hex') if opt.commit or opt.name: msg = 'bup split\n\nGenerated by command:\n%r\n' % sys.argv
def main(argv): o = options.Options(optspec) opt, flags, extra = o.parse_bytes(argv[1:]) if opt.name: opt.name = argv_bytes(opt.name) if opt.remote: opt.remote = argv_bytes(opt.remote) if opt.verbose is None: opt.verbose = 0 if not (opt.blobs or opt.tree or opt.commit or opt.name or opt.noop or opt.copy): o.fatal("use one or more of -b, -t, -c, -n, --noop, --copy") if opt.copy and (opt.blobs or opt.tree): o.fatal('--copy is incompatible with -b, -t') if (opt.noop or opt.copy) and (opt.commit or opt.name): o.fatal('--noop and --copy are incompatible with -c, -n') if opt.blobs and (opt.tree or opt.commit or opt.name): o.fatal('-b is incompatible with -t, -c, -n') if extra and opt.git_ids: o.fatal("don't provide filenames when using --git-ids") if opt.verbose >= 2: git.verbose = opt.verbose - 1 opt.bench = 1 max_pack_size = None if opt.max_pack_size: max_pack_size = parse_num(opt.max_pack_size) max_pack_objects = None if opt.max_pack_objects: max_pack_objects = parse_num(opt.max_pack_objects) if opt.fanout: hashsplit.fanout = parse_num(opt.fanout) if opt.blobs: hashsplit.fanout = 0 if opt.bwlimit: client.bwlimit = parse_num(opt.bwlimit) if opt.date: date = parse_date_or_fatal(opt.date, o.fatal) else: date = time.time() # Hack around lack of nonlocal vars in python 2 total_bytes = [0] def prog(filenum, nbytes): total_bytes[0] += nbytes if filenum > 0: qprogress('Splitting: file #%d, %d kbytes\r' % (filenum + 1, total_bytes[0] // 1024)) else: qprogress('Splitting: %d kbytes\r' % (total_bytes[0] // 1024)) is_reverse = environ.get(b'BUP_SERVER_REVERSE') if is_reverse and opt.remote: o.fatal("don't use -r in reverse mode; it's automatic") start_time = time.time() if opt.name and not valid_save_name(opt.name): o.fatal("'%r' is not a valid branch name." % opt.name) refname = opt.name and b'refs/heads/%s' % opt.name or None if opt.noop or opt.copy: cli = pack_writer = oldref = None elif opt.remote or is_reverse: git.check_repo_or_die() cli = client.Client(opt.remote) oldref = refname and cli.read_ref(refname) or None pack_writer = cli.new_packwriter(compression_level=opt.compress, max_pack_size=max_pack_size, max_pack_objects=max_pack_objects) else: git.check_repo_or_die() cli = None oldref = refname and git.read_ref(refname) or None pack_writer = git.PackWriter(compression_level=opt.compress, max_pack_size=max_pack_size, max_pack_objects=max_pack_objects) input = byte_stream(sys.stdin) if opt.git_ids: # the input is actually a series of git object ids that we should retrieve # and split. # # This is a bit messy, but basically it converts from a series of # CatPipe.get() iterators into a series of file-type objects. # It would be less ugly if either CatPipe.get() returned a file-like object # (not very efficient), or split_to_shalist() expected an iterator instead # of a file. cp = git.CatPipe() class IterToFile: def __init__(self, it): self.it = iter(it) def read(self, size): v = next(self.it, None) return v or b'' def read_ids(): while 1: line = input.readline() if not line: break if line: line = line.strip() try: it = cp.get(line.strip()) next(it, None) # skip the file info except KeyError as e: add_error('error: %s' % e) continue yield IterToFile(it) files = read_ids() else: # the input either comes from a series of files or from stdin. files = extra and (open(argv_bytes(fn), 'rb') for fn in extra) or [input] if pack_writer: new_blob = pack_writer.new_blob new_tree = pack_writer.new_tree elif opt.blobs or opt.tree: # --noop mode new_blob = lambda content: git.calc_hash(b'blob', content) new_tree = lambda shalist: git.calc_hash(b'tree', git.tree_encode(shalist)) sys.stdout.flush() out = byte_stream(sys.stdout) if opt.blobs: shalist = hashsplit.split_to_blobs(new_blob, files, keep_boundaries=opt.keep_boundaries, progress=prog) for (sha, size, level) in shalist: out.write(hexlify(sha) + b'\n') reprogress() elif opt.tree or opt.commit or opt.name: if opt.name: # insert dummy_name which may be used as a restore target mode, sha = \ hashsplit.split_to_blob_or_tree(new_blob, new_tree, files, keep_boundaries=opt.keep_boundaries, progress=prog) splitfile_name = git.mangle_name(b'data', hashsplit.GIT_MODE_FILE, mode) shalist = [(mode, splitfile_name, sha)] else: shalist = hashsplit.split_to_shalist( new_blob, new_tree, files, keep_boundaries=opt.keep_boundaries, progress=prog) tree = new_tree(shalist) else: last = 0 it = hashsplit.hashsplit_iter(files, keep_boundaries=opt.keep_boundaries, progress=prog) for (blob, level) in it: hashsplit.total_split += len(blob) if opt.copy: sys.stdout.write(str(blob)) megs = hashsplit.total_split // 1024 // 1024 if not opt.quiet and last != megs: last = megs if opt.verbose: log('\n') if opt.tree: out.write(hexlify(tree) + b'\n') if opt.commit or opt.name: msg = b'bup split\n\nGenerated by command:\n%r\n' % compat.get_argvb() ref = opt.name and (b'refs/heads/%s' % opt.name) or None userline = b'%s <%s@%s>' % (userfullname(), username(), hostname()) commit = pack_writer.new_commit(tree, oldref, userline, date, None, userline, date, None, msg) if opt.commit: out.write(hexlify(commit) + b'\n') if pack_writer: pack_writer.close() # must close before we can update the ref if opt.name: if cli: cli.update_ref(refname, commit, oldref) else: git.update_ref(refname, commit, oldref) if cli: cli.close() secs = time.time() - start_time size = hashsplit.total_split if opt.bench: log('bup: %.2f kbytes in %.2f secs = %.2f kbytes/sec\n' % (size / 1024, secs, size / 1024 / secs)) if saved_errors: log('WARNING: %d errors encountered while saving.\n' % len(saved_errors)) sys.exit(1)
def main(argv): o = options.Options(optspec) opt, flags, extra = o.parse_bytes(argv[1:]) if opt.name: opt.name = argv_bytes(opt.name) if opt.verbose is None: opt.verbose = 0 if not (opt.blobs or opt.tree or opt.commit or opt.name or opt.noop or opt.copy): o.fatal("use one or more of -b, -t, -c, -n, --noop, --copy") if opt.copy and (opt.blobs or opt.tree): o.fatal('--copy is incompatible with -b, -t') if (opt.noop or opt.copy) and (opt.commit or opt.name): o.fatal('--noop and --copy are incompatible with -c, -n') if opt.blobs and (opt.tree or opt.commit or opt.name): o.fatal('-b is incompatible with -t, -c, -n') if extra and opt.git_ids: o.fatal("don't provide filenames when using --git-ids") if opt.verbose >= 2: git.verbose = opt.verbose - 1 opt.bench = 1 fanout = None if opt.fanout: # This used to be in hashsplit, but that's just confusing; # hashsplit now defaults to the real default (16) if 0 (or # None) is passed, but keep the command-line compatible... fanout = parse_num(opt.fanout) or 128 blobbits = None if opt.blobbits: blobbits = parse_num(opt.blobbits) if opt.bwlimit: client.bwlimit = parse_num(opt.bwlimit) if opt.date: date = parse_date_or_fatal(opt.date, o.fatal) else: date = time.time() # Hack around lack of nonlocal vars in python 2 total_bytes = [0] def prog(filenum, nbytes): total_bytes[0] += nbytes if filenum > 0: qprogress('Splitting: file #%d, %d kbytes\r' % (filenum + 1, total_bytes[0] // 1024)) else: qprogress('Splitting: %d kbytes\r' % (total_bytes[0] // 1024)) start_time = time.time() if opt.name and not valid_save_name(opt.name): o.fatal("'%r' is not a valid branch name." % opt.name) refname = opt.name and b'refs/heads/%s' % opt.name or None if opt.noop or opt.copy: repo = oldref = None else: repo = from_opts(opt) oldref = refname and repo.read_ref(refname) or None repobits = repo.config(b'bup.blobbits', opttype='int') or hashsplit.BUP_BLOBBITS if not blobbits: blobbits = repobits else: print("overriding repo blobbits %d from cmdline with %d" % (repobits, blobbits)) input = byte_stream(sys.stdin) if opt.git_ids: # the input is actually a series of git object ids that we should retrieve # and split. # # This is a bit messy, but basically it converts from a series of # repo.cat() iterators into a series of file-type objects. # It would be less ugly if either repo.cat() returned a file-like object # (not very efficient), or split_to_shalist() expected an iterator instead # of a file. class IterToFile: def __init__(self, it): self.it = iter(it) def read(self, size): v = next(self.it, None) return v or b'' def read_ids(): while 1: line = input.readline() if not line: break if line: line = line.strip() try: it = repo.cat(line.strip()) next(it, None) # skip the file info except KeyError as e: add_error('error: %s' % e) continue yield IterToFile(it) files = read_ids() else: # the input either comes from a series of files or from stdin. files = extra and (open(argv_bytes(fn), 'rb') for fn in extra) or [input] if repo: write_data = repo.write_data write_tree = repo.write_tree elif opt.blobs or opt.tree: # --noop mode write_data = lambda content: git.calc_hash(b'blob', content) write_tree = lambda shalist: git.calc_hash(b'tree', git.tree_encode(shalist)) sys.stdout.flush() out = byte_stream(sys.stdout) if opt.blobs: shalist = hashsplit.split_to_blobs(write_data, files, keep_boundaries=opt.keep_boundaries, progress=prog, blobbits=blobbits) for (sha, size, level) in shalist: out.write(hexlify(sha) + b'\n') reprogress() elif opt.tree or opt.commit or opt.name: if opt.name: # insert dummy_name which may be used as a restore target mode, sha = \ hashsplit.split_to_blob_or_tree(write_data, write_tree, files, keep_boundaries=opt.keep_boundaries, progress=prog, fanout=fanout, blobbits=blobbits) splitfile_name = git.mangle_name(b'data', hashsplit.GIT_MODE_FILE, mode) shalist = [(mode, splitfile_name, sha)] else: shalist = hashsplit.split_to_shalist( write_data, write_tree, files, keep_boundaries=opt.keep_boundaries, progress=prog, fanout=fanout, blobbits=blobbits) tree = write_tree(shalist) else: last = 0 it = hashsplit.hashsplit_iter(files, keep_boundaries=opt.keep_boundaries, progress=prog, fanout=fanout, blobbits=blobbits) for (blob, level) in it: hashsplit.total_split += len(blob) if opt.copy: sys.stdout.write(str(blob)) megs = hashsplit.total_split // 1024 // 1024 if not opt.quiet and last != megs: last = megs if opt.verbose: log('\n') if opt.tree: out.write(hexlify(tree) + b'\n') if opt.commit or opt.name: msg = b'bup split\n\nGenerated by command:\n%r\n' % compat.get_argvb() ref = opt.name and (b'refs/heads/%s' % opt.name) or None userline = b'%s <%s@%s>' % (userfullname(), username(), hostname()) commit = repo.write_commit(tree, oldref, userline, date, None, userline, date, None, msg) if opt.commit: out.write(hexlify(commit) + b'\n') if opt.name and repo: repo.update_ref(refname, commit, oldref) if repo: repo.close() secs = time.time() - start_time size = hashsplit.total_split if opt.bench: log('bup: %.2f kbytes in %.2f secs = %.2f kbytes/sec\n' % (size / 1024, secs, size / 1024 / secs)) if saved_errors: log('WARNING: %d errors encountered while saving.\n' % len(saved_errors)) sys.exit(1)
shalist = [(mode, splitfile_name, sha)] else: shalist = hashsplit.split_to_shalist( write_data, write_tree, files, keep_boundaries=opt.keep_boundaries, progress=prog, fanout=fanout, blobbits=blobbits) tree = write_tree(shalist) else: last = 0 it = hashsplit.hashsplit_iter(files, keep_boundaries=opt.keep_boundaries, progress=prog, fanout=fanout, blobbits=blobbits) for (blob, level) in it: hashsplit.total_split += len(blob) if opt.copy: sys.stdout.write(str(blob)) megs = hashsplit.total_split // 1024 // 1024 if not opt.quiet and last != megs: last = megs if opt.verbose: log('\n') if opt.tree: out.write(hexlify(tree) + b'\n') if opt.commit or opt.name:
elif opt.remote or is_reverse: cli = client.Client(opt.remote, opt.port) oldref = refname and cli.read_ref(refname) or None pack_writer = cli.new_packwriter() else: cli = None oldref = refname and git.read_ref(refname) or None pack_writer = git.PackWriter() files = extra and (open(fn) for fn in extra) or [sys.stdin] if pack_writer: shalist = hashsplit.split_to_shalist(pack_writer, files) tree = pack_writer.new_tree(shalist) else: last = 0 for (blob, bits) in hashsplit.hashsplit_iter(files): hashsplit.total_split += len(blob) if opt.copy: sys.stdout.write(str(blob)) megs = hashsplit.total_split/1024/1024 if not opt.quiet and last != megs: progress('%d Mbytes read\r' % megs) last = megs progress('%d Mbytes read, done.\n' % megs) if opt.verbose: log('\n') if opt.blobs: for (mode,name,bin) in shalist: print bin.encode('hex') if opt.tree:
continue yield IterToFile(it) files = read_ids() else: # the input either comes from a series of files or from stdin. files = extra and (open(fn) for fn in extra) or [sys.stdin] if pack_writer: shalist = hashsplit.split_to_shalist(pack_writer, files, keep_boundaries=opt.keep_boundaries, progress=prog) tree = pack_writer.new_tree(shalist) else: last = 0 for (blob, bits) in hashsplit.hashsplit_iter(files, keep_boundaries=opt.keep_boundaries, progress=prog): hashsplit.total_split += len(blob) if opt.copy: sys.stdout.write(str(blob)) megs = hashsplit.total_split/1024/1024 if not opt.quiet and last != megs: progress('%d Mbytes read\r' % megs) last = megs progress('%d Mbytes read, done.\n' % megs) if opt.verbose: log('\n') if opt.blobs: for (mode,name,bin) in shalist: print bin.encode('hex')