def sync_indexes(self): self.check_busy() conn = self.conn mkdirp(self.cachedir) # All cached idxs are extra until proven otherwise extra = set() for f in os.listdir(self.cachedir): debug1('%s\n' % f) if f.endswith('.idx'): extra.add(f) needed = set() conn.write('list-indexes\n') for line in linereader(conn): if not line: break assert(line.find('/') < 0) parts = line.split(' ') idx = parts[0] if len(parts) == 2 and parts[1] == 'load' and idx not in extra: # If the server requests that we load an idx and we don't # already have a copy of it, it is needed needed.add(idx) # Any idx that the server has heard of is proven not extra extra.discard(idx) self.check_ok() debug1('client: removing extra indexes: %s\n' % extra) for idx in extra: os.unlink(os.path.join(self.cachedir, idx)) debug1('client: server requested load of: %s\n' % needed) for (count,idx) in enumerate(needed): progress('Receiving indexes from server: %d/%d\r' % (count, len(needed))) self.sync_index(idx) git.auto_midx(self.cachedir)
def _suggest_packs(self): ob = self._busy if ob: assert(ob == 'receive-objects-v2') self.conn.write('\xff\xff\xff\xff') # suspend receive-objects-v2 suggested = [] for line in linereader(self.conn): if not line: break debug2('%s\n' % line) if line.startswith('index '): idx = line[6:] debug1('client: received index suggestion: %s\n' % git.shorten_hash(idx)) suggested.append(idx) else: assert(line.endswith('.idx')) debug1('client: completed writing pack, idx: %s\n' % git.shorten_hash(line)) suggested.append(line) self.check_ok() if ob: self._busy = None idx = None for (count,idx) in enumerate(suggested): progress('Receiving indexes from server: %d/%d\r' % (count, len(suggested))) self.sync_index(idx) progress('Receiving indexes from server: %d indexes received.\r' % len(suggested)) git.auto_midx(self.cachedir) if ob: self._busy = ob self.conn.write('%s\n' % ob) return idx
def send_index(self, name, f, send_size): self._require_command(b'send-index') #debug1('requesting %r\n' % name) self.check_busy() self.conn.write(b'send-index %s\n' % name) n = struct.unpack('!I', self.conn.read(4))[0] assert(n) send_size(n) count = 0 progress('Receiving index from server: %d/%d\r' % (count, n)) for b in chunkyreader(self.conn, n): f.write(b) count += len(b) qprogress('Receiving index from server: %d/%d\r' % (count, n)) progress('Receiving index from server: %d/%d, done.\n' % (count, n)) self.check_ok()
def sync_index(self, name): #debug1('requesting %r\n' % name) self.check_busy() mkdirp(self.cachedir) fn = os.path.join(self.cachedir, name) if os.path.exists(fn): msg = "won't request existing .idx, try `bup bloom --check %s`" % fn raise ClientError(msg) self.conn.write('send-index %s\n' % name) n = struct.unpack('!I', self.conn.read(4))[0] assert(n) with atomically_replaced_file(fn, 'w') as f: count = 0 progress('Receiving index from server: %d/%d\r' % (count, n)) for b in chunkyreader(self.conn, n): f.write(b) count += len(b) qprogress('Receiving index from server: %d/%d\r' % (count, n)) progress('Receiving index from server: %d/%d, done.\n' % (count, n)) self.check_ok()
def sync_index(self, name): #debug1('requesting %r\n' % name) self.check_busy() mkdirp(self.cachedir) fn = os.path.join(self.cachedir, name) if os.path.exists(fn): msg = "won't request existing .idx, try `bup bloom --check %s`" % fn raise ClientError(msg) self.conn.write('send-index %s\n' % name) n = struct.unpack('!I', self.conn.read(4))[0] assert (n) with atomically_replaced_file(fn, 'w') as f: count = 0 progress('Receiving index from server: %d/%d\r' % (count, n)) for b in chunkyreader(self.conn, n): f.write(b) count += len(b) qprogress('Receiving index from server: %d/%d\r' % (count, n)) progress('Receiving index from server: %d/%d, done.\n' % (count, n)) self.check_ok()
def main(argv): # Hack around lack of nonlocal vars in python 2 _nonlocal = {} o = options.Options(optspec) opt, flags, extra = o.parse_bytes(argv[1:]) if opt.indexfile: opt.indexfile = argv_bytes(opt.indexfile) if opt.name: opt.name = argv_bytes(opt.name) if opt.remote: opt.remote = argv_bytes(opt.remote) if opt.strip_path: opt.strip_path = argv_bytes(opt.strip_path) git.check_repo_or_die() if not (opt.tree or opt.commit or opt.name): o.fatal("use one or more of -t, -c, -n") if not extra: o.fatal("no filenames given") extra = [argv_bytes(x) for x in extra] opt.progress = (istty2 and not opt.quiet) opt.smaller = parse_num(opt.smaller or 0) if opt.bwlimit: client.bwlimit = parse_num(opt.bwlimit) if opt.date: date = parse_date_or_fatal(opt.date, o.fatal) else: date = time.time() if opt.strip and opt.strip_path: o.fatal("--strip is incompatible with --strip-path") graft_points = [] if opt.graft: if opt.strip: o.fatal("--strip is incompatible with --graft") if opt.strip_path: o.fatal("--strip-path is incompatible with --graft") for (option, parameter) in flags: if option == "--graft": parameter = argv_bytes(parameter) splitted_parameter = parameter.split(b'=') if len(splitted_parameter) != 2: o.fatal( "a graft point must be of the form old_path=new_path") old_path, new_path = splitted_parameter if not (old_path and new_path): o.fatal("a graft point cannot be empty") graft_points.append( (resolve_parent(old_path), resolve_parent(new_path))) is_reverse = environ.get(b'BUP_SERVER_REVERSE') if is_reverse and opt.remote: o.fatal("don't use -r in reverse mode; it's automatic") name = opt.name if name and not valid_save_name(name): o.fatal("'%s' is not a valid branch name" % path_msg(name)) refname = name and b'refs/heads/%s' % name or None if opt.remote or is_reverse: try: cli = client.Client(opt.remote) except client.ClientError as e: log('error: %s' % e) sys.exit(1) oldref = refname and cli.read_ref(refname) or None w = cli.new_packwriter(compression_level=opt.compress) else: cli = None oldref = refname and git.read_ref(refname) or None w = git.PackWriter(compression_level=opt.compress) handle_ctrl_c() # Metadata is stored in a file named .bupm in each directory. The # first metadata entry will be the metadata for the current directory. # The remaining entries will be for each of the other directory # elements, in the order they're listed in the index. # # Since the git tree elements are sorted according to # git.shalist_item_sort_key, the metalist items are accumulated as # (sort_key, metadata) tuples, and then sorted when the .bupm file is # created. The sort_key should have been computed using the element's # mangled name and git mode (after hashsplitting), but the code isn't # actually doing that but rather uses the element's real name and mode. # This makes things a bit more difficult when reading it back, see # vfs.ordered_tree_entries(). # Maintain a stack of information representing the current location in # the archive being constructed. The current path is recorded in # parts, which will be something like ['', 'home', 'someuser'], and # the accumulated content and metadata for of the dirs in parts is # stored in parallel stacks in shalists and metalists. parts = [] # Current archive position (stack of dir names). shalists = [] # Hashes for each dir in paths. metalists = [] # Metadata for each dir in paths. def _push(part, metadata): # Enter a new archive directory -- make it the current directory. parts.append(part) shalists.append([]) metalists.append([(b'', metadata)]) # This dir's metadata (no name). def _pop(force_tree, dir_metadata=None): # Leave the current archive directory and add its tree to its parent. assert (len(parts) >= 1) part = parts.pop() shalist = shalists.pop() metalist = metalists.pop() # FIXME: only test if collision is possible (i.e. given --strip, etc.)? if force_tree: tree = force_tree else: names_seen = set() clean_list = [] metaidx = 1 # entry at 0 is for the dir for x in shalist: name = x[1] if name in names_seen: parent_path = b'/'.join(parts) + b'/' add_error('error: ignoring duplicate path %s in %s' % (path_msg(name), path_msg(parent_path))) if not stat.S_ISDIR(x[0]): del metalist[metaidx] else: names_seen.add(name) clean_list.append(x) if not stat.S_ISDIR(x[0]): metaidx += 1 if dir_metadata: # Override the original metadata pushed for this dir. metalist = [(b'', dir_metadata)] + metalist[1:] sorted_metalist = sorted(metalist, key=lambda x: x[0]) metadata = b''.join([m[1].encode() for m in sorted_metalist]) metadata_f = BytesIO(metadata) mode, id = hashsplit.split_to_blob_or_tree(w.new_blob, w.new_tree, [metadata_f], keep_boundaries=False) clean_list.append((mode, b'.bupm', id)) tree = w.new_tree(clean_list) if shalists: shalists[-1].append((GIT_MODE_TREE, git.mangle_name(part, GIT_MODE_TREE, GIT_MODE_TREE), tree)) return tree _nonlocal['count'] = 0 _nonlocal['subcount'] = 0 _nonlocal['lastremain'] = None def progress_report(n): _nonlocal['subcount'] += n cc = _nonlocal['count'] + _nonlocal['subcount'] pct = total and (cc * 100.0 / total) or 0 now = time.time() elapsed = now - tstart kps = elapsed and int(cc / 1024. / elapsed) kps_frac = 10**int(math.log(kps + 1, 10) - 1) kps = int(kps / kps_frac) * kps_frac if cc: remain = elapsed * 1.0 / cc * (total - cc) else: remain = 0.0 if (_nonlocal['lastremain'] and (remain > _nonlocal['lastremain']) and ((remain - _nonlocal['lastremain']) / _nonlocal['lastremain'] < 0.05)): remain = _nonlocal['lastremain'] else: _nonlocal['lastremain'] = remain hours = int(remain / 60 / 60) mins = int(remain / 60 - hours * 60) secs = int(remain - hours * 60 * 60 - mins * 60) if elapsed < 30: remainstr = '' kpsstr = '' else: kpsstr = '%dk/s' % kps if hours: remainstr = '%dh%dm' % (hours, mins) elif mins: remainstr = '%dm%d' % (mins, secs) else: remainstr = '%ds' % secs qprogress( 'Saving: %.2f%% (%d/%dk, %d/%d files) %s %s\r' % (pct, cc / 1024, total / 1024, fcount, ftotal, remainstr, kpsstr)) indexfile = opt.indexfile or git.repo(b'bupindex') r = index.Reader(indexfile) try: msr = index.MetaStoreReader(indexfile + b'.meta') except IOError as ex: if ex.errno != EACCES: raise log('error: cannot access %r; have you run bup index?' % path_msg(indexfile)) sys.exit(1) hlink_db = hlinkdb.HLinkDB(indexfile + b'.hlink') def already_saved(ent): return ent.is_valid() and w.exists(ent.sha) and ent.sha def wantrecurse_pre(ent): return not already_saved(ent) def wantrecurse_during(ent): return not already_saved(ent) or ent.sha_missing() def find_hardlink_target(hlink_db, ent): if hlink_db and not stat.S_ISDIR(ent.mode) and ent.nlink > 1: link_paths = hlink_db.node_paths(ent.dev, ent.ino) if link_paths: return link_paths[0] total = ftotal = 0 if opt.progress: for (transname, ent) in r.filter(extra, wantrecurse=wantrecurse_pre): if not (ftotal % 10024): qprogress('Reading index: %d\r' % ftotal) exists = ent.exists() hashvalid = already_saved(ent) ent.set_sha_missing(not hashvalid) if not opt.smaller or ent.size < opt.smaller: if exists and not hashvalid: total += ent.size ftotal += 1 progress('Reading index: %d, done.\n' % ftotal) hashsplit.progress_callback = progress_report # Root collisions occur when strip or graft options map more than one # path to the same directory (paths which originally had separate # parents). When that situation is detected, use empty metadata for # the parent. Otherwise, use the metadata for the common parent. # Collision example: "bup save ... --strip /foo /foo/bar /bar". # FIXME: Add collision tests, or handle collisions some other way. # FIXME: Detect/handle strip/graft name collisions (other than root), # i.e. if '/foo/bar' and '/bar' both map to '/'. first_root = None root_collision = None tstart = time.time() fcount = 0 lastskip_name = None lastdir = b'' for (transname, ent) in r.filter(extra, wantrecurse=wantrecurse_during): (dir, file) = os.path.split(ent.name) exists = (ent.flags & index.IX_EXISTS) hashvalid = already_saved(ent) wasmissing = ent.sha_missing() oldsize = ent.size if opt.verbose: if not exists: status = 'D' elif not hashvalid: if ent.sha == index.EMPTY_SHA: status = 'A' else: status = 'M' else: status = ' ' if opt.verbose >= 2: log('%s %-70s\n' % (status, path_msg(ent.name))) elif not stat.S_ISDIR(ent.mode) and lastdir != dir: if not lastdir.startswith(dir): log('%s %-70s\n' % (status, path_msg(os.path.join(dir, b'')))) lastdir = dir if opt.progress: progress_report(0) fcount += 1 if not exists: continue if opt.smaller and ent.size >= opt.smaller: if exists and not hashvalid: if opt.verbose: log('skipping large file "%s"\n' % path_msg(ent.name)) lastskip_name = ent.name continue assert (dir.startswith(b'/')) if opt.strip: dirp = stripped_path_components(dir, extra) elif opt.strip_path: dirp = stripped_path_components(dir, [opt.strip_path]) elif graft_points: dirp = grafted_path_components(graft_points, dir) else: dirp = path_components(dir) # At this point, dirp contains a representation of the archive # path that looks like [(archive_dir_name, real_fs_path), ...]. # So given "bup save ... --strip /foo/bar /foo/bar/baz", dirp # might look like this at some point: # [('', '/foo/bar'), ('baz', '/foo/bar/baz'), ...]. # This dual representation supports stripping/grafting, where the # archive path may not have a direct correspondence with the # filesystem. The root directory is represented by an initial # component named '', and any component that doesn't have a # corresponding filesystem directory (due to grafting, for # example) will have a real_fs_path of None, i.e. [('', None), # ...]. if first_root == None: first_root = dirp[0] elif first_root != dirp[0]: root_collision = True # If switching to a new sub-tree, finish the current sub-tree. while parts > [x[0] for x in dirp]: _pop(force_tree=None) # If switching to a new sub-tree, start a new sub-tree. for path_component in dirp[len(parts):]: dir_name, fs_path = path_component # Not indexed, so just grab the FS metadata or use empty metadata. try: meta = metadata.from_path(fs_path, normalized=True) \ if fs_path else metadata.Metadata() except (OSError, IOError) as e: add_error(e) lastskip_name = dir_name meta = metadata.Metadata() _push(dir_name, meta) if not file: if len(parts) == 1: continue # We're at the top level -- keep the current root dir # Since there's no filename, this is a subdir -- finish it. oldtree = already_saved(ent) # may be None newtree = _pop(force_tree=oldtree) if not oldtree: if lastskip_name and lastskip_name.startswith(ent.name): ent.invalidate() else: ent.validate(GIT_MODE_TREE, newtree) ent.repack() if exists and wasmissing: _nonlocal['count'] += oldsize continue # it's not a directory if hashvalid: id = ent.sha git_name = git.mangle_name(file, ent.mode, ent.gitmode) git_info = (ent.gitmode, git_name, id) shalists[-1].append(git_info) sort_key = git.shalist_item_sort_key((ent.mode, file, id)) meta = msr.metadata_at(ent.meta_ofs) meta.hardlink_target = find_hardlink_target(hlink_db, ent) # Restore the times that were cleared to 0 in the metastore. (meta.atime, meta.mtime, meta.ctime) = (ent.atime, ent.mtime, ent.ctime) metalists[-1].append((sort_key, meta)) else: id = None hlink = find_hardlink_target(hlink_db, ent) try: meta = metadata.from_path( ent.name, hardlink_target=hlink, normalized=True, after_stat=after_nondir_metadata_stat) except (OSError, IOError) as e: add_error(e) lastskip_name = ent.name continue if stat.S_IFMT(ent.mode) != stat.S_IFMT(meta.mode): # The mode changed since we indexed the file, this is bad. # This can cause two issues: # 1) We e.g. think the file is a regular file, but now it's # something else (a device, socket, FIFO or symlink, etc.) # and _read_ from it when we shouldn't. # 2) We then record it as valid, but don't update the index # metadata, and on a subsequent save it has 'hashvalid' # but is recorded as the file type from the index, when # the content is something else ... # Avoid all of these consistency issues by just skipping such # things - it really ought to not happen anyway. add_error("%s: mode changed since indexing, skipping." % path_msg(ent.name)) lastskip_name = ent.name continue if stat.S_ISREG(ent.mode): try: # If the file changes while we're reading it, then our reading # may stop at some point, but the stat() above may have gotten # a different size already. Recalculate the meta size so that # the repository records the accurate size in the metadata, even # if the other stat() data might be slightly older than the file # content (which we can't fix, this is inherently racy, but we # can prevent the size mismatch.) meta.size = 0 def new_blob(data): meta.size += len(data) return w.new_blob(data) before_saving_regular_file(ent.name) with hashsplit.open_noatime(ent.name) as f: (mode, id) = hashsplit.split_to_blob_or_tree( new_blob, w.new_tree, [f], keep_boundaries=False) except (IOError, OSError) as e: add_error('%s: %s' % (ent.name, e)) lastskip_name = ent.name elif stat.S_ISDIR(ent.mode): assert (0) # handled above elif stat.S_ISLNK(ent.mode): mode, id = (GIT_MODE_SYMLINK, w.new_blob(meta.symlink_target)) else: # Everything else should be fully described by its # metadata, so just record an empty blob, so the paths # in the tree and .bupm will match up. (mode, id) = (GIT_MODE_FILE, w.new_blob(b'')) if id: ent.validate(mode, id) ent.repack() git_name = git.mangle_name(file, ent.mode, ent.gitmode) git_info = (mode, git_name, id) shalists[-1].append(git_info) sort_key = git.shalist_item_sort_key((ent.mode, file, id)) metalists[-1].append((sort_key, meta)) if exists and wasmissing: _nonlocal['count'] += oldsize _nonlocal['subcount'] = 0 if opt.progress: pct = total and _nonlocal['count'] * 100.0 / total or 100 progress( 'Saving: %.2f%% (%d/%dk, %d/%d files), done. \n' % (pct, _nonlocal['count'] / 1024, total / 1024, fcount, ftotal)) while len(parts) > 1: # _pop() all the parts above the root _pop(force_tree=None) assert (len(shalists) == 1) assert (len(metalists) == 1) # Finish the root directory. tree = _pop( force_tree=None, # When there's a collision, use empty metadata for the root. dir_metadata=metadata.Metadata() if root_collision else None) sys.stdout.flush() out = byte_stream(sys.stdout) if opt.tree: out.write(hexlify(tree)) out.write(b'\n') if opt.commit or name: if compat.py_maj > 2: # Strip b prefix from python 3 bytes reprs to preserve previous format msgcmd = b'[%s]' % b', '.join( [repr(argv_bytes(x))[1:].encode('ascii') for x in argv]) else: msgcmd = repr(argv) msg = b'bup save\n\nGenerated by command:\n%s\n' % msgcmd userline = (b'%s <%s@%s>' % (userfullname(), username(), hostname())) commit = w.new_commit(tree, oldref, userline, date, None, userline, date, None, msg) if opt.commit: out.write(hexlify(commit)) out.write(b'\n') msr.close() w.close() # must close before we can update the ref if opt.name: if cli: cli.update_ref(refname, commit, oldref) else: git.update_ref(refname, commit, oldref) if cli: cli.close() if saved_errors: log('WARNING: %d errors encountered while saving.\n' % len(saved_errors)) sys.exit(1)
if link_paths: return link_paths[0] total = ftotal = 0 if opt.progress: for (transname,ent) in r.filter(extra, wantrecurse=wantrecurse_pre): if not (ftotal % 10024): qprogress('Reading index: %d\r' % ftotal) exists = ent.exists() hashvalid = already_saved(ent) ent.set_sha_missing(not hashvalid) if not opt.smaller or ent.size < opt.smaller: if exists and not hashvalid: total += ent.size ftotal += 1 progress('Reading index: %d, done.\n' % ftotal) hashsplit.progress_callback = progress_report # Root collisions occur when strip or graft options map more than one # path to the same directory (paths which originally had separate # parents). When that situation is detected, use empty metadata for # the parent. Otherwise, use the metadata for the common parent. # Collision example: "bup save ... --strip /foo /foo/bar /bar". # FIXME: Add collision tests, or handle collisions some other way. # FIXME: Detect/handle strip/graft name collisions (other than root), # i.e. if '/foo/bar' and '/bar' both map to '/'. first_root = None root_collision = None
def update_index(top, excluded_paths, exclude_rxs, xdev_exceptions): # tmax and start must be epoch nanoseconds. tmax = (time.time() - 1) * 10**9 ri = index.Reader(indexfile) msw = index.MetaStoreWriter(indexfile + '.meta') wi = index.Writer(indexfile, msw, tmax) rig = IterHelper(ri.iter(name=top)) tstart = int(time.time()) * 10**9 hlinks = hlinkdb.HLinkDB(indexfile + '.hlink') fake_hash = None if opt.fake_valid: def fake_hash(name): return (GIT_MODE_FILE, index.FAKE_SHA) total = 0 bup_dir = os.path.abspath(git.repo()) index_start = time.time() for path, pst in recursive_dirlist([top], xdev=opt.xdev, bup_dir=bup_dir, excluded_paths=excluded_paths, exclude_rxs=exclude_rxs, xdev_exceptions=xdev_exceptions): if opt.verbose >= 2 or (opt.verbose == 1 and stat.S_ISDIR(pst.st_mode)): sys.stdout.write('%s\n' % path) sys.stdout.flush() elapsed = time.time() - index_start paths_per_sec = total / elapsed if elapsed else 0 qprogress('Indexing: %d (%d paths/s)\r' % (total, paths_per_sec)) elif not (total % 128): elapsed = time.time() - index_start paths_per_sec = total / elapsed if elapsed else 0 qprogress('Indexing: %d (%d paths/s)\r' % (total, paths_per_sec)) total += 1 while rig.cur and rig.cur.name > path: # deleted paths if rig.cur.exists(): rig.cur.set_deleted() rig.cur.repack() if rig.cur.nlink > 1 and not stat.S_ISDIR(rig.cur.mode): hlinks.del_path(rig.cur.name) rig.next() if rig.cur and rig.cur.name == path: # paths that already existed need_repack = False if (rig.cur.stale(pst, tstart, check_device=opt.check_device)): try: meta = metadata.from_path(path, statinfo=pst) except (OSError, IOError) as e: add_error(e) rig.next() continue if not stat.S_ISDIR(rig.cur.mode) and rig.cur.nlink > 1: hlinks.del_path(rig.cur.name) if not stat.S_ISDIR(pst.st_mode) and pst.st_nlink > 1: hlinks.add_path(path, pst.st_dev, pst.st_ino) # Clear these so they don't bloat the store -- they're # already in the index (since they vary a lot and they're # fixed length). If you've noticed "tmax", you might # wonder why it's OK to do this, since that code may # adjust (mangle) the index mtime and ctime -- producing # fake values which must not end up in a .bupm. However, # it looks like that shouldn't be possible: (1) When # "save" validates the index entry, it always reads the # metadata from the filesytem. (2) Metadata is only # read/used from the index if hashvalid is true. (3) # "faked" entries will be stale(), and so we'll invalidate # them below. meta.ctime = meta.mtime = meta.atime = 0 meta_ofs = msw.store(meta) rig.cur.update_from_stat(pst, meta_ofs) rig.cur.invalidate() need_repack = True if not (rig.cur.flags & index.IX_HASHVALID): if fake_hash: rig.cur.gitmode, rig.cur.sha = fake_hash(path) rig.cur.flags |= index.IX_HASHVALID need_repack = True if opt.fake_invalid: rig.cur.invalidate() need_repack = True if need_repack: rig.cur.repack() rig.next() else: # new paths try: meta = metadata.from_path(path, statinfo=pst) except (OSError, IOError) as e: add_error(e) continue # See same assignment to 0, above, for rationale. meta.atime = meta.mtime = meta.ctime = 0 meta_ofs = msw.store(meta) wi.add(path, pst, meta_ofs, hashgen=fake_hash) if not stat.S_ISDIR(pst.st_mode) and pst.st_nlink > 1: hlinks.add_path(path, pst.st_dev, pst.st_ino) elapsed = time.time() - index_start paths_per_sec = total / elapsed if elapsed else 0 progress('Indexing: %d, done (%d paths/s).\n' % (total, paths_per_sec)) hlinks.prepare_save() if ri.exists(): ri.save() wi.flush() if wi.count: wr = wi.new_reader() if opt.check: log('check: before merging: oldfile\n') check_index(ri) log('check: before merging: newfile\n') check_index(wr) mi = index.Writer(indexfile, msw, tmax) for e in index.merge(ri, wr): # FIXME: shouldn't we remove deleted entries eventually? When? mi.add_ixentry(e) ri.close() mi.close() wr.close() wi.abort() else: wi.close() msw.close() hlinks.commit_save()
def pfinal(count, total): progress('bup: merging indexes (%d/%d), done.\n' % (count, total))
def sweep(live_objects, existing_count, cat_pipe, opt): # Traverse all the packs, saving the (probably) live data. ns = Nonlocal() ns.stale_files = [] def remove_stale_files(new_pack_prefix): if opt.verbose and new_pack_prefix: log('created ' + basename(new_pack_prefix) + '\n') for p in ns.stale_files: if opt.verbose: log('removing ' + basename(p) + '\n') os.unlink(p) ns.stale_files = [] writer = git.PackWriter(objcache_maker=None, compression_level=opt.compress, run_midx=False, on_pack_finish=remove_stale_files) # FIXME: sanity check .idx names vs .pack names? collect_count = 0 for idx_name in glob.glob(os.path.join(git.repo('objects/pack'), '*.idx')): if opt.verbose: qprogress('preserving live data (%d%% complete)\r' % ((float(collect_count) / existing_count) * 100)) idx = git.open_idx(idx_name) idx_live_count = 0 for i in xrange(0, len(idx)): sha = idx.shatable[i * 20:(i + 1) * 20] if live_objects.exists(sha): idx_live_count += 1 collect_count += idx_live_count if idx_live_count == 0: if opt.verbose: log('deleting %s\n' % git.repo_rel(basename(idx_name))) ns.stale_files.append(idx_name) ns.stale_files.append(idx_name[:-3] + 'pack') continue live_frac = idx_live_count / float(len(idx)) if live_frac > ((100 - opt.threshold) / 100.0): if opt.verbose: log('keeping %s (%d%% live)\n' % (git.repo_rel(basename(idx_name)), live_frac * 100)) continue if opt.verbose: log('rewriting %s (%.2f%% live)\n' % (basename(idx_name), live_frac * 100)) for i in xrange(0, len(idx)): sha = idx.shatable[i * 20:(i + 1) * 20] if live_objects.exists(sha): item_it = cat_pipe.get(sha.encode('hex')) type = item_it.next() writer.write(sha, type, ''.join(item_it)) ns.stale_files.append(idx_name) ns.stale_files.append(idx_name[:-3] + 'pack') if opt.verbose: progress('preserving live data (%d%% complete)\n' % ((float(collect_count) / existing_count) * 100)) # Nothing should have recreated midx/bloom yet. pack_dir = git.repo('objects/pack') assert (not os.path.exists(os.path.join(pack_dir, 'bup.bloom'))) assert (not glob.glob(os.path.join(pack_dir, '*.midx'))) # try/catch should call writer.abort()? # This will finally run midx. writer.close() # Can only change refs (if needed) after this. remove_stale_files(None) # In case we didn't write to the writer. if opt.verbose: log('discarded %d%% of objects\n' % ((existing_count - count_objects(pack_dir)) / float(existing_count) * 100))
def update_index(top, excluded_paths, exclude_rxs, xdev_exceptions): # tmax and start must be epoch nanoseconds. tmax = (time.time() - 1) * 10**9 ri = index.Reader(indexfile) msw = index.MetaStoreWriter(indexfile + '.meta') wi = index.Writer(indexfile, msw, tmax) rig = IterHelper(ri.iter(name=top)) tstart = int(time.time()) * 10**9 hlinks = hlinkdb.HLinkDB(indexfile + '.hlink') fake_hash = None if opt.fake_valid: def fake_hash(name): return (GIT_MODE_FILE, index.FAKE_SHA) total = 0 bup_dir = os.path.abspath(git.repo()) index_start = time.time() for path, pst in recursive_dirlist([top], xdev=opt.xdev, bup_dir=bup_dir, excluded_paths=excluded_paths, exclude_rxs=exclude_rxs, xdev_exceptions=xdev_exceptions): if opt.verbose>=2 or (opt.verbose==1 and stat.S_ISDIR(pst.st_mode)): sys.stdout.write('%s\n' % path) sys.stdout.flush() elapsed = time.time() - index_start paths_per_sec = total / elapsed if elapsed else 0 qprogress('Indexing: %d (%d paths/s)\r' % (total, paths_per_sec)) elif not (total % 128): elapsed = time.time() - index_start paths_per_sec = total / elapsed if elapsed else 0 qprogress('Indexing: %d (%d paths/s)\r' % (total, paths_per_sec)) total += 1 while rig.cur and rig.cur.name > path: # deleted paths if rig.cur.exists(): rig.cur.set_deleted() rig.cur.repack() if rig.cur.nlink > 1 and not stat.S_ISDIR(rig.cur.mode): hlinks.del_path(rig.cur.name) rig.next() if rig.cur and rig.cur.name == path: # paths that already existed need_repack = False if(rig.cur.stale(pst, tstart, check_device=opt.check_device)): try: meta = metadata.from_path(path, statinfo=pst) except (OSError, IOError) as e: add_error(e) rig.next() continue if not stat.S_ISDIR(rig.cur.mode) and rig.cur.nlink > 1: hlinks.del_path(rig.cur.name) if not stat.S_ISDIR(pst.st_mode) and pst.st_nlink > 1: hlinks.add_path(path, pst.st_dev, pst.st_ino) # Clear these so they don't bloat the store -- they're # already in the index (since they vary a lot and they're # fixed length). If you've noticed "tmax", you might # wonder why it's OK to do this, since that code may # adjust (mangle) the index mtime and ctime -- producing # fake values which must not end up in a .bupm. However, # it looks like that shouldn't be possible: (1) When # "save" validates the index entry, it always reads the # metadata from the filesytem. (2) Metadata is only # read/used from the index if hashvalid is true. (3) # "faked" entries will be stale(), and so we'll invalidate # them below. meta.ctime = meta.mtime = meta.atime = 0 meta_ofs = msw.store(meta) rig.cur.update_from_stat(pst, meta_ofs) rig.cur.invalidate() need_repack = True if not (rig.cur.flags & index.IX_HASHVALID): if fake_hash: rig.cur.gitmode, rig.cur.sha = fake_hash(path) rig.cur.flags |= index.IX_HASHVALID need_repack = True if opt.fake_invalid: rig.cur.invalidate() need_repack = True if need_repack: rig.cur.repack() rig.next() else: # new paths try: meta = metadata.from_path(path, statinfo=pst) except (OSError, IOError) as e: add_error(e) continue # See same assignment to 0, above, for rationale. meta.atime = meta.mtime = meta.ctime = 0 meta_ofs = msw.store(meta) wi.add(path, pst, meta_ofs, hashgen=fake_hash) if not stat.S_ISDIR(pst.st_mode) and pst.st_nlink > 1: hlinks.add_path(path, pst.st_dev, pst.st_ino) elapsed = time.time() - index_start paths_per_sec = total / elapsed if elapsed else 0 progress('Indexing: %d, done (%d paths/s).\n' % (total, paths_per_sec)) hlinks.prepare_save() if ri.exists(): ri.save() wi.flush() if wi.count: wr = wi.new_reader() if opt.check: log('check: before merging: oldfile\n') check_index(ri) log('check: before merging: newfile\n') check_index(wr) mi = index.Writer(indexfile, msw, tmax) for e in index.merge(ri, wr): # FIXME: shouldn't we remove deleted entries eventually? When? mi.add_ixentry(e) ri.close() mi.close() wr.close() wi.abort() else: wi.close() msw.close() hlinks.commit_save()
def main(): o = options.Options(optspec) opt, flags, extra = o.parse(sys.argv[1:]) verbosity = opt.verbose if not opt.quiet else -1 git.check_repo_or_die() if not extra: o.fatal('must specify at least one filename to restore') exclude_rxs = parse_rx_excludes(flags, o.fatal) owner_map = {} for map_type in ('user', 'group', 'uid', 'gid'): owner_map[map_type] = parse_owner_mappings(map_type, flags, o.fatal) if opt.outdir: mkdirp(opt.outdir) os.chdir(opt.outdir) repo = RemoteRepo(opt.remote) if opt.remote else LocalRepo() top = os.getcwd() hardlinks = {} for path in extra: if not valid_restore_path(path): add_error("path %r doesn't include a branch and revision" % path) continue try: resolved = vfs.resolve(repo, path, want_meta=True, follow=False) except vfs.IOError as e: add_error(e) continue if len(resolved) == 3 and resolved[2][0] == 'latest': # Follow latest symlink to the actual save try: resolved = vfs.resolve(repo, 'latest', parent=resolved[:-1], want_meta=True) except vfs.IOError as e: add_error(e) continue # Rename it back to 'latest' resolved = tuple(elt if i != 2 else ('latest',) + elt[1:] for i, elt in enumerate(resolved)) path_parent, path_name = os.path.split(path) leaf_name, leaf_item = resolved[-1] if not leaf_item: add_error('error: cannot access %r in %r' % ('/'.join(name for name, item in resolved), path)) continue if not path_name or path_name == '.': # Source is /foo/what/ever/ or /foo/what/ever/. -- extract # what/ever/* to the current directory, and if name == '.' # (i.e. /foo/what/ever/.), then also restore what/ever's # metadata to the current directory. treeish = vfs.item_mode(leaf_item) if not treeish: add_error('%r cannot be restored as a directory' % path) else: items = vfs.contents(repo, leaf_item, want_meta=True) dot, leaf_item = next(items, None) assert(dot == '.') for sub_name, sub_item in items: restore(repo, '', sub_name, sub_item, top, opt.sparse, opt.numeric_ids, owner_map, exclude_rxs, verbosity, hardlinks) if path_name == '.': leaf_item = vfs.augment_item_meta(repo, leaf_item, include_size=True) apply_metadata(leaf_item.meta, '.', opt.numeric_ids, owner_map) else: restore(repo, '', leaf_name, leaf_item, top, opt.sparse, opt.numeric_ids, owner_map, exclude_rxs, verbosity, hardlinks) if verbosity >= 0: progress('Restoring: %d, done.\n' % total_restored) die_if_errors()
def main(argv): global opt, par2_ok o = options.Options(optspec) opt, flags, extra = o.parse_bytes(argv[1:]) opt.verbose = opt.verbose or 0 par2_setup() if opt.par2_ok: if par2_ok: sys.exit(0) # 'true' in sh else: sys.exit(1) if opt.disable_par2: par2_ok = 0 git.check_repo_or_die() if extra: extra = [argv_bytes(x) for x in extra] else: debug('fsck: No filenames given: checking all packs.\n') extra = glob.glob(git.repo(b'objects/pack/*.pack')) sys.stdout.flush() out = byte_stream(sys.stdout) code = 0 count = 0 outstanding = {} for name in extra: if name.endswith(b'.pack'): base = name[:-5] elif name.endswith(b'.idx'): base = name[:-4] elif name.endswith(b'.par2'): base = name[:-5] elif os.path.exists(name + b'.pack'): base = name else: raise Exception('%r is not a pack file!' % name) (dir, last) = os.path.split(base) par2_exists = os.path.exists(base + b'.par2') if par2_exists and os.stat(base + b'.par2').st_size == 0: par2_exists = 0 sys.stdout.flush( ) # Not sure we still need this, but it'll flush out too debug('fsck: checking %r (%s)\n' % (last, par2_ok and par2_exists and 'par2' or 'git')) if not opt.verbose: progress('fsck (%d/%d)\r' % (count, len(extra))) if not opt.jobs: nc = do_pack(base, last, par2_exists, out) code = code or nc count += 1 else: while len(outstanding) >= opt.jobs: (pid, nc) = os.wait() nc >>= 8 if pid in outstanding: del outstanding[pid] code = code or nc count += 1 pid = os.fork() if pid: # parent outstanding[pid] = 1 else: # child try: sys.exit(do_pack(base, last, par2_exists, out)) except Exception as e: log('exception: %r\n' % e) sys.exit(99) while len(outstanding): (pid, nc) = os.wait() nc >>= 8 if pid in outstanding: del outstanding[pid] code = code or nc count += 1 if not opt.verbose: progress('fsck (%d/%d)\r' % (count, len(extra))) if istty2: debug('fsck done. \n') sys.exit(code)
def do_bloom(path, outfilename): global _first b = None if os.path.exists(outfilename) and not opt.force: b = bloom.ShaBloom(outfilename) if not b.valid(): debug1("bloom: Existing invalid bloom found, regenerating.\n") b = None add = [] rest = [] add_count = 0 rest_count = 0 for i,name in enumerate(glob.glob('%s/*.idx' % path)): progress('bloom: counting: %d\r' % i) ix = git.open_idx(name) ixbase = os.path.basename(name) if b and (ixbase in b.idxnames): rest.append(name) rest_count += len(ix) else: add.append(name) add_count += len(ix) total = add_count + rest_count if not add: debug1("bloom: nothing to do.\n") return if b: if len(b) != rest_count: debug1("bloom: size %d != idx total %d, regenerating\n" % (len(b), rest_count)) b = None elif (b.bits < bloom.MAX_BLOOM_BITS and b.pfalse_positive(add_count) > bloom.MAX_PFALSE_POSITIVE): debug1("bloom: regenerating: adding %d entries gives " "%.2f%% false positives.\n" % (add_count, b.pfalse_positive(add_count))) b = None else: b = bloom.ShaBloom(outfilename, readwrite=True, expected=add_count) if not b: # Need all idxs to build from scratch add += rest add_count += rest_count del rest del rest_count msg = b is None and 'creating from' or 'adding' if not _first: _first = path dirprefix = (_first != path) and git.repo_rel(path)+': ' or '' progress('bloom: %s%s %d file%s (%d object%s).\n' % (dirprefix, msg, len(add), len(add)!=1 and 's' or '', add_count, add_count!=1 and 's' or '')) tfname = None if b is None: tfname = os.path.join(path, 'bup.tmp.bloom') b = bloom.create(tfname, expected=add_count, k=opt.k) count = 0 icount = 0 for name in add: ix = git.open_idx(name) qprogress('bloom: writing %.2f%% (%d/%d objects)\r' % (icount*100.0/add_count, icount, add_count)) b.add_idx(ix) count += 1 icount += len(ix) # Currently, there's an open file object for tfname inside b. # Make sure it's closed before rename. b.close() if tfname: os.rename(tfname, outfilename)
def pfinal(count, total): if final_progress: progress('Reading indexes: %.2f%% (%d/%d), done.\n' % (100, total, total))
base = name[:-4] elif name.endswith(b'.par2'): base = name[:-5] elif os.path.exists(name + b'.pack'): base = name else: raise Exception('%r is not a pack file!' % name) (dir, last) = os.path.split(base) par2_exists = os.path.exists(base + b'.par2') if par2_exists and os.stat(base + b'.par2').st_size == 0: par2_exists = 0 sys.stdout.flush() # Not sure we still need this, but it'll flush out too debug('fsck: checking %r (%s)\n' % (last, par2_ok and par2_exists and 'par2' or 'git')) if not opt.verbose: progress('fsck (%d/%d)\r' % (count, len(extra))) if not opt.jobs: nc = do_pack(base, last, par2_exists, out) code = code or nc count += 1 else: while len(outstanding) >= opt.jobs: (pid, nc) = os.wait() nc >>= 8 if pid in outstanding: del outstanding[pid] code = code or nc count += 1 pid = os.fork() if pid: # parent
base = name[:-4] elif name.endswith('.par2'): base = name[:-5] elif os.path.exists(name + '.pack'): base = name else: raise Exception('%s is not a pack file!' % name) (dir,last) = os.path.split(base) par2_exists = os.path.exists(base + '.par2') if par2_exists and os.stat(base + '.par2').st_size == 0: par2_exists = 0 sys.stdout.flush() debug('fsck: checking %s (%s)\n' % (last, par2_ok and par2_exists and 'par2' or 'git')) if not opt.verbose: progress('fsck (%d/%d)\r' % (count, len(extra))) if not opt.jobs: nc = do_pack(base, last, par2_exists) code = code or nc count += 1 else: while len(outstanding) >= opt.jobs: (pid,nc) = os.wait() nc >>= 8 if pid in outstanding: del outstanding[pid] code = code or nc count += 1 pid = os.fork() if pid: # parent
if not name or name == '.': # Source is /foo/what/ever/ or /foo/what/ever/. -- extract # what/ever/* to the current directory, and if name == '.' # (i.e. /foo/what/ever/.), then also restore what/ever's # metadata to the current directory. if not isdir: add_error('%r: not a directory' % d) else: do_root(n, opt.sparse, owner_map, restore_root_meta = (name == '.')) else: # Source is /foo/what/ever -- extract ./ever to cwd. if isinstance(n, vfs.FakeSymlink): # Source is actually /foo/what, i.e. a top-level commit # like /foo/latest, which is a symlink to ../.commit/SHA. # So dereference it, and restore ../.commit/SHA/. to # "./what/.". target = n.dereference() mkdirp(n.name) os.chdir(n.name) do_root(target, opt.sparse, owner_map) else: # Not a directory or fake symlink. meta = find_dir_item_metadata_by_name(n.parent, n.name) do_node(n.parent, n, opt.sparse, owner_map, meta = meta) if not opt.quiet: progress('Restoring: %d, done.\n' % total_restored) if saved_errors: log('WARNING: %d errors encountered while restoring.\n' % len(saved_errors)) sys.exit(1)
def do_bloom(path, outfilename, k, force): global _first assert k in (None, 4, 5) b = None if os.path.exists(outfilename) and not force: b = bloom.ShaBloom(outfilename) if not b.valid(): debug1("bloom: Existing invalid bloom found, regenerating.\n") b = None add = [] rest = [] add_count = 0 rest_count = 0 for i, name in enumerate(glob.glob(b'%s/*.idx' % path)): progress('bloom: counting: %d\r' % i) ix = git.open_idx(name) ixbase = os.path.basename(name) if b and (ixbase in b.idxnames): rest.append(name) rest_count += len(ix) else: add.append(name) add_count += len(ix) if not add: debug1("bloom: nothing to do.\n") return if b: if len(b) != rest_count: debug1("bloom: size %d != idx total %d, regenerating\n" % (len(b), rest_count)) b = None elif k is not None and k != b.k: debug1("bloom: new k %d != existing k %d, regenerating\n" % (k, b.k)) b = None elif (b.bits < bloom.MAX_BLOOM_BITS[b.k] and b.pfalse_positive(add_count) > bloom.MAX_PFALSE_POSITIVE): debug1("bloom: regenerating: adding %d entries gives " "%.2f%% false positives.\n" % (add_count, b.pfalse_positive(add_count))) b = None else: b = bloom.ShaBloom(outfilename, readwrite=True, expected=add_count) if not b: # Need all idxs to build from scratch add += rest add_count += rest_count del rest del rest_count msg = b is None and 'creating from' or 'adding' if not _first: _first = path dirprefix = (_first != path) and git.repo_rel(path) + b': ' or b'' progress('bloom: %s%s %d file%s (%d object%s).\r' % (path_msg(dirprefix), msg, len(add), len(add) != 1 and 's' or '', add_count, add_count != 1 and 's' or '')) tfname = None if b is None: tfname = os.path.join(path, b'bup.tmp.bloom') b = bloom.create(tfname, expected=add_count, k=k) count = 0 icount = 0 for name in add: ix = git.open_idx(name) qprogress('bloom: writing %.2f%% (%d/%d objects)\r' % (icount * 100.0 / add_count, icount, add_count)) b.add_idx(ix) count += 1 icount += len(ix) # Currently, there's an open file object for tfname inside b. # Make sure it's closed before rename. b.close() if tfname: os.rename(tfname, outfilename)
def sweep(live_objects, existing_count, cat_pipe, threshold, compression, verbosity): # Traverse all the packs, saving the (probably) live data. ns = Nonlocal() ns.stale_files = [] def remove_stale_files(new_pack_prefix): if verbosity and new_pack_prefix: log('created ' + basename(new_pack_prefix) + '\n') for p in ns.stale_files: if new_pack_prefix and p.startswith(new_pack_prefix): continue # Don't remove the new pack file if verbosity: log('removing ' + basename(p) + '\n') os.unlink(p) if ns.stale_files: # So git cat-pipe will close them cat_pipe.restart() ns.stale_files = [] writer = git.PackWriter(objcache_maker=None, compression_level=compression, run_midx=False, on_pack_finish=remove_stale_files) # FIXME: sanity check .idx names vs .pack names? collect_count = 0 for idx_name in glob.glob(os.path.join(git.repo('objects/pack'), '*.idx')): if verbosity: qprogress('preserving live data (%d%% complete)\r' % ((float(collect_count) / existing_count) * 100)) idx = git.open_idx(idx_name) idx_live_count = 0 for i in xrange(0, len(idx)): sha = idx.shatable[i * 20 : (i + 1) * 20] if live_objects.exists(sha): idx_live_count += 1 collect_count += idx_live_count if idx_live_count == 0: if verbosity: log('deleting %s\n' % git.repo_rel(basename(idx_name))) ns.stale_files.append(idx_name) ns.stale_files.append(idx_name[:-3] + 'pack') continue live_frac = idx_live_count / float(len(idx)) if live_frac > ((100 - threshold) / 100.0): if verbosity: log('keeping %s (%d%% live)\n' % (git.repo_rel(basename(idx_name)), live_frac * 100)) continue if verbosity: log('rewriting %s (%.2f%% live)\n' % (basename(idx_name), live_frac * 100)) for i in xrange(0, len(idx)): sha = idx.shatable[i * 20 : (i + 1) * 20] if live_objects.exists(sha): item_it = cat_pipe.get(sha.encode('hex')) type = item_it.next() writer.just_write(sha, type, ''.join(item_it)) ns.stale_files.append(idx_name) ns.stale_files.append(idx_name[:-3] + 'pack') if verbosity: progress('preserving live data (%d%% complete)\n' % ((float(collect_count) / existing_count) * 100)) # Nothing should have recreated midx/bloom yet. pack_dir = git.repo('objects/pack') assert(not os.path.exists(os.path.join(pack_dir, 'bup.bloom'))) assert(not glob.glob(os.path.join(pack_dir, '*.midx'))) # try/catch should call writer.abort()? # This will finally run midx. writer.close() # Can only change refs (if needed) after this. remove_stale_files(None) # In case we didn't write to the writer. if verbosity: log('discarded %d%% of objects\n' % ((existing_count - count_objects(pack_dir, verbosity)) / float(existing_count) * 100))
def sweep(live_objects, existing_count, cat_pipe, threshold, compression, verbosity): # Traverse all the packs, saving the (probably) live data. ns = Nonlocal() ns.stale_files = [] def remove_stale_files(new_pack_prefix): if verbosity and new_pack_prefix: log('created ' + path_msg(basename(new_pack_prefix)) + '\n') for p in ns.stale_files: if new_pack_prefix and p.startswith(new_pack_prefix): continue # Don't remove the new pack file if verbosity: log('removing ' + path_msg(basename(p)) + '\n') os.unlink(p) if ns.stale_files: # So git cat-pipe will close them cat_pipe.restart() ns.stale_files = [] writer = git.PackWriter(objcache_maker=None, compression_level=compression, run_midx=False, on_pack_finish=remove_stale_files) try: # FIXME: sanity check .idx names vs .pack names? collect_count = 0 for idx_name in glob.glob( os.path.join(git.repo(b'objects/pack'), b'*.idx')): if verbosity: qprogress('preserving live data (%d%% complete)\r' % ((float(collect_count) / existing_count) * 100)) with git.open_idx(idx_name) as idx: idx_live_count = 0 for sha in idx: if live_objects.exists(sha): idx_live_count += 1 collect_count += idx_live_count if idx_live_count == 0: if verbosity: log('deleting %s\n' % path_msg(git.repo_rel(basename(idx_name)))) ns.stale_files.append(idx_name) ns.stale_files.append(idx_name[:-3] + b'pack') continue live_frac = idx_live_count / float(len(idx)) if live_frac > ((100 - threshold) / 100.0): if verbosity: log('keeping %s (%d%% live)\n' % (git.repo_rel( basename(idx_name)), live_frac * 100)) continue if verbosity: log('rewriting %s (%.2f%% live)\n' % (basename(idx_name), live_frac * 100)) for sha in idx: if live_objects.exists(sha): item_it = cat_pipe.get(hexlify(sha)) _, typ, _ = next(item_it) writer.just_write(sha, typ, b''.join(item_it)) ns.stale_files.append(idx_name) ns.stale_files.append(idx_name[:-3] + b'pack') if verbosity: progress('preserving live data (%d%% complete)\n' % ((float(collect_count) / existing_count) * 100)) # Nothing should have recreated midx/bloom yet. pack_dir = git.repo(b'objects/pack') assert (not os.path.exists(os.path.join(pack_dir, b'bup.bloom'))) assert (not glob.glob(os.path.join(pack_dir, b'*.midx'))) except BaseException as ex: with pending_raise(ex): writer.abort() # This will finally run midx. # Can only change refs (if needed) after this. writer.close() remove_stale_files(None) # In case we didn't write to the writer. if verbosity: log('discarded %d%% of objects\n' % ((existing_count - count_objects(pack_dir, verbosity)) / float(existing_count) * 100))
def main(argv): o = options.Options(optspec) opt, flags, extra = o.parse_bytes(argv[1:]) verbosity = (opt.verbose or 0) if not opt.quiet else -1 if opt.remote: opt.remote = argv_bytes(opt.remote) if opt.outdir: opt.outdir = argv_bytes(opt.outdir) git.check_repo_or_die() if not extra: o.fatal('must specify at least one filename to restore') exclude_rxs = parse_rx_excludes(flags, o.fatal) owner_map = {} for map_type in ('user', 'group', 'uid', 'gid'): owner_map[map_type] = parse_owner_mappings(map_type, flags, o.fatal) if opt.outdir: mkdirp(opt.outdir) os.chdir(opt.outdir) repo = RemoteRepo(opt.remote) if opt.remote else LocalRepo() top = fsencode(os.getcwd()) hardlinks = {} for path in [argv_bytes(x) for x in extra]: if not valid_restore_path(path): add_error("path %r doesn't include a branch and revision" % path) continue try: resolved = vfs.resolve(repo, path, want_meta=True, follow=False) except vfs.IOError as e: add_error(e) continue if len(resolved) == 3 and resolved[2][0] == b'latest': # Follow latest symlink to the actual save try: resolved = vfs.resolve(repo, b'latest', parent=resolved[:-1], want_meta=True) except vfs.IOError as e: add_error(e) continue # Rename it back to 'latest' resolved = tuple(elt if i != 2 else (b'latest', ) + elt[1:] for i, elt in enumerate(resolved)) path_parent, path_name = os.path.split(path) leaf_name, leaf_item = resolved[-1] if not leaf_item: add_error('error: cannot access %r in %r' % (b'/'.join(name for name, item in resolved), path)) continue if not path_name or path_name == b'.': # Source is /foo/what/ever/ or /foo/what/ever/. -- extract # what/ever/* to the current directory, and if name == '.' # (i.e. /foo/what/ever/.), then also restore what/ever's # metadata to the current directory. treeish = vfs.item_mode(leaf_item) if not treeish: add_error('%r cannot be restored as a directory' % path) else: items = vfs.contents(repo, leaf_item, want_meta=True) dot, leaf_item = next(items, None) assert dot == b'.' for sub_name, sub_item in items: restore(repo, b'', sub_name, sub_item, top, opt.sparse, opt.numeric_ids, owner_map, exclude_rxs, verbosity, hardlinks) if path_name == b'.': leaf_item = vfs.augment_item_meta(repo, leaf_item, include_size=True) apply_metadata(leaf_item.meta, b'.', opt.numeric_ids, owner_map) else: restore(repo, b'', leaf_name, leaf_item, top, opt.sparse, opt.numeric_ids, owner_map, exclude_rxs, verbosity, hardlinks) if verbosity >= 0: progress('Restoring: %d, done.\n' % total_restored) die_if_errors()
def main(): o = options.Options(optspec) opt, flags, extra = o.parse(sys.argv[1:]) verbosity = opt.verbose if not opt.quiet else -1 git.check_repo_or_die() if not extra: o.fatal('must specify at least one filename to restore') exclude_rxs = parse_rx_excludes(flags, o.fatal) owner_map = {} for map_type in ('user', 'group', 'uid', 'gid'): owner_map[map_type] = parse_owner_mappings(map_type, flags, o.fatal) if opt.outdir: mkdirp(opt.outdir) os.chdir(opt.outdir) repo = RemoteRepo(opt.remote) if opt.remote else LocalRepo() top = os.getcwd() hardlinks = {} for path in extra: if not valid_restore_path(path): add_error("path %r doesn't include a branch and revision" % path) continue try: resolved = vfs2.lresolve(repo, path, want_meta=True) except vfs2.IOError as e: add_error(e) continue path_parent, path_name = os.path.split(path) leaf_name, leaf_item = resolved[-1] if not leaf_item: add_error('error: cannot access %r in %r' % ('/'.join(name for name, item in resolved), path)) continue if not path_name or path_name == '.': # Source is /foo/what/ever/ or /foo/what/ever/. -- extract # what/ever/* to the current directory, and if name == '.' # (i.e. /foo/what/ever/.), then also restore what/ever's # metadata to the current directory. treeish = vfs2.item_mode(leaf_item) if not treeish: add_error('%r cannot be restored as a directory' % path) else: items = vfs2.contents(repo, leaf_item, want_meta=True) dot, leaf_item = next(items, None) assert (dot == '.') for sub_name, sub_item in items: restore(repo, '', sub_name, sub_item, top, opt.sparse, opt.numeric_ids, owner_map, exclude_rxs, verbosity, hardlinks) if path_name == '.': leaf_item = vfs2.augment_item_meta(repo, leaf_item, include_size=True) apply_metadata(leaf_item.meta, '.', opt.numeric_ids, owner_map) else: restore(repo, '', leaf_name, leaf_item, top, opt.sparse, opt.numeric_ids, owner_map, exclude_rxs, verbosity, hardlinks) if verbosity >= 0: progress('Restoring: %d, done.\n' % total_restored) die_if_errors()
def save_tree(opt, reader, hlink_db, msr, w): # Metadata is stored in a file named .bupm in each directory. The # first metadata entry will be the metadata for the current directory. # The remaining entries will be for each of the other directory # elements, in the order they're listed in the index. # # Since the git tree elements are sorted according to # git.shalist_item_sort_key, the metalist items are accumulated as # (sort_key, metadata) tuples, and then sorted when the .bupm file is # created. The sort_key should have been computed using the element's # mangled name and git mode (after hashsplitting), but the code isn't # actually doing that but rather uses the element's real name and mode. # This makes things a bit more difficult when reading it back, see # vfs.ordered_tree_entries(). # Maintain a stack of information representing the current location in # the archive being constructed. The current path is recorded in # parts, which will be something like # [StackDir(name=''), StackDir(name='home'), StackDir(name='someuser')], # and the accumulated content and metadata for files in the dirs is stored # in the .items member of the StackDir. stack = [] def _push(part, metadata): # Enter a new archive directory -- make it the current directory. item = StackDir(part, metadata) stack.append(item) def _pop(force_tree=None, dir_metadata=None): # Leave the current archive directory and add its tree to its parent. item = stack.pop() # FIXME: only test if collision is possible (i.e. given --strip, etc.)? if force_tree: tree = force_tree else: names_seen = set() clean_list = [] for x in item.items: name = x.name if name in names_seen: parent_path = b'/'.join(x.name for x in stack) + b'/' add_error('error: ignoring duplicate path %s in %s' % (path_msg(name), path_msg(parent_path))) else: names_seen.add(name) clean_list.append(x) # if set, overrides the original metadata pushed for this dir. if dir_metadata is None: dir_metadata = item.meta metalist = [(b'', dir_metadata)] metalist += [(git.shalist_item_sort_key((entry.mode, entry.name, None)), entry.meta) for entry in clean_list if entry.mode != GIT_MODE_TREE] metalist.sort(key = lambda x: x[0]) metadata = BytesIO(b''.join(m[1].encode() for m in metalist)) mode, id = hashsplit.split_to_blob_or_tree(w.new_blob, w.new_tree, [metadata], keep_boundaries=False) shalist = [(mode, b'.bupm', id)] shalist += [(entry.gitmode, git.mangle_name(entry.name, entry.mode, entry.gitmode), entry.oid) for entry in clean_list] tree = w.new_tree(shalist) if stack: stack[-1].append(item.name, GIT_MODE_TREE, GIT_MODE_TREE, tree, None) return tree # Hack around lack of nonlocal vars in python 2 _nonlocal = {} _nonlocal['count'] = 0 _nonlocal['subcount'] = 0 _nonlocal['lastremain'] = None def progress_report(n): _nonlocal['subcount'] += n cc = _nonlocal['count'] + _nonlocal['subcount'] pct = total and (cc*100.0/total) or 0 now = time.time() elapsed = now - tstart kps = elapsed and int(cc/1024./elapsed) kps_frac = 10 ** int(math.log(kps+1, 10) - 1) kps = int(kps/kps_frac)*kps_frac if cc: remain = elapsed*1.0/cc * (total-cc) else: remain = 0.0 if (_nonlocal['lastremain'] and (remain > _nonlocal['lastremain']) and ((remain - _nonlocal['lastremain'])/_nonlocal['lastremain'] < 0.05)): remain = _nonlocal['lastremain'] else: _nonlocal['lastremain'] = remain hours = int(remain/60/60) mins = int(remain/60 - hours*60) secs = int(remain - hours*60*60 - mins*60) if elapsed < 30: remainstr = '' kpsstr = '' else: kpsstr = '%dk/s' % kps if hours: remainstr = '%dh%dm' % (hours, mins) elif mins: remainstr = '%dm%d' % (mins, secs) else: remainstr = '%ds' % secs qprogress('Saving: %.2f%% (%d/%dk, %d/%d files) %s %s\r' % (pct, cc/1024, total/1024, fcount, ftotal, remainstr, kpsstr)) def already_saved(ent): return ent.is_valid() and w.exists(ent.sha) and ent.sha def wantrecurse_pre(ent): return not already_saved(ent) def wantrecurse_during(ent): return not already_saved(ent) or ent.sha_missing() def find_hardlink_target(hlink_db, ent): if hlink_db and not stat.S_ISDIR(ent.mode) and ent.nlink > 1: link_paths = hlink_db.node_paths(ent.dev, ent.ino) if link_paths: return link_paths[0] return None total = ftotal = 0 if opt.progress: for transname, ent in reader.filter(opt.sources, wantrecurse=wantrecurse_pre): if not (ftotal % 10024): qprogress('Reading index: %d\r' % ftotal) exists = ent.exists() hashvalid = already_saved(ent) ent.set_sha_missing(not hashvalid) if not opt.smaller or ent.size < opt.smaller: if exists and not hashvalid: total += ent.size ftotal += 1 progress('Reading index: %d, done.\n' % ftotal) hashsplit.progress_callback = progress_report # Root collisions occur when strip or graft options map more than one # path to the same directory (paths which originally had separate # parents). When that situation is detected, use empty metadata for # the parent. Otherwise, use the metadata for the common parent. # Collision example: "bup save ... --strip /foo /foo/bar /bar". # FIXME: Add collision tests, or handle collisions some other way. # FIXME: Detect/handle strip/graft name collisions (other than root), # i.e. if '/foo/bar' and '/bar' both map to '/'. first_root = None root_collision = None tstart = time.time() fcount = 0 lastskip_name = None lastdir = b'' for transname, ent in reader.filter(opt.sources, wantrecurse=wantrecurse_during): (dir, file) = os.path.split(ent.name) exists = (ent.flags & index.IX_EXISTS) hashvalid = already_saved(ent) wasmissing = ent.sha_missing() oldsize = ent.size if opt.verbose: if not exists: status = 'D' elif not hashvalid: if ent.sha == index.EMPTY_SHA: status = 'A' else: status = 'M' else: status = ' ' if opt.verbose >= 2: log('%s %-70s\n' % (status, path_msg(ent.name))) elif not stat.S_ISDIR(ent.mode) and lastdir != dir: if not lastdir.startswith(dir): log('%s %-70s\n' % (status, path_msg(os.path.join(dir, b'')))) lastdir = dir if opt.progress: progress_report(0) fcount += 1 if not exists: continue if opt.smaller and ent.size >= opt.smaller: if exists and not hashvalid: if opt.verbose: log('skipping large file "%s"\n' % path_msg(ent.name)) lastskip_name = ent.name continue assert(dir.startswith(b'/')) if opt.strip: dirp = stripped_path_components(dir, opt.sources) elif opt.strip_path: dirp = stripped_path_components(dir, [opt.strip_path]) elif opt.grafts: dirp = grafted_path_components(opt.grafts, dir) else: dirp = path_components(dir) # At this point, dirp contains a representation of the archive # path that looks like [(archive_dir_name, real_fs_path), ...]. # So given "bup save ... --strip /foo/bar /foo/bar/baz", dirp # might look like this at some point: # [('', '/foo/bar'), ('baz', '/foo/bar/baz'), ...]. # This dual representation supports stripping/grafting, where the # archive path may not have a direct correspondence with the # filesystem. The root directory is represented by an initial # component named '', and any component that doesn't have a # corresponding filesystem directory (due to grafting, for # example) will have a real_fs_path of None, i.e. [('', None), # ...]. if first_root == None: first_root = dirp[0] elif first_root != dirp[0]: root_collision = True # If switching to a new sub-tree, finish the current sub-tree. while [x.name for x in stack] > [x[0] for x in dirp]: _pop() # If switching to a new sub-tree, start a new sub-tree. for path_component in dirp[len(stack):]: dir_name, fs_path = path_component # Not indexed, so just grab the FS metadata or use empty metadata. try: meta = metadata.from_path(fs_path, normalized=True) \ if fs_path else metadata.Metadata() except (OSError, IOError) as e: add_error(e) lastskip_name = dir_name meta = metadata.Metadata() _push(dir_name, meta) if not file: if len(stack) == 1: continue # We're at the top level -- keep the current root dir # Since there's no filename, this is a subdir -- finish it. oldtree = already_saved(ent) # may be None newtree = _pop(force_tree = oldtree) if not oldtree: if lastskip_name and lastskip_name.startswith(ent.name): ent.invalidate() else: ent.validate(GIT_MODE_TREE, newtree) ent.repack() if exists and wasmissing: _nonlocal['count'] += oldsize continue # it's not a directory if hashvalid: meta = msr.metadata_at(ent.meta_ofs) meta.hardlink_target = find_hardlink_target(hlink_db, ent) # Restore the times that were cleared to 0 in the metastore. (meta.atime, meta.mtime, meta.ctime) = (ent.atime, ent.mtime, ent.ctime) stack[-1].append(file, ent.mode, ent.gitmode, ent.sha, meta) else: id = None hlink = find_hardlink_target(hlink_db, ent) try: meta = metadata.from_path(ent.name, hardlink_target=hlink, normalized=True, after_stat=after_nondir_metadata_stat) except (OSError, IOError) as e: add_error(e) lastskip_name = ent.name continue if stat.S_IFMT(ent.mode) != stat.S_IFMT(meta.mode): # The mode changed since we indexed the file, this is bad. # This can cause two issues: # 1) We e.g. think the file is a regular file, but now it's # something else (a device, socket, FIFO or symlink, etc.) # and _read_ from it when we shouldn't. # 2) We then record it as valid, but don't update the index # metadata, and on a subsequent save it has 'hashvalid' # but is recorded as the file type from the index, when # the content is something else ... # Avoid all of these consistency issues by just skipping such # things - it really ought to not happen anyway. add_error("%s: mode changed since indexing, skipping." % path_msg(ent.name)) lastskip_name = ent.name continue if stat.S_ISREG(ent.mode): try: # If the file changes while we're reading it, then our reading # may stop at some point, but the stat() above may have gotten # a different size already. Recalculate the meta size so that # the repository records the accurate size in the metadata, even # if the other stat() data might be slightly older than the file # content (which we can't fix, this is inherently racy, but we # can prevent the size mismatch.) meta.size = 0 def new_blob(data): meta.size += len(data) return w.new_blob(data) before_saving_regular_file(ent.name) with hashsplit.open_noatime(ent.name) as f: (mode, id) = hashsplit.split_to_blob_or_tree( new_blob, w.new_tree, [f], keep_boundaries=False) except (IOError, OSError) as e: add_error('%s: %s' % (ent.name, e)) lastskip_name = ent.name elif stat.S_ISDIR(ent.mode): assert(0) # handled above elif stat.S_ISLNK(ent.mode): mode, id = (GIT_MODE_SYMLINK, w.new_blob(meta.symlink_target)) else: # Everything else should be fully described by its # metadata, so just record an empty blob, so the paths # in the tree and .bupm will match up. (mode, id) = (GIT_MODE_FILE, w.new_blob(b'')) if id: ent.validate(mode, id) ent.repack() stack[-1].append(file, ent.mode, ent.gitmode, id, meta) if exists and wasmissing: _nonlocal['count'] += oldsize _nonlocal['subcount'] = 0 if opt.progress: pct = total and _nonlocal['count']*100.0/total or 100 progress('Saving: %.2f%% (%d/%dk, %d/%d files), done. \n' % (pct, _nonlocal['count']/1024, total/1024, fcount, ftotal)) while len(stack) > 1: # _pop() all the parts above the root _pop() # Finish the root directory. # When there's a collision, use empty metadata for the root. tree = _pop(dir_metadata = metadata.Metadata() if root_collision else None) return tree