def best_desc(self, root_id): if root_id not in self._best_desc: intpath = fsdecode(self.root_info[root_id].path) candidate_mis = [ mi for mi in self.minfos if not mi.private and path_isprefix(mi.internal_path, intpath)] if candidate_mis: mi = max( candidate_mis, key=lambda mi: len(mi.internal_path)) base = mi.mpoint intbase = mi.internal_path is_fs_path = True else: base = self.desc intbase = '/' is_fs_path = False self._best_desc[root_id] = VolDesc( os.path.normpath( os.path.join(base, os.path.relpath(intpath, intbase))), is_fs_path) return self._best_desc[root_id]
def device_info(self): di = {} lbls = Counter() for line in subprocess.check_output( 'blkid -s LABEL -s UUID -t TYPE=btrfs'.split() ).splitlines(): dev, label, uuid = BLKID_RE.match(line).groups() uuid = UUID(hex=uuid.decode('ascii')) dev = fsdecode(dev) if label is not None: try: label = label.decode('ascii') except UnicodeDecodeError: # Don't try to guess. pass if uuid in di: # btrfs raid assert di[uuid].label == label di[uuid].devices.append(dev) else: lbls[label] += 1 di[uuid] = DeviceInfo(label, [dev]) self._label_occurs = dict(lbls) return di
def dedup_tracked1(sess, tt, ofile_reserved, query, fs): space_gain = 0 ofile_soft, ofile_hard = resource.getrlimit(resource.RLIMIT_OFILE) # Hopefully close any files we left around gc.collect() for comm1 in query: size = comm1.size tt.update(comm1=comm1) by_mh = defaultdict(list) for inode in comm1.inodes: # XXX Need to cope with deleted inodes. # We cannot find them in the search-new pass, not without doing # some tracking of directory modifications to poke updated # directories to find removed elements. # rehash everytime for now # I don't know enough about how inode transaction numbers are # updated (as opposed to extent updates) to be able to actually # cache the result try: pathb = inode.vol.live.lookup_one_path(inode) except IOError as e: if e.errno != errno.ENOENT: raise # We have a stale record for a removed inode # XXX If an inode number is reused and the second instance # is below the size cutoff, we won't update the .size # attribute and we won't get an IOError to notify us # either. Inode reuse does happen (with and without # inode_cache), so this branch isn't enough to rid us of # all stale entries. We can also get into trouble with # regular file inodes being replaced by some other kind of # inode. sess.delete(inode) continue with closing(fopenat(inode.vol.live.fd, pathb)) as rfile: by_mh[mini_hash_from_file(inode, rfile)].append(inode) tt.update(mhash=None) for inodes in by_mh.itervalues(): inode_count = len(inodes) if inode_count < 2: continue fies = set() for inode in inodes: try: pathb = inode.vol.live.lookup_one_path(inode) except IOError as e: if e.errno != errno.ENOENT: raise sess.delete(inode) continue with closing(fopenat(inode.vol.live.fd, pathb)) as rfile: fies.add(fiemap_hash_from_file(rfile)) if len(fies) < 2: continue files = [] fds = [] # For description only fd_names = {} fd_inodes = {} by_hash = defaultdict(list) # XXX I have no justification for doubling inode_count ofile_req = 2 * inode_count + ofile_reserved if ofile_req > ofile_soft: if ofile_req <= ofile_hard: resource.setrlimit(resource.RLIMIT_OFILE, (ofile_req, ofile_hard)) ofile_soft = ofile_req else: tt.notify( "Too many duplicates (%d at size %d), " "would bring us over the open files limit (%d, %d)." % (inode_count, size, ofile_soft, ofile_hard) ) for inode in inodes: if inode.has_updates: query.skipped.append(inode) continue for inode in inodes: # Open everything rw, we can't pick one for the source side # yet because the crypto hash might eliminate it. # We may also want to defragment the source. try: pathb = inode.vol.live.lookup_one_path(inode) path = fsdecode(pathb) except IOError as e: if e.errno == errno.ENOENT: sess.delete(inode) continue raise try: afile = fopenat_rw(inode.vol.live.fd, pathb) except IOError as e: if e.errno == errno.ETXTBSY: # The file contains the image of a running process, # we can't open it in write mode. tt.notify("File %r is busy, skipping" % path) elif e.errno == errno.EACCES: # Could be SELinux or immutability tt.notify("Access denied on %r, skipping" % path) elif e.errno == errno.ENOENT: # The file was moved or unlinked by a racing process tt.notify("File %r may have moved, skipping" % path) else: raise query.skipped.append(inode) continue # It's not completely guaranteed we have the right inode, # there may still be race conditions at this point. # Gets re-checked below (tell and fstat). fd = afile.fileno() fd_inodes[fd] = inode fd_names[fd] = path files.append(afile) fds.append(fd) with ExitStack() as stack: for afile in files: stack.enter_context(closing(afile)) # Enter this context last immutability = stack.enter_context(ImmutableFDs(fds)) # With a false positive, some kind of cmp pass that compares # all files at once might be more efficient that hashing. for afile in files: fd = afile.fileno() inode = fd_inodes[fd] if fd in immutability.fds_in_write_use: tt.notify("File %r is in use, skipping" % fd_names[fd]) query.skipped.append(inode) continue hasher = hashlib.sha1() for buf in iter(lambda: afile.read(BUFSIZE), b""): hasher.update(buf) # Gets rid of a race condition st = os.fstat(fd) if st.st_ino != inode.ino: query.skipped.append(inode) continue if st.st_dev != inode.vol.live.st_dev: query.skipped.append(inode) continue size1 = afile.tell() if size1 != size: if size1 < inode.vol.size_cutoff: # if we didn't delete this inode, it would cause # spurious comm groups in all future invocations. sess.delete(inode) else: query.skipped.append(inode) continue by_hash[hasher.digest()].append(afile) tt.update(fhash=None) for fileset in by_hash.itervalues(): if len(fileset) < 2: continue sfile = fileset[0] sfd = sfile.fileno() sdesc = fd_inodes[sfd].vol.live.describe_path(fd_names[sfd]) # Commented out, defragmentation can unshare extents. # It can also disable compression as a side-effect. if False: defragment(sfd) dfiles = fileset[1:] dfiles_successful = [] for dfile in dfiles: dfd = dfile.fileno() ddesc = fd_inodes[dfd].vol.live.describe_path(fd_names[dfd]) if not cmp_files(sfile, dfile): # Probably a bug since we just used a crypto hash tt.notify("Files differ: %r %r" % (sdesc, ddesc)) assert False, (sdesc, ddesc) continue if clone_data(dest=dfd, src=sfd, check_first=True): tt.notify("Deduplicated:\n- %r\n- %r" % (sdesc, ddesc)) dfiles_successful.append(dfile) space_gain += size tt.update(space_gain=space_gain) elif False: # Often happens when there are multiple files with # the same extents, plus one with the same size and # mini-hash but a difference elsewhere. # We hash the same extents multiple times, but # I assume the data is shared in the vfs cache. tt.notify("Did not deduplicate (same extents): %r %r" % (sdesc, ddesc)) if dfiles_successful: evt = DedupEvent(fs=fs.impl, item_size=size, created=system_now()) sess.add(evt) for afile in [sfile] + dfiles_successful: inode = fd_inodes[afile.fileno()] evti = DedupEventInode(event=evt, ino=inode.ino, vol=inode.vol) sess.add(evti) sess.commit() tt.format(None)