def test1(a, b): d = mdiff.textdiff(a, b) if not d: raise ValueError("empty") c = mdiff.patches(a, [d]) if c != b: raise ValueError("bad")
def make_delta(self, history_chain, old_file, new_file, out_file_name): """ Make a new binary change blob and write it into out_file_name. """ if len(history_chain) == 0: #print "DOING FULL INSERT" return self.make_full_insert(new_file, out_file_name) #print "MAKING DELTA" in_file = open(new_file, 'rb') raw_new = None try: raw_new = in_file.read() finally: in_file.close() parent = NULL_SHA in_old = open(old_file, 'rb') try: raw_old = in_old.read() values = compress(mdiff.textdiff(raw_old, raw_new)) parent = history_chain[0][0] out_file = open(out_file_name, 'wb') try: if values[0]: out_file.write(values[0]) out_file.write(values[1]) finally: out_file.close() finally: in_old.close() return parent
def diff(self, delta_object): def flatten(s): return s if isinstance(s, bytes) else bytes(s) return textdiff( flatten(delta_object.raw_data) if delta_object else b'', flatten(self.raw_data))
def unicode_make_patch(old_text, new_text): """ Helper wrapper around make_patch() which takes unicode strings.""" values = compress(mdiff.textdiff(old_text.encode('utf8'), new_text.encode('utf8'))) if values[0]: return ''.join(values) return values[1]
def assert_bdiff_applies(self, a, b): d = mdiff.textdiff(a, b) c = a if d: c = mdiff.patches(a, [d]) self.assertEqual( c, b, ("bad diff+patch result from\n %r to\n " "%r: \nbdiff: %r\npatched: %r" % (a, b, d, c[:200])))
def revdiff(self, rev1, rev2): validaterev(rev1) validaterev(rev2) node1 = self.node(rev1) node2 = self.node(rev2) return mdiff.textdiff(self.revision(node1, raw=True), self.revision(node2, raw=True))
def revdiff(self, rev1, rev2): """return or calculate a delta between two revisions""" if rev1 > self.repotiprev and rev2 > self.repotiprev: return self.revlog2.revdiff(self.revlog2.rev(self.node(rev1)), self.revlog2.rev(self.node(rev2))) elif rev1 <= self.repotiprev and rev2 <= self.repotiprev: return revlog.revlog.revdiff(self, rev1, rev2) return mdiff.textdiff(self.revision(self.node(rev1)), self.revision(self.node(rev2)))
def revdiff(self, rev1, rev2): """return or calculate a delta between two revisions""" if rev1 > self.repotiprev and rev2 > self.repotiprev: return self.revlog2.revdiff( self.revlog2.rev(self.node(rev1)), self.revlog2.rev(self.node(rev2))) elif rev1 <= self.repotiprev and rev2 <= self.repotiprev: return revlog.revlog.revdiff(self, rev1, rev2) return mdiff.textdiff(self.revision(self.node(rev1)), self.revision(self.node(rev2)))
def get_delta(self, src, dst): """ Calculate strings delta :param src: Source string :param dst: Destination string :return: (<type>, delta) """ delta = textdiff(src, dst) if len(delta) >= len(dst): return self.T_FILE, dst else: return self.T_BDIFF, delta
def fastdelta(mf, mfgetter, base, changes): """Given a base manifest text as an array.array and a list of changes relative to that text, compute a delta that can be used by revlog. """ delta = [] dstart = None dend = None dline = [""] start = 0 # zero copy representation of base as a buffer addbuf = util.buffer(base) changes = list(changes) if len(changes) < 1000: # start with a readonly loop that finds the offset of # each line and creates the deltas for f, todelete in changes: # bs will either be the index of the item or the insert point start, end = manifest._msearch(addbuf, f, start) if not todelete: h, fl = mfgetter(f) l = "%s\0%s%s\n" % (f, revlog.hex(h), fl) else: if start == end: # item we want to delete was not found, error out raise AssertionError( (("failed to remove %s from manifest") % f)) l = "" if dstart is not None and dstart <= start and dend >= start: if dend < end: dend = end if l: dline.append(l) else: if dstart is not None: delta.append([dstart, dend, "".join(dline)]) dstart = start dend = end dline = [l] if dstart is not None: delta.append([dstart, dend, "".join(dline)]) # apply the delta to the base, and get a delta for addrevision deltatext, arraytext = manifest._addlistdelta(base, delta) else: # For large changes, it's much cheaper to just build the text and # diff it. arraytext = bytearray(mf.text()) deltatext = mdiff.textdiff(util.buffer(base), util.buffer(arraytext)) return arraytext, deltatext
def showdiff(self, a, b): bin = mdiff.textdiff(a, b) pos = 0 q = 0 actions = [] while pos < len(bin): p1, p2, l = struct.unpack(">lll", bin[pos:pos + 12]) pos += 12 if p1: actions.append(a[q:p1]) actions.append(diffreplace(p1, p2, a[p1:p2], bin[pos:pos + l])) pos += l q = p2 if q < len(a): actions.append(a[q:]) return actions
def repackdata(self, ledger, target): ui = self.repo.ui maxchainlen = ui.configint('packs', 'maxchainlen', 1000) byfile = {} for entry in ledger.entries.itervalues(): if entry.datasource: byfile.setdefault(entry.filename, {})[entry.node] = entry count = 0 repackprogress = ui.makeprogress(_("repacking data"), unit=self.unit, total=len(byfile)) for filename, entries in sorted(byfile.iteritems()): repackprogress.update(count) ancestors = {} nodes = list(node for node in entries) nohistory = [] buildprogress = ui.makeprogress(_("building history"), unit='nodes', total=len(nodes)) for i, node in enumerate(nodes): if node in ancestors: continue buildprogress.update(i) try: ancestors.update( self.fullhistory.getancestors(filename, node, known=ancestors)) except KeyError: # Since we're packing data entries, we may not have the # corresponding history entries for them. It's not a big # deal, but the entries won't be delta'd perfectly. nohistory.append(node) buildprogress.complete() # Order the nodes children first, so we can produce reverse deltas orderednodes = list(reversed(self._toposort(ancestors))) if len(nohistory) > 0: ui.debug('repackdata: %d nodes without history\n' % len(nohistory)) orderednodes.extend(sorted(nohistory)) # Filter orderednodes to just the nodes we want to serialize (it # currently also has the edge nodes' ancestors). orderednodes = list( filter(lambda node: node in nodes, orderednodes)) # Garbage collect old nodes: if self.garbagecollect: neworderednodes = [] for node in orderednodes: # If the node is old and is not in the keepset, we skip it, # and mark as garbage collected if ((filename, node) not in self.keepkeys and self.isold(self.repo, filename, node)): entries[node].gced = True continue neworderednodes.append(node) orderednodes = neworderednodes # Compute delta bases for nodes: deltabases = {} nobase = set() referenced = set() nodes = set(nodes) processprogress = ui.makeprogress(_("processing nodes"), unit='nodes', total=len(orderednodes)) for i, node in enumerate(orderednodes): processprogress.update(i) # Find delta base # TODO: allow delta'ing against most recent descendant instead # of immediate child deltatuple = deltabases.get(node, None) if deltatuple is None: deltabase, chainlen = nullid, 0 deltabases[node] = (nullid, 0) nobase.add(node) else: deltabase, chainlen = deltatuple referenced.add(deltabase) # Use available ancestor information to inform our delta choices ancestorinfo = ancestors.get(node) if ancestorinfo: p1, p2, linknode, copyfrom = ancestorinfo # The presence of copyfrom means we're at a point where the # file was copied from elsewhere. So don't attempt to do any # deltas with the other file. if copyfrom: p1 = nullid if chainlen < maxchainlen: # Record this child as the delta base for its parents. # This may be non optimal, since the parents may have # many children, and this will only choose the last one. # TODO: record all children and try all deltas to find # best if p1 != nullid: deltabases[p1] = (node, chainlen + 1) if p2 != nullid: deltabases[p2] = (node, chainlen + 1) # experimental config: repack.chainorphansbysize if ui.configbool('repack', 'chainorphansbysize'): orphans = nobase - referenced orderednodes = self._chainorphans(ui, filename, orderednodes, orphans, deltabases) # Compute deltas and write to the pack for i, node in enumerate(orderednodes): deltabase, chainlen = deltabases[node] # Compute delta # TODO: Optimize the deltachain fetching. Since we're # iterating over the different version of the file, we may # be fetching the same deltachain over and over again. if deltabase != nullid: deltaentry = self.data.getdelta(filename, node) delta, deltabasename, origdeltabase, meta = deltaentry size = meta.get(constants.METAKEYSIZE) if (deltabasename != filename or origdeltabase != deltabase or size is None): deltabasetext = self.data.get(filename, deltabase) original = self.data.get(filename, node) size = len(original) delta = mdiff.textdiff(deltabasetext, original) else: delta = self.data.get(filename, node) size = len(delta) meta = self.data.getmeta(filename, node) # TODO: don't use the delta if it's larger than the fulltext if constants.METAKEYSIZE not in meta: meta[constants.METAKEYSIZE] = size target.add(filename, node, deltabase, delta, meta) entries[node].datarepacked = True processprogress.complete() count += 1 repackprogress.complete() target.close(ledger=ledger)
def revdiff(self, node1, node2): return mdiff.textdiff(self.revision(node1), self.revision(node2))
def diff(self, delta_object): def flatten(s): return s if isinstance(s, str) else str(s) return textdiff(flatten(delta_object.raw_data) if delta_object else '', flatten(self.raw_data))
def make_patch(old_text, new_text): """ Return a raw patch bytes which transforms old_text into new_text. """ values = compress(mdiff.textdiff(old_text, new_text)) if values[0]: return ''.join(values) return values[1]
def diff(self, other): return textdiff(other.data if other else '', self.data)
def diff(self, other): return mdiff.textdiff(other.data if other else '', self.data)
def repackdata(self, ledger, target): ui = self.repo.ui maxchainlen = ui.configint('packs', 'maxchainlen', 1000) byfile = {} for entry in ledger.entries.itervalues(): if entry.datasource: byfile.setdefault(entry.filename, {})[entry.node] = entry count = 0 for filename, entries in sorted(byfile.iteritems()): ui.progress(_("repacking data"), count, unit=self.unit, total=len(byfile)) ancestors = {} nodes = list(node for node in entries.iterkeys()) nohistory = [] for i, node in enumerate(nodes): if node in ancestors: continue ui.progress(_("building history"), i, unit='nodes', total=len(nodes)) try: ancestors.update(self.fullhistory.getancestors(filename, node, known=ancestors)) except KeyError: # Since we're packing data entries, we may not have the # corresponding history entries for them. It's not a big # deal, but the entries won't be delta'd perfectly. nohistory.append(node) ui.progress(_("building history"), None) # Order the nodes children first, so we can produce reverse deltas orderednodes = list(reversed(self._toposort(ancestors))) if len(nohistory) > 0: ui.debug('repackdata: %d nodes without history\n' % len(nohistory)) orderednodes.extend(sorted(nohistory)) # Filter orderednodes to just the nodes we want to serialize (it # currently also has the edge nodes' ancestors). orderednodes = filter(lambda node: node in nodes, orderednodes) # Garbage collect old nodes: if self.garbagecollect: neworderednodes = [] for node in orderednodes: # If the node is old and is not in the keepset, we skip it, # and mark as garbage collected if ((filename, node) not in self.keepkeys and self.isold(self.repo, filename, node)): entries[node].gced = True continue neworderednodes.append(node) orderednodes = neworderednodes # Compute delta bases for nodes: deltabases = {} nobase = set() referenced = set() nodes = set(nodes) for i, node in enumerate(orderednodes): ui.progress(_("processing nodes"), i, unit='nodes', total=len(orderednodes)) # Find delta base # TODO: allow delta'ing against most recent descendant instead # of immediate child deltatuple = deltabases.get(node, None) if deltatuple is None: deltabase, chainlen = nullid, 0 deltabases[node] = (nullid, 0) nobase.add(node) else: deltabase, chainlen = deltatuple referenced.add(deltabase) # Use available ancestor information to inform our delta choices ancestorinfo = ancestors.get(node) if ancestorinfo: p1, p2, linknode, copyfrom = ancestorinfo # The presence of copyfrom means we're at a point where the # file was copied from elsewhere. So don't attempt to do any # deltas with the other file. if copyfrom: p1 = nullid if chainlen < maxchainlen: # Record this child as the delta base for its parents. # This may be non optimal, since the parents may have # many children, and this will only choose the last one. # TODO: record all children and try all deltas to find # best if p1 != nullid: deltabases[p1] = (node, chainlen + 1) if p2 != nullid: deltabases[p2] = (node, chainlen + 1) # experimental config: repack.chainorphansbysize if ui.configbool('repack', 'chainorphansbysize', True): orphans = nobase - referenced orderednodes = self._chainorphans(ui, filename, orderednodes, orphans, deltabases) # Compute deltas and write to the pack for i, node in enumerate(orderednodes): deltabase, chainlen = deltabases[node] # Compute delta # TODO: Optimize the deltachain fetching. Since we're # iterating over the different version of the file, we may # be fetching the same deltachain over and over again. meta = None if deltabase != nullid: deltaentry = self.data.getdelta(filename, node) delta, deltabasename, origdeltabase, meta = deltaentry size = meta.get(constants.METAKEYSIZE) if (deltabasename != filename or origdeltabase != deltabase or size is None): deltabasetext = self.data.get(filename, deltabase) original = self.data.get(filename, node) size = len(original) delta = mdiff.textdiff(deltabasetext, original) else: delta = self.data.get(filename, node) size = len(delta) meta = self.data.getmeta(filename, node) # TODO: don't use the delta if it's larger than the fulltext if constants.METAKEYSIZE not in meta: meta[constants.METAKEYSIZE] = size target.add(filename, node, deltabase, delta, meta) entries[node].datarepacked = True ui.progress(_("processing nodes"), None) count += 1 ui.progress(_("repacking data"), None) target.close(ledger=ledger)
def _addrawrevision( self, node, revisiondata, transaction, linkrev, p1, p2, storedelta=None, flags=0, ): if self._pathid is None: res = self._db.execute( 'INSERT INTO filepath (path) VALUES (?)', (self._path,) ) self._pathid = res.lastrowid # For simplicity, always store a delta against p1. # TODO we need a lot more logic here to make behavior reasonable. if storedelta: deltabase, delta = storedelta if isinstance(deltabase, int): deltabase = self.node(deltabase) else: assert revisiondata is not None deltabase = p1 if deltabase == nullid: delta = revisiondata else: delta = mdiff.textdiff( self.revision(self.rev(deltabase)), revisiondata ) # File index stores a pointer to its delta and the parent delta. # The parent delta is stored via a pointer to the fileindex PK. if deltabase == nullid: baseid = None else: baseid = self._revisions[deltabase].rid # Deltas are stored with a hash of their content. This allows # us to de-duplicate. The table is configured to ignore conflicts # and it is faster to just insert and silently noop than to look # first. deltahash = hashutil.sha1(delta).digest() if self._compengine == b'zstd': deltablob = self._cctx.compress(delta) compression = COMPRESSION_ZSTD elif self._compengine == b'zlib': deltablob = zlib.compress(delta) compression = COMPRESSION_ZLIB elif self._compengine == b'none': deltablob = delta compression = COMPRESSION_NONE else: raise error.ProgrammingError( b'unhandled compression engine: %s' % self._compengine ) # Don't store compressed data if it isn't practical. if len(deltablob) >= len(delta): deltablob = delta compression = COMPRESSION_NONE deltaid = insertdelta(self._db, compression, deltahash, deltablob) rev = len(self) if p1 == nullid: p1rev = nullrev else: p1rev = self._nodetorev[p1] if p2 == nullid: p2rev = nullrev else: p2rev = self._nodetorev[p2] rid = self._db.execute( 'INSERT INTO fileindex (' ' pathid, revnum, node, p1rev, p2rev, linkrev, flags, ' ' deltaid, deltabaseid) ' ' VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', ( self._pathid, rev, node, p1rev, p2rev, linkrev, flags, deltaid, baseid, ), ).lastrowid entry = revisionentry( rid=rid, rev=rev, node=node, p1rev=p1rev, p2rev=p2rev, p1node=p1, p2node=p2, linkrev=linkrev, flags=flags, ) self._nodetorev[node] = rev self._revtonode[rev] = node self._revisions[node] = entry return node
def revdiff(self, node1, node2): return mdiff.textdiff(self.revision(node1, raw=True), self.revision(node2, raw=True))
def revdiff(self, node1, node2): return mdiff.textdiff(self.rawdata(node1), self.rawdata(node2))
def d(): for pair in textpairs: mdiff.textdiff(*pair)