def test_different_compression(): all_ascii = (string.ascii_letters + string.digits).encode() v1 = all_ascii * 1000 v2 = all_ascii * 900 + string.ascii_letters.encode() * 100 + all_ascii * 100 delta_a = xdelta3.encode(v1, v2, xdelta3.Flags.COMPLEVEL_9) v2_a = xdelta3.decode(v1, delta_a) assert v2 == v2_a delta_b = xdelta3.encode(v1, v2, xdelta3.Flags.COMPLEVEL_1) v2_b = xdelta3.decode(v1, delta_b) assert v2 == v2_b assert len(delta_a) < len(delta_b)
def write_content_diff(out, in1, in2): content_id_map = { iterdiff.PROLOGUE: CONTENT_PROLOGUE, iterdiff.EPILOGUE: CONTENT_EPILOGUE, CONTENT_PROLOGUE: None, # fail-fast on struct.pack CONTENT_EPILOGUE: None, # fail-fast on struct.pack } # Default compresslevel of 9 is okay as gzip costs nothing compared to XML parsing of source. # Note, xdelta from b'' may be still smaller than the original blob. with gzip.GzipFile(fileobj=out, mode='wb') as gz: for k, v1, v2 in iterdiff.iter_content_diff(in1, in2): k = content_id_map.get(k, k) try: delta = xdelta3.encode(v1, v2) # FIXME: workaround for bug https://github.com/samuelcolvin/xdelta3-python/issues/2 try: good = xdelta3.decode(v1, delta) == v2 # Epic Fail if happens. except xdelta3.XDeltaError: good = False magic = MAGIC_XDELTA3 if good else MAGIC_UGLY except xdelta3.NoDeltaFound: magic, delta = MAGIC_RAW, None blob = delta if magic == MAGIC_XDELTA3 else v2 gz.write(CDIFF_HEAD.pack(magic, k, len(v1), len(blob))) gz.write(v1) gz.write(blob)
def test_readme(): value_one = b'wonderful string to demonstrate xdelta3, much of these two strings is the same.' value_two = b'different string to demonstrate xdelta3, much of these two strings is the same.' delta = xdelta3.encode(value_one, value_two) value_two_rebuilt = xdelta3.decode(value_one, delta) assert value_two_rebuilt == value_two
def xdelta3dec(source_bytes, patch_bytes): #if source bytes finished, everything remaining is a patch bytes if not source_bytes: source_bytes = b'' #if this happens either the patchfile is corrupt or there is a bug. #Every write and read must have a patch file written and read. assert patch_bytes #these are memoryviews and python xdelta3 lib doesn't like that... return xdelta3.decode(bytes(source_bytes), patch_bytes)
def test_large_decode(): this_dir = Path(__file__).parent try: b1 = (this_dir / 'b1.bin').read_bytes() b2 = (this_dir / 'b2.bin').read_bytes() except FileNotFoundError as e: raise RuntimeError( 'file required for test not found, run `make download-test-files`' ) from e d = xdelta3.encode(b1, b2) b3 = xdelta3.decode(b1, d) assert b2 == b3
def read_content_diff(fd): with gzip.GzipFile(fileobj=fd, mode='rb') as gz: while True: head = gz.read(CDIFF_HEAD.size) if not head: break magic, content_id, l1, l2 = CDIFF_HEAD.unpack(head) v1 = gz.read(l1) v2 = gz.read(l2) if magic not in MAGIC_LIST or len(v1) != l1 or len(v2) != l2: raise RuntimeError('Bad format', magic, content_id, l1, len(v1), l2, len(v2)) if magic == MAGIC_XDELTA3: v2 = xdelta3.decode(v1, v2) yield content_id, v1, v2
curl https://ocw.mit.edu/ans7870/6/6.006/s08/lecturenotes/files/t8.shakespeare.txt > shakespeare.txt cp shakespeare.txt shakespeare_changed.txt vim shakespeare_changed.txt (and make some changes to shakespeare_changed.txt) python performance.py """ from pathlib import Path from statistics import mean, stdev from time import time import xdelta3 v1 = Path('shakespeare.txt').read_bytes() v2 = Path('shakespeare_changed.txt').read_bytes() times = [] for i in range(50): start = time() delta = xdelta3.encode(v1, v2, xdelta3.Flags.COMPLEVEL_1) v22 = xdelta3.decode(v1, delta) time_taken = (time() - start) * 1000 times.append(time_taken) print(f'{i + 1:3} result_match={v2 == v22} time={time_taken:0.1f}ms') print(f'\noriginal length: {len(v1)}') print(f'changed length: {len(v2)}') print(f'delta length: {len(delta)}') print( f'mean time taken to encode and decode: {mean(times):0.3f}ms, stdev {stdev(times):0.3f}ms' )
def test_decode_error(): with pytest.raises(xdelta3.XDeltaError) as exc_info: xdelta3.decode(expected_delta, value_one) assert exc_info.value.args[ 0] == 'Error occur executing xdelta3: XD3_INVALID_INPUT'
def test_long_random(): v1 = base64.b32encode(os.urandom(1000)) v2 = b'x' + v1 + b'x' delta = xdelta3.encode(v1, v2) v22 = xdelta3.decode(v1, delta) assert v2 == v22
def test_encode_decode(): delta = xdelta3.encode(value_one, value_two) assert delta == expected_delta value_two2 = xdelta3.decode(value_one, delta) assert value_two == value_two2
def load_human_data(self, start_idx=None, end_idx=None, ids=[]): """ Load human backups and event data for the specified humans. Ex : Calling with start_idx=1 and end_idx=4 will load data for the second, third and fourth humans. Args: start_idx (int, optional): Index (starting at 0) of the first human to load. If unspecified, loading will start at first human. end_idx (int, optional): Index (starting at 0) of the last human to load plus one. If unspecified, humans up until the last one will be loaded. Returns: [type]: [description] """ if start_idx is None: start_idx = 0 if end_idx is None: end_idx = self.get_nb_humans() assert start_idx < end_idx # If we pass in a specfic set of human ids, go get those ones, otherwise batch if ids: idxs = [int(i.split(":")[-1])-1 for i in ids] else: idxs = range(start_idx, end_idx) human_backups = {} humans_events = {} latest_human_buffers = [None] * self.get_nb_humans() print("loading humans from delta buffer...") # first, quickly load all the raw data, we'll rebuild the full objects afterwards for day_idx in tqdm.tqdm(range(self.get_nb_days())): for hour_idx in range(24): for idx in idxs: # here, we assume the data is always encoded in a delta-since-last format if latest_human_buffers[idx] is None or not self.is_delta[day_idx, hour_idx, idx]: assert not self.is_delta[day_idx, hour_idx, idx] latest_human_buffers[idx] = self.dataset[day_idx, hour_idx, idx] human_buffer = latest_human_buffers[idx] else: human_delta = self.dataset[day_idx, hour_idx, idx] human_buffer = xdelta3.decode(latest_human_buffers[idx], human_delta) latest_human_buffers[idx] = human_buffer human_data = pickle.loads(human_buffer) timestamp = human_data.env.timestamp human_data.conf = self.conf if timestamp not in human_backups: human_backups[timestamp] = {} human_backups[timestamp][human_data.name] = human_data human_constr_args = [k for k in inspect.getfullargspec(Human.__init__)[0] if k != "self"] for timestamp, humans in human_backups.items(): # time to recreate the (approx) full object w/ its member functions for human_name, human_dump in humans.items(): new_env = Env(human_dump.env.initial_timestamp) new_env._now = human_dump.env.now human_dump.env = new_env human_dump.rng = human_dump.init_seed # to keep same init construction state human_obj = Human(*[getattr(human_dump, k) for k in human_constr_args]) # override all attributes except the blacklist/dummy ones for attr_name in human_obj.__dict__.keys(): if attr_name != "env" and attr_name not in human_dump.blacklisted_attribs and attr_name != "known_connections" and attr_name != "intervened_behavior": setattr(human_obj, attr_name, getattr(human_dump, attr_name)) human_obj.name = human_dump.name humans[human_name] = human_obj # now, extract human event data for human_name, human in humans.items(): humans_events[human.name] = {} for event in human._events: humans_events[human.name][(event["time"], event["event_type"])] = event human._events = [] # finally, ensure events are sorted by timestamp for each human for human_id, human_events in humans_events.items(): events = list(human_events.values()) events.sort(key=lambda e: e["time"]) humans_events[human_id] = events return human_backups, humans_events