def convert_archive(archive): # regarding the usage of del in this function... bear in mind these sets # could easily have 10k -> 100k entries in extreme cases; thus the del # usage, explicitly trying to ensure we don't keep refs long term. # this one is a bit fun. raw = list(archive_to_fsobj(archive)) # we use the data source as the unique key to get position. files_ordering = list(enumerate(x for x in raw if x.is_reg)) files_ordering = {x.data: idx for idx, x in files_ordering} t = contents.contentsSet(raw, mutable=True) del raw, archive # first rewrite affected syms. raw_syms = t.links() syms = contents.contentsSet(raw_syms) while True: for x in sorted(syms): affected = syms.child_nodes(x.location) if not affected: continue syms.difference_update(affected) syms.update(affected.change_offset(x.location, x.resolved_target)) del affected break else: break t.difference_update(raw_syms) t.update(syms) del raw_syms syms = sorted(syms, reverse=True) # ok, syms are correct. now we get the rest. # we shift the readds into a separate list so that we don't reinspect # them on later runs; this slightly reduces the working set. additions = [] for x in syms: affected = t.child_nodes(x.location) if not affected: continue t.difference_update(affected) additions.extend(affected.change_offset(x.location, x.resolved_target)) t.update(additions) t.add_missing_directories() # finally... an insane sort. def sort_func(x, y): if x.is_dir: if not y.is_dir: return -1 return cmp(x, y) elif y.is_dir: return +1 elif x.is_reg: if y.is_reg: return cmp(files_ordering[x.data], files_ordering[y.data]) return +1 elif y.is_reg: return -1 return cmp(x, y) return contents.OrderedContentsSet(sorted_cmp(t, sort_func), mutable=False)