else: head, *tail = path if head in data: child = data[head] if isinstance(child, list): for element in child: self.remove_path(element, tail) else: self.remove_path(child, tail) def rewrite(self, d, *args, file=None, **kwargs): try: type = d['type'] except KeyError: return d if type in self.paths: data = json.loads(json.dumps(d)) paths = self.paths[type] for path in paths: self.remove_path(data, path) return data else: return d print(f'Removing meaningless `id` properties ...') r = JSONIDRemovalRewriter() rewrite_output_files(r, parallel=True) print('Done')
return d else: print(f'failed to rewrite JSON value: {d!r}') raise Exception(f'failed to rewrite JSON value ({kwargs}): {d!r}') if __name__ == '__main__': if len(sys.argv) < 2: cmd = sys.argv[0] print(f''' Usage: {cmd} URI_PREFIX MAP_FILE_NAME Process all json files in the output path (configured with the GETTY_PIPELINE_OUTPUT environment variable), rewriting URIs that have the specified URI_PREFIX to urn:uuid: URIs that are specified in the MAP_FILE_NAME JSON file. '''.lstrip()) sys.exit(1) prefix = sys.argv[1] map_file = sys.argv[2] print(f'Rewriting URIs to UUIDs ...') start_time = time.time() r = UUIDRewriter(prefix, map_file) rewrite_output_files(r, update_filename=True, verify_uuid=True, parallel=True, ignore_errors=True) if map_file: r.persist_map() cur = time.time() elapsed = cur - start_time print(f'Done (%.1fs)' % (elapsed,))
if __name__ == '__main__': if len(sys.argv) < 2: cmd = sys.argv[0] print(f''' Usage: {cmd} URI_REWRITE_MAP.json '''.lstrip()) sys.exit(1) rewrite_map_filename = sys.argv[1] kwargs = {} if len(sys.argv) > 2: kwargs['files'] = sys.argv[2:] print(f'Rewriting post-sales URIs ...') start_time = time.time() with open(rewrite_map_filename, 'r') as f: post_sale_rewrite_map = json.load(f) # print('Post sales rewrite map:') # pprint.pprint(post_sale_rewrite_map) r = JSONValueRewriter(post_sale_rewrite_map, prefix=True) prefix = os.path.commonprefix(list(post_sale_rewrite_map.keys())) if len(prefix) > 20: kwargs['content_filter_re'] = re.compile(re.escape(prefix)) rewrite_output_files(r, parallel=True, concurrency=8, **kwargs) cur = time.time() elapsed = cur - start_time print(f'Done (%.1fs)' % (elapsed, ))