def localize_css(dom, path, prefix='css'): ''' Find out all css files loaded for this page, replace urls with local files. ''' # create a subfolder if necessary css_path = os.path.join(path, prefix) if not os.path.exists(css_path): os.mkdir(css_path) css_set = set() ret = FBParser.css.css_in_html(dom, dir=css_path, prefix=prefix + '/') dom = ret['source'] css_set.update(ret['csses']) ret = FBParser.css.css_in_json(dom, dir=css_path, prefix=prefix + '\/') dom = ret['source'] css_set.update(ret['csses']) images = set() # localize images in the css files for css_file in css_set: css = FBParser.get_content(os.path.join(path, css_file), encoding='ascii') ret = FBParser.img.img_in_css(css, dir=css_path, prefix=prefix + '/') FBParser.save_content( ret['source'], os.path.join(path, css_file), encoding='ascii') images.update(ret['images']) FBParser.save_content( '\n'.join(list(images)), os.path.join(path, filename.rstrip('html') + 'cssimage_list'), encoding='ascii') FBParser.save_content( '\n'.join(list(css_set)), os.path.join(path, filename.rstrip('html') + 'css_list'), encoding='ascii') return dom
def get_css_selectors(path, filename): ''' Get all the selectors from a list of css files. Call this AFTER localizing all css resources and css_list is in place. ''' # first collect css selectors from css files selectors = {} css_files = FBParser.get_content( os.path.join(path, filename.rstrip('html') + 'css_list'), encoding='ascii') css_files = css_files.split('\n') for css_file in css_files: css_file = css_file.replace('\\', '') css = FBParser.get_content(os.path.join(path, css_file), encoding='ascii') ret = FBParser.css.selector_index(css) for key in ret.keys(): if key in selectors: selectors[key].update(ret[key]) else: selectors[key] = ret[key] return selectors
def anonym_images(dom, path, filename): ''' Anonymize images and regenerate file names. ''' img_files = FBParser.get_content( os.path.join(path, filename.rstrip('html') + 'img_list'), encoding='ascii') img_files = img_files.split('\n') images = {} if os.path.isfile( os.path.join(path, filename.rstrip('html') + 'img_mapping')): # already have garbled images, only replace filenames in dom img_mapping = FBParser.get_content( os.path.join(path, filename.rstrip('html') + 'img_mapping'), encoding='ascii') img_mapping = img_mapping.split('\n') for mapping in img_mapping: img_file, new_file = mapping.split(': ') dom = dom.replace(img_file, new_file) else: # garbled image not generated yet for img_file in img_files: ext = os.path.splitext(img_file)[1] prefix = os.path.split(img_file)[0] new_file = prefix + '/anonym_' + str(random.getrandbits(40)) + ext images[img_file] = new_file dom = dom.replace(img_file, new_file) FBParser.save_resource(os.path.join(path, img_file), path, new_file) st_mapping = [] for key, val in images.items(): st_mapping.append(key + ': ' + val) FBParser.save_content( '\n'.join(st_mapping), os.path.join(path, filename.rstrip('html') + 'img_mapping'), encoding='ascii') for image in images: images[image] = os.path.join(path, images[image]) FBParser.garble_image.garble(images.values()) return dom
def anonym_images(dom, path, filename): ''' Anonymize images and regenerate file names. ''' img_files = FBParser.get_content( os.path.join(path, filename.rstrip('html') + 'img_list'), encoding='ascii') img_files = img_files.split('\n') images = {} for img_file in img_files: ext = os.path.splitext(img_file)[1] prefix = os.path.split(img_file)[0] new_file = prefix + '/anonym_' + str(random.getrandbits(40)) + ext images[img_file] = new_file dom = dom.replace(img_file, new_file) FBParser.save_resource(os.path.join(path, img_file), path, new_file) os.remove(os.path.join(path, img_file)) st_mapping = [] for image in images: images[image] = os.path.join(path, images[image]) FBParser.garble_image.garble(images.values()) return dom
f.close() os.remove(log) for entry in entries: url, file = entry.rstrip('\n').split() try: urlretrieve(url, os.path.join(dir, file)) except ValueError, err: err_log.write(url + ' ' + os.path.join(subdir, file) + '\n') err_log.close() # main if __name__ == '__main__': args = get_args() path, filename = os.path.split(args.file) dom = FBParser.get_content(args.file) if args.action == 'pretty': pretty_dom = FBParser.dom.prettify(dom) FBParser.save_content( pretty_dom, os.path.join(path, 'pretty-' + filename)) if args.action == "convert": dom = FBParser.js.remove_cavalry(dom) dom = FBParser.dom.decss_injected(dom) dom = localize_css(dom, path) dom = localize_js(dom, path) dom = localize_img(dom, path) local_dom = localize_misc(dom, path) retry_resource(path)
os.remove(log) for entry in entries: url, file = entry.rstrip('\n').split() try: urlretrieve(url, os.path.join(dir, file)) except ValueError, err: pass # err_log.write(url + ' ' + os.path.join(subdir, file) + '\n') # err_log.close() # main if __name__ == '__main__': args = get_args() path, filename = os.path.split(args.file) dom = FBParser.get_content(args.file, encoding='latin1') if args.action == 'pretty': pretty_dom = FBParser.dom.prettify(dom) FBParser.save_content( pretty_dom, os.path.join(path, 'pretty-' + filename)) if args.action == "convert": dom = decavalry(dom) dom = FBParser.js.remove_cavalry(dom) dom = FBParser.dom.descript_injected(dom) dom = localize_css(dom, path) dom = localize_js(dom, path) dom = localize_img(dom, path) local_dom = localize_misc(dom, path)