def localize_css(dom, path, prefix='css'): ''' Find out all css files loaded for this page, replace urls with local files. ''' # create a subfolder if necessary css_path = os.path.join(path, prefix) if not os.path.exists(css_path): os.mkdir(css_path) css_set = set() ret = FBParser.css.css_in_html(dom, dir=css_path, prefix=prefix + '/') dom = ret['source'] css_set.update(ret['csses']) ret = FBParser.css.css_in_json(dom, dir=css_path, prefix=prefix + '\/') dom = ret['source'] css_set.update(ret['csses']) images = set() # localize images in the css files for css_file in css_set: css = FBParser.get_content(os.path.join(path, css_file), encoding='ascii') ret = FBParser.img.img_in_css(css, dir=css_path, prefix='') FBParser.save_content( ret['source'], os.path.join(path, css_file), encoding='ascii') images.update(ret['images']) # FBParser.save_content( # '\n'.join(list(images)), # os.path.join(path, filename.rstrip('html') + 'cssimage_list'), # encoding='ascii') FBParser.save_content( '\n'.join(list(css_set)), os.path.join(path, filename.rstrip('html') + 'css_list'), encoding='ascii') return dom
def localize_img(dom, path, prefix='img'): ''' Find out all images loaded for this page, replace urls with local files. ''' # create a subfolder if necessary img_path = os.path.join(path, prefix) if not os.path.exists(img_path): os.mkdir(img_path) ret = FBParser.img.img_in_html( dom, dir=img_path, prefix=prefix + '/') dom = ret['source'] FBParser.save_content( '\n'.join(list(ret['images'])), os.path.join(path, filename.rstrip('html') + 'img_list'), encoding='ascii') return dom
def localize_js(dom, path, prefix='js'): ''' Find out all js files loaded for this page, replace urls with local files. ''' # create a subfolder if necessary js_path = os.path.join(path, prefix) if not os.path.exists(js_path): os.mkdir(js_path) js_set = set() ret = FBParser.js.js_in_html(dom, dir=js_path, prefix=prefix + '/') dom = ret['source'] js_set.update(ret['javascripts']) ret = FBParser.js.js_in_json(dom, dir=js_path, prefix=prefix + '\/') dom = ret['source'] js_set.update(ret['javascripts']) FBParser.save_content( '\n'.join(list(js_set)), os.path.join(path, filename.rstrip('html') + 'js_list'), mode='w', encoding='ascii') return dom
def localize_css(dom, path, prefix='css'): ''' Find out all css files loaded for this page, replace urls with local files. ''' # create a subfolder if necessary css_path = os.path.join(path, prefix) if not os.path.exists(css_path): os.mkdir(css_path) css_set = set() ret = FBParser.css.css_in_html(dom, dir=css_path, prefix=prefix + '/') dom = ret['source'] css_set.update(ret['csses']) ret = FBParser.css.css_in_json(dom, dir=css_path, prefix=prefix + '\/') dom = ret['source'] css_set.update(ret['csses']) images = set() # localize images in the css files for css_file in css_set: css = FBParser.get_content(os.path.join(path, css_file), encoding='ascii') ret = FBParser.img.img_in_css(css, dir=css_path, prefix=prefix + '/') FBParser.save_content( ret['source'], os.path.join(path, css_file), encoding='ascii') images.update(ret['images']) FBParser.save_content( '\n'.join(list(images)), os.path.join(path, filename.rstrip('html') + 'cssimage_list'), encoding='ascii') FBParser.save_content( '\n'.join(list(css_set)), os.path.join(path, filename.rstrip('html') + 'css_list'), encoding='ascii') return dom
def anonym_images(dom, path, filename): ''' Anonymize images and regenerate file names. ''' img_files = FBParser.get_content( os.path.join(path, filename.rstrip('html') + 'img_list'), encoding='ascii') img_files = img_files.split('\n') images = {} if os.path.isfile( os.path.join(path, filename.rstrip('html') + 'img_mapping')): # already have garbled images, only replace filenames in dom img_mapping = FBParser.get_content( os.path.join(path, filename.rstrip('html') + 'img_mapping'), encoding='ascii') img_mapping = img_mapping.split('\n') for mapping in img_mapping: img_file, new_file = mapping.split(': ') dom = dom.replace(img_file, new_file) else: # garbled image not generated yet for img_file in img_files: ext = os.path.splitext(img_file)[1] prefix = os.path.split(img_file)[0] new_file = prefix + '/anonym_' + str(random.getrandbits(40)) + ext images[img_file] = new_file dom = dom.replace(img_file, new_file) FBParser.save_resource(os.path.join(path, img_file), path, new_file) st_mapping = [] for key, val in images.items(): st_mapping.append(key + ': ' + val) FBParser.save_content( '\n'.join(st_mapping), os.path.join(path, filename.rstrip('html') + 'img_mapping'), encoding='ascii') for image in images: images[image] = os.path.join(path, images[image]) FBParser.garble_image.garble(images.values()) return dom
urlretrieve(url, os.path.join(dir, file)) except ValueError, err: err_log.write(url + ' ' + os.path.join(subdir, file) + '\n') err_log.close() # main if __name__ == '__main__': args = get_args() path, filename = os.path.split(args.file) dom = FBParser.get_content(args.file) if args.action == 'pretty': pretty_dom = FBParser.dom.prettify(dom) FBParser.save_content( pretty_dom, os.path.join(path, 'pretty-' + filename)) if args.action == "convert": dom = FBParser.js.remove_cavalry(dom) dom = FBParser.dom.decss_injected(dom) dom = localize_css(dom, path) dom = localize_js(dom, path) dom = localize_img(dom, path) local_dom = localize_misc(dom, path) retry_resource(path) dom_13 = local_dom print "css 1, javascript 3" ret = FBParser.dom.unload_pagelets(dom_13) html_13 = ret['html'] pagelets = ret['pagelets']
except ValueError, err: pass # err_log.write(url + ' ' + os.path.join(subdir, file) + '\n') # err_log.close() # main if __name__ == '__main__': args = get_args() path, filename = os.path.split(args.file) dom = FBParser.get_content(args.file, encoding='latin1') if args.action == 'pretty': pretty_dom = FBParser.dom.prettify(dom) FBParser.save_content( pretty_dom, os.path.join(path, 'pretty-' + filename)) if args.action == "convert": dom = decavalry(dom) dom = FBParser.js.remove_cavalry(dom) dom = FBParser.dom.descript_injected(dom) dom = localize_css(dom, path) dom = localize_js(dom, path) dom = localize_img(dom, path) local_dom = localize_misc(dom, path) retry_resource(path) dom_13 = local_dom ret = FBParser.dom.unload_pagelets(dom_13) pagelets = ret['pagelets'] dom_12 = FBParser.dom.descript_onclick(dom_13)