def localize_css(dom, path, prefix='css'): ''' Find out all css files loaded for this page, replace urls with local files. ''' # create a subfolder if necessary css_path = os.path.join(path, prefix) if not os.path.exists(css_path): os.mkdir(css_path) css_set = set() ret = FBParser.css.css_in_html(dom, dir=css_path, prefix=prefix + '/') dom = ret['source'] css_set.update(ret['csses']) ret = FBParser.css.css_in_json(dom, dir=css_path, prefix=prefix + '\/') dom = ret['source'] css_set.update(ret['csses']) images = set() # localize images in the css files for css_file in css_set: css = FBParser.get_content(os.path.join(path, css_file), encoding='ascii') ret = FBParser.img.img_in_css(css, dir=css_path, prefix='') FBParser.save_content( ret['source'], os.path.join(path, css_file), encoding='ascii') images.update(ret['images']) # FBParser.save_content( # '\n'.join(list(images)), # os.path.join(path, filename.rstrip('html') + 'cssimage_list'), # encoding='ascii') FBParser.save_content( '\n'.join(list(css_set)), os.path.join(path, filename.rstrip('html') + 'css_list'), encoding='ascii') return dom
def localize_misc(dom, path, prefix='misc'): ''' A few standalone resources to localize, including an xml, an ico and an iframe ''' misc_path = os.path.join(path, prefix) if not os.path.exists(misc_path): os.mkdir(misc_path) re_search = re.compile( '<link rel="search" type="application/opensearchdescription\+xml" \ href="(?P<url>http://.*?\.xml)" title="Facebook">') m_search = re_search.search(dom) url = m_search.group('url') file = FBParser.url_to_file(url) FBParser.save_resource(url, misc_path, file) dom = dom.replace(url, os.path.join(prefix, file)) re_ico = re.compile( '<link rel="shortcut icon" href="(?P<url>http://.*?\.ico)">') m_ico = re_ico.search(dom) url = m_ico.group('url') file = FBParser.url_to_file(url) FBParser.save_resource(url, misc_path, file) dom = dom.replace(url, os.path.join(prefix, file)) re_uicif = re.compile('<iframe src="(?P<url>http://.*?\.html)"') m_uicif = re_uicif.search(dom) if m_uicif: url = m_uicif.group('url') file = FBParser.url_to_file(url) FBParser.save_resource(url, misc_path, file) dom = dom.replace(url, os.path.join(prefix, file)) # redirect the rest of the hrefs to about:blank (most of them are hyperlinks) # (call this after localize_css!) dom = re.sub('href="http://.+?"', 'href="about:blank"', dom) return dom
def localize_css(dom, path, prefix='css'): ''' Find out all css files loaded for this page, replace urls with local files. ''' # create a subfolder if necessary css_path = os.path.join(path, prefix) if not os.path.exists(css_path): os.mkdir(css_path) css_set = set() ret = FBParser.css.css_in_html(dom, dir=css_path, prefix=prefix + '/') dom = ret['source'] css_set.update(ret['csses']) ret = FBParser.css.css_in_json(dom, dir=css_path, prefix=prefix + '\/') dom = ret['source'] css_set.update(ret['csses']) images = set() # localize images in the css files for css_file in css_set: css = FBParser.get_content(os.path.join(path, css_file), encoding='ascii') ret = FBParser.img.img_in_css(css, dir=css_path, prefix=prefix + '/') FBParser.save_content( ret['source'], os.path.join(path, css_file), encoding='ascii') images.update(ret['images']) FBParser.save_content( '\n'.join(list(images)), os.path.join(path, filename.rstrip('html') + 'cssimage_list'), encoding='ascii') FBParser.save_content( '\n'.join(list(css_set)), os.path.join(path, filename.rstrip('html') + 'css_list'), encoding='ascii') return dom
def localize_img(dom, path, prefix='img'): ''' Find out all images loaded for this page, replace urls with local files. ''' # create a subfolder if necessary img_path = os.path.join(path, prefix) if not os.path.exists(img_path): os.mkdir(img_path) ret = FBParser.img.img_in_html( dom, dir=img_path, prefix=prefix + '/') dom = ret['source'] FBParser.save_content( '\n'.join(list(ret['images'])), os.path.join(path, filename.rstrip('html') + 'img_list'), encoding='ascii') return dom
def get_css_selectors(path, filename): ''' Get all the selectors from a list of css files. Call this AFTER localizing all css resources and css_list is in place. ''' # first collect css selectors from css files selectors = {} css_files = FBParser.get_content( os.path.join(path, filename.rstrip('html') + 'css_list'), encoding='ascii') css_files = css_files.split('\n') for css_file in css_files: css_file = css_file.replace('\\', '') css = FBParser.get_content(os.path.join(path, css_file), encoding='ascii') ret = FBParser.css.selector_index(css) for key in ret.keys(): if key in selectors: selectors[key].update(ret[key]) else: selectors[key] = ret[key] return selectors
def localize_js(dom, path, prefix='js'): ''' Find out all js files loaded for this page, replace urls with local files. ''' # create a subfolder if necessary js_path = os.path.join(path, prefix) if not os.path.exists(js_path): os.mkdir(js_path) js_set = set() ret = FBParser.js.js_in_html(dom, dir=js_path, prefix=prefix + '/') dom = ret['source'] js_set.update(ret['javascripts']) ret = FBParser.js.js_in_json(dom, dir=js_path, prefix=prefix + '\/') dom = ret['source'] js_set.update(ret['javascripts']) FBParser.save_content( '\n'.join(list(js_set)), os.path.join(path, filename.rstrip('html') + 'js_list'), mode='w', encoding='ascii') return dom
def anonym_images(dom, path, filename): ''' Anonymize images and regenerate file names. ''' img_files = FBParser.get_content( os.path.join(path, filename.rstrip('html') + 'img_list'), encoding='ascii') img_files = img_files.split('\n') images = {} for img_file in img_files: ext = os.path.splitext(img_file)[1] prefix = os.path.split(img_file)[0] new_file = prefix + '/anonym_' + str(random.getrandbits(40)) + ext images[img_file] = new_file dom = dom.replace(img_file, new_file) FBParser.save_resource(os.path.join(path, img_file), path, new_file) os.remove(os.path.join(path, img_file)) st_mapping = [] for image in images: images[image] = os.path.join(path, images[image]) FBParser.garble_image.garble(images.values()) return dom
def anonym_images(dom, path, filename): ''' Anonymize images and regenerate file names. ''' img_files = FBParser.get_content( os.path.join(path, filename.rstrip('html') + 'img_list'), encoding='ascii') img_files = img_files.split('\n') images = {} if os.path.isfile( os.path.join(path, filename.rstrip('html') + 'img_mapping')): # already have garbled images, only replace filenames in dom img_mapping = FBParser.get_content( os.path.join(path, filename.rstrip('html') + 'img_mapping'), encoding='ascii') img_mapping = img_mapping.split('\n') for mapping in img_mapping: img_file, new_file = mapping.split(': ') dom = dom.replace(img_file, new_file) else: # garbled image not generated yet for img_file in img_files: ext = os.path.splitext(img_file)[1] prefix = os.path.split(img_file)[0] new_file = prefix + '/anonym_' + str(random.getrandbits(40)) + ext images[img_file] = new_file dom = dom.replace(img_file, new_file) FBParser.save_resource(os.path.join(path, img_file), path, new_file) st_mapping = [] for key, val in images.items(): st_mapping.append(key + ': ' + val) FBParser.save_content( '\n'.join(st_mapping), os.path.join(path, filename.rstrip('html') + 'img_mapping'), encoding='ascii') for image in images: images[image] = os.path.join(path, images[image]) FBParser.garble_image.garble(images.values()) return dom
f.close() os.remove(log) for entry in entries: url, file = entry.rstrip('\n').split() try: urlretrieve(url, os.path.join(dir, file)) except ValueError, err: err_log.write(url + ' ' + os.path.join(subdir, file) + '\n') err_log.close() # main if __name__ == '__main__': args = get_args() path, filename = os.path.split(args.file) dom = FBParser.get_content(args.file) if args.action == 'pretty': pretty_dom = FBParser.dom.prettify(dom) FBParser.save_content( pretty_dom, os.path.join(path, 'pretty-' + filename)) if args.action == "convert": dom = FBParser.js.remove_cavalry(dom) dom = FBParser.dom.decss_injected(dom) dom = localize_css(dom, path) dom = localize_js(dom, path) dom = localize_img(dom, path) local_dom = localize_misc(dom, path) retry_resource(path)
os.remove(log) for entry in entries: url, file = entry.rstrip('\n').split() try: urlretrieve(url, os.path.join(dir, file)) except ValueError, err: pass # err_log.write(url + ' ' + os.path.join(subdir, file) + '\n') # err_log.close() # main if __name__ == '__main__': args = get_args() path, filename = os.path.split(args.file) dom = FBParser.get_content(args.file, encoding='latin1') if args.action == 'pretty': pretty_dom = FBParser.dom.prettify(dom) FBParser.save_content( pretty_dom, os.path.join(path, 'pretty-' + filename)) if args.action == "convert": dom = decavalry(dom) dom = FBParser.js.remove_cavalry(dom) dom = FBParser.dom.descript_injected(dom) dom = localize_css(dom, path) dom = localize_js(dom, path) dom = localize_img(dom, path) local_dom = localize_misc(dom, path)