Example #1
0
def localize_css(dom, path, prefix='css'):
  '''
  Find out all css files loaded for this page, replace urls with local files.
  '''
  # create a subfolder if necessary
  css_path = os.path.join(path, prefix)
  if not os.path.exists(css_path):
    os.mkdir(css_path)
  css_set = set()
  ret = FBParser.css.css_in_html(dom, dir=css_path, prefix=prefix + '/')
  dom = ret['source']
  css_set.update(ret['csses'])
  ret = FBParser.css.css_in_json(dom, dir=css_path, prefix=prefix + '\/')
  dom = ret['source']
  css_set.update(ret['csses'])
  images = set()
  # localize images in the css files
  for css_file in css_set:
    css = FBParser.get_content(os.path.join(path, css_file), encoding='ascii')
    ret = FBParser.img.img_in_css(css, dir=css_path, prefix='')
    FBParser.save_content(
      ret['source'],
      os.path.join(path, css_file),
      encoding='ascii')
    images.update(ret['images'])
  # FBParser.save_content(
    # '\n'.join(list(images)),
    # os.path.join(path, filename.rstrip('html') + 'cssimage_list'),
    # encoding='ascii')
  FBParser.save_content(
    '\n'.join(list(css_set)),
    os.path.join(path, filename.rstrip('html') + 'css_list'),
    encoding='ascii')
  return dom
Example #2
0
def localize_misc(dom, path, prefix='misc'):
  '''
  A few standalone resources to localize, including an xml, an ico and an iframe
  '''
  misc_path = os.path.join(path, prefix)
  if not os.path.exists(misc_path):
    os.mkdir(misc_path)
  re_search = re.compile(
'<link rel="search" type="application/opensearchdescription\+xml" \
href="(?P<url>http://.*?\.xml)" title="Facebook">')
  m_search = re_search.search(dom)
  url = m_search.group('url')
  file = FBParser.url_to_file(url)
  FBParser.save_resource(url, misc_path, file)
  dom = dom.replace(url, os.path.join(prefix, file))
  re_ico = re.compile(
'<link rel="shortcut icon" href="(?P<url>http://.*?\.ico)">')
  m_ico = re_ico.search(dom)
  url = m_ico.group('url')
  file = FBParser.url_to_file(url)
  FBParser.save_resource(url, misc_path, file)
  dom = dom.replace(url, os.path.join(prefix, file))
  re_uicif = re.compile('<iframe src="(?P<url>http://.*?\.html)"')
  m_uicif = re_uicif.search(dom)
  if m_uicif:
    url = m_uicif.group('url')
    file = FBParser.url_to_file(url)
    FBParser.save_resource(url, misc_path, file)
    dom = dom.replace(url, os.path.join(prefix, file))
  # redirect the rest of the hrefs to about:blank (most of them are hyperlinks)
  # (call this after localize_css!)
  dom = re.sub('href="http://.+?"', 'href="about:blank"', dom)
  return dom
Example #3
0
def localize_css(dom, path, prefix='css'):
  '''
  Find out all css files loaded for this page, replace urls with local files.
  '''
  # create a subfolder if necessary
  css_path = os.path.join(path, prefix)
  if not os.path.exists(css_path):
    os.mkdir(css_path)
  css_set = set()
  ret = FBParser.css.css_in_html(dom, dir=css_path, prefix=prefix + '/')
  dom = ret['source']
  css_set.update(ret['csses'])
  ret = FBParser.css.css_in_json(dom, dir=css_path, prefix=prefix + '\/')
  dom = ret['source']
  css_set.update(ret['csses'])
  images = set()
  # localize images in the css files
  for css_file in css_set:
    css = FBParser.get_content(os.path.join(path, css_file), encoding='ascii')
    ret = FBParser.img.img_in_css(css, dir=css_path, prefix=prefix + '/')
    FBParser.save_content(
      ret['source'],
      os.path.join(path, css_file),
      encoding='ascii')
    images.update(ret['images'])
  FBParser.save_content(
    '\n'.join(list(images)),
    os.path.join(path, filename.rstrip('html') + 'cssimage_list'),
    encoding='ascii')
  FBParser.save_content(
    '\n'.join(list(css_set)),
    os.path.join(path, filename.rstrip('html') + 'css_list'),
    encoding='ascii')
  return dom
Example #4
0
def localize_img(dom, path, prefix='img'):
  '''
  Find out all images loaded for this page, replace urls with local files.
  '''
  # create a subfolder if necessary
  img_path = os.path.join(path, prefix)
  if not os.path.exists(img_path):
    os.mkdir(img_path)
  ret = FBParser.img.img_in_html(
    dom,
    dir=img_path,
    prefix=prefix + '/')
  dom = ret['source']
  FBParser.save_content(
    '\n'.join(list(ret['images'])),
    os.path.join(path, filename.rstrip('html') + 'img_list'),
    encoding='ascii')
  return dom
Example #5
0
def get_css_selectors(path, filename):
  '''
  Get all the selectors from a list of css files.
  Call this AFTER localizing all css resources and css_list is in place.
  '''
  #   first collect css selectors from css files
  selectors = {}
  css_files = FBParser.get_content(
    os.path.join(path, filename.rstrip('html') + 'css_list'),
    encoding='ascii')
  css_files = css_files.split('\n')
  for css_file in css_files:
    css_file = css_file.replace('\\', '')
    css = FBParser.get_content(os.path.join(path, css_file), encoding='ascii')
    ret = FBParser.css.selector_index(css)
    for key in ret.keys():
      if key in selectors:
        selectors[key].update(ret[key])
      else:
        selectors[key] = ret[key]
  return selectors
Example #6
0
def localize_js(dom, path, prefix='js'):
  '''
  Find out all js files loaded for this page, replace urls with local files.
  '''
  # create a subfolder if necessary
  js_path = os.path.join(path, prefix)
  if not os.path.exists(js_path):
    os.mkdir(js_path)
  js_set = set()
  ret = FBParser.js.js_in_html(dom, dir=js_path, prefix=prefix + '/')
  dom = ret['source']
  js_set.update(ret['javascripts'])
  ret = FBParser.js.js_in_json(dom, dir=js_path, prefix=prefix + '\/')
  dom = ret['source']
  js_set.update(ret['javascripts'])
  FBParser.save_content(
    '\n'.join(list(js_set)),
    os.path.join(path, filename.rstrip('html') + 'js_list'),
    mode='w',
    encoding='ascii')
  return dom
Example #7
0
def anonym_images(dom, path, filename):
  '''
  Anonymize images and regenerate file names.
  '''
  img_files = FBParser.get_content(
    os.path.join(path, filename.rstrip('html') + 'img_list'),
    encoding='ascii')
  img_files = img_files.split('\n')
  images = {}
  for img_file in img_files:
    ext = os.path.splitext(img_file)[1]
    prefix = os.path.split(img_file)[0]
    new_file = prefix + '/anonym_' + str(random.getrandbits(40)) + ext
    images[img_file] = new_file
    dom = dom.replace(img_file, new_file)
    FBParser.save_resource(os.path.join(path, img_file), path, new_file)
    os.remove(os.path.join(path, img_file))
  st_mapping = []
  for image in images:
    images[image] = os.path.join(path, images[image])
  FBParser.garble_image.garble(images.values())
  return dom
Example #8
0
def anonym_images(dom, path, filename):
  '''
  Anonymize images and regenerate file names.
  '''
  img_files = FBParser.get_content(
    os.path.join(path, filename.rstrip('html') + 'img_list'),
    encoding='ascii')
  img_files = img_files.split('\n')
  images = {}
  if os.path.isfile(
    os.path.join(path, filename.rstrip('html') + 'img_mapping')):
  # already have garbled images, only replace filenames in dom
    img_mapping = FBParser.get_content(
      os.path.join(path, filename.rstrip('html') + 'img_mapping'),
      encoding='ascii')
    img_mapping = img_mapping.split('\n')
    for mapping in img_mapping:
      img_file, new_file = mapping.split(': ')
      dom = dom.replace(img_file, new_file)
  else:  # garbled image not generated yet
    for img_file in img_files:
      ext = os.path.splitext(img_file)[1]
      prefix = os.path.split(img_file)[0]
      new_file = prefix + '/anonym_' + str(random.getrandbits(40)) + ext
      images[img_file] = new_file
      dom = dom.replace(img_file, new_file)
      FBParser.save_resource(os.path.join(path, img_file), path, new_file)
    st_mapping = []
    for key, val in images.items():
      st_mapping.append(key + ': ' + val)
    FBParser.save_content(
      '\n'.join(st_mapping),
      os.path.join(path, filename.rstrip('html') + 'img_mapping'),
      encoding='ascii')
    for image in images:
      images[image] = os.path.join(path, images[image])
    FBParser.garble_image.garble(images.values())
  return dom
Example #9
0
      f.close()
      os.remove(log)
      for entry in entries:
        url, file = entry.rstrip('\n').split()
        try:
          urlretrieve(url, os.path.join(dir, file))
        except ValueError, err:
          err_log.write(url + ' ' + os.path.join(subdir, file) + '\n')
  err_log.close()


# main
if __name__ == '__main__':
  args = get_args()
  path, filename = os.path.split(args.file)
  dom = FBParser.get_content(args.file)

  if args.action == 'pretty':
    pretty_dom = FBParser.dom.prettify(dom)
    FBParser.save_content(
      pretty_dom,
      os.path.join(path, 'pretty-' + filename))

  if args.action == "convert":
    dom = FBParser.js.remove_cavalry(dom)
    dom = FBParser.dom.decss_injected(dom)
    dom = localize_css(dom, path)
    dom = localize_js(dom, path)
    dom = localize_img(dom, path)
    local_dom = localize_misc(dom, path)
    retry_resource(path)
Example #10
0
      os.remove(log)
      for entry in entries:
        url, file = entry.rstrip('\n').split()
        try:
          urlretrieve(url, os.path.join(dir, file))
        except ValueError, err:
          pass
          # err_log.write(url + ' ' + os.path.join(subdir, file) + '\n')
  # err_log.close()


# main
if __name__ == '__main__':
  args = get_args()
  path, filename = os.path.split(args.file)
  dom = FBParser.get_content(args.file, encoding='latin1')

  if args.action == 'pretty':
    pretty_dom = FBParser.dom.prettify(dom)
    FBParser.save_content(
      pretty_dom,
      os.path.join(path, 'pretty-' + filename))

  if args.action == "convert":
    dom = decavalry(dom)
    dom = FBParser.js.remove_cavalry(dom)
    dom = FBParser.dom.descript_injected(dom)
    dom = localize_css(dom, path)
    dom = localize_js(dom, path)
    dom = localize_img(dom, path)
    local_dom = localize_misc(dom, path)