コード例 #1
0
def localize_css(dom, path, prefix='css'):
  '''
  Find out all css files loaded for this page, replace urls with local files.
  '''
  # create a subfolder if necessary
  css_path = os.path.join(path, prefix)
  if not os.path.exists(css_path):
    os.mkdir(css_path)
  css_set = set()
  ret = FBParser.css.css_in_html(dom, dir=css_path, prefix=prefix + '/')
  dom = ret['source']
  css_set.update(ret['csses'])
  ret = FBParser.css.css_in_json(dom, dir=css_path, prefix=prefix + '\/')
  dom = ret['source']
  css_set.update(ret['csses'])
  images = set()
  # localize images in the css files
  for css_file in css_set:
    css = FBParser.get_content(os.path.join(path, css_file), encoding='ascii')
    ret = FBParser.img.img_in_css(css, dir=css_path, prefix=prefix + '/')
    FBParser.save_content(
      ret['source'],
      os.path.join(path, css_file),
      encoding='ascii')
    images.update(ret['images'])
  FBParser.save_content(
    '\n'.join(list(images)),
    os.path.join(path, filename.rstrip('html') + 'cssimage_list'),
    encoding='ascii')
  FBParser.save_content(
    '\n'.join(list(css_set)),
    os.path.join(path, filename.rstrip('html') + 'css_list'),
    encoding='ascii')
  return dom
コード例 #2
0
def get_css_selectors(path, filename):
  '''
  Get all the selectors from a list of css files.
  Call this AFTER localizing all css resources and css_list is in place.
  '''
  #   first collect css selectors from css files
  selectors = {}
  css_files = FBParser.get_content(
    os.path.join(path, filename.rstrip('html') + 'css_list'),
    encoding='ascii')
  css_files = css_files.split('\n')
  for css_file in css_files:
    css_file = css_file.replace('\\', '')
    css = FBParser.get_content(os.path.join(path, css_file), encoding='ascii')
    ret = FBParser.css.selector_index(css)
    for key in ret.keys():
      if key in selectors:
        selectors[key].update(ret[key])
      else:
        selectors[key] = ret[key]
  return selectors
コード例 #3
0
def anonym_images(dom, path, filename):
  '''
  Anonymize images and regenerate file names.
  '''
  img_files = FBParser.get_content(
    os.path.join(path, filename.rstrip('html') + 'img_list'),
    encoding='ascii')
  img_files = img_files.split('\n')
  images = {}
  if os.path.isfile(
    os.path.join(path, filename.rstrip('html') + 'img_mapping')):
  # already have garbled images, only replace filenames in dom
    img_mapping = FBParser.get_content(
      os.path.join(path, filename.rstrip('html') + 'img_mapping'),
      encoding='ascii')
    img_mapping = img_mapping.split('\n')
    for mapping in img_mapping:
      img_file, new_file = mapping.split(': ')
      dom = dom.replace(img_file, new_file)
  else:  # garbled image not generated yet
    for img_file in img_files:
      ext = os.path.splitext(img_file)[1]
      prefix = os.path.split(img_file)[0]
      new_file = prefix + '/anonym_' + str(random.getrandbits(40)) + ext
      images[img_file] = new_file
      dom = dom.replace(img_file, new_file)
      FBParser.save_resource(os.path.join(path, img_file), path, new_file)
    st_mapping = []
    for key, val in images.items():
      st_mapping.append(key + ': ' + val)
    FBParser.save_content(
      '\n'.join(st_mapping),
      os.path.join(path, filename.rstrip('html') + 'img_mapping'),
      encoding='ascii')
    for image in images:
      images[image] = os.path.join(path, images[image])
    FBParser.garble_image.garble(images.values())
  return dom
コード例 #4
0
def anonym_images(dom, path, filename):
  '''
  Anonymize images and regenerate file names.
  '''
  img_files = FBParser.get_content(
    os.path.join(path, filename.rstrip('html') + 'img_list'),
    encoding='ascii')
  img_files = img_files.split('\n')
  images = {}
  for img_file in img_files:
    ext = os.path.splitext(img_file)[1]
    prefix = os.path.split(img_file)[0]
    new_file = prefix + '/anonym_' + str(random.getrandbits(40)) + ext
    images[img_file] = new_file
    dom = dom.replace(img_file, new_file)
    FBParser.save_resource(os.path.join(path, img_file), path, new_file)
    os.remove(os.path.join(path, img_file))
  st_mapping = []
  for image in images:
    images[image] = os.path.join(path, images[image])
  FBParser.garble_image.garble(images.values())
  return dom
コード例 #5
0
      f.close()
      os.remove(log)
      for entry in entries:
        url, file = entry.rstrip('\n').split()
        try:
          urlretrieve(url, os.path.join(dir, file))
        except ValueError, err:
          err_log.write(url + ' ' + os.path.join(subdir, file) + '\n')
  err_log.close()


# main
if __name__ == '__main__':
  args = get_args()
  path, filename = os.path.split(args.file)
  dom = FBParser.get_content(args.file)

  if args.action == 'pretty':
    pretty_dom = FBParser.dom.prettify(dom)
    FBParser.save_content(
      pretty_dom,
      os.path.join(path, 'pretty-' + filename))

  if args.action == "convert":
    dom = FBParser.js.remove_cavalry(dom)
    dom = FBParser.dom.decss_injected(dom)
    dom = localize_css(dom, path)
    dom = localize_js(dom, path)
    dom = localize_img(dom, path)
    local_dom = localize_misc(dom, path)
    retry_resource(path)
コード例 #6
0
      os.remove(log)
      for entry in entries:
        url, file = entry.rstrip('\n').split()
        try:
          urlretrieve(url, os.path.join(dir, file))
        except ValueError, err:
          pass
          # err_log.write(url + ' ' + os.path.join(subdir, file) + '\n')
  # err_log.close()


# main
if __name__ == '__main__':
  args = get_args()
  path, filename = os.path.split(args.file)
  dom = FBParser.get_content(args.file, encoding='latin1')

  if args.action == 'pretty':
    pretty_dom = FBParser.dom.prettify(dom)
    FBParser.save_content(
      pretty_dom,
      os.path.join(path, 'pretty-' + filename))

  if args.action == "convert":
    dom = decavalry(dom)
    dom = FBParser.js.remove_cavalry(dom)
    dom = FBParser.dom.descript_injected(dom)
    dom = localize_css(dom, path)
    dom = localize_js(dom, path)
    dom = localize_img(dom, path)
    local_dom = localize_misc(dom, path)