Example #1
0
def localize_css(dom, path, prefix='css'):
  '''
  Find out all css files loaded for this page, replace urls with local files.
  '''
  # create a subfolder if necessary
  css_path = os.path.join(path, prefix)
  if not os.path.exists(css_path):
    os.mkdir(css_path)
  css_set = set()
  ret = FBParser.css.css_in_html(dom, dir=css_path, prefix=prefix + '/')
  dom = ret['source']
  css_set.update(ret['csses'])
  ret = FBParser.css.css_in_json(dom, dir=css_path, prefix=prefix + '\/')
  dom = ret['source']
  css_set.update(ret['csses'])
  images = set()
  # localize images in the css files
  for css_file in css_set:
    css = FBParser.get_content(os.path.join(path, css_file), encoding='ascii')
    ret = FBParser.img.img_in_css(css, dir=css_path, prefix='')
    FBParser.save_content(
      ret['source'],
      os.path.join(path, css_file),
      encoding='ascii')
    images.update(ret['images'])
  # FBParser.save_content(
    # '\n'.join(list(images)),
    # os.path.join(path, filename.rstrip('html') + 'cssimage_list'),
    # encoding='ascii')
  FBParser.save_content(
    '\n'.join(list(css_set)),
    os.path.join(path, filename.rstrip('html') + 'css_list'),
    encoding='ascii')
  return dom
Example #2
0
def localize_img(dom, path, prefix='img'):
  '''
  Find out all images loaded for this page, replace urls with local files.
  '''
  # create a subfolder if necessary
  img_path = os.path.join(path, prefix)
  if not os.path.exists(img_path):
    os.mkdir(img_path)
  ret = FBParser.img.img_in_html(
    dom,
    dir=img_path,
    prefix=prefix + '/')
  dom = ret['source']
  FBParser.save_content(
    '\n'.join(list(ret['images'])),
    os.path.join(path, filename.rstrip('html') + 'img_list'),
    encoding='ascii')
  return dom
Example #3
0
def localize_js(dom, path, prefix='js'):
  '''
  Find out all js files loaded for this page, replace urls with local files.
  '''
  # create a subfolder if necessary
  js_path = os.path.join(path, prefix)
  if not os.path.exists(js_path):
    os.mkdir(js_path)
  js_set = set()
  ret = FBParser.js.js_in_html(dom, dir=js_path, prefix=prefix + '/')
  dom = ret['source']
  js_set.update(ret['javascripts'])
  ret = FBParser.js.js_in_json(dom, dir=js_path, prefix=prefix + '\/')
  dom = ret['source']
  js_set.update(ret['javascripts'])
  FBParser.save_content(
    '\n'.join(list(js_set)),
    os.path.join(path, filename.rstrip('html') + 'js_list'),
    mode='w',
    encoding='ascii')
  return dom
Example #4
0
def localize_css(dom, path, prefix='css'):
  '''
  Find out all css files loaded for this page, replace urls with local files.
  '''
  # create a subfolder if necessary
  css_path = os.path.join(path, prefix)
  if not os.path.exists(css_path):
    os.mkdir(css_path)
  css_set = set()
  ret = FBParser.css.css_in_html(dom, dir=css_path, prefix=prefix + '/')
  dom = ret['source']
  css_set.update(ret['csses'])
  ret = FBParser.css.css_in_json(dom, dir=css_path, prefix=prefix + '\/')
  dom = ret['source']
  css_set.update(ret['csses'])
  images = set()
  # localize images in the css files
  for css_file in css_set:
    css = FBParser.get_content(os.path.join(path, css_file), encoding='ascii')
    ret = FBParser.img.img_in_css(css, dir=css_path, prefix=prefix + '/')
    FBParser.save_content(
      ret['source'],
      os.path.join(path, css_file),
      encoding='ascii')
    images.update(ret['images'])
  FBParser.save_content(
    '\n'.join(list(images)),
    os.path.join(path, filename.rstrip('html') + 'cssimage_list'),
    encoding='ascii')
  FBParser.save_content(
    '\n'.join(list(css_set)),
    os.path.join(path, filename.rstrip('html') + 'css_list'),
    encoding='ascii')
  return dom
Example #5
0
def anonym_images(dom, path, filename):
  '''
  Anonymize images and regenerate file names.
  '''
  img_files = FBParser.get_content(
    os.path.join(path, filename.rstrip('html') + 'img_list'),
    encoding='ascii')
  img_files = img_files.split('\n')
  images = {}
  if os.path.isfile(
    os.path.join(path, filename.rstrip('html') + 'img_mapping')):
  # already have garbled images, only replace filenames in dom
    img_mapping = FBParser.get_content(
      os.path.join(path, filename.rstrip('html') + 'img_mapping'),
      encoding='ascii')
    img_mapping = img_mapping.split('\n')
    for mapping in img_mapping:
      img_file, new_file = mapping.split(': ')
      dom = dom.replace(img_file, new_file)
  else:  # garbled image not generated yet
    for img_file in img_files:
      ext = os.path.splitext(img_file)[1]
      prefix = os.path.split(img_file)[0]
      new_file = prefix + '/anonym_' + str(random.getrandbits(40)) + ext
      images[img_file] = new_file
      dom = dom.replace(img_file, new_file)
      FBParser.save_resource(os.path.join(path, img_file), path, new_file)
    st_mapping = []
    for key, val in images.items():
      st_mapping.append(key + ': ' + val)
    FBParser.save_content(
      '\n'.join(st_mapping),
      os.path.join(path, filename.rstrip('html') + 'img_mapping'),
      encoding='ascii')
    for image in images:
      images[image] = os.path.join(path, images[image])
    FBParser.garble_image.garble(images.values())
  return dom
Example #6
0
          urlretrieve(url, os.path.join(dir, file))
        except ValueError, err:
          err_log.write(url + ' ' + os.path.join(subdir, file) + '\n')
  err_log.close()


# main
if __name__ == '__main__':
  args = get_args()
  path, filename = os.path.split(args.file)
  dom = FBParser.get_content(args.file)

  if args.action == 'pretty':
    pretty_dom = FBParser.dom.prettify(dom)
    FBParser.save_content(
      pretty_dom,
      os.path.join(path, 'pretty-' + filename))

  if args.action == "convert":
    dom = FBParser.js.remove_cavalry(dom)
    dom = FBParser.dom.decss_injected(dom)
    dom = localize_css(dom, path)
    dom = localize_js(dom, path)
    dom = localize_img(dom, path)
    local_dom = localize_misc(dom, path)
    retry_resource(path)
    dom_13 = local_dom
    print "css 1, javascript 3"
    ret = FBParser.dom.unload_pagelets(dom_13)
    html_13 = ret['html']
    pagelets = ret['pagelets']
Example #7
0
        except ValueError, err:
          pass
          # err_log.write(url + ' ' + os.path.join(subdir, file) + '\n')
  # err_log.close()


# main
if __name__ == '__main__':
  args = get_args()
  path, filename = os.path.split(args.file)
  dom = FBParser.get_content(args.file, encoding='latin1')

  if args.action == 'pretty':
    pretty_dom = FBParser.dom.prettify(dom)
    FBParser.save_content(
      pretty_dom,
      os.path.join(path, 'pretty-' + filename))

  if args.action == "convert":
    dom = decavalry(dom)
    dom = FBParser.js.remove_cavalry(dom)
    dom = FBParser.dom.descript_injected(dom)
    dom = localize_css(dom, path)
    dom = localize_js(dom, path)
    dom = localize_img(dom, path)
    local_dom = localize_misc(dom, path)
    retry_resource(path)
    dom_13 = local_dom
    ret = FBParser.dom.unload_pagelets(dom_13)
    pagelets = ret['pagelets']
    dom_12 = FBParser.dom.descript_onclick(dom_13)