def fetch_and_store(key_name, base_url, translated_address, mirrored_url): """Fetch and cache a page. Args: key_name: Hash to use to store the cached page. base_url: The hostname of the page that's being mirrored. translated_address: The URL of the mirrored page on this site. mirrored_url: The URL of the original page. Hostname should match the base_url. Returns: A new MirroredContent object, if the page was successfully retrieved. None if any errors occurred or the content could not be retrieved. """ # Check for the X-Mirrorrr header to ignore potential loops. if base_url in MIRROR_HOSTS: logging.warning('Encountered recursive request for "%s"; ignoring', mirrored_url) return None logging.debug("Fetching '%s'", mirrored_url) try: response = urlfetch.fetch(mirrored_url) except (urlfetch.Error, apiproxy_errors.Error): logging.exception("Could not fetch URL") return None adjusted_headers = {} for key, value in response.headers.iteritems(): adjusted_key = key.lower() if adjusted_key not in IGNORE_HEADERS: adjusted_headers[adjusted_key] = value content = response.content page_content_type = adjusted_headers.get("content-type", "") for content_type in TRANSFORMED_CONTENT_TYPES: # Startswith() because there could be a 'charset=UTF-8' in the header. if page_content_type.startswith(content_type): content = transform_content.TransformContent( base_url, mirrored_url, content) break # If the transformed content is over 1MB, truncate it (yikes!) if len(content) > MAX_CONTENT_SIZE: logging.warning('Content is over 1MB; truncating') content = content[:MAX_CONTENT_SIZE] new_content = MirroredContent(base_url=base_url, original_address=mirrored_url, translated_address=translated_address, status=response.status_code, headers=adjusted_headers, data=content) if not memcache.add( key_name, new_content, time=EXPIRATION_DELTA_SECONDS): logging.error( 'memcache.add failed: key_name = "%s", ' 'original_url = "%s"', key_name, mirrored_url) return new_content
def fetch_and_store(key_name, base_url, translated_address, mirrored_url): """Fetch and cache a page. Args: key_name: Hash to use to store the cached page. base_url: The hostname of the page that's being mirrored. translated_address: The URL of the mirrored page on this site. mirrored_url: The URL of the original page. Hostname should match the base_url. Returns: A new MirroredContent object, if the page was successfully retrieved. None if any errors occurred or the content could not be retrieved. """ logging.debug("Fetching '%s'", mirrored_url) try: # response = urlfetch.fetch(mirrored_url) response = requests.get(mirrored_url) # except (urlfetch.Error, apiproxy_errors.Error): except Exception: logging.exception("Could not fetch URL") return None adjusted_headers = {} for key, value in response.headers.items(): adjusted_key = key.lower() if adjusted_key not in IGNORE_HEADERS: adjusted_headers[adjusted_key] = value content = response.content if isinstance(content, bytes): content = content.decode('utf-8', 'ignore') page_content_type = adjusted_headers.get("content-type", "") for content_type in TRANSFORMED_CONTENT_TYPES: # startswith() because there could be a 'charset=UTF-8' in the header. if page_content_type.startswith(content_type): content = transform_content.TransformContent( base_url, mirrored_url, content) break new_content = MirroredContent(base_url=base_url, original_address=mirrored_url, translated_address=translated_address, status=response.status_code, headers=adjusted_headers, data=content) # Do not memcache content over 1MB if len(content) < MAX_CONTENT_SIZE: # if not memcache.add(key_name, new_content, time=EXPIRATION_DELTA_SECONDS): testaaa = json.dumps(new_content.__dict__) # if not r.set(key_name, json.dumps(new_content)): if not r.set(key_name, testaaa): logging.error( 'memcache.add failed: key_name = "%s", ' 'original_url = "%s"', key_name, mirrored_url) else: logging.warning("Content is over 1MB; not memcached") return new_content
def _RunTransformTest(self, base_url, accessed_url, original, expected): tag_tests = [ '<img src="%s"/>', "<img src='%s'/>", "<img src=%s/>", "<img src=\"%s'/>", "<img src='%s\"/>", "<img src \t= '%s'/>", "<img src \t= \t '%s'/>", "<img src = '%s'/>", '<a href="%s">', "<a href='%s'>", "<a href=%s>", "<a href=\"%s'>", "<a href='%s\">", "<a href \t = \t'%s'>", "<a href \t = '%s'>", "<a href = \t'%s'>", "<td background=%s>", "<td background='%s'>", '<td background="%s">', '<form action="%s">', "<form action='%s'>", "<form action=%s>", "<form action=\"%s'>", "<form action='%s\">", "<form action \t = \t'%s'>", "<form action \t = '%s'>", "<form action = \t'%s'>", "@import '%s';", "@import '%s'\nnext line here", "@import \t '%s';", "@import %s;", "@import %s", '@import "%s";', '@import "%s"\nnext line here', "@import url(%s)", "@import url('%s')", '@import url("%s")', "background: transparent url(%s) repeat-x left;", 'background: transparent url("%s") repeat-x left;', "background: transparent url('%s') repeat-x left;", '<meta http-equiv="Refresh" content="0; URL=%s">', ] for tag in tag_tests: test = tag % original correct = tag % expected result = transform_content.TransformContent( base_url, accessed_url, test) logging.info( "Test with\n" "Accessed: %s\n" "Input : %s\n" "Received: %s\n" "Expected: %s", accessed_url, test, result, correct) if result != correct: logging.info("FAIL") self.assertEquals(correct, result)
def fetch_and_store(base_url, translated_address, mirrored_url, user_agent, referer, ip): """Fetch and cache a page. Args: base_url: The hostname of the page that's being mirrored. translated_address: The URL of the mirrored page on this site. mirrored_url: The URL of the original page. Hostname should match the base_url. Returns: A new MirroredContent object, if the page was successfully retrieved. None if any errors occurred or the content could not be retrieved. """ # logging.info('Base_url = "%s", mirrored_url = "%s"', base_url, mirrored_url) # logging.info("Fetching '%s'", mirrored_url) try: response = urlfetch.fetch(mirrored_url, headers={ 'User-Agent': user_agent, 'App-Engine': 'true', 'Real-Ip': ip, 'Referer': referer }) except (urlfetch.Error, apiproxy_errors.Error): logging.info("Could not fetch URL") return None adjusted_headers = {} for key, value in response.headers.iteritems(): adjusted_key = key.lower() if adjusted_key not in IGNORE_HEADERS: adjusted_headers[adjusted_key] = value content = response.content # logging.info("content '%s'", content) page_content_type = adjusted_headers.get("content-type", "") for content_type in TRANSFORMED_CONTENT_TYPES: # startswith() because there could be a 'charset=UTF-8' in the header. if page_content_type.startswith(content_type): content = transform_content.TransformContent( base_url, mirrored_url, content) break new_content = MirroredContent(base_url=base_url, original_address=mirrored_url, translated_address=translated_address, status=response.status_code, headers=adjusted_headers, data=content) return new_content
def fetch_and_store(key_name, base_url, translated_address, mirrored_url, postdata=None, ChineseWordsencoding=True, whitelist=''): """Fetch and cache a page. Args: key_name: Hash to use to store the cached page. base_url: The hostname of the page that's being mirrored. translated_address: The URL of the mirrored page on this site. mirrored_url: The URL of the original page. Hostname should match the base_url. Returns: A new MirroredContent object, if the page was successfully retrieved. None if any errors occurred or the content could not be retrieved. """ # Check for the X-Mirrorrr header to ignore potential loops. if base_url in mirror_const.MIRROR_HOSTS: logging.warning( u'Encountered recursive request for "%s"; ignoring', mirrored_url) return None logging.debug(u"Fetching '%s'", mirrored_url) try: response = post_and_get_content(mirrored_url, postdata) except (urlfetch.Error, apiproxy_errors.Error): logging.exception("Could not fetch URL") return None adjusted_headers = {} #for key, value in response['headers']: for key, value in response['headers'].iteritems(): adjusted_key = key.lower() if adjusted_key not in mirror_const.IGNORE_HEADERS: adjusted_headers[adjusted_key] = value #logging.error(adjusted_headers) content = response['content'] page_content_type = adjusted_headers.get("content-type", "") for content_type in mirror_const.TRANSFORMED_CONTENT_TYPES: # Startswith() because there could be a 'charset=UTF-8' in the header. if page_content_type.startswith(content_type): is_html = page_content_type.startswith('text/html') if is_html: logging.error(u'transform:%s' % response['geturl']) #mirrored_url) content = transform_content.TransformContent( base_url, response['geturl'], content, is_html, ChineseWordsencoding, whitelist) break # If the transformed content is over 1MB, truncate it (yikes!) if len(content) > mirror_const.MAX_CONTENT_SIZE: logging.warning('Content is over 1MB; truncating') content = content[:mirror_const.MAX_CONTENT_SIZE] new_content = MirroredContent(base_url=base_url, original_address=mirrored_url, translated_address=translated_address, status=response['status_code'], headers=adjusted_headers, data=content) #======================================================================= # if memcache.get(key_name): # if memcache.set(key_name, new_content, time=mirror_const.EXPIRATION_DELTA_SECONDS): # logging.error('memcache.set failed: key_name = "%s", ' # 'original_url = "%s"', key_name, mirrored_url) # else: # if memcache.set(key_name, new_content, time=mirror_const.EXPIRATION_DELTA_SECONDS): # logging.error('memcache.add2 failed: key_name = "%s", ' # 'original_url = "%s"', key_name, mirrored_url) #======================================================================= return new_content
def _RunTransformTest(self, base_url, accessed_url, original, expected): tag_tests = [ '<img src="%s"/>', "<img src='%s'/>", "<img src=%s/>", "<img src=\"%s'/>", "<img src='%s\"/>", "<img src \t= '%s'/>", "<img src \t= \t '%s'/>", "<img src = '%s'/>", '<a href="%s">', "<a href='%s'>", "<a href=%s>", "<a href=\"%s'>", "<a href='%s\">", "<a href \t = \t'%s'>", "<a href \t = '%s'>", "<a href = \t'%s'>", "<td background=%s>", "<td background='%s'>", '<td background="%s">', '<form action="%s">', "<form action='%s'>", "<form action=%s>", "<form action=\"%s'>", "<form action='%s\">", "<form action \t = \t'%s'>", "<form action \t = '%s'>", "<form action = \t'%s'>", "@import '%s';", "@import '%s'\nnext line here", "@import \t '%s';", "@import %s;", "@import %s", '@import "%s";', '@import "%s"\nnext line here', "@import url(%s)", "@import url('%s')", '@import url("%s")', "background: transparent url(%s) repeat-x left;", 'background: transparent url("%s") repeat-x left;', "background: transparent url('%s') repeat-x left;", '<meta http-equiv="Refresh" content="0; URL=%s">', 'url(%s)', 'src="%s" ', 'style="background:url(%s)' ] No_b64_encoding = expected.lstrip().startswith('#') or expected.lstrip().lower().startswith('javascript') scheme = original.startswith('https') and 'https' or urlparse.urlparse(accessed_url).scheme expected = No_b64_encoding and expected or ('/'+b64.uri_b64encode('%s:/'%scheme +expected)) for tag in tag_tests: #logging.error("\n\n\n tag begin:%s "%tag) test = tag % original correct = tag % expected result = transform_content.TransformContent(base_url, accessed_url, test) logging.error("Test with\n" "Accessed: %s\n" "Input : %s\n" "Received: %s\n" "Expected: %s", accessed_url, test, result, correct) if result != correct: logging.info("FAIL") self.assertEquals(correct, result)
def fetch_and_store(key_name, base_url, translated_address, mirrored_url, host, handler, shorturl): """Fetch and cache a page. Args: key_name: Hash to use to store the cached page. base_url: The hostname of the page that's being mirrored. translated_address: The URL of the mirrored page on this site. mirrored_url: The URL of the original page. Hostname should match the base_url. Returns: A new MirroredContent object, if the page was successfully retrieved. None if any errors occurred or the content could not be retrieved. """ logging.debug( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") logging.debug("Fetching '%s' , base_url is '%s' ,UA is %s ", mirrored_url, base_url, handler.request.headers["User-Agent"]) headers = { # 'content-type': 'application/json', 'User-Agent': handler.request.headers["User-Agent"] or 'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1', # 'Referer': 'http://'+base_url, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' } try: if handler.request.method == 'GET': response = requests.get(mirrored_url, headers=headers) if handler.request.method == 'POST': logging.debug('handler.request postitems : %s' % handler.request.POST.items()) response = requests.post(mirrored_url, headers=headers, data=handler.request.POST.items()) except Exception: logging.exception("Could not fetch URL") return None adjusted_headers = {} for key, value in response.headers.iteritems(): adjusted_key = key.lower() if adjusted_key not in IGNORE_HEADERS: adjusted_headers[adjusted_key] = value content = response.content page_content_type = adjusted_headers.get("content-type", "") logging.info('page_content_type is %s' % page_content_type) for content_type in TRANSFORMED_CONTENT_TYPES: # startswith() because there could be a 'charset=UTF-8' in the header. if page_content_type.startswith(content_type): content = transform_content.TransformContent( base_url, mirrored_url, content, shorturl) if page_content_type.startswith( "text/html"): #监听所有的请求,替换ajax地址 content = content.replace('document.domain="qq.com";', 'void(0);') #微信的烂代码 content = content.replace( '</body>', '<script type="text/javascript" src="http://pingjs.qq.com/h5/stats.js" name="MTAH5" sid="500324497" cid="500331564" opts="{"senseHash":false}" ></script></body>' ) #微信的烂代码 content = content.replace( 'location.href.indexOf("safe=0") == -1 ', 'false') #微信的蓝代码 # content = content.replace('"2",','"3",') #eqx烂代码 content = content.replace( "<head>", """<head> <meta name="referrer" content="never"> <script> function do_poster_script_onload(el){ console.log('script el:',el); if (el.outerHTML.indexOf('http://')<0 && el.outerHTML.indexOf('https://')<0 && el.src.indexOf(base_url)<0){ var path = el.src.replace('http://""" + host + """','');// el.src = '/""" + shorturl + """/""" + base_url + """'+ path; //el.onloadstart = null; } } (function() { var base_url = '""" + base_url + """'; var proxied = window.XMLHttpRequest.prototype.open; window.XMLHttpRequest.prototype.open = function() { //console.log( arguments ); if (arguments[1].indexOf('http://')<0 && arguments[1].indexOf('https://')<0) {arguments[1]='http://'+base_url+arguments[1]} arguments[1] = arguments[1].replace('http://','/""" + shorturl + """/') //console.log( 'arguments xhr:',arguments ); return proxied.apply(this, [].slice.call(arguments)); }; var proxied_append = HTMLElement.prototype.appendChild; HTMLElement.prototype.appendChild = function() { //console.log( 'appendChild:', arguments ); for (var i in arguments){ var el = arguments[i]; //debugger; if (el.tagName==='SCRIPT'){ //debugger; if (el.outerHTML.indexOf('http://')<0 && el.outerHTML.indexOf('https://')<0 && el.src.indexOf(base_url)<0){ var path = el.src.replace('http://""" + host + """','');// console.log('path:',path); if (path==='') { el.onbeforeonload = function(){ //console.log('onbeforeonloadNew:',el); } el.onerror = function(e){ //console.log('onerror',e) } }else{ el.src = '/""" + shorturl + """/""" + base_url + """'+ path; } } } } //if (arguments[1].indexOf('http://')<0) {arguments[1]='http://'+arguments[1]} //arguments[1] = arguments[1].replace('http://','/') //console.log( 'arguments append:',arguments ); return proxied_append.apply(this, [].slice.call(arguments)); }; /* var proxied_onload = HTMLElement.prototype.onload; HTMLElement.prototype.onload = function(){ var result = proxied_onload.apply(this, [].slice.call(arguments)); //do_poster_script_onload(this); //console.log('poster onload'); return result; }*/ var D = window.localStorage.getItem; window.localStorage.getItem = function() { var result = D.call(window.localStorage, [].slice.call(arguments)); //console.log('getItem',result); if (result) result = result.replace('document.domain="qq.com";','void(0);');//#微信的烂代码 return result; }; })(); </script> """) break new_content = MirroredContent(base_url=base_url, original_address=mirrored_url, translated_address=translated_address, status=response.status_code, headers=adjusted_headers, data=content) # Do not memcache content over 1MB if len(content) < MAX_CONTENT_SIZE: if not memcache.set(key_name, new_content): logging.error( 'memcache.add failed: key_name = "%s", ' 'original_url = "%s"', key_name, mirrored_url) else: logging.warning("Content is over %s ; not memcached" % MAX_CONTENT_SIZE) return new_content