def get_meta_refresh(response): '''Parse the http-equiv refrsh parameter from the given HTML response. Return tuple (interval, url).''' text = remove_entities(response.text[0:4096]) text = html_comment_re.sub(u'', text) text = html_noscript_re.sub(u'', text) text = html_script_re.sub(u'', text) m = _meta_refresh_re.search(text) if m: interval = float(m.group('int')) url = requote_url(to_str(m.group('url').strip(' "\''), response.encoding)) url = urlparse.urljoin(response.url, url) return (interval, url) else: return (None, None)
def get_meta_refresh(response): '''Parse the http-equiv refrsh parameter from the given HTML response. Return tuple (interval, url).''' text = remove_entities(response.text[0:4096]) text = html_comment_re.sub(u'', text) text = html_noscript_re.sub(u'', text) text = html_script_re.sub(u'', text) m = _meta_refresh_re.search(text) if m: interval = float(m.group('int')) url = requote_url( to_str(m.group('url').strip(' "\''), response.encoding)) url = urlparse.urljoin(response.url, url) return (interval, url) else: return (None, None)
def test_remove_entities(self): # make sure it always return uncode self.assertIsInstance(remove_entities('no entities'), unicode) self.assertIsInstance(remove_entities('Price: £100!'), unicode) # regular conversions self.assertEqual(remove_entities(u'As low as £100!'), u'As low as \xa3100!') self.assertEqual(remove_entities('As low as £100!'), u'As low as \xa3100!') self.assertEqual(remove_entities('redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant'), u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant') # keep some entities self.assertEqual(remove_entities('<b>Low < High & Medium £ six</b>', keep=['lt', 'amp']), u'<b>Low < High & Medium \xa3 six</b>') # illegal entities self.assertEqual(remove_entities('a < b &illegal; c � six', remove_illegal=False), u'a < b &illegal; c � six') self.assertEqual(remove_entities('a < b &illegal; c � six', remove_illegal=True), u'a < b c six') self.assertEqual(remove_entities('x≤y'), u'x\u2264y') # check browser hack for numeric character references in the 80-9F range self.assertEqual(remove_entities('x™y', encoding='cp1252'), u'x\u2122y') # encoding self.assertEqual(remove_entities('x\x99™™y', encoding='cp1252'), u'x\u2122\u2122\u2122y')
def test_remove_entities(self): # make sure it always return uncode self.assertIsInstance(remove_entities('no entities'), unicode) self.assertIsInstance(remove_entities('Price: £100!'), unicode) # regular conversions self.assertEqual(remove_entities(u'As low as £100!'), u'As low as \xa3100!') self.assertEqual(remove_entities('As low as £100!'), u'As low as \xa3100!') self.assertEqual( remove_entities( 'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold ½oz solid crucifix pendant' ), u'redirectTo=search&searchtext=MR0221Y&aff=buyat&affsrc=d_data&cm_mmc=buyat-_-ELECTRICAL & SEASONAL-_-MR0221Y-_-9-carat gold \xbdoz solid crucifix pendant' ) # keep some entities self.assertEqual( remove_entities('<b>Low < High & Medium £ six</b>', keep=['lt', 'amp']), u'<b>Low < High & Medium \xa3 six</b>') # illegal entities self.assertEqual( remove_entities('a < b &illegal; c � six', remove_illegal=False), u'a < b &illegal; c � six') self.assertEqual( remove_entities('a < b &illegal; c � six', remove_illegal=True), u'a < b c six') self.assertEqual(remove_entities('x≤y'), u'x\u2264y') # check browser hack for numeric character references in the 80-9F range self.assertEqual(remove_entities('x™y', encoding='cp1252'), u'x\u2122y') # encoding self.assertEqual( remove_entities('x\x99™™y', encoding='cp1252'), u'x\u2122\u2122\u2122y')