def test_regular_url(self): """ Test the UTF8 to ASCII converter on a normal url (http://www.google.com) """ url = u'http://www.google.com' ascii_url = 'http://www.google.com' self.assertEqual(convert_utf8_url_to_ascii(url), ascii_url)
def test_unicode_with_params_url(self): """ Test the UTF8 to ASCII converter on a url with unicode characters and parameters (http://Åsa:abc123@➡.ws:81/admin) """ url = u'http://Åsa:abc123@➡.ws:81/admin' ascii_url = 'http://%C3%85sa:[email protected]:81/admin' self.assertEqual(convert_utf8_url_to_ascii(url), ascii_url)
def test_unicode_url(self): """ Test the UTF8 to ASCII converter on a url with unicode characters (http://➡.ws/admin) """ url = u'http://➡.ws/admin' ascii_url = 'http://xn--hgi.ws/admin' self.assertEqual(convert_utf8_url_to_ascii(url), ascii_url)
def test_quoted_with_params_url(self): """ Test the UTF8 to ASCII converter on a quoted url with parameters (http://\xe2\x9e\xa1.ws/\xe2\x99\xa5/%2F) """ url = 'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5/%2F' ascii_url = 'http://xn--hgi.ws/%E2%99%A5/%2F' self.assertEqual(convert_utf8_url_to_ascii(url), ascii_url)
def test_quoted_url(self): """ Test the UTF8 to ASCII converter on a quoted url (http://\xe2\x9e\xa1.ws/\xe2\x99\xa5) """ url = 'http://\xe2\x9e\xa1.ws/\xe2\x99\xa5' ascii_url = 'http://xn--hgi.ws/%E2%99%A5' self.assertEqual(convert_utf8_url_to_ascii(url), ascii_url)
def save_errors_raw_data_to_db(self, errors, batch_outdir): """ """ raw_data = list() for i, error in enumerate(errors): url, _, _ = error try: url = convert_utf8_url_to_ascii(url) html_content = fetch_html_content(url) raw_data.append((url, html_content)) except: self.log_error(u"Could not fetch raw html data for error'd url: {0} (Reason: {1})".format([url], traceback.format_exc())) continue raw_data_dir = os.path.join(batch_outdir, ERRORS_RAW_DATA_DIR) self.log_info(u"Writing raw html data to {0}".format(raw_data_dir)) self.save_raw_data_to_path(raw_data, raw_data_dir)