def unexpected_http_status(self, code, resp): if resp.status == 302: location = resp.getheader("Location") if location and ("sharedby" in location or "visibli" in location): raise exceptions.NoRedirectException() elif location and location.startswith("http://yahoo.com"): raise exceptions.BlockedException("Banned (location=%s)" % location) # Guess it be an override for site that busts out iframes return location if resp.status != 200: return super(BaseVisbliService, self).unexpected_http_status(code, resp) resp, data = self._http_get(code) if resp.status != 200: raise exceptions.ServiceException( "HTTP status changed from 200 to %i on second request" % resp.status) match = re.search(r'<iframe id="[^"]+" src="([^"]+)">', data) if not match: if 'Undefined index: HTTP_USER_AGENT' in data: raise exceptions.ServiceException( "Website broken about user-agent") raise exceptions.ServiceException("No iframe url found") url = match.group(1).decode("utf-8") url = HTMLParser.HTMLParser().unescape(url).encode("utf-8") return url
def _fetch_blocked(self, code): resp = self._http_fetch(code, "GET") data = resp.read() if resp.status != 200: raise exceptions.ServiceException( "HTTP status changed from 200 to %i on second request" % resp.status) if not data: raise exceptions.CodeBlockedException( "Empty response on status 200") if self.RATE_LIMIT_STRING in data: raise exceptions.BlockedException() position = data.find(self.BLOCKED_STRING_START) if position == -1: raise exceptions.ServiceException( "Unexpected response on status 200") data = data[position + len(self.BLOCKED_STRING_START):] position = data.find(self.BLOCKED_STRING_END) if position == -1: raise exceptions.ServiceException( "Unexpected response on status 200") url = data[:position].decode("utf-8") return HTMLParser.HTMLParser().unescape(url).encode("utf-8")
def fetch(self, code): resp = self._http_fetch(code) if resp.status == 200: return self._fetch_200(code) elif resp.status == 301: location = resp.getheader("Location") if not location: raise exceptions.CodeBlockedException( "No Location header after HTTP status 301") tiny = resp.getheader("X-tiny") if tiny and tiny[:3] == "aff": return self._preview(code) return location elif resp.status == 302: raise exceptions.CodeBlockedException() elif resp.status == 404: raise exceptions.NoRedirectException() elif resp.status == 500: # Some "errorhelp" URLs result in HTTP status 500, which goes away when trying a different server self._conn.close() raise exceptions.ServiceException("HTTP status 500") else: raise exceptions.ServiceException("Unknown HTTP status %i" % resp.status) return resp.status
def fetch(self, code): resp = self._http_fetch(code) if resp.status == 301: location = resp.getheader("Location") if not location: raise exceptions.ServiceException( "No Location header after HTTP status 301") if resp.reason == "Moved": # Normal bit.ly redirect return location elif resp.reason == "Moved Permanently": # Weird "bundles" redirect, forces connection close despite # sending Keep-Alive header self._conn.close() raise exceptions.CodeBlockedException() else: raise exceptions.ServiceException( "Unknown HTTP reason %s after HTTP status 301" % resp.reason) elif resp.status == 302: location = resp.getheader("Location") if not location: raise exceptions.ServiceException( "No Location header after HTTP status 302") return self._parse_warning_url(code, location) elif resp.status == 403: raise exceptions.BlockedException() elif resp.status == 404: raise exceptions.NoRedirectException() elif resp.status == 410: raise exceptions.CodeBlockedException() else: raise exceptions.ServiceException("Unknown HTTP status %i" % resp.status)
def _parse_warning_url(self, code, url): url = urlparse.urlparse(url) if url.scheme != "http" or url.netloc != "bitly.com" or url.path != "/a/warning": raise exceptions.ServiceException( "Unexpected Location header after HTTP status 302") query = urlparse.parse_qs(url.query) if not ("url" in query and len(query["url"]) == 1) or not ("hash" in query and len(query["hash"]) == 1): raise exceptions.ServiceException( "Unexpected Location header after HTTP status 302") if query["hash"][0] != code: raise exceptions.ServiceException( "Hash mismatch forr HTTP status 302") return query["url"][0]
def _parse_json(self, data): try: data = json.loads(data) except ValueError: raise exceptions.ServiceException("Could not decode response") if not "kind" in data or data["kind"] != "urlshortener#url": raise exceptions.ServiceException("No/bad type given") if not "status" in data: raise exceptions.ServiceException("No status given") if not "longUrl" in data: raise exceptions.CodeBlockedException("Status: %s" % data["status"]) return data["longUrl"]
def _fetch_200(self, code): resp, data = self._http_get(code) if resp.status != 200: raise exceptions.ServiceException( "HTTP status changed from 200 to %i on second request" % resp.status) if "<title>Redirecting...</title>" in data: return self._parse_errorhelp(code, data) elif "Error: TinyURL redirects to a TinyURL." in data: return self._parse_tinyurl_redirect(data) else: raise exceptions.ServiceException( "Unexpected response on status 200")
def _preview(self, code): resp = self._http_fetch("preview.php?num=" + code, "GET") data = resp.read() if resp.status != 200: raise exceptions.ServiceException( "Unexpected HTTP status %i on preview page" % resp.status) match = re.search( "<a id=\"redirecturl\" href=\"(.*?)\">Proceed to this site.</a>", data, re.DOTALL) if not match: raise exceptions.ServiceException("No redirect on preview page") return HTMLParser.HTMLParser().unescape(match.group(1)).encode("utf-8")
class HTTPService(Service): """ Httplib-based URL shortener client Abstract serivce class to help with using httplib. """ @abc.abstractproperty def url(self): """ Returns the base URL of the URL shortener """ def __init__(self): parsed_url = urlparse.urlparse(self.url) self._path = parsed_url.path or "/" self._conn = httplib.HTTPConnection(parsed_url.netloc, timeout=30) def _http_fetch(self, code, method="HEAD"): try: self._conn.request(method, self._path + code) resp = self._conn.getresponse() if method == "HEAD": resp.read() return resp except httplib.HTTPException, e: self._conn.close() raise exceptions.ServiceException("HTTP exception: %s" % e) except socket.error, e: self._conn.close() raise exceptions.ServiceException("Socket error: %s" % e)
class Googl(Service): """ http://goo.gl/ """ @property def rate_limit(self): return (1, 5) @property def charset(self): return "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" def __init__(self): host = "www.googleapis.com" version = platform.python_version_tuple() if int(version[0]) == 2 and int(version[1]) <= 5: self._conn = httplib.HTTPSConnection(host) else: self._conn = httplib.HTTPSConnection(host, timeout=30) def fetch(self, code): try: self._conn.request( "GET", "/urlshortener/v1/url?shortUrl=http://goo.gl/%s" % code) resp = self._conn.getresponse() data = resp.read() except httplib.HTTPException, e: self._conn.close() raise exceptions.ServiceException("HTTP exception: %s" % e) except socket.error, e: self._conn.close() raise exceptions.ServiceException("Socket error: %s" % e)
class YourlsService(Service): """ A service for installations of Yourls (http://yourls.org). """ @abc.abstractproperty def yourls_api_url(self): """ The endpoint of the Yourls API. The Yourls API is typically located at /yourls-api.php """ @abc.abstractproperty def yourls_url_convert(self): """ The value of the YOURLS_URL_CONVERT parameter. The YOUR_SULR_CONVERT parameter specifies what charset is used by the Yourls installation. """ @property def charset(self): if self.yourls_url_convert == 36: return "0123456789abcdefghijklmnopqrstuvwxyz" elif self.yourls_url_convert == 62: return "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" raise RuntimeError("Bad value for yourls_url_convert parameter") def __init__(self): parsed_url = urlparse.urlparse(self.yourls_api_url) self._path = parsed_url.path or "/" if parsed_url.scheme == "http": klass = httplib.HTTPConnection elif parsed_url.scheme == "https": klass = httplib.HTTPSConnection else: raise ValueError("Unknown scheme %s" % parsed_url.scheme) version = platform.python_version_tuple() if int(version[0]) == 2 and int(version[1]) <= 5: self._conn = klass(parsed_url.netloc) else: self._conn = klass(parsed_url.netloc, timeout=30) def fetch(self, code): params = {"action": "expand", "shorturl": code, "format": "simple"} try: self._conn.request("GET", self._path + "?" + urllib.urlencode(params)) resp = self._conn.getresponse() data = resp.read() except httplib.HTTPException, e: self._conn.close() raise exceptions.ServiceException("HTTP exception: %s" % e) except socket.error, e: self._conn.close() raise exceptions.ServiceException("Socket error: %s" % e)
def _preview(self, code, affiliate_url): resp, data = self._http_get("preview.php?num=" + code) if resp.status != 200: raise exceptions.ServiceException( "Unexpected HTTP status %i on preview page" % resp.status) match = re.search( "<a id=\"redirecturl\" href=\"(.*?)\">Proceed to this site.</a>", data, re.DOTALL) if not match: raise exceptions.ServiceException("No redirect on preview page") url = match.group(1).decode("utf-8") if url == "": return self._scrub_url(code, affiliate_url) return HTMLParser.HTMLParser().unescape(url).encode("utf-8")
def _fetch_blocked(self, code): resp = self._http_fetch(code, "GET") data = resp.read() if resp.status != 200: raise exceptions.ServiceException( "HTTP status changed from 200 to %i on second request" % resp.status) match = re.search("<a class=\"btn ignore\" href=\"(.*?)\" title=", data) if not match: raise exceptions.ServiceException( "Could not find target URL in safety warning") url = match.group(1).decode("utf-8") return HTMLParser.HTMLParser().unescape(url).encode("utf-8")
def fetch(self, code): resp = self._http_fetch(code) if resp.status == 200: return self._fetch_blocked(code) elif resp.status == 301: location = resp.getheader("Location") if not location: raise exceptions.ServiceException( "No Location header after HTTP status 301") return location elif resp.status == 404: raise exceptions.NoRedirectException() elif resp.status == 502: raise exceptions.CodeBlockedException("HTTP status 502") else: raise exceptions.ServiceException("Unknown HTTP status %i" % resp.status)
def _parse_tinyurl_redirect(self, data): match = re.search( "<p class=\"intro\">The URL you followed redirects back to a TinyURL and therefore we can't directly send you to the site\\. The URL it redirects to is <a href=\"(.*?)\">", data, re.DOTALL) if not match: raise exceptions.ServiceException( "No redirect on \"tinyurl redirect\" page on HTTP status 200") return HTMLParser.HTMLParser().unescape(match.group(1)).encode("utf-8")
def fetch(self, code): resp = self._http_fetch(code) if resp.status in self.http_status_redirect: location = resp.getheader("Location") if not location: raise exceptions.ServiceException( "No Location header after HTTP status 301") return location elif resp.status in self.http_status_no_redirect: raise exceptions.NoRedirectException() elif resp.status in self.http_status_code_blocked: raise exceptions.CodeBlockedException() elif resp.status in self.http_status_blocked: raise exceptions.BlockedException() else: raise exceptions.ServiceException("Unknown HTTP status %i" % resp.status)
def unexpected_http_status(self, code, resp): if resp.status != 200: return super(Owly, self).unexpected_http_status(code, resp) resp, data = self._http_get(code) if resp.status != 200: raise exceptions.ServiceException( "HTTP status changed from 200 to %i on second request" % resp.status) match = re.search("<a class=\"btn ignore\" href=\"(.*?)\" title=", data) if not match: raise exceptions.ServiceException( "Could not find target URL in safety warning") url = match.group(1).decode("utf-8") return HTMLParser.HTMLParser().unescape(url).encode("utf-8")
def fetch(self, code): try: self._conn.request( "GET", "/urlshortener/v1/url?shortUrl=http://goo.gl/%s" % code) resp = self._conn.getresponse() data = resp.read() except httplib.HTTPException, e: self._conn.close() raise exceptions.ServiceException("HTTP exception: %s" % e)
def fetch(self, code): params = {"action": "expand", "shorturl": code, "format": "simple"} try: self._conn.request("GET", self._path + "?" + urllib.urlencode(params)) resp = self._conn.getresponse() data = resp.read() except httplib.HTTPException, e: self._conn.close() raise exceptions.ServiceException("HTTP exception: %s" % e)
def _parse_preview(self, code, data): match = re.search( "<b>Click the link</b> if you'd like to proceed to the destination shown: -<br /><a href=\"(.*)\" class=\"biglink\">", data) if not match: raise exceptions.ServiceException( "Could not find target URL in 'Preview' page") url = match.group(1).decode("utf-8") return HTMLParser.HTMLParser().unescape(url).encode("utf-8")
def unexpected_http_status(self, code, resp): if resp.status != 500: return super(Snipurl, self).unexpected_http_status(code, resp) resp, data = self._http_get(code) if resp.status != 500: raise exceptions.ServiceException( "HTTP status changed from 500 to %i on second request" % resp.status) match = re.search( "<p>You clicked on a snipped URL, which will take you to the following looong URL: </p> <div class=\"quote\"><span class=\"quotet\"></span><br/>(.*?)</div> <br />", data) if not match: raise exceptions.ServiceException( "Could not find target URL on preview page") url = match.group(1).decode("utf-8") return HTMLParser.HTMLParser().unescape(url).encode("utf-8")
def _http_fetch(self, code, method="HEAD"): try: self._conn.request(method, self._path + code) resp = self._conn.getresponse() if method == "HEAD": resp.read() return resp except httplib.HTTPException, e: self._conn.close() raise exceptions.ServiceException("HTTP exception: %s" % e)
def _parse_blocked(self, code, data): match = re.search( "<p>For reference and to help those fighting spam the original destination of this URL is given below \(we strongly recommend you don't visit it since it may damage your PC\): -<br />(.*)</p><h2>is\.gd</h2><p>is\.gd is a free service used to shorten long URLs\.", data) if not match: raise exceptions.ServiceException( "Could not find target URL in 'Link Disabled' page") url = match.group(1).decode("utf-8") url = HTMLParser.HTMLParser().unescape(url).encode("utf-8") if url == "": raise exceptions.CodeBlockedException("Empty URL on preview") return url
def _parse_errorhelp(self, code, data): match = re.search('<meta http-equiv="refresh" content="0;url=(.*?)">', data) if not match: raise exceptions.ServiceException( "No redirect on \"errorhelp\" page on HTTP status 200") url = urlparse.urlparse(match.group(1)) if url.scheme != "http" or url.netloc != "tinyurl.com" or url.path != "/errorb.php": raise exceptions.ServiceException( "Unexpected redirect on \"errorhelp\" page on HTTP status 200" ) query = urlparse.parse_qs(url.query) if not ("url" in query and len(query["url"]) == 1) or not ("path" in query and len(query["path"]) == 1): raise exceptions.ServiceException( "Unexpected redirect on \"errorhelp\" page on HTTP status 200" ) if query["path"][0] != ("/" + code): raise exceptions.ServiceException( "Code mismatch on \"errorhelp\" on HTTP status 200") return query["url"][0]
def _http_fetch(self, code, method): headers = self.http_headers if self.http_keepalive: headers["Connection"] = "Keep-Alive" else: headers["Connection"] = "close" try: self._conn.request(method, self._path + code, headers=headers) resp = self._conn.getresponse() result = (resp, resp.read()) if not self.http_keepalive: self._conn.close() return result except httplib.HTTPException, e: self._conn.close() raise exceptions.ServiceException("HTTP exception: %s" % e)
def unexpected_http_status(self, code, resp): if resp.status != 200: return super(Isgd, self).unexpected_http_status(code, resp) resp, data = self._http_get(code) if resp.status != 200: raise exceptions.ServiceException( "HTTP status changed from 200 to %i on second request" % resp.status) if not data: raise exceptions.CodeBlockedException( "Empty response on status 200") if "<div id=\"main\"><p>Rate limit exceeded - please wait 1 minute before accessing more shortened URLs</p></div>" in data: raise exceptions.BlockedException() if "<div id=\"disabled\"><h2>Link Disabled</h2>" in data: return self._parse_blocked(code, data) if "<p>The full original link is shown below. <b>Click the link</b> if you'd like to proceed to the destination shown:" in data: return self._parse_preview(code, data)
self._conn.request("GET", self._path + "?" + urllib.urlencode(params)) resp = self._conn.getresponse() data = resp.read() except httplib.HTTPException, e: self._conn.close() raise exceptions.ServiceException("HTTP exception: %s" % e) except socket.error, e: self._conn.close() raise exceptions.ServiceException("Socket error: %s" % e) if resp.status == 200: if data == "not found": raise exceptions.NoRedirectException() return data raise exceptions.ServiceException("Unexpected HTTP status %i" % resp.status) class Bitly(HTTPService): """ http://bit.ly/ """ @property def charset(self): return "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_" @property def url(self): return "http://bit.ly/" def fetch(self, code):
def unexpected_http_status(self, code, resp): raise exceptions.ServiceException("Unexpected HTTP status %i" % resp.status)
class HTTPService(Service): """ Httplib-based URL shortener client Abstract serivce class to help with using httplib. """ @abc.abstractproperty def url(self): """ Returns the base URL of the URL shortener """ @property def http_headers(self): """ Dictionary of additional HTTP headers to send with every request. """ return {} @property def http_keepalive(self): """ Whether to use HTTP persistent connections or not. If set to false, the connection will be forcibly closed after each request """ return True def __init__(self): parsed_url = urlparse.urlparse(self.url) self._path = parsed_url.path or "/" if parsed_url.scheme == "http": klass = httplib.HTTPConnection elif parsed_url.scheme == "https": klass = httplib.HTTPSConnection else: raise ValueError("Unknown scheme %s" % parsed_url.scheme) version = platform.python_version_tuple() if int(version[0]) == 2 and int(version[1]) <= 5: self._conn = klass(parsed_url.netloc) else: self._conn = klass(parsed_url.netloc, timeout=30) def _http_head(self, code): return self._http_fetch(code, "HEAD")[0] def _http_get(self, code): return self._http_fetch(code, "GET") def _http_fetch(self, code, method): headers = self.http_headers if self.http_keepalive: headers["Connection"] = "Keep-Alive" else: headers["Connection"] = "close" try: self._conn.request(method, self._path + code, headers=headers) resp = self._conn.getresponse() result = (resp, resp.read()) if not self.http_keepalive: self._conn.close() return result except httplib.HTTPException, e: self._conn.close() raise exceptions.ServiceException("HTTP exception: %s" % e) except socket.error, e: self._conn.close() raise exceptions.ServiceException("Socket error: %s" % e)