def _request(self, url, method): scheme, host = urlparse(url)[:2] scheme = scheme.lower() proxies = getproxies_environment() if scheme in proxies: scheme, host = urlparse(proxies[scheme])[:2] scheme = scheme.lower() kwargs = {} if version_info[1] >= 6: kwargs['timeout'] = self.timeout else: socket.setdefaulttimeout(self.timeout) if scheme == "https": conn = HTTPSConnection(host, **kwargs) else: conn = HTTPConnection(host, **kwargs) headers={} if method == 'GET': headers['Range'] = 'bytes=0-%s' % self.max_size try: try: conn.request(method.upper(), iri_to_uri(url), headers=headers) response = conn.getresponse() data = response.read(self.max_size) conn.close() except socket.error, e: raise HTTPException(e.message or e.args[1]) finally: if version_info[1] < 6: socket.setdefaulttimeout(None) contenttype = response.getheader('Content-Type', None) if contenttype: match = re.search('^charset=([a-zA-Z0-9-]+)', contenttype) try: if match: data = data.decode(match.group(1)) elif contenttype.startswith('text/'): data = data.decode('utf-8') except UnicodeDecodeError: guessed = detect(data) if guessed['confidence'] > 0.5: charset = guessed['encoding'] # Common guessing mistake: if charset.startswith('ISO-8859') and '\x92' in data: charset = 'windows-1252' data = unicode(data, charset, errors='replace') return response.status, response.reason, data, response.getheaders()
def _request(self, url, method): scheme, host = urlparse(url)[:2] scheme = scheme.lower() proxies = getproxies_environment() if scheme in proxies: scheme, host = urlparse(proxies[scheme])[:2] scheme = scheme.lower() kwargs = {} if version_info[1] >= 6: kwargs["timeout"] = self.timeout else: socket.setdefaulttimeout(self.timeout) if scheme == "https": conn = HTTPSConnection(host, **kwargs) else: conn = HTTPConnection(host, **kwargs) headers = {} if method == "GET": headers["Range"] = "bytes=0-%s" % self.max_size try: try: conn.request(method.upper(), iri_to_uri(url), headers=headers) response = conn.getresponse() data = response.read(self.max_size) conn.close() except socket.error, e: raise HTTPException(e.message or e.args[1]) finally: if version_info[1] < 6: socket.setdefaulttimeout(None) contenttype = response.getheader("Content-Type", None) if contenttype: match = re.search("^charset=([a-zA-Z0-9-]+)", contenttype) try: if match: data = data.decode(match.group(1)) elif contenttype.startswith("text/"): data = data.decode("utf-8") except UnicodeDecodeError: guessed = detect(data) if guessed["confidence"] > 0.5: charset = guessed["encoding"] # Common guessing mistake: if charset.startswith("ISO-8859") and "\x92" in data: charset = "windows-1252" data = unicode(data, charset, errors="replace") return response.status, response.reason, data, response.getheaders()
def draw(self, event, url, colour, width, height): if not urlparse(url).netloc: url = 'http://' + url if urlparse(url).scheme == 'file': event.addresponse(u'Are you trying to haxor me?') return if not urlparse(url).path: url += '/' try: f = urlopen(iri_to_uri(url)) except HTTPError, e: event.addresponse(u'Sorry, error fetching URL: %s', BaseHTTPRequestHandler.responses[e.code][0]) return
def get_html_parse_tree(url, data=None, headers={}, treetype='beautifulsoup'): "Request a URL, parse with html5lib, and return a parse tree from it" req = urllib2.Request(iri_to_uri(url), data, headers) f = urllib2.urlopen(req) if f.info().gettype() not in ('text/html', 'application/xhtml+xml'): f.close() raise ContentTypeException("Content type isn't HTML, but " + f.info().gettype()) data = f.read() f.close() encoding = None contentType = f.headers.get('content-type') if contentType: (mediaType, params) = cgi.parse_header(contentType) encoding = params.get('charset') compression = f.headers.get('content-encoding') if compression: if compression.lower() == "deflate": try: data = zlib.decompress(data) except zlib.error: data = zlib.decompress(data, -zlib.MAX_WBITS) elif compression.lower() == "gzip": compressedstream = StringIO(data) gzipper = GzipFile(fileobj=compressedstream) data = gzipper.read() if treetype == "beautifulsoup": return BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES) elif treetype == "etree": kwargs = {'tree': treebuilders.getTreeBuilder('etree', ElementTree)} # http://code.google.com/p/html5lib/issues/detail?id=138 if ('namespaceHTMLElements' in inspect.getargspec(HTMLParser.__init__)[0]): kwargs['namespaceHTMLElements'] = False parser = HTMLParser(**kwargs) else: if treetype == "html5lib-beautifulsoup": treetype = "beautifulsoup" parser = HTMLParser(tree=treebuilders.getTreeBuilder(treetype)) return parser.parse(data, encoding=encoding)
def get_html_parse_tree(url, data=None, headers={}, treetype='beautifulsoup'): "Request a URL, parse with html5lib, and return a parse tree from it" req = urllib2.Request(iri_to_uri(url), data, headers) f = urllib2.urlopen(req) if f.info().gettype() not in ('text/html', 'application/xhtml+xml'): f.close() raise ContentTypeException("Content type isn't HTML, but " + f.info().gettype()) data = f.read() f.close() encoding = None contentType = f.headers.get('content-type') if contentType: (mediaType, params) = cgi.parse_header(contentType) encoding = params.get('charset') compression = f.headers.get('content-encoding') if compression: if compression.lower() == "deflate": try: data = zlib.decompress(data) except zlib.error: data = zlib.decompress(data, -zlib.MAX_WBITS) elif compression.lower() == "gzip": compressedstream = StringIO(data) gzipper = GzipFile(fileobj=compressedstream) data = gzipper.read() if treetype == "beautifulsoup": return BeautifulSoup(data, convertEntities=BeautifulSoup.HTML_ENTITIES) elif treetype == "etree": kwargs = {'tree': treebuilders.getTreeBuilder('etree', ElementTree)} # http://code.google.com/p/html5lib/issues/detail?id=138 if ('namespaceHTMLElements' in inspect.getargspec(HTMLParser.__init__)[0]): kwargs['namespaceHTMLElements'] = False parser = HTMLParser(**kwargs) else: if treetype == "html5lib-beautifulsoup": treetype = "beautifulsoup" parser = HTMLParser(tree=treebuilders.getTreeBuilder(treetype)) return parser.parse(data, encoding = encoding)
def translate (self, event, text, src_lang, dest_lang): dest_lang = self.language_code(dest_lang or self.dest_lang) src_lang = self.language_code(src_lang or '') if is_url(text): if urlparse(text).scheme in ('', 'http'): url = iri_to_uri(text) query = {'sl': src_lang, 'tl': dest_lang, 'u': url} event.addresponse(u'http://translate.google.com/translate?' + urlencode(query)) else: event.addresponse(u'I can only translate HTTP pages') return try: translated = self._translate(event, text, src_lang, dest_lang)[0] event.addresponse(translated) except TranslationException, e: event.addresponse(u"I couldn't translate that: %s.", unicode(e))
def translate(self, event, text, src_lang, dest_lang): dest_lang = self.language_code(dest_lang or self.dest_lang) src_lang = self.language_code(src_lang or '') if is_url(text): if urlparse(text).scheme in ('', 'http'): url = iri_to_uri(text) query = {'sl': src_lang, 'tl': dest_lang, 'u': url} event.addresponse(u'http://translate.google.com/translate?' + urlencode(query)) else: event.addresponse(u'I can only translate HTTP pages') return try: translated = self._translate(event, text, src_lang, dest_lang)[0] event.addresponse(translated) except TranslationException, e: event.addresponse(u"I couldn't translate that: %s.", unicode(e))