def get_http_data(url, header=None, data=None, useragent=FIREFOX_UA, referer=None, cookiejar=None): """ Get the page to parse it for streams """ if not cookiejar: cookiejar = CookieJar() log.debug("HTTP getting %r", url) starttime = time.time() request = Request(url) standard_header = {'Referer': referer, 'User-Agent': useragent} for key, value in [head for head in standard_header.items() if head[1]]: request.add_header(key, value) if header: for key, value in [head for head in header.items() if head[1]]: request.add_header(key, value) if data: request.add_data(data) opener = build_opener(HTTPCookieProcessor(cookiejar)) try: response = opener.open(request) except HTTPError as e: log.error("Something wrong with that url") log.error("Error code: %s", e.code) sys.exit(5) except URLError as e: log.error("Something wrong with that url") log.error("Error code: %s", e.reason) sys.exit(5) except ValueError as e: log.error("Try adding http:// before the url") sys.exit(5) if is_py3: data = response.read() try: data = data.decode("utf-8") except UnicodeDecodeError: pass else: try: data = response.read() except socket.error as e: log.error("Lost the connection to the server") sys.exit(5) response.close() spent_time = time.time() - starttime bps = 8 * len(data) / max(spent_time, 0.001) log.debug("HTTP got %d bytes from %r in %.2fs (= %dbps)", len(data), url, spent_time, bps) return data