def strip_url_params3(url, strip=None): if not strip: strip = [] parse = urllib.parse.urlparse(url) query = urllib.parse.parse_qs(parse.query) query = {k: v[0] for k, v in query.items() if k not in strip} query = urllib.parse.urlencode(query) new = parse._replace(query=query) return new.geturl()
def _is_valid_proxy(proxy_source): """Check if is a valid proxy for a specific Source.""" proxy, source = proxy_source _proxy = urllib.request.ProxyHandler({'http': proxy}) opener = urllib.request.build_opener(_proxy) test = PROXY_MAP[source] url, valid, invalid = test[URL], test[VALID], test[INVALID] if source in VHOST: parse = urllib.parse.urlparse(url) netloc = parse.netloc url = parse._replace(netloc=VHOST[source]).geturl() req = urllib.request.Request(url) req.add_unredirected_header('Host', netloc) else: req = urllib.request.Request(url) try: response = opener.open(req, timeout=TIMEOUT) if response.info().get('Content-Encoding') == 'gzip': body = io.BytesIO(response.read()) body = gzip.open(body).read().decode('utf-8') else: body = response.read().decode('utf-8') except Exception: return None if valid and invalid: is_valid = all(i in body for i in valid) is_valid = is_valid and not any(i in body for i in invalid) elif valid: is_valid = all(i in body for i in valid) elif invalid: is_valid = not any(i in body for i in invalid) else: is_valid = False if is_valid: return (proxy, source)