def loadTree(url): wr = WebRequest() time.sleep(2) html = wr.get(url, headers=wr.header) content = html.content tree = etree.HTML(content) return tree
def freeProxySecond(proxy_number=100): """ 抓取代理66 http://www.66ip.cn/ :param proxy_number: 代理数量 :return: """ url = "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format( proxy_number) request = WebRequest() html = request.get(url).content for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html): yield proxy
def run(): proxy = "27.208.24.164:8060" type = "http" targetUrl = "http://httpbin.org/ip" validate(proxy, type, targetUrl) proxies = { type: "{type}://{url}".format(type=type, url=proxy) } wr = WebRequest() response = wr.get(url=targetUrl, proxies=proxies) origin = json.loads(response.content)['origin'] print(origin)
def freeProxySixth(): """ 抓取讯代理免费proxy http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10 :return: """ url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10' request = WebRequest() try: res = request.get(url).json() for row in res['RESULT']['rows']: yield '{}:{}'.format(row['ip'], row['port']) except Exception as e: pass
def getHtmlTree(url, **kwargs): header = { 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', } wr = WebRequest() html = wr.get(url=url, header=header).content return etree.HTML(html)
from proxy.ProxyValidator import validate from util.WebRequest import WebRequest import logging, json format = "%(asctime)s - [%(levelname)s] - [%(funcName)s] - %(message)s" logging.basicConfig(level=logging.INFO, format=format) wr = WebRequest() def run(): proxy = "27.208.24.164:8060" type = "http" targetUrl = "http://httpbin.org/ip" validate(proxy, type, targetUrl) proxies = { type: "{type}://{url}".format(type=type, url=proxy) } wr = WebRequest() response = wr.get(url=targetUrl, proxies=proxies) origin = json.loads(response.content)['origin'] print(origin) if __name__ == '__main__': run()