Ejemplo n.º 1
0
 def freeProxyFifth():
     """
     抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml
     :return:
     """
     url = "http://www.goubanjia.com/free/gngn/index{page}.shtml"
     for page in range(1, 10):
         page_url = url.format(page=page)
         tree = getHtmlTree(page_url)
         proxy_list = tree.xpath('//td[@class="ip"]')
         # 此网站有隐藏的数字干扰,或抓取到多余的数字或.符号
         # 需要过滤掉<p style="display:none;">的内容
         xpath_str = """.//*[not(contains(@style, 'display: none'))
                             and not(contains(@style, 'display:none'))
                             and not(contains(@class, 'port'))
                             ]/text()
                     """
         for each_proxy in proxy_list:
             try:
                 # :符号裸放在td下,其他放在div span p中,先分割找出ip,再找port
                 ip_addr = ''.join(each_proxy.xpath(xpath_str))
                 port = each_proxy.xpath(
                     ".//span[contains(@class, 'port')]/text()")[0]
                 yield '{}:{}'.format(ip_addr, port)
             except Exception as e:
                 pass
 def freeProxyThird(days=1):
     """
     抓取ip181 http://www.ip181.com/
     :param days:
     :return:
     """
     url = 'http://www.ip181.com/'
     html_tree = getHtmlTree(url)
     tr_list = html_tree.xpath('//tr')[1:]
     for tr in tr_list:
         yield ':'.join(tr.xpath('./td/text()')[0:2])
 def freeProxyFirst(page=10):
     url_list = [
         'http://www.data5u.com/', 'http://www.data5u.com/free/',
         'http://www.data5u.com/free/gngn/index.shtml',
         'http://www.data5u.com/free/gnpt/index.shtml'
     ]
     for url in url_list:
         html_tree = getHtmlTree(url)
         ul_list = html_tree.xpath('//ul[@class="l2"]')
         for ul in ul_list:
             yield ':'.join(ul.xpath('.//li/text()')[0:2])
 def freeProxyFifth():
     """
     抓取guobanjia http://www.goubanjia.com/free/gngn/index.shtml
     :return:
     """
     url = "http://www.goubanjia.com/free/gngn/index{page}.shtml"
     for page in range(1, 10):
         page_url = url.format(page=page)
         tree = getHtmlTree(page_url)
         proxy_list = tree.xpath('//td[@class="ip"]')
         for each_proxy in proxy_list:
             yield ''.join(each_proxy.xpath('.//text()'))
 def freeProxyFourth():
     """
     抓取西刺代理 http://api.xicidaili.com/free2016.txt
     :return:
     """
     url_list = [
         'http://www.xicidaili.com/nn',  # 高匿
         'http://www.xicidaili.com/nt',  # 透明
     ]
     for each_url in url_list:
         tree = getHtmlTree(each_url)
         proxy_list = tree.xpath('.//table[@id="ip_list"]//tr')
         for proxy in proxy_list:
             yield ':'.join(proxy.xpath('./td/text()')[0:2])
Ejemplo n.º 6
0
 def freeProxyFirst(page=10):
     """
     抓取无忧代理 http://www.data5u.com/
     :param page: 页数
     :return:
     """
     url_list = [
         'http://www.data5u.com/', 'http://www.data5u.com/free/',
         'http://www.data5u.com/free/gngn/index.shtml',
         'http://www.data5u.com/free/gnpt/index.shtml'
     ]
     for url in url_list:
         html_tree = getHtmlTree(url)
         ul_list = html_tree.xpath('//ul[@class="l2"]')
         for ul in ul_list:
             try:
                 yield ':'.join(ul.xpath('.//li/text()')[0:2])
             except Exception as e:
                 pass