Python getHtmlTree Examples

Programming Language: Python

Namespace/Package Name: utilFunction

Method/Function: getHtmlTree

Examples at hotexamples.com: 8

Python getHtmlTree - 8 examples found. These are the top rated real world Python examples of utilFunction.getHtmlTree extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

 def freeProxyThird(days=1):
     """
     ip181 http://www.ip181.com/  不能用了
     :param days:
     :return:
     """
     url = 'http://www.ip181.com/'
     html_tree = getHtmlTree(url)
     try:
         tr_list = html_tree.xpath('//tr')[1:]
         for tr in tr_list:
             yield ':'.join(tr.xpath('./td/text()')[0:2])
     except Exception as e:
         pass

Example #2

Show file

 def freeProxySeventh():
     """
     快代理 https://www.kuaidaili.com
     """
     url_list = [
         'https://www.kuaidaili.com/free/inha/{page}/',
         'https://www.kuaidaili.com/free/intr/{page}/'
     ]
     for url in url_list:
         for page in range(1, 2):
             page_url = url.format(page=page)
             tree = getHtmlTree(page_url)
             proxy_list = tree.xpath('.//table//tr')
             for tr in proxy_list[1:]:
                 yield ':'.join(tr.xpath('./td/text()')[0:2])

Example #3

Show file

 def freeProxyTwelve(page_count=2):
     """
     guobanjia http://ip.jiangxianli.com/?page=
     免费代理库
     超多量
     :return:
     """
     for i in range(1, page_count + 1):
         url = 'http://ip.jiangxianli.com/?page={}'.format(i)
         html_tree = getHtmlTree(url)
         tr_list = html_tree.xpath(
             "/html/body/div[1]/div/div[1]/div[2]/table/tbody/tr")
         if len(tr_list) == 0:
             continue
         for tr in tr_list:
             yield tr.xpath("./td[2]/text()")[0] + ":" + tr.xpath(
                 "./td[3]/text()")[0]

Example #4

Show file

 def crawl_data5u():
     url = 'http://www.data5u.com/'
     html_tree = getHtmlTree(url)
     print(html_tree)
     ul_list = html_tree.xpath('//ul[@class="l2"]')
     key = 'ABCDEFGHIZ'
     for ul in ul_list:
         try:
             ip = ul.xpath('./span[1]/li/text()')[0]
             classnames = ul.xpath('./span[2]/li/attribute::class')[0]
             classname = classnames.split(' ')[1]
             port_sum = 0
             for c in classname:
                 port_sum *= 10
                 port_sum += key.index(c)
             port = port_sum >> 3
             yield '{}:{}'.format(ip, port)
         except Exception as e:
             print(e)

Example #5

Show file

 def freeProxyFirst(page=10):
     """
     无忧代理 http://www.data5u.com/
     几乎没有能用的
     :param page: 页数
     :return:
     """
     url_list = [
         'http://www.data5u.com/',
         'http://www.data5u.com/free/gngn/index.shtml',
         'http://www.data5u.com/free/gnpt/index.shtml'
     ]
     for url in url_list:
         html_tree = getHtmlTree(url)
         ul_list = html_tree.xpath('//ul[@class="l2"]')
         for ul in ul_list:
             try:
                 yield ':'.join(ul.xpath('.//li/text()')[0:2])
             except Exception as e:
                 print(e)

Example #6

Show file

 def freeProxyFourth(page_count=2):
     """
     西刺代理 http://www.xicidaili.com
     :return:
     """
     url_list = [
         'http://www.xicidaili.com/nn/',  # 高匿
         'http://www.xicidaili.com/nt/',  # 透明
     ]
     for each_url in url_list:
         for i in range(1, page_count + 1):
             page_url = each_url + str(i)
             tree = getHtmlTree(page_url)
             proxy_list = tree.xpath(
                 './/table[@id="ip_list"]//tr[position()>1]')
             for proxy in proxy_list:
                 try:
                     yield ':'.join(proxy.xpath('./td/text()')[0:2])
                 except Exception as e:
                     pass

Example #7

Show file

 def freeProxySecond(area=33, page=1):
     """
     代理66 http://www.66ip.cn/
     :param area: 抓取代理页数，page=1北京代理页，page=2上海代理页......
     :param page: 翻页
     :return:
     """
     area = 33 if area > 33 else area
     for area_index in range(1, area + 1):
         for i in range(1, page + 1):
             url = "http://www.66ip.cn/areaindex_{}/{}.html".format(
                 area_index, i)
             html_tree = getHtmlTree(url)
             tr_list = html_tree.xpath(
                 "//*[@id='footer']/div/table/tr[position()>1]")
             if len(tr_list) == 0:
                 continue
             for tr in tr_list:
                 yield tr.xpath("./td[1]/text()")[0] + ":" + tr.xpath(
                     "./td[2]/text()")[0]
             break

Example #8

Show file

 def freeProxyFifth():
     """
     guobanjia http://www.goubanjia.com/
     :return:
     """
     url = "http://www.goubanjia.com/"
     tree = getHtmlTree(url)
     proxy_list = tree.xpath('//td[@class="ip"]')
     # 此网站有隐藏的数字干扰，或抓取到多余的数字或.符号
     # 需要过滤掉<p style="display:none;">的内容
     xpath_str = """.//*[not(contains(@style, 'display: none'))
                                     and not(contains(@style, 'display:none'))
                                     and not(contains(@class, 'port'))
                                     ]/text()
                             """
     for each_proxy in proxy_list:
         try:
             # :符号裸放在td下，其他放在div span p中，先分割找出ip，再找port
             ip_addr = ''.join(each_proxy.xpath(xpath_str))
             port = each_proxy.xpath(
                 ".//span[contains(@class, 'port')]/text()")[0]
             yield '{}:{}'.format(ip_addr, port)
         except Exception as e:
             pass