Python WebRequestの例、util.webRequest.WebRequest Pythonの例

コード例 #1

0

ファイルを表示

    def freeProxy09(page_count=4):
        """
        http://ip.jiangxianli.com/?page=
        免费代理库
        :return:
        """
        url_pattern = [
            "http://ip.jiangxianli.com/?anonymity=1&page={}",
            "http://ip.jiangxianli.com/?anonymity=2&page={}",
            "http://ip.jiangxianli.com/?protocol=http&page={}",
            "http://ip.jiangxianli.com/?protocol=https&page={}",
        ]
        url_list = []
        for page_index in range(1, page_count + 1):
            for pattern in url_pattern:
                url_list.append(pattern.format(page_index))

        for url in url_list:
            # for i in range(1, page_count + 1):
            #     url = 'http://ip.jiangxianli.com/?anonymity=2&page={}'.format(i)
            html_tree = WebRequest().get(url).tree
            for index, tr in enumerate(html_tree.xpath("//table//tr")):
                if index == 0:
                    continue
                yield ":".join(tr.xpath("./td/text()")[0:2]).strip()

コード例 #2

0

ファイルを表示

 def freeProxy01():
     """
     米扑代理 https://proxy.mimvp.com/
     :return:
     """
     url_list = [
         'https://proxy.mimvp.com/freeopen',
         'https://proxy.mimvp.com/freeopen?proxy=in_tp'
     ]
     port_img_map = {
         'DMxMjg': '3128',
         'Dgw': '80',
         'DgwODA': '8080',
         'DgwOA': '808',
         'DgwMDA': '8000',
         'Dg4ODg': '8888',
         'DgwODE': '8081',
         'Dk5OTk': '9999'
     }
     for url in url_list:
         html_tree = WebRequest().get(url).tree
         for tr in html_tree.xpath(
                 ".//table[@class='mimvp-tbl free-proxylist-tbl']/tbody/tr"
         ):
             try:
                 ip = ''.join(tr.xpath('./td[2]/text()'))
                 port_img = ''.join(
                     tr.xpath('./td[3]/img/@src')).split("port=")[-1]
                 port = port_img_map.get(port_img[14:].replace('O0O', ''))
                 if port:
                     yield '%s:%s' % (ip, port)
             except Exception as e:
                 print(e)

コード例 #3

0

ファイルを表示

 def freeProxy01():
     """
     无忧代理 http://www.data5u.com/
     几乎没有能用的
     :return:
     """
     url_list = [
         'http://www.data5u.com/',
         # 'http://www.data5u.com/free/gngn/index.shtml',
         # 'http://www.data5u.com/free/gnpt/index.shtml'
     ]
     key = 'ABCDEFGHIZ'
     for url in url_list:
         html_tree = WebRequest().get(url).tree
         ul_list = html_tree.xpath('//ul[@class="l2"]')
         for ul in ul_list:
             try:
                 ip = ul.xpath('./span[1]/li/text()')[0]
                 classnames = ul.xpath('./span[2]/li/attribute::class')[0]
                 classname = classnames.split(' ')[1]
                 port_sum = 0
                 for c in classname:
                     port_sum *= 10
                     port_sum += key.index(c)
                 port = port_sum >> 3
                 yield '{}:{}'.format(ip, port)
             except Exception as e:
                 print(e)

コード例 #4

0

ファイルを表示

 def freeProxy20():
     source = 'premproxy.com'
     urls = [
         'https://premproxy.com/list/ip-port/1.htm',
         'https://premproxy.com/list/ip-port/2.htm',
         'https://premproxy.com/list/ip-port/3.htm',
     ]
     proxies = {'http': MAINPROXY, 'https': MAINPROXY}
     for url in urls:
         tree = WebRequest().get(url, proxies=proxies).tree
         if tree is None:
             return None
         ret = tree.xpath('//ul[@id="ipportlist"]/li')
         for r in ret:
             try:
                 ip = r.xpath('./li/text()')[0][:-1]
                 # ip_mask = re.search('(?:")(.*)(?:")',
                 #                     ip_script).groups()[0]
                 # ip = re.search('(?:>)([0-9\.]+)(?:<)',
                 #                unquote(ip_mask, 'utf8')).groups()[0]
                 port = r.xpath('./li/span/text()')[0]
                 protocol = 'https'
                 yield Proxy(f'{protocol}://{ip}:{port}', source=source)
             except Exception as e:
                 print(type(e), e)

コード例 #5

0

ファイルを表示

    def freeProxy04():
        """
        全网代理 http://www.goubanjia.com/
        :return:
        """
        url = "http://www.goubanjia.com/"
        tree = WebRequest().get(url).tree
        proxy_list = tree.xpath('//td[@class="ip"]')
        # 此网站有隐藏的数字干扰，或抓取到多余的数字或.符号
        # 需要过滤掉<p style="display:none;">的内容
        xpath_str = """.//*[not(contains(@style, 'display: none'))
                                        and not(contains(@style, 'display:none'))
                                        and not(contains(@class, 'port'))
                                        ]/text()
                                """

        # port是class属性值加密得到
        def _parse_port(port_element):
            port_list = []
            for letter in port_element:
                port_list.append(str("ABCDEFGHIZ".find(letter)))
            _port = "".join(port_list)
            return int(_port) >> 0x3

        for each_proxy in proxy_list:
            try:
                ip_addr = ''.join(each_proxy.xpath(xpath_str))
                port_str = each_proxy.xpath(
                    ".//span[contains(@class, 'port')]/@class")[0].split()[-1]
                port = _parse_port(port_str.strip())
                yield '{}:{}'.format(ip_addr, int(port))
            except Exception:
                pass

コード例 #6

0

ファイルを表示

ファイル: proxyFetcher.py プロジェクト: LuWinter/proxy_pool

    def freeProxy04():
        """
        guobanjia http://www.goubanjia.com/
        :return:
        """
        url = "http://www.goubanjia.com/"
        tree = WebRequest().get(url).tree
        proxy_list = tree.xpath('//td[@class="ip"]')
        # 此网站有隐藏的数字干扰，或抓取到多余的数字或.符号
        # 需要过滤掉<p style="display:none;">的内容
        xpath_str = """.//*[not(contains(@style, 'display: none'))
                                        and not(contains(@style, 'display:none'))
                                        and not(contains(@class, 'port'))
                                        ]/text()
                                """
        for each_proxy in proxy_list:
            try:
                # :符号裸放在td下，其他放在div span p中，先分割找出ip，再找port
                ip_addr = ''.join(each_proxy.xpath(xpath_str))

                # HTML中的port是随机数，真正的端口编码在class后面的字母中。
                # 比如这个：
                # <span class="port CFACE">9054</span>
                # CFACE解码后对应的是3128。
                port = 0
                for _ in each_proxy.xpath(".//span[contains(@class, 'port')]"
                                          "/attribute::class")[0]. \
                        replace("port ", ""):
                    port *= 10
                    port += (ord(_) - ord('A'))
                port /= 8

                yield '{}:{}'.format(ip_addr, int(port))
            except Exception as e:
                pass

コード例 #7

0

ファイルを表示

 def freeProxy01():
     """
     米扑代理 https://proxy.mimvp.com/
     :return:
     """
     url_list = [
         'https://proxy.mimvp.com/freeopen?proxy=in_hp',
         'https://proxy.mimvp.com/freeopen?proxy=out_hp'
     ]
     for url in url_list:
         html_tree = WebRequest().get(url).tree
         for tr in html_tree.xpath(
                 ".//table[@class='mimvp-tbl free-proxylist-tbl']/tbody/tr"
         ):
             try:
                 ip = ''.join(tr.xpath('./td[2]/text()'))
                 port_img_url = 'https://proxy.mimvp.com' + (''.join(
                     tr.xpath('./td[3]/img/@src')))
                 port_img_name = port_img_url.split('port=')[-1]
                 img_path = os.path.abspath(
                     os.path.join(
                         os.path.dirname(__file__),
                         "../cache/freeopen_port_image/%s.png" %
                         port_img_name))
                 if not os.path.exists(img_path):
                     urllib.request.urlretrieve(port_img_url,
                                                filename=img_path)
                 port = pytesseract.image_to_string(img_path)
                 if port:
                     yield '%s:%s' % (ip, port)
             except Exception as e:
                 print(e)

コード例 #8

0

ファイルを表示

 def freeProxy04():
     """ 蝶鸟IP """
     url = "https://www.dieniao.com/FreeProxy.html"
     tree = WebRequest().get(url, verify=False).tree
     for li in tree.xpath("//div[@class='free-main col-lg-12 col-md-12 col-sm-12 col-xs-12']/ul/li")[1:]:
         ip = "".join(li.xpath('./span[1]/text()')).strip()
         port = "".join(li.xpath('./span[2]/text()')).strip()
         yield "%s:%s" % (ip, port)

コード例 #9

0

ファイルを表示

 def freeProxy06():
     """ PROXY11 https://proxy11.com """
     url = "https://proxy11.com/api/demoweb/proxy.json?country=hk&speed=2000"
     try:
         resp_json = WebRequest().get(url).json
         for each in resp_json.get("data", []):
             yield "%s:%s" % (each.get("ip", ""), each.get("port", ""))
     except Exception as e:
         print(e)

コード例 #10

0

ファイルを表示

 def freeProxy03():
     """ 开心代理 """
     target_urls = ["http://www.kxdaili.com/dailiip.html", "http://www.kxdaili.com/dailiip/2/1.html"]
     for url in target_urls:
         tree = WebRequest().get(url).tree
         for tr in tree.xpath("//table[@class='active']//tr")[1:]:
             ip = "".join(tr.xpath('./td[1]/text()')).strip()
             port = "".join(tr.xpath('./td[2]/text()')).strip()
             yield "%s:%s" % (ip, port)

コード例 #11

0

ファイルを表示

 def freeProxy09(page_count=1):
     """ 免费代理库 """
     for i in range(1, page_count + 1):
         url = 'http://ip.jiangxianli.com/?country=中国&page={}'.format(i)
         html_tree = WebRequest().get(url).tree
         for index, tr in enumerate(html_tree.xpath("//table//tr")):
             if index == 0:
                 continue
             yield ":".join(tr.xpath("./td/text()")[0:2]).strip()

コード例 #12

0

ファイルを表示

 def freeProxy12():
     urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1']
     request = WebRequest()
     for url in urls:
         r = request.get(url, timeout=10)
         proxies = re.findall(
             r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>',
             r.text)
         for proxy in proxies:
             yield ':'.join(proxy)

コード例 #13

0

ファイルを表示

 def freeProxy02():
     """
     代理66 http://www.66ip.cn/
     """
     url = "http://www.66ip.cn/"
     resp = WebRequest().get(url, timeout=10).tree
     for i, tr in enumerate(resp.xpath("(//table)[3]//tr")):
         if i > 0:
             ip = "".join(tr.xpath("./td[1]/text()")).strip()
             port = "".join(tr.xpath("./td[2]/text()")).strip()
             yield "%s:%s" % (ip, port)

コード例 #14

0

ファイルを表示

 def freeProxy06():
     """ FateZero http://proxylist.fatezero.org/ """
     url = "http://proxylist.fatezero.org/proxy.list"
     try:
         resp_text = WebRequest().get(url).text
         for each in resp_text.split("\n"):
             json_info = json.loads(each)
             if json_info.get("country") == "CN":
                 yield "%s:%s" % (json_info.get("host", ""), json_info.get("port", ""))
     except Exception as e:
         print(e)

コード例 #15

0

ファイルを表示

 def freeProxy04():
     """
     神鸡代理 http://www.shenjidaili.com/
     :return:
     """
     url = "http://www.shenjidaili.com/product/open/"
     tree = WebRequest().get(url).tree
     for table in tree.xpath("//table[@class='table table-hover text-white text-center table-borderless']"):
         for tr in table.xpath("./tr")[1:]:
             proxy = ''.join(tr.xpath("./td[1]/text()"))
             yield proxy.strip()

コード例 #16

0

ファイルを表示

ファイル: proxyFetcher.py プロジェクト: LuWinter/proxy_pool

 def freeProxy06():
     """
     码农代理 https://proxy.coderbusy.com/
     :return:
     """
     urls = ['https://proxy.coderbusy.com/']
     for url in urls:
         tree = WebRequest().get(url).tree
         proxy_list = tree.xpath('.//table//tr')
         for tr in proxy_list[1:]:
             yield ':'.join(tr.xpath('./td/text()')[0:2])

コード例 #17

0

ファイルを表示

 def freeProxy10():
     """
     墙外网站 cn-proxy
     :return:
     """
     urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218']
     request = WebRequest()
     for url in urls:
         r = request.get(url, timeout=10)
         proxies = re.findall(
             r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W]<td>(\d+)</td>',
             r.text)
         for proxy in proxies:
             yield ':'.join(proxy)

コード例 #18

0

ファイルを表示

ファイル: proxyFetcher.py プロジェクト: rdc201213/DataWarehouse

 def freeProxy05():
     """
     快代理 https://www.kuaidaili.com
     """
     url_list = [
         'https://www.kuaidaili.com/free/inha/',
         'https://www.kuaidaili.com/free/intr/'
     ]
     for url in url_list:
         tree = WebRequest().get(url).tree
         proxy_list = tree.xpath('.//table//tr')
         sleep(1)  # 必须sleep 不然第二条请求不到数据
         for tr in proxy_list[1:]:
             yield ':'.join(tr.xpath('./td/text()')[0:2])

コード例 #19

0

ファイルを表示

 def freeProxy06():
     """
     代理盒子 https://proxy.coderbusy.com/
     :return:
     """
     urls = ['https://proxy.coderbusy.com/zh-hans/ops/country/cn.html']
     for url in urls:
         tree = WebRequest().get(url).tree
         proxy_list = tree.xpath('.//table//tr')
         for tr in proxy_list[1:]:
             proxy = '{}:{}'.format("".join(tr.xpath("./td[1]/text()")).strip(),
                                    "".join(tr.xpath("./td[2]//text()")).strip())
             if proxy:
                 yield proxy

コード例 #20

0

ファイルを表示

 def freeProxy06(page=2):
     """
     极速代理 https://www.superfastip.com/
     :return:
     """
     url = "https://api.superfastip.com/ip/freeip?page={page}"
     for i in range(page):
         page_url = url.format(page=i + 1)
         try:
             resp_json = WebRequest().get(page_url).json
             for each in resp_json.get("freeips", []):
                 yield "%s:%s" % (each.get("ip", ""), each.get("port", ""))
         except Exception as e:
             print(e)

コード例 #21

0

ファイルを表示

 def freeProxy06():
     """
     码农代理 https://proxy.coderbusy.com/
     :return:
     """
     urls = ['https://proxy.coderbusy.com/']
     for url in urls:
         tree = WebRequest().get(url).tree
         proxy_list = tree.xpath('.//table//tr')
         for tr in proxy_list[1:]:
             try:
                 yield ('http://' + tr.xpath('./td[1]/text()')[0] + ':' +
                        tr.xpath('./td[2]/a/text()')[0])
             except IndexError:
                 continue

コード例 #22

0

ファイルを表示

 def freeProxy26():
     """
     https://cool-proxy.net/
     
     :return:
     """
     urls = ['http://cool-proxy.net/proxies.json']
     for url in urls:
         r = WebRequest().get(url, timeout=10)
         proxy_json = r.json()
         ips = []
         for a in proxy_json:
             ips += [a.get('ip') + ':' + str(a.get('port'))]
         for ip in ips:
             yield ip.strip()

コード例 #23

0

ファイルを表示

 def freeProxy09(page_count=10):
     """ 免费代理库 """
     page = 1
     while page <= page_count:
         url = 'http://ip.jiangxianli.com/?country=中国&page={}'.format(page)
         html_tree = WebRequest().get(url).tree
         trs = html_tree.xpath("//table//tr")
         if len(trs) <= 1:
             break
         for index, tr in enumerate(trs):
             if index == 0:
                 continue
             yield ":".join(tr.xpath("./td/text()")[0:2]).strip()
         sleep(1)
         page += 1

コード例 #24

0

ファイルを表示

    def freeProxy04():
        """ FreeProxyList https://www.freeproxylists.net/zh/ """
        url = "https://www.freeproxylists.net/zh/?c=CN&pt=&pr=&a%5B%5D=0&a%5B%5D=1&a%5B%5D=2&u=50"
        tree = WebRequest().get(url, verify=False).tree
        from urllib import parse

        def parse_ip(input_str):
            html_str = parse.unquote(input_str)
            ips = re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', html_str)
            return ips[0] if ips else None

        for tr in tree.xpath("//tr[@class='Odd']") + tree.xpath("//tr[@class='Even']"):
            ip = parse_ip("".join(tr.xpath('./td[1]/script/text()')).strip())
            port = "".join(tr.xpath('./td[2]/text()')).strip()
            if ip:
                yield "%s:%s" % (ip, port)

コード例 #25

0

ファイルを表示

 def freeProxy30():
     """
     http://www.xroxy.com/
     vpn needed
     :return:
     """
     urls = [
         'http://www.xroxy.com/proxylist.php?port=&type=&ssl=&country=&latency=&reliability=&'
         'sort=reliability&desc=true&pnum=%s#table' % i for i in range(20)
     ]
     for url in urls:
         r = WebRequest().get(url, timeout=10, proxies=proxies)
         ips = list()
         html = r.tree
         infos = list()
         for x in html.xpath('//tr'):
             # print(dir(x))
             infos += x.cssselect('.row1') + x.cssselect('.row0')
         # print(infos)
         for info in infos:
             proxy_ip = info.cssselect('a')[0].text.replace('\n',
                                                            '').replace(
                                                                '\r', '')
             proxy_port = info.cssselect('a')[1].text
             proxy_type = info.cssselect('a')[2].text
             ips.append(proxy_ip + ':' + proxy_port)
         for ip in ips:
             yield ip.strip()

コード例 #26

0

ファイルを表示

 def freeProxy34():
     import base64
     """
     http://proxy-list.org/
     vpn needed
     :return:
     """
     urls = [
         'https://proxy-list.org/english/index.php?p=%s' % i
         for i in range(1, 11)
     ]
     for url in urls:
         r = WebRequest().get(url, timeout=10, proxies=proxies)
         html = etree.HTML(
             r.text.replace('<script type="text/javascript">Proxy(\'',
                            '').replace('\')</script>', ''))
         infos = html.xpath('//*[@id="proxy-table"]/div[2]/div')[0].xpath(
             'ul')
         ips = list()
         for info in infos:
             ips.append(
                 base64.b64decode(
                     info.cssselect('li')[0].text).decode('ascii'))
         for ip in ips:
             yield ip.strip()

コード例 #27

0

ファイルを表示

 def freeProxy39():
     """
     https://www.proxynova.com/
     vpn needed
     :return:
     """
     urls = [
         'https://www.proxynova.com/proxy-server-list/country-cn/',
         'https://www.proxynova.com/proxy-server-list/'
     ]
     for url in urls:
         r = WebRequest().get(url, timeout=10, proxies=proxies)
         # print(r.text)
         infos = etree.HTML(
             r.text.replace('<script>document.write(\'',
                            '').replace('\');</script>',
                                        '').replace('\n',
                                                    '')).cssselect('tr')
         print(len(infos))
         infos.pop(0)
         ips = []
         for info in infos:
             if len(info.cssselect('td')) < 3:
                 continue
             proxy_ip = info.cssselect('td')[0].cssselect(
                 'abbr')[0].text.replace(' ', '')
             proxy_port = info.cssselect('td')[1].text.replace(' ', '')
             proxy = "{0}:{1}".format(proxy_ip, proxy_port)
             ips.append(proxy)
             pass
         for ip in ips:
             yield ip.strip()
     pass

コード例 #28

0

ファイルを表示

 def freeProxy16():
     proxies = {'http': MAINPROXY, 'https': MAINPROXY}
     source = 'free-proxy.cz'
     urls = [
         'http://free-proxy.cz/en/proxylist/country/all/socks5/ping/all',
         'http://free-proxy.cz/en/proxylist/country/all/socks5/ping/all/2',
         'http://free-proxy.cz/en/proxylist/country/all/socks5/ping/all/3',
         'http://free-proxy.cz/en/proxylist/country/all/socks5/ping/all/4',
         'http://free-proxy.cz/en/proxylist/country/all/socks5/ping/all/5',
     ]
     for url in urls:
         r = WebRequest().get(url, proxies=proxies)
         if r.response.status_code == 200:
             ret = r.tree
             for tr in ret.xpath('//table[@id="proxy_list"]//tr')[1:]:
                 try:
                     ip_script = tr.xpath('./td[1]/script/text()')[0]
                     ip_base64 = re.search('(?:")([\w=]+)(?:")',
                                           ip_script).groups()[0]
                     ip = base64.b64decode(ip_base64).decode('utf8')
                     port = tr.xpath('./td[2]/span/text()')[0]
                     protocol = ''.join(tr.xpath('./td[3]/small/text()'))
                     yield Proxy(f'{protocol}://{ip}:{port}', source=source)
                 except Exception as e:
                     print(e)

コード例 #29

0

ファイルを表示

 def freeProxy11():
     """
     https://proxy-list.org/english/index.php
     :return:
     """
     urls = [
         'https://proxy-list.org/english/index.php?p=%s' % n
         for n in range(1, 10)
     ]
     request = WebRequest()
     import base64
     for url in urls:
         r = request.get(url, timeout=10)
         proxies = re.findall(r"Proxy\('(.*?)'\)", r.text)
         for proxy in proxies:
             yield base64.b64decode(proxy).decode()

コード例 #30

0

ファイルを表示

 def freeProxy16():
     url = 'http://raw.staticdn.net/fate0/proxylist/master/proxy.list'
     r = WebRequest().get(url, timeout=10)
     ips = [item for item in r.text.split('\n')][0:-1]
     ips = [json.loads(item, strict=False) for item in ips]
     for ip in ips:
         yield str(ip['host']) + str(ip['port'])