Example #1
0
 def freeProxyWallThird():
     urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1']
     request = WebRequest()
     for url in urls:
         r = request.get(url)
         proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text)
         for proxy in proxies:
             yield ':'.join(proxy)
Example #2
0
 def freeProxyWallSecond():
     urls = ['https://proxy-list.org/english/index.php?p=%s' %
             n for n in range(1, 10)]
     request = WebRequest()
     import base64
     for url in urls:
         r = request.get(url)
         proxies = re.findall(r"Proxy\('(.*?)'\)", r.text)
         for proxy in proxies:
             yield base64.b64decode(proxy).decode()
Example #3
0
 def freeProxy12():
     urls = ['https://list.proxylistplus.com/Fresh-HTTP-Proxy-List-1']
     request = WebRequest()
     for url in urls:
         r = request.get(url, timeout=10)
         proxies = re.findall(
             r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>',
             r.text)
         for proxy in proxies:
             yield ':'.join(proxy)
Example #4
0
 def getCheckerproxy():
     str = datetime.datetime.today().strftime('%Y-%m-%d')
     url = 'https://checkerproxy.net/api/archive/{}'.format(str)
     request = WebRequest()
     try:
         res = request.get(url, timeout=10).json()
         for row in res:
             if row['type'] == 2 and row['kind'] == 2:
                 yield row['addr']
     except Exception as e:
         pass
Example #5
0
    def run(self):

        urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218']
        request = WebRequest()
        for url in urls:
            r = request.get(url)
            proxies = re.findall(
                r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W]<td>(\d+)</td>',
                r.text)
            for proxy in proxies:
                yield ':'.join(proxy)
Example #6
0
 def freeProxyWallFourth():
     urls = ['https://free-proxy-list.net']
     request = WebRequest()
     for url in urls:
         r = request.get(url)
         html = etree.HTML(r.text)
         tbody = html.xpath('//tbody')[0]
         proxies = [':'.join([tr[0].text, tr[1].text]) for tr in tbody]
         proxies = list(set(proxies))
         for proxy in proxies:
             yield proxy
Example #7
0
    def run(self):

        urls = ['http://www.ip3366.net/free/']
        request = WebRequest()
        for url in urls:
            r = request.get(url)
            proxies = re.findall(
                r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>',
                r.text)
            for proxy in proxies:
                yield ":".join(proxy)
Example #8
0
 def freeProxySixth():
     """
     讯代理 http://www.xdaili.cn/
     :return:
     """
     url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10'
     request = WebRequest()
     # try:
     res = request.get(url).json()
     for row in res['RESULT']['rows']:
         yield '{}:{}'.format(row['ip'], row['port'])
Example #9
0
 def freeProxy15():
     urls = ['http://www.xiladaili.com/putong/',
             "http://www.xiladaili.com/gaoni/",
             "http://www.xiladaili.com/http/",
             "http://www.xiladaili.com/https/"]
     request = WebRequest()
     for url in urls:
         r = request.get(url, timeout=10)
         ips = re.findall(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}", r.text)
         for ip in ips:
             yield ip.strip()
Example #10
0
    def run(self):

        urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1']
        request = WebRequest()
        for url in urls:
            r = request.get(url)
            proxies = re.findall(
                'data-ip="(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})".+?>(\d+)</td>',
                r.text)
            for proxy in proxies:
                yield ':'.join(proxy)
Example #11
0
 def freeProxyTen():
     """
     云代理 http://www.ip3366.net/free/
     :return:
     """
     urls = ['http://www.ip3366.net/free/?stype=1&page={}'.format(str(i)) for i in range(1, 4)]
     request = WebRequest()
     for url in urls:
         r = request.get(url)
         proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text)
         for proxy in proxies:
             yield ":".join(proxy)
Example #12
0
 def freeProxyTen():
     """
     云代理 http://www.ip3366.net/free/
     :return:
     """
     urls = ['http://www.ip3366.net/free/']
     request = WebRequest()
     for url in urls:
         r = request.get(url)
         proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text)
         for proxy in proxies:
             yield ":".join(proxy)
Example #13
0
 def freeProxyWallFirst():
     """
     墙外网站 cn-proxy
     :return:
     """
     urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218']
     request = WebRequest()
     for url in urls:
         r = request.get(url)
         proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W]<td>(\d+)</td>', r.text)
         for proxy in proxies:
             yield ':'.join(proxy)
Example #14
0
    def freeProxyEight():
        url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)]  # 国内高匿
        url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 10)]  # 国内普匿
        url_gntou = ['http://www.mimiip.com/gntou/%s' % n for n in range(1, 10)]  # 国内透明
        url_list = url_gngao + url_gnpu + url_gntou

        request = WebRequest()
        for url in url_list:
            r = request.get(url)
            proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W].*<td>(\d+)</td>', r.text)
            for proxy in proxies:
                yield ':'.join(proxy)
Example #15
0
 def freeProxyNinth():
     """
     码农代理 https://proxy.coderbusy.com/
     :return:
     """
     urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1']
     request = WebRequest()
     for url in urls:
         r = request.get(url)
         proxies = re.findall('data-ip="(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})".+?>(\d+)</td>', r.text)
         for proxy in proxies:
             yield ':'.join(proxy)
Example #16
0
 def freeProxyWallFirst():
     """
     墙外网站 cn-proxy
     :return:
     """
     urls = ['http://cn-proxy.com/', 'http://cn-proxy.com/archives/218']
     request = WebRequest()
     for url in urls:
         r = request.get(url)
         proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W]<td>(\d+)</td>', r.text)
         for proxy in proxies:
             yield ':'.join(proxy)
Example #17
0
 def freeProxySecond(proxy_number=100):
     """
     抓取代理66 http://www.66ip.cn/
     :param proxy_number: 代理数量
     :return:
     """
     url = "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format(
         proxy_number)
     request = WebRequest()
     html = request.get(url).text
     for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html):
         yield proxy
Example #18
0
 def freeProxyTen():
     """
     云代理 http://www.ip3366.net/free/
     :return:
     """
     urls = ['http://www.ip3366.net/free/']
     request = WebRequest()
     for url in urls:
         r = request.get(url)
         proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text)
         for proxy in proxies:
             yield ":".join(proxy)
Example #19
0
 def freeProxyNinth():
     """
     码农代理 https://proxy.coderbusy.com/ 已停用
     :return:
     """
     urls = ['https://proxy.coderbusy.com/classical/country/cn.aspx?page=1']
     request = WebRequest()
     for url in urls:
         r = request.get(url, timeout=10)
         proxies = re.findall('data-ip="(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})".+?>(\d+)</td>', r.text)
         for proxy in proxies:
             yield ':'.join(proxy)
Example #20
0
def getHtmlTree(url, proxy_ip):
    # TODO 取代理服务器用代理服务器访问
    wr = WebRequest()
    # delay 2s for per request
    time.sleep(2)

    # ip, port, prot = proxy_ip.split(':')
    # proxies = {prot: '{}://{}:{}'.format(prot, ip, port)}
    #
    html = wr.get(url, proxies=None).text

    return BeautifulSoup(html, features='lxml')
Example #21
0
    def run(self):

        urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)]
        request = WebRequest()
        import base64
        for url in urls:
            r = request.get(url)
            if r.status_code != 200:
                break
            proxies = re.findall(r"Proxy\('(.*?)'\)", r.text)
            for proxy in proxies:
                yield base64.b64decode(proxy).decode()
 def freeProxySixth():
     """
     抓取讯代理免费proxy http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10
     :return:
     """
     url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10'
     request = WebRequest()
     try:
         res = request.get(url).json()
         for row in res['RESULT']['rows']:
             yield '{}:{}'.format(row['ip'], row['port'])
     except Exception as e:
         pass
Example #23
0
 def freeNordVPN():
     """
     nordvpn.com
     :return:
     """
     url = 'https://nordvpn.com/wp-admin/admin-ajax.php?searchParameters%5B0%5D%5Bname%5D=proxy-country&searchParameters%5B0%5D%5Bvalue%5D=&searchParameters%5B1%5D%5Bname%5D=proxy-ports&searchParameters%5B1%5D%5Bvalue%5D=&searchParameters%5B3%5D%5Bname%5D=https&searchParameters%5B3%5D%5Bvalue%5D=on&limit=50000&action=getProxies&offset=0'
     request = WebRequest()
     try:
         res = request.get(url, timeout=10).json()
         for row in res:
             yield '{}:{}'.format(row['ip'], row['port'])
     except Exception as e:
         pass
Example #24
0
 def freeProxyWallSecond():
     """
     https://proxy-list.org/english/index.php
     :return:
     """
     urls = ['https://proxy-list.org/english/index.php?p=%s' % n for n in range(1, 10)]
     request = WebRequest()
     import base64
     for url in urls:
         r = request.get(url)
         proxies = re.findall(r"Proxy\('(.*?)'\)", r.text)
         for proxy in proxies:
             yield base64.b64decode(proxy).decode()
Example #25
0
 def freeProxyCustumOne():
     """
     三一代理 http://31f.cn/
     :return:
     """
     url = 'http://31f.cn/'
     request = WebRequest()
     r = request.get(url, timeout=10)
     proxies = re.findall(
         r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>',
         r.text)
     for proxy in proxies:
         yield ":".join(proxy)
Example #26
0
    def myFreeProxy3():
        """

        """
        urls = [
            "https://api.proxyscrape.com?request=displayproxies&proxytype=http&timeout=7000&country=DE&anonymity=elite",
            "https://www.proxy-list.download/api/v1/get?type=https&anon=elite",
        ]
        request = WebRequest()
        for url in urls:
            sourcecode = request.get(url, timeout=10)
            for line in sourcecode.text.splitlines():
                yield line
Example #27
0
 def freeProxyTen():
     urls = [
         "http://www.ip3366.net/free/?stype=1",
         "http://www.ip3366.net/free/?stype=2",
         "http://www.ip3366.net/free/?stype=3",
         "http://www.ip3366.net/free/?stype=4",
     ]
     request = WebRequest()
     for url in urls:
         r = request.get(url)
         proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\s\S]*?<td>(\d+)</td>', r.text)
         for proxy in proxies:
             yield ":".join(proxy)
Example #28
0
 def freeProxy_shenji():
     """
         神鸡代理
         http://www.shenjidaili.com/open/
     """
     try:
         url = 'http://www.shenjidaili.com/open/'
         request = WebRequest()
         r = request.get(url, timeout=10)
         proxies = re.findall(r'<td>(\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}:\d{1,5})</td>', r.text)
         for proxy in proxies:
             yield proxy
     except Exception as e:
         print(e)
 def freeProxySecond(proxy_number=100):
     """
     抓取代理66 http://www.66ip.cn/
     :param proxy_number: 代理数量
     :return:
     """
     url = "http://www.66ip.cn/mo.php?sxb=&tqsl={}&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=".format(
         proxy_number)
     request = WebRequest()
     # html = request.get(url).content
     # content为未解码,text为解码后的字符串
     html = request.get(url).text
     for proxy in re.findall(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}', html):
         yield proxy
Example #30
0
 def freeProxySixth():
     """
     讯代理 http://www.xdaili.cn/
     网站不再提供免费代理
     :return:
     """
     url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10'
     request = WebRequest()
     try:
         res = request.get(url, timeout=10).json()
         for row in res['RESULT']['rows']:
             yield '{}:{}'.format(row['ip'], row['port'])
     except Exception as e:
         pass
Example #31
0
 def freeProxyCustumTwo(page_count=2):
     """
     免费代理IP库 http://www.89ip.cn/
     :return:
     """
     for i in range(1, page_count + 1):
         url = 'http://www.89ip.cn/index_{}.html'.format(i)
         request = WebRequest()
         r = request.get(url, timeout=10)
         proxies = re.findall(
             r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td><td>(\d+)</td>',
             re.sub(r'[\t\n]', '', r.text))
         for proxy in proxies:
             yield ":".join(proxy)
Example #32
0
 def freeProxySixth():
     """
     抓取讯代理免费proxy http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=10
     :return:
     """
     url = 'http://www.xdaili.cn/ipagent/freeip/getFreeIps?page=1&rows=100'
     request = WebRequest()
     try:
         res = request.get(url).json()
         for row in res['RESULT']['rows']:
             yield GetFreeProxy.compose(row['ip'], row['port'],
                                        row['anony'], row['type'])
     except Exception as e:
         self.log.warning("fetch proxy failed: " + str(e))
Example #33
0
 def freeProxyEight():
     """
     89ip http://www.89ip.cn/index.html
     """
     url_list = [
         "http://www.89ip.cn/tqdl.html?api=1&num=30&port=&address=&isp="
     ]
     request = WebRequest()
     for url in url_list:
         r = request.get(url, timeout=10)
         proxies = re.findall(
             r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d+)<br>', r.text)
         for proxy in proxies:
             yield proxy
Example #34
0
 def freeProxyEleven():
     urls = [
         'http://www.iphai.com/free/ng',
         'http://www.iphai.com/free/np',
         'http://www.iphai.com/free/wg',
         'http://www.iphai.com/free/wp'
     ]
     request = WebRequest()
     for url in urls:
         r = request.get(url)
         proxies = re.findall(r'<td>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?</td>[\s\S]*?<td>\s*?(\d+)\s*?</td>',
                              r.text)
         for proxy in proxies:
             yield ":".join(proxy)
Example #35
0
    def freeProxyEighteen():
        """

        """
        url = "http://www.xsdaili.com/"
        request = WebRequest()
        max_ = re.findall("class=\"title\".*?<a href=\".*?(\d+).*?\"",
                          request.get(url).text, re.S)[0]
        for i in range(max_ - 100, max_ + 1):
            url = f"http://www.xsdaili.com/dayProxy/ip/{i}.html"
            text = request.get(url).text
            proxy_list = re.findall(">.*?([\d.]+:\d+).*?<br", text, re.S)
            for proxy in proxy_list:
                yield proxy_list
Example #36
0
    def freeProxyEight():
        """
        秘密代理 http://www.mimiip.com
        """
        url_gngao = ['http://www.mimiip.com/gngao/%s' % n for n in range(1, 10)]  # 国内高匿
        url_gnpu = ['http://www.mimiip.com/gnpu/%s' % n for n in range(1, 10)]  # 国内普匿
        url_gntou = ['http://www.mimiip.com/gntou/%s' % n for n in range(1, 10)]  # 国内透明
        url_list = url_gngao + url_gnpu + url_gntou

        request = WebRequest()
        for url in url_list:
            r = request.get(url, use_proxy=True)
            proxies = re.findall(r'<td>(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})</td>[\w\W].*<td>(\d+)</td>', r.text)
            for proxy in proxies:
                yield ':'.join(proxy)
Example #37
0
    def freeProxyThirteen(page_count=1):
        """
        http://www.feiyiproxy.com/?page_id=1457
        飞蚁免费代理IP
        :return:
        """
        url = 'http://www.feiyiproxy.com/?page_id=1457'

        request = WebRequest()
        r = request.get(url, timeout=10)
        proxies = re.findall(
            r'<td>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?</td>[\s\S]*?<td>\s*?(\d+)\s*?</td>',
            r.text)
        for proxy in proxies:
            yield ":".join(proxy)
Example #38
0
 def freeProxy16(max_page=34):
     """
     66代理 http://www.66ip.cn
     :return:
     """
     base_url = 'http://www.66ip.cn/areaindex_{}/1.html'
     request = WebRequest()
     for page in range(1, max_page + 1):
         url = base_url.format(page)
         r = request.get(url, timeout=10)
         proxies = re.findall(
             r'<td.*?>[\s\S]*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})[\s\S]*?</td>[\s\S]*?<td.*?>[\s\S]*?(\d+)[\s\S]*?</td>',
             r.text)
         for proxy in proxies:
             yield ':'.join(proxy)
Example #39
0
 def freeProxyEleven():
     """
     IP海 http://www.iphai.com/free/ng
     :return:
     """
     urls = [
         'http://www.iphai.com/free/ng',
         'http://www.iphai.com/free/np',
         'http://www.iphai.com/free/wg',
         'http://www.iphai.com/free/wp'
     ]
     request = WebRequest()
     for url in urls:
         r = request.get(url)
         proxies = re.findall(r'<td>\s*?(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})\s*?</td>[\s\S]*?<td>\s*?(\d+)\s*?</td>',
                              r.text)
         for proxy in proxies:
             yield ":".join(proxy)
def getHtmlTree(url, **kwargs):
    """
    获取html树
    :param url:
    :param kwargs:
    :return:
    """

    header = {'Connection': 'keep-alive',
              'Cache-Control': 'max-age=0',
              'Upgrade-Insecure-Requests': '1',
              'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko)',
              'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
              'Accept-Encoding': 'gzip, deflate, sdch',
              'Accept-Language': 'zh-CN,zh;q=0.8',
              }
    # TODO 取代理服务器用代理服务器访问
    wr = WebRequest()

    # delay 2s for per request
    time.sleep(2)

    html = wr.get(url=url, header=header).content
    return etree.HTML(html)