Example #1
0
    async def proxy_site_66ip():
        try:
            # 遍历全国的代理
            for i in range(0, 35):
                if i == 0:
                    url = 'http://www.66ip.cn/'
                else:
                    url = 'http://www.66ip.cn/areaindex_{area}/'.format(area=i)
                # 获得前5页IP
                for j in range(1, 6):
                    url_page = url + '{page}.html'.format(page=j)
                    async with aiohttp.ClientSession() as session:
                        async with session.get(url=url_page,
                                               headers=HEADERS) as response:
                            r = await response.text()

                    if r is not None and '' != r:
                        doc = PyQuery(r)
                        for each in doc('tr').items():
                            ip_address = each('td').eq(0).text()
                            pattern = re.compile(
                                '\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
                            match = pattern.match(ip_address)
                            if match:
                                ip_port = each('td').eq(1).text()
                                # 非异步,待解决
                                await SpiderProxy.verify_and_save(
                                    ip_address + ':' + ip_port, '66ip.cn')
        except Exception as e:
            Util.log_error(e)
Example #2
0
    async def proxy_site_kuaidaili():
        try:
            header = HEADERS
            header[
                'Cookie'] = '_ydclearance=b673c31fc021172e1d425859-915a-4546-bd2d-d88b08d45818-1491886397'
            for i in range(1, 6):
                url = 'http://www.kuaidaili.com/proxylist/{page}/'.format(
                    page=i)
                async with aiohttp.ClientSession() as session:
                    async with session.get(url=url,
                                           headers=header) as response:
                        r = await response.text()

                if r is not None and '' != r:
                    doc = PyQuery(r)
                    for each in doc('#index_free_list > table > tbody')(
                            'tr').items():
                        ip_address = each('td[data-title=IP]').eq(0).text()
                        ip_port = each('td[data-title=PORT]').eq(0).text()
                        # 非异步,待解决
                        await SpiderProxy.verify_and_save(
                            ip_address + ':' + ip_port, 'kuaidaili.com')
                    pass
        except Exception as e:
            Util.log_error(e)
Example #3
0
    async def proxy_site_xici():
        try:
            # 高匿、普匿、HTTPS、HTTP
            forms = ['nn', 'nt', 'wn', 'wt']
            for type in forms:
                # 每种类型的前10页
                for page in range(1, 11):
                    url = 'http://www.xicidaili.com/' + type + '/' + str(page)
                    header = HEADERS
                    async with aiohttp.ClientSession() as session:
                        async with session.get(url=url,
                                               headers=header) as response:
                            r = await response.text()

                    if r is not None and '' != r:
                        doc = PyQuery(r)
                        for each in doc('tr').items():
                            ip_address = each('td').eq(1).text()
                            pattern = re.compile(
                                '\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
                            match = pattern.match(ip_address)
                            if match:
                                ip_port = each('td').eq(2).text()
                                # 非异步,待解决
                                await SpiderProxy.verify_and_save(
                                    ip_address + ':' + ip_port,
                                    'xicidaili.com')
        except Exception as e:
            Util.log_error(e)
    async def proxy_site_goubanjia():
        try:
            for index in range(1, 6):
                url = 'http://www.goubanjia.com/free/index{0}.shtml'.format(
                    index)
                header = HEADERS
                async with aiohttp.ClientSession() as session:
                    async with session.get(url=url,
                                           headers=header) as response:
                        r = await response.text()

                if r is not None and '' != r:
                    doc = PyQuery(r)
                    for each in doc('td.ip').items():
                        address = ''
                        for i, tag in enumerate(each('*').items()):
                            if i == 0:
                                continue
                            if (tag.attr.style and 'none;'
                                    in tag.attr.style) or '' == tag.text():
                                continue
                            if 'port' in str(tag):
                                address = address + ':' + tag.text()
                            else:
                                address += tag.text()
                        pattern = re.compile(
                            '\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5}')
                        match = pattern.match(address)
                        if match:
                            # 非异步,待解决
                            await VerifySave.verify_and_save(
                                address, 'guobanjia.com')
            pass
        except Exception as e:
            Util.log_error('proxy_site_goubanjia: ' + str(e))
 def execute(self, sql):
     try:
         with self.connection.cursor() as cursor:
             cursor.execute(sql)
             self.connection.commit()
     except Exception as e:
         Util.log_error(e)
     finally:
         self.connection.close()
 def query_one(self, sql):
     try:
         with self.connection.cursor() as cursor:
             cursor.execute(sql)
             return cursor.fetchone()
     except Exception as e:
         Util.log_error(e)
         return None
     finally:
         self.connection.close()
    def checking(self, sql):
        try:
            fetchall = MySql(DB_ADDRESS, DB_USER, DB_PASS, DB_DATABASE, DB_CHARSET).query(sql)
            if fetchall is None:
                # 如果查出来没有数据,则优先重复验证最新验证的IP(保证IP高度可用)
                sql = 'SELECT * FROM httpbin WHERE anonymity=2 AND alive=1 ORDER BY update_time DESC'
                fetchall = MySql(DB_ADDRESS, DB_USER, DB_PASS, DB_DATABASE, DB_CHARSET).query(sql)
            if fetchall is None:
                # 休息5秒后退出
                time.sleep(5)
                return
            for each in fetchall:
                each_id = each[0]
                proxy = each[1] + ':' + each[2]
                verify_count = int(each[12]) + 1
                r = VerifyProxy().validate_proxy(proxy, protocol='http', timeout=3)
                threadLock.acquire()
                if isinstance(r, str):
                    # 失效
                    sql = 'SELECT leave_count FROM httpbin WHERE id={0}'.format(each_id)
                    fetchone = MySql(DB_ADDRESS, DB_USER, DB_PASS, DB_DATABASE, DB_CHARSET).query_one(sql)
                    alive = '0'
                    leave_count = int(fetchone[0]) - 1
                    if leave_count <= 0:
                        # 尝试20次之后,删除该条代理
                        sql = 'DELETE FROM httpbin WHERE id={0}'.format(each_id)
                        Util.log_error(sql)
                    else:
                        # 更新
                        sql = "UPDATE httpbin SET verify_count={0}, leave_count={1}, alive={2} WHERE id={3}" \
                            .format(verify_count, leave_count, alive, each_id)
                        # Util.log_info(sql)
                    MySql(DB_ADDRESS, DB_USER, DB_PASS, DB_DATABASE, DB_CHARSET).execute(sql)
                elif isinstance(r, dict):
                    if 'exception' not in r.keys():
                        alive = '1'
                        leave_count = 20
                        speed = r['timedelta']
                        # 暂不重复验证是否支持HTTPS
                        # result_https = VerifyProxy().validate_proxy(proxy, 'https', timeout=3)
                        sql = "UPDATE httpbin SET verify_count={0}, speed={1}, alive={2}, leave_count={3} WHERE id={4}" \
                            .format(verify_count, speed, alive, leave_count, each_id)
                        # Util.log_info(sql)
                        MySql(DB_ADDRESS, DB_USER, DB_PASS, DB_DATABASE, DB_CHARSET).execute(sql)
                threadLock.release()

        except Exception as e:
            Util.log_error(e)
Example #8
0
    async def proxy_site_66ip_api():
        try:
            url = 'http://m.66ip.cn/mo.php?tqsl=3000'
            async with aiohttp.ClientSession() as session:
                async with session.get(url=url, headers=HEADERS) as response:
                    r = await response.text()

            if r is not None and '' != r:
                pattern = re.compile(
                    '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}:\d{1,5})<br />')
                match = pattern.findall(r)
                if len(match) > 0:
                    for each in match:
                        # 非异步,待解决
                        await SpiderProxy.verify_and_save(each, 'm.66ip.cn')
        except Exception as e:
            Util.log_error(e)
Example #9
0
    async def verify_and_save(proxy, source):
        try:
            r = VerifyProxy().validate_proxy(proxy, protocol='http', timeout=3)
            if isinstance(r, str):
                r = json.loads(r)
            if isinstance(r, dict):
                if 'exception' not in r.keys():
                    https = '0'
                    ip = proxy.split(':')[0]
                    port = proxy.split(':')[1]

                    speed = r['timedelta']
                    origin = r['origin']
                    if origin == ip:
                        # 高匿
                        anonymity = '2'
                    else:
                        # 透明
                        anonymity = '0'
                    country = VerifyProxy().country_proxy(ip)
                    result_https = VerifyProxy().validate_proxy(proxy,
                                                                'https',
                                                                timeout=3)
                    if not isinstance(result_https, str):
                        https = '1'

                    # 1. 查找是否已存在该IP和Port
                    sql = "SELECT * FROM httpbin WHERE ip = '{0}' AND port = '{1}'".format(
                        ip, port)
                    fetchone = MySql(DB_ADDRESS, DB_USER, DB_PASS, DB_DATABASE,
                                     DB_CHARSET).query_one(sql)
                    if fetchone is None:
                        # 2. 插入数据库中没有的IP
                        sql = "INSERT INTO httpbin(ip, port, https, anonymity, country, speed, source, " \
                              "insert_time) VALUES ('{0}', '{1}', '{2}', '{3}', '{4}', '{5}', '{6}', '{7}')" \
                            .format(ip, port, https, anonymity, country, speed, source, Util.get_current_time())
                        Util.log_info('Save Proxy ' + proxy + ' From ' +
                                      source)
                        MySql(DB_ADDRESS, DB_USER, DB_PASS, DB_DATABASE,
                              DB_CHARSET).execute(sql)

        except Exception as e:
            # Util.log_error(e)
            pass
Example #10
0
    async def proxy_site_ip181():
        try:
            url = 'http://www.ip181.com'
            header = HEADERS
            header['Host'] = 'www.ip181.com'
            async with aiohttp.ClientSession() as session:
                async with session.get(url=url, headers=header) as response:
                    r = await response.text(encoding='gb2312')

            if r is not None and '' != r:
                doc = PyQuery(r)
                for each in doc('tr').items():
                    ip_address = each('td').eq(0).text()
                    pattern = re.compile('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
                    match = pattern.match(ip_address)
                    if match:
                        ip_port = each('td').eq(1).text()
                        # 非异步,待解决
                        await SpiderProxy.verify_and_save(
                            ip_address + ':' + ip_port, 'ip181.com')
        except Exception as e:
            Util.log_error(e)
    async def proxy_site_mimvp():
        # 国内普通、高匿,国外普通、高匿
        start_urls = ['in_tp', 'in_hp', 'out_tp', 'out_hp']
        try:
            for url in start_urls:
                url = 'http://proxy.mimvp.com/free.php?proxy=' + url
                header = HEADERS
                async with aiohttp.ClientSession() as session:
                    async with session.get(url=url,
                                           headers=header) as response:
                        r = await response.text()

                if r is not None and '' != r:
                    r = r.replace('<tbody>',
                                  '<tbody><tr>').replace('</tr>', '</tr><tr>')
                    doc = PyQuery(r)
                    for each in doc('tbody')('tr').items():
                        address = each('td').eq(1).text()
                        pattern = re.compile(
                            '\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
                        match = pattern.match(address)
                        if match:
                            port_img_url = each('td').eq(2)('img').attr.src
                            if 'common' in port_img_url:
                                port_img_url = 'http://proxy.mimvp.com/' + port_img_url
                                r = requests.get(port_img_url, headers=header)
                                image = Image.open(BytesIO(r.content))
                                port = Util.image_to_str(image)
                                try:
                                    port = int(port)
                                    # 非异步,待解决
                                    await VerifySave.verify_and_save(
                                        address + ':' + str(port), 'mimvp.com')
                                except:
                                    continue

        except Exception as e:
            Util.log_error('proxy_site_mimvp: ' + str(e))
    async def proxy_site_kuaidaili():
        try:
            header = HEADERS
            header[
                'Cookie'] = 'channelid=0; sid=1492138570160333; _gat=1; _ga=GA1.2.486444152.1491549216'
            for i in range(1, 6):
                url = 'http://www.kuaidaili.com/proxylist/{page}/'.format(
                    page=i)
                async with aiohttp.ClientSession() as session:
                    async with session.get(url=url,
                                           headers=header) as response:
                        r = await response.text()

                if r is not None and '' != r:
                    doc = PyQuery(r)
                    for each in doc('#freelist > table > tbody')('tr').items():
                        ip_address = each('td[data-title=IP]').eq(0).text()
                        ip_port = each('td[data-title=PORT]').eq(0).text()
                        # 非异步,待解决
                        await VerifySave.verify_and_save(
                            ip_address + ':' + ip_port, 'kuaidaili.com')
                    pass
        except Exception as e:
            Util.log_error('proxy_site_kuaidaili: ' + str(e))