Exemple #1
0
 def test_add_proxy(self):
     proxy = ProxyEntity('127.0.0.1',
                         '8080',
                         source='66ip网',
                         supplier='中国电信',
                         proxy_type=ProxyTypeEnum.HTTPS.value)
     assert self._opt.add_proxy(proxy) == 1, '插入proxy表失败'
     proxy = ProxyEntity('127.0.0.2',
                         '8081',
                         source='66ip网',
                         supplier='中国电信',
                         proxy_type=ProxyTypeEnum.HTTPS.value)
     assert self._opt.add_proxy(proxy) == 1, '插入proxy表失败'
 def do_crawl(self, resp) -> List[ProxyEntity]:
     result = []
     soup = BeautifulSoup(resp, 'lxml')
     table = soup.find('table')
     if table is None:
         return []
     tbody = soup.find('tbody')
     if tbody is None:
         return []
     trs = tbody.find_all('tr')
     for i, tr in enumerate(trs):
         if i == 0:
             continue
         tds = tr.find_all('td')
         ip = tds[0].text
         port = tds[1].text
         proxy_cover = tds[2].text
         proxy_type = tds[3].text if tds[3].text != '' else 'http'
         region = tds[5].text
         supplier = tds[6].text
         result.append(
             ProxyEntity(f'{proxy_type.lower()}://{ip}:{port}',
                         source=self._name,
                         supplier=supplier,
                         proxy_type=self._judge_proxy_type(proxy_type),
                         proxy_cover=self._judge_proxy_cover(proxy_cover),
                         region=region))
     return result
Exemple #3
0
 async def do_crawl(self) -> List[ProxyEntity]:
     result = []
     async with aiohttp.ClientSession() as session:
         async with session.get(self._base_url, headers=HEADERS) as resp:
             soup = BeautifulSoup(await resp.text(), 'lxml')
             tr_list = soup.find('tbody').find_all('tr')
             for i, tr in enumerate(tr_list):
                 tds = tr.find_all('td')
                 id_and_port = tds[0]
                 ip, port = self._parse_ip_and_port(id_and_port)
                 proxy_cover = tds[1].text
                 proxy_type = tds[2].text
                 region = tds[3].contents[1].text
                 supplier = tds[4].text
                 result.append(
                     ProxyEntity(
                         f'{proxy_type.lower()}://{ip}:{port}',
                         # ip, port,
                         # protocol=proxy_type,
                         source=self._name,
                         supplier=supplier,
                         proxy_type=self._judge_proxy_type(proxy_type),
                         proxy_cover=self._judge_proxy_cover(proxy_cover),
                         region=region))
     return result
Exemple #4
0
 async def do_crawl(self) -> List[ProxyEntity]:
     result = []
     for base_url in self._base_urls:
         async with aiohttp.ClientSession() as session:
             async with session.get(base_url, headers=HEADERS) as resp:
                 soup = BeautifulSoup(await resp.text(), 'lxml')
                 table = soup.find('table')
                 if table is None:
                     continue
                 tbody = soup.find('tbody')
                 if tbody is None:
                     continue
                 trs = tbody.find_all('tr')
                 for i, tr in enumerate(trs):
                     if i == 0:
                         continue
                     tds = tr.find_all('td')
                     ip = tds[0].text
                     port = tds[1].text
                     proxy_cover = tds[2].text
                     proxy_type = tds[
                         3].text if tds[3].text != '' else 'http'
                     region = tds[4].text
                     result.append(
                         ProxyEntity(
                             f'{proxy_type.lower()}://{ip}:{port}',
                             # ip, port, protocol=proxy_type.lower(),
                             source=self._name,
                             proxy_type=self._judge_proxy_type(proxy_type),
                             proxy_cover=self._judge_proxy_cover(
                                 proxy_cover),
                             region=region))
         await asyncio.sleep(2)
     return result
Exemple #5
0
 async def do_crawl(self) -> List[ProxyEntity]:
     result = []
     for page in range(1, 6):
         # print(f'第{page}页...')
         async with aiohttp.ClientSession() as session:
             async with session.get(
                     f'{self._base_url}/{page}.html') as resp:
                 resp.encoding = 'gb2312'
                 soup = BeautifulSoup(await resp.text(), 'lxml')
                 tr_list = soup.find('table',
                                     attrs={
                                         'width': '100%',
                                         'bordercolor': '#6699ff'
                                     }).find_all('tr')
                 for i, tr in enumerate(tr_list):
                     if i == 0:
                         continue
                     contents = tr.contents
                     ip = contents[0].text
                     port = contents[1].text
                     region = contents[2].text
                     proxy_cover = contents[3].text
                     result.append(
                         ProxyEntity(
                             f'http://{ip}:{port}',
                             # ip, port,
                             source=self._name,
                             proxy_cover=self._judge_proxy_cover(
                                 proxy_cover),
                             region=region))
     return result
Exemple #6
0
    async def do_crawl(self) -> List[ProxyEntity]:
        result = []
        for base_url in self._base_urls:
            for page in range(1, 3):
                async with aiohttp.ClientSession() as session:
                    async with session.get(f'{base_url}&page={page}',
                                           headers=HEADERS) as resp:

                        # res = requests.get(f'{base_url}/{page}', headers=HEADERS)
                        soup = BeautifulSoup(await resp.text(), 'lxml')
                        trs = soup.find('table').find('tbody').find_all('tr')
                        for tr in trs:
                            tds = tr.find_all('td')
                            ip = tds[0].text
                            port = tds[1].text
                            proxy_cover = tds[2].text
                            proxy_type = tds[3].text
                            region = tds[4].text
                            result.append(
                                ProxyEntity(
                                    f'{proxy_type.lower()}://{ip}:{port}',
                                    # ip, port, protocol=proxy_type.lower(),
                                    source=self._name,
                                    proxy_type=self._judge_proxy_type(
                                        proxy_type),
                                    proxy_cover=self._judge_proxy_cover(
                                        proxy_cover),
                                    region=region))
        return result
Exemple #7
0
 async def do_crawl(self) -> List[ProxyEntity]:
     result = []
     for base_url in self._base_urls:
         for page in range(1, 3):
             async with aiohttp.ClientSession() as session:
                 async with session.get(f'{base_url}/{page}',
                                        headers=HEADERS) as resp:
                     soup = BeautifulSoup(await resp.text(), 'lxml')
                     tab = soup.find('table', attrs={'id': 'ip_list'})
                     if tab is None:
                         continue
                     tr_list = tab.find_all('tr')[1:-1]
                     for tr in tr_list:
                         tds = tr.find_all('td')
                         # country = tds[0].find('img')['alt']
                         ip = tds[1].text
                         port = tds[2].text
                         # city = tds[3].text.replace('\n', '')
                         proxy_cover = tds[4].text
                         proxy_type = tds[5].text
                         result.append(
                             ProxyEntity(
                                 f'{proxy_type.lower()}://{ip}:{port}',
                                 # ip, port,
                                 # protocol=proxy_type.lower(),
                                 source=self._name,
                                 proxy_cover=self._judge_proxy_cover(
                                     proxy_cover),
                                 proxy_type=self._judge_proxy_type(
                                     proxy_type),
                             ))
             return result
Exemple #8
0
 def do_crawl(self) -> List[ProxyEntity]:
     result = []
     for page in range(1, 4):
         # print(f'第{page}页...')
         resp = requests.get(f'{self._base_url}/{page}.html')
         resp.encoding = 'gb2312'
         soup = BeautifulSoup(resp.text, 'lxml')
         tr_list = soup.find('table',
                             attrs={
                                 'width': '100%',
                                 'bordercolor': '#6699ff'
                             }).find_all('tr')
         for i, tr in enumerate(tr_list):
             if i == 0:
                 continue
             contents = tr.contents
             ip = contents[0].text
             port = contents[1].text
             region = contents[2].text
             proxy_cover = contents[3].text
             # check_time = contents[4].text
             # print(f'{ip}:{port}/{region}/{proxy_type}/{check_time}')
             result.append(
                 ProxyEntity(
                     f'http://{ip}:{port}',
                     # ip, port,
                     source=self._name,
                     proxy_cover=self._judge_proxy_cover(proxy_cover),
                     region=region))
     return result
    def setUp(self) -> None:
        self._opt = sqlite_opt
        self._validator = validator
        self._opt.init_db()
        proxy = ProxyEntity('127.0.0.1',
                            '8080',
                            source='66ip网',
                            supplier='中国电信',
                            proxy_type=ProxyTypeEnum.HTTPS.value)
        assert self._opt.add_proxy(proxy) == 1, '插入proxy表失败'
        proxy = ProxyEntity('127.0.0.2',
                            '8081',
                            source='66ip网',
                            supplier='中国电信',
                            proxy_type=ProxyTypeEnum.HTTPS.value)
        assert self._opt.add_proxy(proxy) == 1, '插入proxy表失败'

        self._opt.clean()
 def do_crawl(self, resp) -> List[ProxyEntity]:
     result = []
     soup = BeautifulSoup(resp, 'lxml')
     trs = soup.find('table').find('tbody').find_all('tr')
     for tr in trs:
         tds = tr.find_all('td')
         ip = tds[0].text
         port = tds[1].text
         proxy_cover = tds[2].text
         proxy_type = tds[3].text
         region = tds[4].text
         result.append(
             ProxyEntity(f'{proxy_type.lower()}://{ip}:{port}',
                         source=self._name,
                         proxy_type=self._judge_proxy_type(proxy_type),
                         proxy_cover=self._judge_proxy_cover(proxy_cover),
                         region=region))
     return result
 def do_crawl(self, resp) -> List[ProxyEntity]:
     result = []
     soup = BeautifulSoup(resp, 'lxml')
     tr_list = soup.find('tbody').find_all('tr')
     for i, tr in enumerate(tr_list):
         tds = tr.find_all('td')
         id_and_port = tds[0]
         ip, port = self._parse_ip_and_port(id_and_port)
         proxy_cover = tds[1].text
         proxy_type = tds[2].text
         region = tds[3].contents[1].text
         supplier = tds[4].text
         result.append(
             ProxyEntity(f'{proxy_type.lower()}://{ip}:{port}',
                         source=self._name,
                         supplier=supplier,
                         proxy_type=self._judge_proxy_type(proxy_type),
                         proxy_cover=self._judge_proxy_cover(proxy_cover),
                         region=region))
     return result
 def do_crawl(self, resp) -> List[ProxyEntity]:
     result = []
     soup = BeautifulSoup(resp, 'lxml')
     tab = soup.find('table', attrs={'id': 'ip_list'})
     if tab is None:
         return []
     tr_list = tab.find_all('tr')[1:-1]
     for tr in tr_list:
         tds = tr.find_all('td')
         ip = tds[1].text
         port = tds[2].text
         proxy_cover = tds[4].text
         proxy_type = tds[5].text
         result.append(
             ProxyEntity(
                 f'{proxy_type.lower()}://{ip}:{port}',
                 source=self._name,
                 proxy_cover=self._judge_proxy_cover(proxy_cover),
                 proxy_type=self._judge_proxy_type(proxy_type),
             ))
     return result
 def do_crawl(self, resp) -> List[ProxyEntity]:
     result = []
     soup = BeautifulSoup(resp, 'lxml')
     tr_list = soup.find('table',
                         attrs={
                             'width': '100%',
                             'bordercolor': '#6699ff'
                         }).find_all('tr')
     for i, tr in enumerate(tr_list):
         if i == 0:
             continue
         contents = tr.contents
         ip = contents[0].text
         port = contents[1].text
         region = contents[2].text
         proxy_cover = contents[3].text
         result.append(
             ProxyEntity(f'http://{ip}:{port}',
                         source=self._name,
                         proxy_cover=self._judge_proxy_cover(proxy_cover),
                         region=region))
     return result
Exemple #14
0
 def do_crawl(self) -> List[ProxyEntity]:
     result = []
     for base_url in self._base_urls:
         for page in range(1, 4):
             res = requests.get(f'{base_url}/{page}', headers=HEADERS)
             soup = BeautifulSoup(res.text, 'lxml')
             trs = soup.find('table').find('tbody').find_all('tr')
             for tr in trs:
                 tds = tr.find_all('td')
                 ip = tds[0].text
                 port = tds[1].text
                 proxy_cover = tds[2].text
                 proxy_type = tds[3].text
                 region = tds[4].text
                 result.append(
                     ProxyEntity(
                         f'{proxy_type.lower()}://{ip}:{port}',
                         # ip, port, protocol=proxy_type.lower(),
                         source=self._name,
                         proxy_type=self._judge_proxy_type(proxy_type),
                         proxy_cover=self._judge_proxy_cover(proxy_cover),
                         region=region))
             time.sleep(3)
     return result