def test_add_proxy(self): proxy = ProxyEntity('127.0.0.1', '8080', source='66ip网', supplier='中国电信', proxy_type=ProxyTypeEnum.HTTPS.value) assert self._opt.add_proxy(proxy) == 1, '插入proxy表失败' proxy = ProxyEntity('127.0.0.2', '8081', source='66ip网', supplier='中国电信', proxy_type=ProxyTypeEnum.HTTPS.value) assert self._opt.add_proxy(proxy) == 1, '插入proxy表失败'
def do_crawl(self, resp) -> List[ProxyEntity]: result = [] soup = BeautifulSoup(resp, 'lxml') table = soup.find('table') if table is None: return [] tbody = soup.find('tbody') if tbody is None: return [] trs = tbody.find_all('tr') for i, tr in enumerate(trs): if i == 0: continue tds = tr.find_all('td') ip = tds[0].text port = tds[1].text proxy_cover = tds[2].text proxy_type = tds[3].text if tds[3].text != '' else 'http' region = tds[5].text supplier = tds[6].text result.append( ProxyEntity(f'{proxy_type.lower()}://{ip}:{port}', source=self._name, supplier=supplier, proxy_type=self._judge_proxy_type(proxy_type), proxy_cover=self._judge_proxy_cover(proxy_cover), region=region)) return result
async def do_crawl(self) -> List[ProxyEntity]: result = [] async with aiohttp.ClientSession() as session: async with session.get(self._base_url, headers=HEADERS) as resp: soup = BeautifulSoup(await resp.text(), 'lxml') tr_list = soup.find('tbody').find_all('tr') for i, tr in enumerate(tr_list): tds = tr.find_all('td') id_and_port = tds[0] ip, port = self._parse_ip_and_port(id_and_port) proxy_cover = tds[1].text proxy_type = tds[2].text region = tds[3].contents[1].text supplier = tds[4].text result.append( ProxyEntity( f'{proxy_type.lower()}://{ip}:{port}', # ip, port, # protocol=proxy_type, source=self._name, supplier=supplier, proxy_type=self._judge_proxy_type(proxy_type), proxy_cover=self._judge_proxy_cover(proxy_cover), region=region)) return result
async def do_crawl(self) -> List[ProxyEntity]: result = [] for base_url in self._base_urls: async with aiohttp.ClientSession() as session: async with session.get(base_url, headers=HEADERS) as resp: soup = BeautifulSoup(await resp.text(), 'lxml') table = soup.find('table') if table is None: continue tbody = soup.find('tbody') if tbody is None: continue trs = tbody.find_all('tr') for i, tr in enumerate(trs): if i == 0: continue tds = tr.find_all('td') ip = tds[0].text port = tds[1].text proxy_cover = tds[2].text proxy_type = tds[ 3].text if tds[3].text != '' else 'http' region = tds[4].text result.append( ProxyEntity( f'{proxy_type.lower()}://{ip}:{port}', # ip, port, protocol=proxy_type.lower(), source=self._name, proxy_type=self._judge_proxy_type(proxy_type), proxy_cover=self._judge_proxy_cover( proxy_cover), region=region)) await asyncio.sleep(2) return result
async def do_crawl(self) -> List[ProxyEntity]: result = [] for page in range(1, 6): # print(f'第{page}页...') async with aiohttp.ClientSession() as session: async with session.get( f'{self._base_url}/{page}.html') as resp: resp.encoding = 'gb2312' soup = BeautifulSoup(await resp.text(), 'lxml') tr_list = soup.find('table', attrs={ 'width': '100%', 'bordercolor': '#6699ff' }).find_all('tr') for i, tr in enumerate(tr_list): if i == 0: continue contents = tr.contents ip = contents[0].text port = contents[1].text region = contents[2].text proxy_cover = contents[3].text result.append( ProxyEntity( f'http://{ip}:{port}', # ip, port, source=self._name, proxy_cover=self._judge_proxy_cover( proxy_cover), region=region)) return result
async def do_crawl(self) -> List[ProxyEntity]: result = [] for base_url in self._base_urls: for page in range(1, 3): async with aiohttp.ClientSession() as session: async with session.get(f'{base_url}&page={page}', headers=HEADERS) as resp: # res = requests.get(f'{base_url}/{page}', headers=HEADERS) soup = BeautifulSoup(await resp.text(), 'lxml') trs = soup.find('table').find('tbody').find_all('tr') for tr in trs: tds = tr.find_all('td') ip = tds[0].text port = tds[1].text proxy_cover = tds[2].text proxy_type = tds[3].text region = tds[4].text result.append( ProxyEntity( f'{proxy_type.lower()}://{ip}:{port}', # ip, port, protocol=proxy_type.lower(), source=self._name, proxy_type=self._judge_proxy_type( proxy_type), proxy_cover=self._judge_proxy_cover( proxy_cover), region=region)) return result
async def do_crawl(self) -> List[ProxyEntity]: result = [] for base_url in self._base_urls: for page in range(1, 3): async with aiohttp.ClientSession() as session: async with session.get(f'{base_url}/{page}', headers=HEADERS) as resp: soup = BeautifulSoup(await resp.text(), 'lxml') tab = soup.find('table', attrs={'id': 'ip_list'}) if tab is None: continue tr_list = tab.find_all('tr')[1:-1] for tr in tr_list: tds = tr.find_all('td') # country = tds[0].find('img')['alt'] ip = tds[1].text port = tds[2].text # city = tds[3].text.replace('\n', '') proxy_cover = tds[4].text proxy_type = tds[5].text result.append( ProxyEntity( f'{proxy_type.lower()}://{ip}:{port}', # ip, port, # protocol=proxy_type.lower(), source=self._name, proxy_cover=self._judge_proxy_cover( proxy_cover), proxy_type=self._judge_proxy_type( proxy_type), )) return result
def do_crawl(self) -> List[ProxyEntity]: result = [] for page in range(1, 4): # print(f'第{page}页...') resp = requests.get(f'{self._base_url}/{page}.html') resp.encoding = 'gb2312' soup = BeautifulSoup(resp.text, 'lxml') tr_list = soup.find('table', attrs={ 'width': '100%', 'bordercolor': '#6699ff' }).find_all('tr') for i, tr in enumerate(tr_list): if i == 0: continue contents = tr.contents ip = contents[0].text port = contents[1].text region = contents[2].text proxy_cover = contents[3].text # check_time = contents[4].text # print(f'{ip}:{port}/{region}/{proxy_type}/{check_time}') result.append( ProxyEntity( f'http://{ip}:{port}', # ip, port, source=self._name, proxy_cover=self._judge_proxy_cover(proxy_cover), region=region)) return result
def setUp(self) -> None: self._opt = sqlite_opt self._validator = validator self._opt.init_db() proxy = ProxyEntity('127.0.0.1', '8080', source='66ip网', supplier='中国电信', proxy_type=ProxyTypeEnum.HTTPS.value) assert self._opt.add_proxy(proxy) == 1, '插入proxy表失败' proxy = ProxyEntity('127.0.0.2', '8081', source='66ip网', supplier='中国电信', proxy_type=ProxyTypeEnum.HTTPS.value) assert self._opt.add_proxy(proxy) == 1, '插入proxy表失败' self._opt.clean()
def do_crawl(self, resp) -> List[ProxyEntity]: result = [] soup = BeautifulSoup(resp, 'lxml') trs = soup.find('table').find('tbody').find_all('tr') for tr in trs: tds = tr.find_all('td') ip = tds[0].text port = tds[1].text proxy_cover = tds[2].text proxy_type = tds[3].text region = tds[4].text result.append( ProxyEntity(f'{proxy_type.lower()}://{ip}:{port}', source=self._name, proxy_type=self._judge_proxy_type(proxy_type), proxy_cover=self._judge_proxy_cover(proxy_cover), region=region)) return result
def do_crawl(self, resp) -> List[ProxyEntity]: result = [] soup = BeautifulSoup(resp, 'lxml') tr_list = soup.find('tbody').find_all('tr') for i, tr in enumerate(tr_list): tds = tr.find_all('td') id_and_port = tds[0] ip, port = self._parse_ip_and_port(id_and_port) proxy_cover = tds[1].text proxy_type = tds[2].text region = tds[3].contents[1].text supplier = tds[4].text result.append( ProxyEntity(f'{proxy_type.lower()}://{ip}:{port}', source=self._name, supplier=supplier, proxy_type=self._judge_proxy_type(proxy_type), proxy_cover=self._judge_proxy_cover(proxy_cover), region=region)) return result
def do_crawl(self, resp) -> List[ProxyEntity]: result = [] soup = BeautifulSoup(resp, 'lxml') tab = soup.find('table', attrs={'id': 'ip_list'}) if tab is None: return [] tr_list = tab.find_all('tr')[1:-1] for tr in tr_list: tds = tr.find_all('td') ip = tds[1].text port = tds[2].text proxy_cover = tds[4].text proxy_type = tds[5].text result.append( ProxyEntity( f'{proxy_type.lower()}://{ip}:{port}', source=self._name, proxy_cover=self._judge_proxy_cover(proxy_cover), proxy_type=self._judge_proxy_type(proxy_type), )) return result
def do_crawl(self, resp) -> List[ProxyEntity]: result = [] soup = BeautifulSoup(resp, 'lxml') tr_list = soup.find('table', attrs={ 'width': '100%', 'bordercolor': '#6699ff' }).find_all('tr') for i, tr in enumerate(tr_list): if i == 0: continue contents = tr.contents ip = contents[0].text port = contents[1].text region = contents[2].text proxy_cover = contents[3].text result.append( ProxyEntity(f'http://{ip}:{port}', source=self._name, proxy_cover=self._judge_proxy_cover(proxy_cover), region=region)) return result
def do_crawl(self) -> List[ProxyEntity]: result = [] for base_url in self._base_urls: for page in range(1, 4): res = requests.get(f'{base_url}/{page}', headers=HEADERS) soup = BeautifulSoup(res.text, 'lxml') trs = soup.find('table').find('tbody').find_all('tr') for tr in trs: tds = tr.find_all('td') ip = tds[0].text port = tds[1].text proxy_cover = tds[2].text proxy_type = tds[3].text region = tds[4].text result.append( ProxyEntity( f'{proxy_type.lower()}://{ip}:{port}', # ip, port, protocol=proxy_type.lower(), source=self._name, proxy_type=self._judge_proxy_type(proxy_type), proxy_cover=self._judge_proxy_cover(proxy_cover), region=region)) time.sleep(3) return result