def crawl(): urls = [ 'http://www.xicidaili.com/nn/', 'http://www.xicidaili.com/nn/2', 'http://www.xicidaili.com/wn/' ] result = [] for url in urls: try: req = logic_common.build_request(url) table = BeautifulSoup(req.text, 'lxml').find('table', id='ip_list').find_all('tr') except Exception as e: error_log.error('Spider xicidaili error.[msg]={}'.format(e)) continue for tr in table[1:]: try: tds = tr.find_all('td') ip = tds[1].get_text() + ':' + tds[2].get_text() result.append(ip) except: pass info_log.info('Spider xicidaili success.Crawled IP Count:{}'.format( len(result))) return result
def crawl(): urls = [ 'http://www.data5u.com/free/gngn/index.shtml', 'http://www.data5u.com/free/gwgn/index.shtml', 'http://www.data5u.com/free/gnpt/index.shtml', 'http://www.data5u.com/free/gwpt/index.shtml' ] result = [] for url in urls: try: req = logic_common.build_request(url) table = BeautifulSoup(req.text, 'lxml').find_all('ul', {"class": 'l2'}) except Exception as e: error_log.error('Spider data5u error.[msg]={}'.format(e)) continue for item in table[1:]: try: spans = item.find_all('span') ip = spans[0].get_text() port = spans[1].get_text() except: continue line = ip + ':' + port result.append( line.replace('\r', '').replace('\n', '').replace('\t', '').replace(' ', '')) info_log.info('Spider data5u success.Crawled IP Count:{}'.format( len(result))) return result
def crawl(): result = [] for page in range(5): url = 'https://proxy.coderbusy.com/classical/anonymous-type/highanonymous.aspx?page=%s' % ( page + 1) try: req = logic_common.build_request(url) table = BeautifulSoup(req.text, 'lxml').find('div', { 'class': 'table-responsive' }).find_all('tr') except Exception as e: error_log.error('Spider CoderBusy error.[msg]={}'.format(e)) continue for item in table[1:]: try: tds = item.find_all('td') ip = tds[0].get_text() port = tds[2].get_text() except: continue line = ip + ':' + port result.append( line.replace('\r', '').replace('\n', '').replace('\t', '').replace(' ', '')) info_log.info('Spider CoderBusy success.Crawled IP Count:{}'.format( len(result))) return result
def crawl(): urls = ['http://www.89ip.cn/tiqv.php?sxb=&tqsl=300&ports=&ktip=&xl=on&submit=%CC%E1++%C8%A1'] result = [] for pageurl in urls: try: req = logic_common.build_request(pageurl) html = req.text except Exception as e: error_log.error('Spider 89ip error.[msg]={}'.format(e)) continue ips = re.findall('\d+\.\d+\.\d+\.\d+:\d+', html) result += ips time.sleep(2) info_log.info('Spider 89ip success.Crawled IP Count:{}'.format(len(result))) return result
def crawl(): urls = [ 'http://www.66ip.cn/nmtq.php?getnum=600&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=0&proxytype=2&api=66ip' ] result = [] for pageurl in urls: try: req = logic_common.build_request(pageurl) html = req.text except Exception as e: error_log.error('Spider 66ip error.[msg]={}'.format(e)) continue ips = re.findall('\d+\.\d+\.\d+\.\d+:\d+', html) result += ips time.sleep(2) info_log.info('Spider 66ip success.Crawled IP Count:{}'.format( len(result))) return result
def crawl(): result = [] for page in range(1, 10): url = 'https://www.kuaidaili.com/ops/proxylist/{}/'.format(page) try: req = logic_common.build_request(url) table = BeautifulSoup(req.text, 'lxml').find( 'div', {'id': 'freelist'}).find('table').find_all('tr') except Exception as e: error_log.error('Spider kuaidaili error.[msg]={}'.format(e)) continue for tr in table[1:]: try: ip = tr.find('td', {'data-title': 'IP'}).get_text() port = tr.find('td', {'data-title': 'PORT'}).get_text() ip = ip + ':' + port result.append(ip) except: pass info_log.info('Spider kuaidaili success.Crawled IP Count:{}'.format(len(result))) return result