コード例 #1
0
def crawl():
    urls = [
        'http://www.xicidaili.com/nn/', 'http://www.xicidaili.com/nn/2',
        'http://www.xicidaili.com/wn/'
    ]
    result = []
    for url in urls:
        try:
            req = logic_common.build_request(url)
            table = BeautifulSoup(req.text,
                                  'lxml').find('table',
                                               id='ip_list').find_all('tr')
        except Exception as e:
            error_log.error('Spider xicidaili error.[msg]={}'.format(e))
            continue
        for tr in table[1:]:
            try:
                tds = tr.find_all('td')
                ip = tds[1].get_text() + ':' + tds[2].get_text()
                result.append(ip)
            except:
                pass
    info_log.info('Spider xicidaili success.Crawled IP Count:{}'.format(
        len(result)))
    return result
コード例 #2
0
def crawl():
    urls = [
        'http://www.data5u.com/free/gngn/index.shtml',
        'http://www.data5u.com/free/gwgn/index.shtml',
        'http://www.data5u.com/free/gnpt/index.shtml',
        'http://www.data5u.com/free/gwpt/index.shtml'
    ]
    result = []
    for url in urls:
        try:
            req = logic_common.build_request(url)
            table = BeautifulSoup(req.text,
                                  'lxml').find_all('ul', {"class": 'l2'})
        except Exception as e:
            error_log.error('Spider data5u error.[msg]={}'.format(e))
            continue
        for item in table[1:]:
            try:
                spans = item.find_all('span')
                ip = spans[0].get_text()
                port = spans[1].get_text()
            except:
                continue
            line = ip + ':' + port
            result.append(
                line.replace('\r',
                             '').replace('\n',
                                         '').replace('\t',
                                                     '').replace(' ', ''))
    info_log.info('Spider data5u success.Crawled IP Count:{}'.format(
        len(result)))
    return result
コード例 #3
0
def crawl():
    result = []
    for page in range(5):
        url = 'https://proxy.coderbusy.com/classical/anonymous-type/highanonymous.aspx?page=%s' % (
            page + 1)
        try:
            req = logic_common.build_request(url)
            table = BeautifulSoup(req.text,
                                  'lxml').find('div', {
                                      'class': 'table-responsive'
                                  }).find_all('tr')
        except Exception as e:
            error_log.error('Spider CoderBusy error.[msg]={}'.format(e))
            continue
        for item in table[1:]:
            try:
                tds = item.find_all('td')
                ip = tds[0].get_text()
                port = tds[2].get_text()
            except:
                continue
            line = ip + ':' + port
            result.append(
                line.replace('\r',
                             '').replace('\n',
                                         '').replace('\t',
                                                     '').replace(' ', ''))
    info_log.info('Spider CoderBusy success.Crawled IP Count:{}'.format(
        len(result)))
    return result
コード例 #4
0
ファイル: spider_89ip.py プロジェクト: Anning01/ProxyPool
def crawl():
    urls = ['http://www.89ip.cn/tiqv.php?sxb=&tqsl=300&ports=&ktip=&xl=on&submit=%CC%E1++%C8%A1']
    result = []
    for pageurl in urls:
        try:
            req = logic_common.build_request(pageurl)
            html = req.text
        except Exception as e:
            error_log.error('Spider 89ip error.[msg]={}'.format(e))
            continue
        ips = re.findall('\d+\.\d+\.\d+\.\d+:\d+', html)
        result += ips
        time.sleep(2)
    info_log.info('Spider 89ip success.Crawled IP Count:{}'.format(len(result)))
    return result
コード例 #5
0
def crawl():
    urls = [
        'http://www.66ip.cn/nmtq.php?getnum=600&isp=0&anonymoustype=0&start=&ports=&export=&ipaddress=&area=0&proxytype=2&api=66ip'
    ]
    result = []
    for pageurl in urls:
        try:
            req = logic_common.build_request(pageurl)
            html = req.text
        except Exception as e:
            error_log.error('Spider 66ip error.[msg]={}'.format(e))
            continue
        ips = re.findall('\d+\.\d+\.\d+\.\d+:\d+', html)
        result += ips
        time.sleep(2)
    info_log.info('Spider 66ip success.Crawled IP Count:{}'.format(
        len(result)))
    return result
コード例 #6
0
def crawl():
    result = []
    for page in range(1, 10):
        url = 'https://www.kuaidaili.com/ops/proxylist/{}/'.format(page)
        try:
            req = logic_common.build_request(url)
            table = BeautifulSoup(req.text, 'lxml').find(
                'div', {'id': 'freelist'}).find('table').find_all('tr')
        except Exception as e:
            error_log.error('Spider kuaidaili error.[msg]={}'.format(e))
            continue
        for tr in table[1:]:
            try:
                ip = tr.find('td', {'data-title': 'IP'}).get_text()
                port = tr.find('td', {'data-title': 'PORT'}).get_text()
                ip = ip + ':' + port
                result.append(ip)
            except:
                pass
    info_log.info('Spider kuaidaili success.Crawled IP Count:{}'.format(len(result)))
    return result