Esempio n. 1
0
def get_ip():
    page = 1
    base_url = 'http://www.kuaidaili.com/proxylist/'
    while page <= 10:
        url = base_url + str(page)
        data = getHtml(url)
        soup = BeautifulSoup(data,'lxml')
        for tr in soup.find_all('tr'):
            ip = tr.contents[1].string
            port = tr.contents[3].string
            if 'IP' == ip:
                continue
            yield ip,port
        page+=1
        sleep(1)
Esempio n. 2
0
def get_ip():
    page = 1
    base_url = 'http://www.kuaidaili.com/proxylist/'
    while page <= 10:
        url = base_url + str(page)
        data = getHtml(url)
        soup = BeautifulSoup(data, 'lxml')
        for tr in soup.find_all('tr'):
            ip = tr.contents[1].string
            port = tr.contents[3].string
            if 'IP' == ip:
                continue
            yield ip, port
        page += 1
        sleep(1)
Esempio n. 3
0
__author__ = 'wangqi'
import re
from commen import getHtml
# -*- coding:utf-8 -*-

page = 1
initail_url = 'http://www.qiushibaike.com/hot/page/' + str(page)

html = getHtml(initail_url)
pattern = re.compile('<div.*?author">.*?<a.*?<img.*?>(.*?)</a>.*?<div.*?'+
                         'content">(.*?)<!--(.*?)-->.*?</div>(.*?)<div class="stats.*?class="number">(.*?)</i>',re.S)
items = re.findall(pattern,html)
for item in items:
     haveImg = re.search("img",item[3])
     if not haveImg:
        print(item[0],item[1],item[2],item[4])


Esempio n. 4
0
__author__ = 'wangqi'
import re
from commen import getHtml
# -*- coding:utf-8 -*-

page = 1
initail_url = 'http://www.qiushibaike.com/hot/page/' + str(page)

html = getHtml(initail_url)
pattern = re.compile(
    '<div.*?author">.*?<a.*?<img.*?>(.*?)</a>.*?<div.*?' +
    'content">(.*?)<!--(.*?)-->.*?</div>(.*?)<div class="stats.*?class="number">(.*?)</i>',
    re.S)
items = re.findall(pattern, html)
for item in items:
    haveImg = re.search("img", item[3])
    if not haveImg:
        print(item[0], item[1], item[2], item[4])