Python soup Exemples, com.mars.html.soup Python Exemples

Exemple #1

0

Afficher le fichier

def book(shopId, pageNo):
    html_soup = soup('http://shop.kongfz.com/' + str(shopId) +
                     '/all/0_50_0_0_' + str(pageNo) + '_sort_desc_0_0/')
    items = html_soup.select('.list-content > div')
    books = []
    for item in items[0:50]:
        books.append(str(item['itemid']) + ':' + str(item['isbn']))
    return books

Exemple #2

0

Afficher le fichier

def contents(web_url):  # 爬虫获取网页没啥好说的
    html_soup = soup(web_url)
    tags = html_soup.select('.tagCol > tbody > tr > td > a')
    temps = []
    for tag in tags:
        temp = tag.get_text()
        if len(temp) > 0:
            temps.append(temp)
    return temps

Exemple #3

0

Afficher le fichier

def get_book(bookUrl):  # 爬虫获取网页没啥好说的
    book_soup = soup(bookUrl)
    # 内容
    content = book_soup.find_all(id='info')[0]
    # 标题
    title = book_soup.find_all(property='v:itemreviewed')[0].get_text()
    # 评分
    score = "0"
    scores = book_soup.find_all(property='v:average')
    if len(scores) > 0:
        score = replaces(str(scores[0].get_text()))
        if len(score) == 0:
            score = "0"

    # 评价人数
    comments = 0
    comment = book_soup.find_all(property='v:votes')
    if len(comment) > 0:
        comments = comment[0].get_text()
        if len(comments) == 0:
            comments = 0
    # 封面
    cover_image = book_soup.find_all(rel='v:photo')[0]['src']

    # 去除空格,在用换行符分割成list
    temps = replaces(content.get_text()).split('\n')
    info = []
    for temps_index in range(len(temps)):
        if temps_index > 0:
            if temps[temps_index].find(':') == -1:
                info[-1] = info[-1] + temps[temps_index]
            else:
                info.append(temps[temps_index])
        else:
            info.append(temps[temps_index])

    params = [
        Id(bookUrl), title, cover_image, score, comments, '', '', '', '', '',
        0, 0, '',
        local(),
        local()
    ]
    for i in info:
        if i.find('作者') > -1:
            author = replaces(i.split(':')[1])
            params[5] = country(author)[0]
            params[6] = country(author)[1].replace("'", "`")
        if i.find('出版社') > -1:
            params[7] = replaces(i.split(':')[1]).replace("'", "`")
        if i.find('译者') > -1:
            params[8] = replaces(i.split(':')[1])
        if i.find('出版年') > -1:
            params[9] = replaces(i.split(':')[1])
        if i.find('定价') > -1:
            params[10] = re.sub('[^0-9.]', '', i.split(':')[1])
        if i.find('页数') > -1:
            pageSize = re.sub('[^0-9]', '', i.split(':')[1])
            if pageSize == '':
                pageSize = '0'
            params[11] = pageSize
        if i.find('ISBN') > -1:
            params[12] = replaces(i.split(':')[1])
    print(params)
    save(params)

Exemple #4

0

Afficher le fichier

def books(web_url):
    target_soup = soup(web_url)
    book_list = []
    for item in target_soup.select('.nbg'):
        book_list.append(item['href'])
    return book_list