Python getSoup Exemples, parse.Base.getSoup Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : Jilinwula.py Projet : BaoXuebin/BlogSpider

def parseJilinwula(content=""):
    page = dict()
    soup = getSoup(content)
    if soup:
        page['origin'] = origin
        # 页面标题
        page['title'] = title
        # 初始化博客列表集合
        blogs = list()
        items = soup.find_all('div', 'item-label')
        for item in items:
            href = item.div.a['href']
            _title = item.div.a.string
            if _title:
                blog = dict()
                blog['id'] = href
                blog['href'] = href
                blog['title'] = _title
                blog['publishTime'] = item.find_all(
                    'div', 'item-meta-date')[0].string.replace('发布于 ', '')
                blogs.append(blog)

        page['blogs'] = blogs
        page['blogCount'] = len(blogs)
    return page

Exemple #2

0

Afficher le fichier

def parseJmtaobao(content=""):
    page = dict()
    soup = getSoup(content)
    if soup:
        # 网页地址
        page['origin'] = origin
        # 页面标题
        page['title'] = title
        # 初始化博客列表集合
        blogs = list()

        # 最近的日志
        articles = soup.find_all('article')
        for article in articles:
            blog = dict()
            header = article.find('header')
            blog['id'] = header.h1.a['href']
            blog['href'] = origin + header.h1.a['href']
            blog['title'] = header.h1.a.string.strip()
            blog['publishTime'] = header.time.string.strip()
            blogs.append(blog)

        page['blogs'] = blogs
        page['blogCount'] = len(blogs)
    return page

Exemple #3

0

Afficher le fichier

def parseHchstudio(content=""):
    page = dict()
    soup = getSoup(content)
    if soup:
        # 网页地址
        page['origin'] = origin
        # 页面标题
        page['title'] = title
        # 初始化博客列表集合
        blogs = list()

        # 最近的日志
        articles = soup.find_all('div', 'post')
        for article in articles:
            blog = dict()
            blog['id'] = article.h1.a.string.strip()
            blog['title'] = article.h1.a.string.strip()
            blog['href'] = origin + article.h1.a['href']
            blog['publishTime'] = article.find('span',
                                               'post-meta').string.strip()
            blog['author'] = article.find('a', id='authorH').string.strip()
            blogs.append(blog)

        page['blogs'] = blogs
        page['blogCount'] = len(blogs)
    return page