def parseJilinwula(content=""): page = dict() soup = getSoup(content) if soup: page['origin'] = origin # 页面标题 page['title'] = title # 初始化博客列表集合 blogs = list() items = soup.find_all('div', 'item-label') for item in items: href = item.div.a['href'] _title = item.div.a.string if _title: blog = dict() blog['id'] = href blog['href'] = href blog['title'] = _title blog['publishTime'] = item.find_all( 'div', 'item-meta-date')[0].string.replace('发布于 ', '') blogs.append(blog) page['blogs'] = blogs page['blogCount'] = len(blogs) return page
def parseJmtaobao(content=""): page = dict() soup = getSoup(content) if soup: # 网页地址 page['origin'] = origin # 页面标题 page['title'] = title # 初始化博客列表集合 blogs = list() # 最近的日志 articles = soup.find_all('article') for article in articles: blog = dict() header = article.find('header') blog['id'] = header.h1.a['href'] blog['href'] = origin + header.h1.a['href'] blog['title'] = header.h1.a.string.strip() blog['publishTime'] = header.time.string.strip() blogs.append(blog) page['blogs'] = blogs page['blogCount'] = len(blogs) return page
def parseHchstudio(content=""): page = dict() soup = getSoup(content) if soup: # 网页地址 page['origin'] = origin # 页面标题 page['title'] = title # 初始化博客列表集合 blogs = list() # 最近的日志 articles = soup.find_all('div', 'post') for article in articles: blog = dict() blog['id'] = article.h1.a.string.strip() blog['title'] = article.h1.a.string.strip() blog['href'] = origin + article.h1.a['href'] blog['publishTime'] = article.find('span', 'post-meta').string.strip() blog['author'] = article.find('a', id='authorH').string.strip() blogs.append(blog) page['blogs'] = blogs page['blogCount'] = len(blogs) return page