def fetcharticle2(self, url, decoder): #url = self.http_daili % url[7:] opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) print result.realurl status_code, content = result.code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code,url)) return None soup = BeautifulSoup(content,'lxml') cont = soup.findAll(attrs={"align":"right"}) url = cont[0].a['href'] url = self.trueURL_zzh(url) #文章url result = opener.open(url) status_code, content = result.code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code,url)) return None if self.page_encoding: return content.decode(self.page_encoding) else: return decoder.decode(content,url)
def ParseFeedUrls(self): urls = [] urladded = set() url = self.feeds[0][1] section = self.feeds[0][0] opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) if result.code == 200 and result.content: content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") tag_a = soup.find_all('a') href = tag_a[1]['href'] temp_url = href[0:6] url = 'http://www.housebook.com.cn/'+ href result = opener.open(url) if result.code != 200: self.log.warn('fetch rss failed:%s'%mainurl) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") tag_a = soup.find_all('a') for art in tag_a: if art['href'] == '../main.htm': continue urlfeed = 'http://www.housebook.com.cn/' + temp_url +'/' +art['href'] title = art.text urls.append((section, title, urlfeed, None)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s'%(result.code,url)) return urls
def ParseFeedUrls(self): #解析xml,返回相关信息 """ return list like [(section,title,url,desc),..] """ urls = [] tnow = datetime.datetime.utcnow() urladded = set() for feed in self.feeds: section, url = feed[0], feed[1] isfulltext = feed[2] if len(feed) > 2 else False timeout = self.timeout + 10 if isfulltext else self.timeout opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) if result.code == 200 and result.content: if self.feed_encoding: content = result.content.decode(self.feed_encoding) else: content = AutoDecoder(True).decode(result.content, url) feed = feedparser.parse(content) #进行解析 #分解得到的内容 for e in feed['entries'][:self. max_articles_per_feed]: #取相应数量的feed if self.oldest_article > 0 and hasattr( e, 'updated_parsed'): #是否有更新 updated = e.updated_parsed if updated: delta = tnow - datetime.datetime(*(updated[0:6])) #根据时间来判断要取的文章 if delta.days * 86400 + delta.seconds > 86400 * self.oldest_article: self.log.info("Skip old article: %s" % e.link) continue #支持HTTPS urlfeed = e.link.replace( 'http://', 'https://') if url.startswith('https://') else e.link if urlfeed in urladded: continue desc = None if isfulltext: if hasattr(e, 'content') and e.content[0]['value']: desc = e.content[0]['value'] elif hasattr(e, 'description'): desc = e.description else: self.log.warn( 'fulltext feed item no has desc,link to webpage for article.(%s)' % e.title) urls.append((section, e.title, urlfeed, desc)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s' % (result.code, url)) return urls
def fetcharticle(self, url, decoder): opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) status_code, content = result.code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code, url)) return None if self.page_encoding: return content.decode(self.page_encoding) else: return decoder.decode(content, url)
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ mainurl = 'http://www.economist.com/printedition' urls = [] urladded = set() opener = URLOpener(self.host, timeout=30) result = opener.open(mainurl) if result.code != 200: self.log.warn('fetch rss failed:%s' % mainurl) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #href=re.compile("elsie") for section in soup.find_all(id=re.compile("section-")): h4 = section.find('h4') if h4 is None: self.log.warn('h4 is empty') continue sectitle = string_of_tag(h4).strip() if not sectitle: self.log.warn('h4 string is empty') continue #self.log.info('Found section: %s' % section_title) articles = [] subsection = '' for node in section.find_all(class_='article'): subsec = node.find('h5') if subsec is not None: subsection = string_of_tag(subsec) prefix = (subsection + ': ') if subsection else '' a = node.find('a', attrs={"href": True}, recursive=False) if a is not None: url = a['href'] if url.startswith(r'/'): url = 'http://www.economist.com' + url url += '/print' title = string_of_tag(a) if title: title = prefix + title #self.log.info('\tFound article:%s' % title) if url not in urladded: urls.append((sectitle, title, url, None)) urladded.add(url) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def ParseFeedUrls(self): urls = [] urladded = set() url = self.feeds[0][1] opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) section = self.feeds[0][0] if result.code == 200 and result.content: soup = BeautifulSoup(result.content,'lxml') cont = soup.findAll('item') for con in cont: title = con.title.get_text() href = con.contents[2] urls.append((section, title, href, None)) else: self.log.warn('fetch rss failed(%d):%s'%(result.code,url)) return urls
def fetcharticle(self, url, decoder): """链接网页获取一篇文章""" if self.fulltext_by_instapaper and not self.fulltext_by_readability: #用instapaper进行初期的内容提取 url = "http://www.instapaper.com/m?u=%s" % self.url_unescape(url) opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) code, content = result.code, result.content if code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (code, url)) return None if self.page_encoding: return content.decode(self.page_encoding) else: return decoder.decode(content, url)
def ParseFeedUrls(self): urls = [] urladded = set() url = self.feeds[0][1] opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) section = self.feeds[0][0] if result.code == 200 and result.content: soup = BeautifulSoup(result.content,'lxml') cont = soup.findAll(attrs={"class":"feed_item_question"}) for con in cont: title = con.a.get_text() href = "http://chuansongme.com%s" % con.a['href'] urls.append((section, title, href, None)) else: self.log.warn('fetch rss failed(%d):%s'%(result.code,url)) return urls
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] url = self.feeds[0][1] opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) if result.code != 200 or not result.content: self.log.warn('fetch webpage failed(%d):%s.' % (result.code, url)) return [] if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(False).decode(result.content, opener.realurl) else: content = AutoDecoder(False).decode(result.content, opener.realurl) soup = BeautifulSoup(content, 'lxml') for article in soup.find_all('div', attrs={'class': 'post'}): title = article.find('a', attrs={'class': 'title'}) if not title or not title.string.startswith(u'安邦'): continue #获取发布时间 pubdate = article.find('span', attrs={'class': 'date'}) if not pubdate: continue mt = re.match(ur'(\d{4})年(\d{1,2})月(\d{1,2})日', pubdate.string) if not mt: continue pubdate = datetime.datetime(int(mt.group(1)), int(mt.group(2)), int(mt.group(3))) #确定文章是否需要推送,时区固定为北京时间 tnow = datetime.datetime.utcnow() + datetime.timedelta(hours=8) delta = tnow - pubdate if self.oldest_article > 0 and delta.days > self.oldest_article: continue urls.append((u'安邦咨询', title.string, title['href'], None)) return urls
def ParseFeedUrls(self): urls = [] urladded = set() url = self.feeds[0][1] opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) if result.code == 200 and result.content: feed = json.loads(result.content.decode(self.feed_encoding)) for partition, section in self.partitions: for item in feed[partition]: urlfeed = item['share_url'] if urlfeed in urladded: self.log.info('skipped %s' % urlfeed) continue urls.append((section, item['title'], urlfeed, None)) urladded.add(urlfeed) else: self.log.warn('fetch rss failed(%d):%s' % (result.code, url)) return urls
def ParseFeedUrls(self): urls = [] urladded = set() url = self.feeds[0][1] opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) section = self.feeds[0][0] if result.code == 200 and result.content: soup = BeautifulSoup(result.content,'lxml') cont1 = soup.findAll("title") cont2 = soup.findAll("guid") nums = len(cont2) for i in range(nums): title = cont1[i+2].string href = cont2[i].string url = self.trueURL_zzh(href) urls.append((section, title, url, None)) else: self.log.warn('fetch rss failed(%d):%s'%(result.code,url)) return urls
def ParseFeedUrls(self): urls = [] urladded = set() url = self.feeds[0][1] opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) section = self.feeds[0][0] if result.code == 200 and result.content: soup = BeautifulSoup(result.content, 'lxml') cont = soup.findAll(attrs={ "class": "field field-name-title field-type-ds field-label-hidden" }) root_url = 'https://s3.amazonaws.com/pao-pao/%s' for con in cont: title = con.a.get_text() href = root_url % con.a['href'] urls.append((section, title, href, None)) else: self.log.warn('fetch rss failed(%d):%s' % (result.code, url)) return urls
def GET(self): code = web.input().get('code') client = Client( KEY_Q, SECRET_Q, site='https://graph.qq.com', authorize_url='https://graph.qq.com/oauth2.0/authorize', token_url='https://graph.qq.com/oauth2.0/token') if not code: try: authorize_url = client.auth_code.authorize_url( redirect_uri=CALLBACK_Q, scope='get_user_info') web.seeother(authorize_url) except: raise web.seeother(r'/') else: try: access_token = client.auth_code.get_token( code, redirect_uri=CALLBACK_Q, parse='query') url = "https://graph.qq.com/oauth2.0/me?access_token=%s" % access_token.token opener = URLOpener() result = opener.open(url) r_code, content = result.code, result.content except: raise web.seeother(r'/') if content.find('error') == 0: raise web.seeother(r'/') if content.find("callback") == 0: lp = content.find('(') rp = content.find(')') con = content[lp + 1:rp - 1] try: data = json.loads(con) openid = data['openid'] clientid = data['client_id'] url2 = "https://graph.qq.com/user/get_user_info?oauth_consumer_key=%s&access_token=%s&openid=%s&format=json" % ( KEY_Q, access_token.token, openid) r2 = opener.open(url2) content2 = r2.content data2 = json.loads(content2) ret = data2['ret'] except: raise web.seeother(r'/') if ret == 0: #name = data2['nickname']+'('+openid[2:6]+')' name = openid[2:6] #存在,登录 if model.isuser(name, 'qq') == 1: session.login = 1 session.username = name model.update_logintime(local_time(), name) raise web.seeother(r'/') else: #不存在,注册,登录返回 #注册 model.input_user(name, 'qq') if model.isuser(name, 'qq') == 1: session.login = 1 session.username = name raise web.seeother(r'/my') else: return jjenv.get_template("register.html").render( nickname='', title='Register', tips="") else: raise web.seeother(r'/') else: raise web.seeother(r'/')
encoding = chardet.detect(content)['encoding'] print encoding result = content.decode(encoding) netloc = urlparse.urlsplit(url)[1] print netloc r.set(netloc,encoding) print r.get(netloc) ''' #url='http://tech.sina.com.cn/internet/' #url='http://tech.sina.com.cn/i/2014-01-08/08039077686.shtml' #url='http://blog.knownsec.com/2012/04/about-content-encoding-gzip/' url = 'http://book.douban.com/review/6549990/' zzh = URLOpener() re = zzh.open(url) #print re.info() #print re.content.decode('GBK').encode('utf-8') #print re.content fout = open('zhang_test', 'wb') fout.write(re.content) fout.close() ''' encoding = chardet.detect(re.content)['encoding'] print encoding print re.headers print isinstance(re.content,unicode) print re.content.decode(encoding,'ignore').encode('utf-8') ''' doc = readability.Document(re.content)
from lib.img import rescale_image from lib.url_req import URLOpener import os #rescale_image(data, maxsizeb=4000000, dimen=None, png2jpg=False, graying=True, reduceto=(600,800)): test = URLOpener().open('http://img.xinjunshi.com/uploads/allimg/140224/11-140224101225.jpg') #test=URLOpener().open('http://www.sucaitianxia.com/d/file/20131222/28caa29d1ddad3c085035e024a9f0b02.png') con = test.content con = rescale_image(con,reduceto=(400,600),graying=False) fout = open('zzh.jpg', "wb") fout.write(con) fout.close()
def Items(self, opts=None): decoder = AutoDecoder(False) timeout = self.timeout for section, url in self.feeds: opener = URLOpener(self.host, timeout=timeout) result = opener.open(url) code, content = result.code, result.content if code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (code, url)) continue if self.page_encoding: try: content = content.decode(self.page_encoding) except UnicodeDecodeError: content = decoder.decode(content, opener.realurl) else: content = decoder.decode(content, opener.realurl) content = self.preprocess(content) soup = BeautifulSoup(content, "lxml") h = soup.find('head') if not h: h = soup.new_tag('head') t = soup.new_tag('title') t.string = section h.append(t) soup.html.insert(0, h) try: title = soup.html.head.title.string except AttributeError: title = section title = self.processtitle(title) if self.keep_only_tags: body = soup.new_tag('body') try: if isinstance(self.keep_only_tags, dict): keep_only_tags = [self.keep_only_tags] else: keep_only_tags = self.keep_only_tags for spec in keep_only_tags: for tag in soup.find('body').find_all(**spec): body.insert(len(body.contents), tag) soup.find('body').replace_with(body) except AttributeError: # soup has no body element pass for spec in self.remove_tags_after: tag = soup.find(**spec) remove_beyond(tag, 'next_sibling') for spec in self.remove_tags_before: tag = soup.find(**spec) remove_beyond(tag, 'previous_sibling') remove_tags = self.insta_remove_tags + self.remove_tags remove_ids = self.insta_remove_ids + self.remove_ids remove_classes = self.insta_remove_classes + self.remove_classes remove_attrs = self.insta_remove_attrs + self.remove_attrs for tag in soup.find_all(remove_tags): tag.decompose() for id in remove_ids: for tag in soup.find_all(attrs={"id": id}): tag.decompose() for cls in remove_classes: for tag in soup.find_all(attrs={"class": cls}): tag.decompose() for attr in remove_attrs: for tag in soup.find_all(attrs={attr: True}): del tag[attr] for cmt in soup.find_all( text=lambda text: isinstance(text, Comment)): cmt.extract() if self.extra_css: sty = soup.new_tag('style', type="text/css") sty.string = self.extra_css soup.html.head.append(sty) if self.keep_image: self.soupbeforeimage(soup) for img in soup.find_all('img', attrs={'src': True}): imgurl = img['src'] if img.get('height') in ('1','2','3','4','5') \ or img.get('width') in ('1','2','3','4','5'): self.log.warn('img size too small,take away it:%s' % imgurl) img.decompose() continue if not imgurl.startswith('http'): imgurl = self.urljoin(url, imgurl) if self.fetch_img_via_ssl and url.startswith('https://'): imgurl = imgurl.replace('http://', 'https://') if self.isfiltered(imgurl): self.log.warn('img filtered:%s' % imgurl) img.decompose() continue imgresult = opener.open(imgurl) imgcontent = self.process_image( imgresult.content, opts) if imgresult.code == 200 else None if imgcontent: imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype == 'jpeg' else imgtype) img['src'] = fnimg yield (imgmime, imgurl, fnimg, imgcontent, None) else: img.decompose() else: self.log.warn('fetch img failed(err:%d):%s' % (imgresult.code, imgurl)) img.decompose() for img in soup.find_all('img'): if img.parent and img.parent.parent and img.parent.name == 'a': img.parent.replace_with(img) else: for img in soup.find_all('img'): img.decompose() self.soupprocessex(soup) content = unicode(soup) brief = u'' if GENERATE_TOC_DESC: body = soup.find('body') for h in body.find_all(['h1', 'h2']): # 去掉h1/h2,避免和标题重 h.decompose() for s in body.stripped_strings: brief += unicode(s) + u' ' if len(brief) >= TOC_DESC_WORD_LIMIT: brief = brief[:TOC_DESC_WORD_LIMIT] break soup = None content = self.postprocess(content) yield (section, url, title, content, brief)
def readability_by_soup(self, article, url, opts=None): content = self.preprocess(article) soup = BeautifulSoup(content, "lxml") try: title = soup.html.head.title.string except AttributeError: self.log.warn('object soup invalid!(%s)' % url) return title = self.processtitle(title) soup.html.head.title.string = title if self.keep_only_tags: body = soup.new_tag('body') try: if isinstance(self.keep_only_tags, dict): keep_only_tags = [self.keep_only_tags] else: keep_only_tags = self.keep_only_tags for spec in keep_only_tags: for tag in soup.find('body').find_all(**spec): body.insert(len(body.contents), tag) soup.find('body').replace_with(body) except AttributeError: pass for spec in self.remove_tags_after: tag = soup.find(**spec) remove_beyond(tag, 'next_sibling') for spec in self.remove_tags_before: tag = soup.find(**spec) remove_beyond(tag, 'previous_sibling') remove_tags = self.insta_remove_tags + self.remove_tags remove_ids = self.insta_remove_ids + self.remove_ids remove_classes = self.insta_remove_classes + self.remove_classes remove_attrs = self.insta_remove_attrs + self.remove_attrs for tag in soup.find_all(remove_tags): tag.decompose() for id in remove_ids: for tag in soup.find_all(attrs={"id": id}): tag.decompose() for cls in remove_classes: for tag in soup.find_all(attrs={"class": cls}): tag.decompose() for attr in remove_attrs: for tag in soup.find_all(attrs={attr: True}): del tag[attr] for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)): cmt.extract() if self.extra_css: sty = soup.new_tag('style', type="text/css") sty.string = self.extra_css soup.html.head.append(sty) if self.keep_image: opener = URLOpener(self.host, timeout=self.timeout) self.soupbeforeimage(soup) for img in soup.find_all('img', attrs={'src': True}): imgurl = img['src'] if img.get('height') in ('1','2','3','4','5') \ or img.get('width') in ('1','2','3','4','5'): self.log.warn('img size too small,take away it:%s' % imgurl) img.decompose() continue if not imgurl.startswith('http'): imgurl = self.urljoin(url, imgurl) if self.fetch_img_via_ssl and url.startswith('https://'): imgurl = imgurl.replace('http://', 'https://') if self.isfiltered(imgurl): self.log.warn('img filtered:%s' % imgurl) img.decompose() continue imgresult = opener.open(imgurl) imgcontent = self.process_image( imgresult.content, opts) if imgresult.code == 200 else None if imgcontent: imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype == 'jpeg' else imgtype) img['src'] = fnimg yield (imgmime, imgurl, fnimg, imgcontent, None) else: img.decompose() else: self.log.warn('fetch img failed(err:%d):%s' % (imgresult.code, imgurl)) img.decompose() for img in soup.find_all('img'): #去掉图像上面的链接 if img.parent and img.parent.parent and \ img.parent.name == 'a': img.parent.replace_with(img) else: for img in soup.find_all('img'): img.decompose() #如果没有内容标题则添加 t = soup.html.body.find(['h1', 'h2']) if not t: t = soup.new_tag('h1') t.string = title soup.html.body.insert(0, t) else: totallen = 0 for ps in t.previous_siblings: totallen += len(string_of_tag(ps)) if totallen > 40: #此H1/H2在文章中间出现,不是文章标题 t = soup.new_tag('h1') t.string = title soup.html.body.insert(0, t) break self.soupprocessex(soup) content = unicode(soup) #提取文章内容的前面一部分做为摘要 brief = u'' if GENERATE_TOC_DESC: body = soup.find('body') for h in body.find_all(['h1', 'h2']): # 去掉h1/h2,避免和标题重复 h.decompose() for s in body.stripped_strings: brief += unicode(s) + u' ' if len(brief) >= TOC_DESC_WORD_LIMIT: brief = brief[:TOC_DESC_WORD_LIMIT] break soup = None yield (title, None, None, content, brief)
def readability(self, article, url, opts=None): """ 使用readability-lxml处理全文信息 """ content = self.preprocess(article) # print '--------------' # print content # print '---------------' # 提取正文 try: doc = readability.Document(content) summary = doc.summary(html_partial=True) except: self.log.warn('article is invalid.[%s]' % url) return title = doc.short_title() title = self.processtitle(title) # print '==================' # print summary # print '===================' soup = BeautifulSoup(summary, 'lxml') # soup = BeautifulSoup(content,'lxml') ''' #没有head h = soup.find('head') if not h: h = soup.new_tag('head') t = soup.new_tag('title') t.string = title h.append(t) soup.html.insert(0,h) #没有h t = soup.html.body.find(['h1','h2']) if not t: t = soup.new_tag('h1') t.string = title soup.html.body.insert(0,t) else: totallen = 0 for ps in t.previous_siblings: totallen += len(string_of_tag(ps)) if totallen > 40: t = soup.new_tag('h1') t.string = title soup.html.body.insert(0,t) break ''' self.soupbeforeimage(soup) if self.remove_tags: for tag in soup.find_all(self.remove_tags): tag.decompose() for id in self.remove_ids: for tag in soup.find_all(attrs={"id": id}): tag.decompose() for cls in self.remove_classes: for tag in soup.find_all(attrs={"class": cls}): tag.decompose() for attr in self.remove_attrs: for tag in soup.find_all(attrs={attr: True}): del tag[attr] for cmt in soup.find_all(text=lambda text: isinstance(text, Comment)): cmt.extract() if self.extra_css: sty = soup.new_tag('style', type="text/css") sty.string = self.extra_css soup.html.head.append(sty) if self.keep_image: opener = URLOpener(self.host, timeout=self.timeout) for img in soup.find_all('img', attrs={'src': True}): imgurl = img['src'] if img.get('height') in ('1','2','3','4','5') \ or img.get('width') in ('1','2','3','4','5'): self.log.warn('img size too small,take it away : %s' % imgurl) img.decompose() continue if not imgurl.startswith('http'): imgurl = self.urljoin(url, imgurl) if self.fetch_img_via_ssl and url.startswith('https://'): imgurl = imgurl.replace('http://', 'https://') if self.isfiltered(imgurl): self.log.warn('img filtered : %s' % imgurl) img.decompose() continue imgresult = opener.open(imgurl) imgcontent = self.process_image( imgresult.content, opts) if imgresult.code == 200 else None if imgcontent: imgtype = imghdr.what(None, imgcontent) if imgtype: imgmime = r"image/" + imgtype fnimg = "img%d.%s" % (self.imgindex, 'jpg' if imgtype == 'jpeg' else imgtype) img['src'] = fnimg yield (imgmime, imgurl, fnimg, imgcontent, None) else: img.decompose() else: self.log.warn('fetch img failed(err:%d):%s' % (imgresult.code, imgurl)) img.decompose() #去掉图像上面的链接 for img in soup.find_all('img'): if img.parent and img.parent.parent and \ img.parent.name == 'a': img.parent.replace_with(img) else: for img in soup.find_all('img'): img.decompose() self.soupprocessex(soup) # print '====-=-=-=-=-=-=-=' # print soup # print '-=-=-=-=-=-=-=-=-=-=-' cc = soup.body.contents[0] # cc.name = "articleblock" # print cc # print soup.body.renderContents() #content = unicode(soup) content = unicode(cc) #print soup.find('body').contents #print soup.body.contents #提取文章内容的前面一部分做为摘要 brief = u'' if GENERATE_TOC_DESC: body = soup.find('body') for h in body.find_all(['h1', 'h2']): # 去掉h1/h2,避免和标题重复 h.decompose() for s in body.stripped_strings: brief += unicode(s) + u' ' if len(brief) >= TOC_DESC_WORD_LIMIT: brief = brief[:TOC_DESC_WORD_LIMIT] break soup = None yield (title, None, None, content, brief)