def ParseFeedUrls(self): #return lists like [(section,title,url,desc),..] main = 'http://www.reuters.com/places/north-korea' urls = [] isEST = False #判断是EST还是EDT opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s'%main) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 section=soup.find('div', attrs={'class':'topStory'}) toparticle = section.find('a', href=True) if toparticle is None: self.log.warn('Top news not found') toptitle = string_of_tag(toparticle).strip() if not toptitle: self.log.warn('No top story title') url = toparticle['href'] if url.startswith(r'/'): url = 'http://www.reuters.com' + url urls.append(('Reuters North Korea',toptitle,url,None)) sect=soup.find('div', id='moreSectionNews') for feature in sect.find_all('div', attrs={'class':'feature'}): article = feature.find('a', href=True) title = string_of_tag(article).strip() url = article['href'] timestamp = feature.find('span', attrs={'class':'timestamp'}) if not timestamp: continue timestamp = string_of_tag(timestamp).strip() #今天的文章 if 'EDT' in timestamp or 'EST' in timestamp: delta=0 if 'EST' in timestamp: isEST=True else: pubtime = datetime.datetime.strptime(timestamp, '%b %d %Y').date() #默认为EDT tnow = datetime.datetime.utcnow()-datetime.timedelta(hours=4) currentmonth= tnow.month if currentmonth in [1, 2, 12] or isEST: tnow = datetime.datetime.utcnow()-datetime.timedelta(hours=5) tnow = tnow.date() delta=(tnow-pubtime).days if self.oldest_article > 0 and delta > self.oldest_article: continue if url.startswith(r'/'): url = 'http://www.reuters.com' + url #self.log.info('\tFound article:%s' % title) urls.append(('Reuters North Korea',title,url,None)) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def ParseFeedUrls(self): #return lists like [(section,title,url,desc),..] main = 'http://www.thepaper.cn/list_masonry.jsp?nodeid=26878' urls = [] opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s' % main) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 for article in soup.find_all('div', class_='news_li', limit=6): inter = article.find('div', class_='pdtt_trbs') timestamp = inter.find('span') timestamp = string_of_tag(timestamp).strip() if u'天' in timestamp or u'-' in timestamp: continue h2 = article.find('h2') a = h2.find('a', href=True) title = string_of_tag(a).strip() if not title: self.log.warn('This title not found.') continue url = a['href'] if url.startswith(r'news'): url = 'http://www.thepaper.cn/' + url urls.append((u'上海书评', title, url, None)) if len(urls) == 0: self.log.warn('No article found for Shanghai Book Review.') return urls
def ParseFeedUrls(self): #return lists like [(section,title,url,desc),..] main = 'http://www.jintiankansha.me/column/nwClF5ZmDJ' urls = [] opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s' % main) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 section = soup.find('div', class_='entries') for article in section.find_all('div', class_='cell item', limit=10): timestamp = article.find('span', class_='small fade') timestamp = string_of_tag(timestamp).strip() #if u'小时' not in timestamp and u'昨天' not in timestamp: if u'小时' not in timestamp: continue span = article.find('span', class_='item_title') a = span.find('a', href=True) title = string_of_tag(a).strip() if not title: self.log.warn('This title not found.') continue url = a['href'] urls.append((u'聊聊架构', title, url, None)) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def ParseFeedUrls(self): main = 'http://bbstsg.vip.qikan.com/text/Mag.aspx?issn=ACB37AEA-8FB7-4855-B7CA-D228E972162F' urls = [] opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s'%main) return [] if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) else: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) soup = BeautifulSoup(content, "lxml") for section in soup.find_all('dl'): dt=section.find('dt') span=dt.find('span') if span: sectitle = string_of_tag(span).strip() for dd in section.find_all('dd'): a=dd.find('a', href=True) title = string_of_tag(a).strip() url = a['href'] if url.startswith('Article'): url = 'http://bbstsg.vip.qikan.com/text/'+url urls.append((sectitle,title,url,None)) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def ParseFeedUrls(self): #return lists like [(section,title,url,desc),..] main = 'http://mp.sohu.com/profile?xpt=bWhtaW5nMUBzb2h1LmNvbQ==&_f=index_pagemp_1' urls = [] opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s' % main) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 for article in soup.find_all('div', class_='content_wrap', limit=6): timestamp = article.find('div', class_='wrap_mark') span = timestamp.find('span') timestamp = string_of_tag(span).strip() if u'今天' not in timestamp and u'昨天' not in timestamp: continue div = article.find('div', class_='wrap_title') a = span.find('a', href=True) title = string_of_tag(a).strip() if not title: self.log.warn('This title not found.') continue url = a['href'] if url.startswith('/'): url = 'http:' + url urls.append((u'古代小说网sohu', title, url, None)) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def ParseFeedUrls(self): #return lists like [(section,title,url,desc),..] main = 'https://www.nknews.org/' urls = [] opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s' % main) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 def is_cls_wanted(css_class): listwanted = [ 'col-md-7', 'post-prinicap-row', 'col-md-12', 'col-md-6 smallboxclass' ] return css_class in listwanted # def not_has_class(tag): # return not tag.has_attr('class') for section in soup.find_all(class_=is_cls_wanted, limit=8): article = section.find('a', string=True) title = string_of_tag(article).strip() url = article['href'] if '/pro/' in url: continue span = article.find('span') strong = span.find('strong') if not strong: timestamp = span else: timestamp = strong timestamp = string_of_tag(timestamp).strip() m = re.search(r'\d{4}$', timestamp) if m: pubtime = datetime.datetime.strptime(timestamp, '%d %B %Y').date() else: m2 = re.search(r'^\d', timestamp) if m2: pubtime = datetime.datetime.strptime(timestamp, '%d %B').date() else: pubtime = datetime.datetime.strptime(timestamp, '%B %d').date() tnow = datetime.datetime.utcnow() tnow = tnow.date() delta = (tnow - pubtime).days if self.oldest_article > 0 and delta > self.oldest_article: continue #self.log.info('\tFound article:%s' % title) urls.append(('NK News', title, url, None)) if len(urls) == 0: self.log.warn('NK News has no article.') return urls
def ParseFeedUrls(self): #return list like [(section,title,url,desc),..] main = 'https://www.economist.com/printedition' # Did you block me? main = self.url4forwarder(main) urls = [] urladded = set() opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s'%main) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 for section in soup.find_all('li', attrs={'class':'list__item'}): div = section.find('div') if div is None: self.log.warn('This part skipped.') continue sectitle = string_of_tag(div).strip() if not sectitle: self.log.warn('No section title') continue if sectitle == 'Economic and financial indicators': continue #self.log.info('Found section: %s' % section_title) articles = [] for node in section.find_all('a', href=True): spans = node.findAll('span') if len(spans) == 2: fly= node.find('span', attrs={'class':'print-edition__link-flytitle'}) pre= string_of_tag(fly).strip() ti= node.find('span', attrs={'class':'print-edition__link-title'}) post= string_of_tag(ti).strip() title = pre +': '+ post else: title = string_of_tag(node).strip() url = node['href'] if url.startswith(r'/'): url = 'http://www.economist.com' + url # Did you block me? url = self.url4forwarder(url) #self.log.info('\tFound article:%s' % title) if url not in urladded: urls.append((sectitle,title,url,None)) urladded.add(url) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def ParseFeedUrls(self): #return lists like [(section,title,url,desc),..] mainhead = 'https://www.yna.co.kr/international/china/' num = 1 urls = [] callitaday = False koreanow = datetime.datetime.utcnow() + datetime.timedelta(hours=9) # koreadate = koreanow.date() year = koreanow.year mydelta = datetime.timedelta(hours=24, minutes=10) while not callitaday: main = mainhead + str(num) opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch mainnews failed:%s' % main) content = result.content.decode(self.page_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 section = soup.find('div', class_='list-type038') for article in section.find_all('div', class_='item-box01'): if article is None: self.log.warn('This article not found') continue ptime = article.find('span', class_='txt-time') if ptime: ptime = string_of_tag(ptime).strip() # pdate=ptime[0:5] #只要07-30这样的日期 ptime = str(year) + '-' + ptime #加上年份,否则默认1900年 ptime = datetime.datetime.strptime(ptime, '%Y-%m-%d %H:%M') delta = koreanow - ptime # if self.oldest_article > 0 and delta >= self.oldest_article: if delta > mydelta: callitaday = True break #因为是按时间顺序的 newscon = article.find('div', class_='news-con') a = newscon.find('a', href=True) atitle = string_of_tag(a).strip() atitle = atitle + ' ' + str(ptime)[5:-3] url = a['href'] if url.startswith('/'): url = 'https:' + url urls.append((u'중국 뉴스', atitle, url, None)) num = num + 1 if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ mainurl = 'http://www.economist.com/printedition' urls = [] urladded = set() opener = URLOpener(self.host, timeout=30) result = opener.open(mainurl) if result.status_code != 200: self.log.warn('fetch rss failed:%s' % mainurl) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #GAE获取到的是移动端网页,和PC获取到的网页有些不一样 for section in soup.find_all( 'section', attrs={'id': lambda x: x and 'section' in x}): h4 = section.find('h4') if h4 is None: self.log.warn('h4 is empty') continue sectitle = string_of_tag(h4).strip() if not sectitle: self.log.warn('h4 string is empty') continue #self.log.info('Found section: %s' % section_title) articles = [] subsection = '' for node in section.find_all('article'): subsec = node.find('h5') if subsec is not None: subsection = string_of_tag(subsec) prefix = (subsection + ': ') if subsection else '' a = node.find('a', attrs={"href": True}, recursive=False) if a is not None: url = a['href'] if url.startswith(r'/'): url = 'http://www.economist.com' + url url += '/print' title = string_of_tag(a) if title: title = prefix + title #self.log.info('\tFound article:%s' % title) if url not in urladded: urls.append((sectitle, title, url, None)) urladded.add(url) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ mainurl = 'http://www.economist.com/printedition' urls = [] urladded = set() opener = URLOpener(self.host, timeout=30) result = opener.open(mainurl) if result.status_code != 200: self.log.warn('fetch rss failed:%s'%mainurl) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #GAE获取到的是移动端网页,和PC获取到的网页有些不一样 for section in soup.find_all('section', attrs={'id':lambda x: x and 'section' in x}): h4 = section.find('h4') if h4 is None: self.log.warn('h4 is empty') continue sectitle = string_of_tag(h4).strip() if not sectitle: self.log.warn('h4 string is empty') continue #self.log.info('Found section: %s' % section_title) articles = [] subsection = '' for node in section.find_all('article'): subsec = node.find('h5') if subsec is not None: subsection = string_of_tag(subsec) prefix = (subsection + ': ') if subsection else '' a = node.find('a', attrs={"href":True}, recursive=False) if a is not None: url = a['href'] if url.startswith(r'/'): url = 'http://www.economist.com' + url url += '/print' title = string_of_tag(a) if title: title = prefix + title #self.log.info('\tFound article:%s' % title) if url not in urladded: urls.append((sectitle,title,url,None)) urladded.add(url) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ mainurl = "http://www.economist.com/printedition" urls = [] urladded = set() opener = URLOpener(self.host, timeout=30) result = opener.open(mainurl) if result.status_code != 200: self.log.warn("fetch rss failed:%s" % mainurl) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") # GAE获取到的是移动端网页,和PC获取到的网页有些不一样 for section in soup.find_all("section", attrs={"id": lambda x: x and "section" in x}): h4 = section.find("h4") if h4 is None: self.log.warn("h4 is empty") continue sectitle = string_of_tag(h4).strip() if not sectitle: self.log.warn("h4 string is empty") continue # self.log.info('Found section: %s' % section_title) articles = [] subsection = "" for node in section.find_all("article"): subsec = node.find("h5") if subsec is not None: subsection = string_of_tag(subsec) prefix = (subsection + ": ") if subsection else "" a = node.find("a", attrs={"href": True}, recursive=False) if a is not None: url = a["href"] if url.startswith(r"/"): url = "http://www.economist.com" + url url += "/print" title = string_of_tag(a) if title: title = prefix + title # self.log.info('\tFound article:%s' % title) if url not in urladded: urls.append((sectitle, title, url, None)) urladded.add(url) if len(urls) == 0: self.log.warn("len of urls is zero.") return urls
def ParseFeedUrls(self): #return lists like [(section,title,url,desc),..] main = 'http://news.joins.com/Issue/10061' urls = [] opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s' % main) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 for article in soup.find_all('strong', class_='headline mg', limit=4): #只保留最近一个月的四篇 a = article.find('a', href=True) title = string_of_tag(a).strip() if not title: self.log.warn('This title not found.') continue url = a['href'] if url.startswith('/'): url = 'http://news.joins.com' + url urls.append((u'사설 속으로', title, url, None)) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def FindHo(): hopage = 'http://weekly.chosun.com/client/contents/lst.asp' opener = URLOpener(self.host, timeout=90) result = opener.open(hopage) content = result.content.decode('euc-kr') if result.status_code != 200: self.log.warn('fetching hopage failed:%s' % hopage) soup = BeautifulSoup(content, "lxml") location = soup.find('div', id='Location') edition = location.find('div', class_='edition') ho = string_of_tag(edition).strip() if ho.startswith('['): ho = ho[1:5] else: self.log.warn('Fetching ho failed.') return ho
def ParseFeedUrls(self): datetime_t = str(datetime.date.today()).split( '-') #对日期进行拆分,返回一个['2017', '10', '09']形式的列表 #return lists like [(section,title,url,desc),..] # main = 'http://csr.mos.gov.cn/content/1/' mainurl = 'http://csr.mos.gov.cn/content/' + datetime_t[ 0] + '-' + datetime_t[1] + '/' + datetime_t[2] + '/' #url前缀带日期 #mainurl = 'http://csr.mos.gov.cn/content/' + datetime_t[0] + '-' + datetime_t[1] + '/' + datetime_t[2] + '/' + 'node_2.htm' #头版完整url ans = [] #urladded = set() # opener = URLOpener(self.host, timeout=90) # result = opener.open(mainurl + 'node_2.htm') soup1 = self.page_to_soup(mainurl + 'node_2.htm') #if result.status_code != 200: # self.log.warn('fetch mainnews failed:%s'%mainurl) # content = result.content.decode(self.page_encoding) # soup = BeautifulSoup(content, "lxml") #开始解析 mulu = soup1.find('td', {'class': 'mulu04'}) for banmian in mulu.find_all('a'): articles = [] if 'pdf' in banmian['href']: continue wenzhangliebiao = self.page_to_soup(mainurl + banmian['href']) vol_title = banmian.contents[0].strip() ul = wenzhangliebiao.find('ul', {'class': 'list01'}) #抓取的正文链接框架部分 for link in ul.find_all('a'): til = string_of_tag(link) url = mainurl + link['href'] desc = '' #r = .find({'class':'title01'}) #if r is not None: # desc = self.tag_to_string(r) # wz = {'fTitle':til, 'url' : url} #self.log.warn('href为:%s'%url) #articles.append(wz) # ans0 = (vol_title, wz) ans.append((vol_title, til, url, None)) #urladded.add(url) if len(ans) == 0: self.log.warn('len of urls is zero.') return ans
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] for feed in self.feeds: feedtitle,url = feed[0],feed[1] opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) if result.status_code != 200 or not result.content: self.log.warn('fetch webpage failed(%d):%s.' % (result.status_code, url)) continue if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) else: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) soup = BeautifulSoup(content, 'lxml') for article in soup.find_all('div', attrs={'class':'feed_item_question'}): title = article.find('a', attrs={'class':'question_link'}) if not title: continue #获取发布时间 pubdate = article.find('span',attrs={'class':'timestamp'}) if not pubdate: continue try: pubdate = datetime.datetime.strptime(pubdate.string, '%Y-%m-%d') except Exception as e: self.log.warn('parse pubdate failed for [%s] : %s'%(url,str(e))) continue #确定文章是否需要推送,时区固定为北京时间 tnow = datetime.datetime.utcnow()+datetime.timedelta(hours=8) delta = tnow - pubdate if self.oldest_article > 0 and delta.days > self.oldest_article: continue href = title['href'] if title['href'].startswith('http') else self.urljoin(url,title['href']) urls.append((feedtitle,string_of_tag(title),href,None)) return urls
def ParseFeedUrls(self): #return lists like [(section,title,url,desc),..] main = 'https://www.yna.co.kr/nk/index' urls = [] urladded = set() opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch mainnews failed:%s' % main) content = result.content.decode(self.page_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 section = soup.find('section', attrs={'class': 'column-type01 column-newslist'}) for article in section.find_all('article'): if article is None: self.log.warn('This article not found') continue h2 = article.find('h2') a = h2.find('a', href=True) atitle = string_of_tag(a).strip() url = a['href'] if url.startswith('/'): url = 'https:' + url elif url.startswith('HTTP'): url = url.replace('HTTP', 'http') if url not in urladded: urls.append((u'韩联社朝鲜要闻', atitle, url, None)) urladded.add(url) related = article.find('div', attrs={'class': 'v-related'}) if related: span = related.find('span') if span: relateda = span.find('a', href=True) rtitle = string_of_tag(relateda).strip() rtitle = 'Related: ' + rtitle #在相关文章标题前加标志 rurl = relateda['href'] if rurl.startswith('/'): rurl = 'https:' + rurl elif rurl.startswith('HTTP'): rurl = rurl.replace('HTTP', 'http') if rurl not in urladded: urls.append((u'韩联社朝鲜要闻', rtitle, rurl, None)) urladded.add(rurl) part2 = 'https://www.yna.co.kr/nk/news/all' opener2 = URLOpener(self.host, timeout=90) result2 = opener2.open(part2) if result2.status_code != 200: self.log.warn('fetch latest news failed:%s' % main) content2 = result2.content.decode(self.page_encoding) soup2 = BeautifulSoup(content2, "lxml") sect = soup2.find('ul', attrs={'class': 'list-type01 yna-more'}) for arti in sect.find_all('article'): h = arti.find('h2') a2 = h.find('a', href=True) title = string_of_tag(a2).strip() if u'[북한날씨]' in title: continue aurl = a2['href'] if aurl.startswith('/'): aurl = 'https:' + aurl elif aurl.startswith('HTTP'): aurl = aurl.replace('HTTP', 'http') if aurl not in urladded: urls.append((u'朝鲜最新消息', title, aurl, None)) urladded.add(aurl) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def ParseFeedUrls(self): #return list like [(section,title,url,desc),..] def FindHo(): hopage = 'http://weekly.chosun.com/client/contents/lst.asp' opener = URLOpener(self.host, timeout=90) result = opener.open(hopage) content = result.content.decode('euc-kr') if result.status_code != 200: self.log.warn('fetching hopage failed:%s' % hopage) soup = BeautifulSoup(content, "lxml") location = soup.find('div', id='Location') edition = location.find('div', class_='edition') ho = string_of_tag(edition).strip() if ho.startswith('['): ho = ho[1:5] else: self.log.warn('Fetching ho failed.') return ho mainhead = 'http://weekly.chosun.com/client/news/alllst.asp?nHo=' urls = [] urladded = set() opener = URLOpener(self.host, timeout=90) ho = FindHo() main = mainhead + ho result = opener.open(main) if result.status_code != 200: self.log.warn('Fetching TOC failed:%s' % main) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 def tr_has_a_tag(tag): return tag.name == 'tr' and tag.find('a') listarea = soup.find('div', class_='List_area') for section in listarea.find_all('table'): h4 = section.find_previous_sibling('h4') sectitle = string_of_tag(h4).strip() if not sectitle: self.log.warn('No section title') continue # if sectitle == 'Economic and financial indicators': # continue #self.log.info('Found section: %s' % section_title) articles = [] for tr in section.find_all(tr_has_a_tag): article = tr.find('a', href=True) title = string_of_tag(article).strip() url = article['href'] if url.startswith('viw'): url = 'http://weekly.chosun.com/client/news/' + url url = url.replace('viw', 'print', 1) #self.log.info('\tFound article:%s' % title) if url not in urladded: urls.append((sectitle, title, url, None)) urladded.add(url) if len(urls) == 0: self.log.warn('No articles found for WeeklyChosun.') return urls
def ParseFeedUrls(self): #return list like [(section,title,url,desc),..] login_url = 'https://my.economist.com/' main = 'https://www.economist.com/weeklyedition' # login_form = {"css-1gytnsx":self.account, "password":self.password} # login_response = opener.open(login_url, data=login_form) # main = 'https://www.economist.com/' urls = [] urladded = set() opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s' % main) return [] # content = result.content.decode(self.feed_encoding) # soup = BeautifulSoup(content, "lxml") # a = soup.find('a', attrs={'class':'latest-printed__cta'}) # current = a['href'] # if current.startswith(r'/'): # current = 'https://www.economist.com' + url # opener = URLOpener(self.host, timeout=90) # result = opener.open(current) # if result.status_code != 200: # self.log.warn('fetch latest edition failed:%s'%main) # return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 # for section in soup.find_all('li', attrs={'class':'list__item'}): # div = section.find('div') # if div is None: # self.log.warn('This part skipped.') # continue thisweek = soup.find('div', class_='layout-weekly-edition-wtw') if thisweek: h2 = thisweek.find('h2') sectitle = string_of_tag(h2).strip() if not sectitle: self.log.warn('No section title for the world this week') for week in thisweek.find_all('a', href=True): title = string_of_tag(week).strip() url = week['href'] if url.startswith(r'/'): url = 'https://www.economist.com' + url urls.append((sectitle, title, url, None)) else: self.log.warn('The world this week not found.') for section in soup.find_all( class_=lambda value: value and value.startswith( 'layout-weekly-edition-section')): h2 = section.find('h2') sectitle = string_of_tag(h2).strip() if not sectitle: self.log.warn('No section title') continue if 'financial indicators' in sectitle: continue #self.log.info('Found section: %s' % section_title) # articles = [] for node in section.find_all('a', href=True, class_=lambda value: value and value. startswith('headline-link')): spans = node.find_all('span') if len(spans) == 2: title = u'{}: {}'.format(*map(string_of_tag, spans)) # for node in section.find_all('a', href=True): # spans = node.findAll('span') # if len(spans) == 2: # fly= node.find('span', attrs={'class':'print-edition__link-flytitle'}) # pre= string_of_tag(fly).strip() # ti= node.find('span', attrs={'class':'print-edition__link-title'}) # post= string_of_tag(ti).strip() # title = pre +': '+ post else: title = string_of_tag(node).strip() url = node['href'] if url.startswith(r'/'): url = 'https://www.economist.com' + url #self.log.info('\tFound article:%s' % title) if url not in urladded: urls.append((sectitle, title, url, None)) urladded.add(url) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls