def FetchDesc(self, url): opener = URLOpener(self.host, timeout=60) result = opener.open(url) if result.status_code != 200: self.log.warn('fetch article failed(%d):%s.' % (status_code, url)) return None content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, 'lxml') abstract = unicode(soup.find('div', attrs={'class': 'zhaiyao'})) article = unicode(soup.find(id='contents')) pagelist = soup.find('ul', attrs={'class': 'pagelist'}) if pagelist and pagelist.find('li'): page_count_context = pagelist.a.text page_count = int( page_count_context[1:page_count_context.index(u'页')]) for i in range(2, page_count + 1): page_url = url[:-5] + "_%d.html" % i result = opener.open(page_url) if result.status_code != 200: self.log.warn( 'fetch page failed(%d):%s.' % (status_code, page_url)) return None content = result.content.decode(self.feed_encoding) pagesoup = BeautifulSoup(content, 'lxml') article += unicode(pagesoup.find(id='contents')) return abstract + article
def fetch_cover(self): # mainurl = 'https://www.economist.com' mainurl = SHARE_FUCK_GFW_SRV % urllib.quote('https://www.economist.com') opener = URLOpener(None, timeout=180) # opener = URLOpener(self.host, timeout=90) result = opener.open(mainurl) content = result.content.decode('utf-8') # content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") # wrapper = soup.find('div', attrs={'class':'print-edition__cover-wrapper'}) div = soup.find('div', class_='current-edition__cover') if div is not None: self.log.warn('Div found.') img = div.find('img', src=True) cover = img.get('src') if cover: self.log.warn('Cover: ' + cover) else: self.log.warn('No cover.') cover = SHARE_FUCK_GFW_SRV % urllib.quote(cover) opener = URLOpener() result = opener.open(cover) if result.status_code == 200 and result.content: return result.content else: raise Exception('Failed to fetch cover for TE.')
def ParseFeedUrls(self): login_url = "http://passport.infzm.com/passport/login" content_url = "http://www.infzm.com/enews/infzm" urls = [] opener = URLOpener(self.host, timeout=60) login_form = {"loginname":self.account, "password":self.password} login_response = opener.open(login_url, data=login_form) opener.SaveCookies(login_response.header_msg.getheaders('Set-Cookie')) result = opener.open(content_url) content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") sec_titles = [] for sec_name in soup.find_all('h2'): sec_titles.append(sec_name.get_text()) for top_news in soup.find_all('dl', {'class': 'topnews'}): url = top_news.a['href'] feed_content = opener.open(url).content.decode(self.feed_encoding) feed_soup = BeautifulSoup(feed_content, "lxml") urls.append( (sec_titles[0], top_news.a['title'], url, feed_soup.find(id="articleContent"))) sec_count = 0 for sec_content in soup.find_all('ul', {'class': 'relnews'}): for a in sec_content.find_all('a'): url = a['href'] feed_content = opener.open( url).content.decode(self.feed_encoding) feed_soup = BeautifulSoup(feed_content, "lxml") urls.append( (sec_titles[sec_count], a['title'], url, feed_soup.find(id="articleContent"))) sec_count += 1 return urls
def ParseFeedUrls(self): #return lists like [(section,title,url,desc),..] main = 'http://www.reuters.com/places/north-korea' urls = [] isEST = False #判断是EST还是EDT opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s'%main) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 section=soup.find('div', attrs={'class':'topStory'}) toparticle = section.find('a', href=True) if toparticle is None: self.log.warn('Top news not found') toptitle = string_of_tag(toparticle).strip() if not toptitle: self.log.warn('No top story title') url = toparticle['href'] if url.startswith(r'/'): url = 'http://www.reuters.com' + url urls.append(('Reuters North Korea',toptitle,url,None)) sect=soup.find('div', id='moreSectionNews') for feature in sect.find_all('div', attrs={'class':'feature'}): article = feature.find('a', href=True) title = string_of_tag(article).strip() url = article['href'] timestamp = feature.find('span', attrs={'class':'timestamp'}) if not timestamp: continue timestamp = string_of_tag(timestamp).strip() #今天的文章 if 'EDT' in timestamp or 'EST' in timestamp: delta=0 if 'EST' in timestamp: isEST=True else: pubtime = datetime.datetime.strptime(timestamp, '%b %d %Y').date() #默认为EDT tnow = datetime.datetime.utcnow()-datetime.timedelta(hours=4) currentmonth= tnow.month if currentmonth in [1, 2, 12] or isEST: tnow = datetime.datetime.utcnow()-datetime.timedelta(hours=5) tnow = tnow.date() delta=(tnow-pubtime).days if self.oldest_article > 0 and delta > self.oldest_article: continue if url.startswith(r'/'): url = 'http://www.reuters.com' + url #self.log.info('\tFound article:%s' % title) urls.append(('Reuters North Korea',title,url,None)) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] timeout = self.timeout opener = URLOpener(self.host, timeout=timeout) urladded = set() for sec, url in self.feeds: result = opener.open(url) if result.status_code == 200: page = result.content.decode('utf-8') soup = BeautifulSoup(page) tbnews = soup.find(name='div', attrs={'class': ['box2']}) if tbnews: for news in tbnews.find_all('a'): if not news.string or news.string == u'繼續閱讀': continue urlnews = news['href'] if not urlnews.startswith('http'): urlnews = urlparse.urljoin(url, urlnews) if urlnews not in urladded: urls.append((sec, news.string, urlnews, None)) urladded.add(urlnews) soup = None else: self.log.warn('fetch url failed:%s' % url) return urls
def ParseFeedUrls(self): #return lists like [(section,title,url,desc),..] main = 'http://www.thepaper.cn/list_masonry.jsp?nodeid=26878' urls = [] opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s' % main) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 for article in soup.find_all('div', class_='news_li', limit=6): inter = article.find('div', class_='pdtt_trbs') timestamp = inter.find('span') timestamp = string_of_tag(timestamp).strip() if u'天' in timestamp or u'-' in timestamp: continue h2 = article.find('h2') a = h2.find('a', href=True) title = string_of_tag(a).strip() if not title: self.log.warn('This title not found.') continue url = a['href'] if url.startswith(r'news'): url = 'http://www.thepaper.cn/' + url urls.append((u'上海书评', title, url, None)) if len(urls) == 0: self.log.warn('No article found for Shanghai Book Review.') return urls
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ urls = [] timeout = self.timeout opener = URLOpener(self.host, timeout=timeout) urladded = set() for sec,url in self.feeds: result = opener.open(url) if result.status_code == 200: page = result.content.decode('utf-8') soup = BeautifulSoup(page) tbnews = soup.find(name='div',attrs={'class':['box2']}) if tbnews: for news in tbnews.find_all('a'): if not news.string or news.string == u'繼續閱讀': continue urlnews = news['href'] if not urlnews.startswith('http'): urlnews = urlparse.urljoin(url, urlnews) if urlnews not in urladded: urls.append((sec,news.string,urlnews,None)) urladded.add(urlnews) soup = None else: self.log.warn('fetch url failed:%s'%url) return urls
def ParseFeedUrls(self): #return lists like [(section,title,url,desc),..] main = 'http://mp.sohu.com/profile?xpt=bWhtaW5nMUBzb2h1LmNvbQ==&_f=index_pagemp_1' urls = [] opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s' % main) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 for article in soup.find_all('div', class_='content_wrap', limit=6): timestamp = article.find('div', class_='wrap_mark') span = timestamp.find('span') timestamp = string_of_tag(span).strip() if u'今天' not in timestamp and u'昨天' not in timestamp: continue div = article.find('div', class_='wrap_title') a = span.find('a', href=True) title = string_of_tag(a).strip() if not title: self.log.warn('This title not found.') continue url = a['href'] if url.startswith('/'): url = 'http:' + url urls.append((u'古代小说网sohu', title, url, None)) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def ParseFeedUrls(self): main = 'http://bbstsg.vip.qikan.com/text/Mag.aspx?issn=ACB37AEA-8FB7-4855-B7CA-D228E972162F' urls = [] opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s'%main) return [] if self.feed_encoding: try: content = result.content.decode(self.feed_encoding) except UnicodeDecodeError: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) else: content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers) soup = BeautifulSoup(content, "lxml") for section in soup.find_all('dl'): dt=section.find('dt') span=dt.find('span') if span: sectitle = string_of_tag(span).strip() for dd in section.find_all('dd'): a=dd.find('a', href=True) title = string_of_tag(a).strip() url = a['href'] if url.startswith('Article'): url = 'http://bbstsg.vip.qikan.com/text/'+url urls.append((sectitle,title,url,None)) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def ParseFeedUrls(self): #return lists like [(section,title,url,desc),..] main = 'http://news.joins.com/Issue/10061' urls = [] opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s' % main) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 for article in soup.find_all('strong', class_='headline mg', limit=4): #只保留最近一个月的四篇 a = article.find('a', href=True) title = string_of_tag(a).strip() if not title: self.log.warn('This title not found.') continue url = a['href'] if url.startswith('/'): url = 'http://news.joins.com' + url urls.append((u'사설 속으로', title, url, None)) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def ParseFeedUrls(self): #return lists like [(section,title,url,desc),..] main = 'http://www.jintiankansha.me/column/nwClF5ZmDJ' urls = [] opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s' % main) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 section = soup.find('div', class_='entries') for article in section.find_all('div', class_='cell item', limit=10): timestamp = article.find('span', class_='small fade') timestamp = string_of_tag(timestamp).strip() #if u'小时' not in timestamp and u'昨天' not in timestamp: if u'小时' not in timestamp: continue span = article.find('span', class_='item_title') a = span.find('a', href=True) title = string_of_tag(a).strip() if not title: self.log.warn('This title not found.') continue url = a['href'] urls.append((u'聊聊架构', title, url, None)) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def ParseFeedUrls(self): #return lists like [(section,title,url,desc),..] main = 'https://www.nknews.org/' urls = [] opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s' % main) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 def is_cls_wanted(css_class): listwanted = [ 'col-md-7', 'post-prinicap-row', 'col-md-12', 'col-md-6 smallboxclass' ] return css_class in listwanted # def not_has_class(tag): # return not tag.has_attr('class') for section in soup.find_all(class_=is_cls_wanted, limit=8): article = section.find('a', string=True) title = string_of_tag(article).strip() url = article['href'] if '/pro/' in url: continue span = article.find('span') strong = span.find('strong') if not strong: timestamp = span else: timestamp = strong timestamp = string_of_tag(timestamp).strip() m = re.search(r'\d{4}$', timestamp) if m: pubtime = datetime.datetime.strptime(timestamp, '%d %B %Y').date() else: m2 = re.search(r'^\d', timestamp) if m2: pubtime = datetime.datetime.strptime(timestamp, '%d %B').date() else: pubtime = datetime.datetime.strptime(timestamp, '%B %d').date() tnow = datetime.datetime.utcnow() tnow = tnow.date() delta = (tnow - pubtime).days if self.oldest_article > 0 and delta > self.oldest_article: continue #self.log.info('\tFound article:%s' % title) urls.append(('NK News', title, url, None)) if len(urls) == 0: self.log.warn('NK News has no article.') return urls
def page_to_soup(self, indexurl): opener = URLOpener(self.host, timeout=90) result = opener.open(indexurl) if result.status_code != 200: self.log.warn('fetch mainnews failed:%s' % indexurl) content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") return soup
def ParseFeedUrls(self): #return list like [(section,title,url,desc),..] main = 'https://www.economist.com/printedition' # Did you block me? main = self.url4forwarder(main) urls = [] urladded = set() opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s'%main) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 for section in soup.find_all('li', attrs={'class':'list__item'}): div = section.find('div') if div is None: self.log.warn('This part skipped.') continue sectitle = string_of_tag(div).strip() if not sectitle: self.log.warn('No section title') continue if sectitle == 'Economic and financial indicators': continue #self.log.info('Found section: %s' % section_title) articles = [] for node in section.find_all('a', href=True): spans = node.findAll('span') if len(spans) == 2: fly= node.find('span', attrs={'class':'print-edition__link-flytitle'}) pre= string_of_tag(fly).strip() ti= node.find('span', attrs={'class':'print-edition__link-title'}) post= string_of_tag(ti).strip() title = pre +': '+ post else: title = string_of_tag(node).strip() url = node['href'] if url.startswith(r'/'): url = 'http://www.economist.com' + url # Did you block me? url = self.url4forwarder(url) #self.log.info('\tFound article:%s' % title) if url not in urladded: urls.append((sectitle,title,url,None)) urladded.add(url) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def ParseFeedUrls(self): #return lists like [(section,title,url,desc),..] mainhead = 'https://www.yna.co.kr/international/china/' num = 1 urls = [] callitaday = False koreanow = datetime.datetime.utcnow() + datetime.timedelta(hours=9) # koreadate = koreanow.date() year = koreanow.year mydelta = datetime.timedelta(hours=24, minutes=10) while not callitaday: main = mainhead + str(num) opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch mainnews failed:%s' % main) content = result.content.decode(self.page_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 section = soup.find('div', class_='list-type038') for article in section.find_all('div', class_='item-box01'): if article is None: self.log.warn('This article not found') continue ptime = article.find('span', class_='txt-time') if ptime: ptime = string_of_tag(ptime).strip() # pdate=ptime[0:5] #只要07-30这样的日期 ptime = str(year) + '-' + ptime #加上年份,否则默认1900年 ptime = datetime.datetime.strptime(ptime, '%Y-%m-%d %H:%M') delta = koreanow - ptime # if self.oldest_article > 0 and delta >= self.oldest_article: if delta > mydelta: callitaday = True break #因为是按时间顺序的 newscon = article.find('div', class_='news-con') a = newscon.find('a', href=True) atitle = string_of_tag(a).strip() atitle = atitle + ' ' + str(ptime)[5:-3] url = a['href'] if url.startswith('/'): url = 'https:' + url urls.append((u'중국 뉴스', atitle, url, None)) num = num + 1 if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ mainurl = 'http://www.economist.com/printedition' urls = [] urladded = set() opener = URLOpener(self.host, timeout=30) result = opener.open(mainurl) if result.status_code != 200: self.log.warn('fetch rss failed:%s'%mainurl) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #GAE获取到的是移动端网页,和PC获取到的网页有些不一样 for section in soup.find_all('section', attrs={'id':lambda x: x and 'section' in x}): h4 = section.find('h4') if h4 is None: self.log.warn('h4 is empty') continue sectitle = string_of_tag(h4).strip() if not sectitle: self.log.warn('h4 string is empty') continue #self.log.info('Found section: %s' % section_title) articles = [] subsection = '' for node in section.find_all('article'): subsec = node.find('h5') if subsec is not None: subsection = string_of_tag(subsec) prefix = (subsection + ': ') if subsection else '' a = node.find('a', attrs={"href":True}, recursive=False) if a is not None: url = a['href'] if url.startswith(r'/'): url = 'http://www.economist.com' + url url += '/print' title = string_of_tag(a) if title: title = prefix + title #self.log.info('\tFound article:%s' % title) if url not in urladded: urls.append((sectitle,title,url,None)) urladded.add(url) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ mainurl = 'http://www.economist.com/printedition' urls = [] urladded = set() opener = URLOpener(self.host, timeout=30) result = opener.open(mainurl) if result.status_code != 200: self.log.warn('fetch rss failed:%s' % mainurl) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #GAE获取到的是移动端网页,和PC获取到的网页有些不一样 for section in soup.find_all( 'section', attrs={'id': lambda x: x and 'section' in x}): h4 = section.find('h4') if h4 is None: self.log.warn('h4 is empty') continue sectitle = string_of_tag(h4).strip() if not sectitle: self.log.warn('h4 string is empty') continue #self.log.info('Found section: %s' % section_title) articles = [] subsection = '' for node in section.find_all('article'): subsec = node.find('h5') if subsec is not None: subsection = string_of_tag(subsec) prefix = (subsection + ': ') if subsection else '' a = node.find('a', attrs={"href": True}, recursive=False) if a is not None: url = a['href'] if url.startswith(r'/'): url = 'http://www.economist.com' + url url += '/print' title = string_of_tag(a) if title: title = prefix + title #self.log.info('\tFound article:%s' % title) if url not in urladded: urls.append((sectitle, title, url, None)) urladded.add(url) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def fetch_cover(self): mainurl = 'http://www.economist.com/printedition' opener = URLOpener(None, timeout=90) # opener = URLOpener(self.host, timeout=90) result = opener.open(mainurl) content = result.content.decode('utf-8') # content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") div = soup.find('div', attrs={'class': 'print-edition__cover-widget'}) img = div.find('img', src=True) cover = img.get('src') if cover.startswith('/'): cover = 'http://www.economist.com' + cover data = urllib.urlopen(cover).read() return data
def ParseFeedUrls(self): """ return list like [(section,title,url,desc),..] """ mainurl = "http://www.economist.com/printedition" urls = [] urladded = set() opener = URLOpener(self.host, timeout=30) result = opener.open(mainurl) if result.status_code != 200: self.log.warn("fetch rss failed:%s" % mainurl) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") # GAE获取到的是移动端网页,和PC获取到的网页有些不一样 for section in soup.find_all("section", attrs={"id": lambda x: x and "section" in x}): h4 = section.find("h4") if h4 is None: self.log.warn("h4 is empty") continue sectitle = string_of_tag(h4).strip() if not sectitle: self.log.warn("h4 string is empty") continue # self.log.info('Found section: %s' % section_title) articles = [] subsection = "" for node in section.find_all("article"): subsec = node.find("h5") if subsec is not None: subsection = string_of_tag(subsec) prefix = (subsection + ": ") if subsection else "" a = node.find("a", attrs={"href": True}, recursive=False) if a is not None: url = a["href"] if url.startswith(r"/"): url = "http://www.economist.com" + url url += "/print" title = string_of_tag(a) if title: title = prefix + title # self.log.info('\tFound article:%s' % title) if url not in urladded: urls.append((sectitle, title, url, None)) urladded.add(url) if len(urls) == 0: self.log.warn("len of urls is zero.") return urls
def ParseFeedUrls(self): mainurl = "http://www.21ccom.net/articles/china/" urls = [] opener = URLOpener(self.host, timeout=60) result = opener.open(mainurl) if result.status_code != 200: self.log.warn('fetch rss failed:%s' % mainurl) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") # Get the 2nd block ul = soup.find_all('ul', attrs={'class': ['m-list', 'list-tweet']})[1] for li in ul.find_all('li'): urls.append( (u'共识网一周排行', li.a.text, li.a['href'], self.FetchDesc(li.a['href']))) return urls
def FindHo(): hopage = 'http://weekly.chosun.com/client/contents/lst.asp' opener = URLOpener(self.host, timeout=90) result = opener.open(hopage) content = result.content.decode('euc-kr') if result.status_code != 200: self.log.warn('fetching hopage failed:%s' % hopage) soup = BeautifulSoup(content, "lxml") location = soup.find('div', id='Location') edition = location.find('div', class_='edition') ho = string_of_tag(edition).strip() if ho.startswith('['): ho = ho[1:5] else: self.log.warn('Fetching ho failed.') return ho
def fetch_cover(self): mainurl = 'http://weekly.chosun.com' opener = URLOpener(None, timeout=180) # opener = URLOpener(self.host, timeout=90) result = opener.open(mainurl) content = result.content.decode('euc-kr') soup = BeautifulSoup(content, "lxml") div = soup.find('div', class_='box_cover_new') img = div.find('img', src=True) cover = img.get('src') if cover.startswith('/'): cover = mainurl + cover else: cover = 'http://weekly.chosun.com/' + cover data = urllib.urlopen(cover).read() return data
def fetch_cover(self): mainurl = 'https://www.economist.com/weeklyedition' opener = URLOpener(None, timeout=180) # opener = URLOpener(self.host, timeout=90) result = opener.open(mainurl) content = result.content.decode('utf-8') # content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") # wrapper = soup.find('div', attrs={'class':'print-edition__cover-wrapper'}) header = soup.find('div', class_='weekly-edition-header__image') # div=wrapper.find('div', class_='component-image print-edition__cover-widget__image') img = header.find('img', src=True) cover = img.get('src') # if cover.startswith('/'): # cover = 'http://www.economist.com' + cover data = urllib.urlopen(cover).read() return data
def fetcharticle(self, url, decoder): """ 爱思想的文章有分页,在此函数内下载全部分页,合并成一个单独的HTML返回。""" opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code,url)) return None #内嵌函数,用于处理分页信息 def not_is_thispage(tag): return not tag.has_attr('class') if self.page_encoding: try: firstpart = content.decode(self.page_encoding) except UnicodeDecodeError: firstpart = decoder.decode(content,url) else: firstpart = decoder.decode(content,url) otherparts = [] soup = BeautifulSoup(firstpart, "lxml") listpage = soup.find('div', attrs={'class':'list_page'}) if listpage: #有分页 for page in listpage.find_all('li'): parturl = page.find(not_is_thispage) if parturl: parturl = self.urljoin(url, parturl['href']) result = opener.open(parturl) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code,url)) else: if self.page_encoding: try: thispart = content.decode(self.page_encoding) except UnicodeDecodeError: thispart = decoder.decode(content,parturl) else: thispart = decoder.decode(content,parturl) otherparts.append(thispart) #合并文件后不再需要分页标志 listpage.decompose() #逐个处理各分页,合成一个单独文件 article1 = soup.find('div', attrs={'id':'content'}) if not article1: return None for foot in article1.contents[-2:]: if isinstance(foot, NavigableString): if u'本文责编:' in unicode(foot) or u'进入专题:' in unicode(foot): foot.decompose() else: for s in foot.strings: if u'本文责编:' in s or u'进入专题:' in s: foot.decompose() break #将其他页的文章内容附加到第一页的文章内容后面 for page in otherparts[::-1]: souppage = BeautifulSoup(page, "lxml") article = souppage.find('div', attrs={'id':'content'}) if not article: continue for foot in article.contents[-2:]: if isinstance(foot, NavigableString): if u'本文责编:' in unicode(foot) or u'进入专题:' in unicode(foot): foot.decompose() else: for s in foot.strings: if u'本文责编:' in s or u'进入专题:' in s: foot.decompose() break article1.insert_after(article) for a in soup.find_all('a',attrs={'href':True}): if a.string == u'点击此处阅读下一页': a.decompose() return unicode(soup)
def ParseFeedUrls(self): #return lists like [(section,title,url,desc),..] main = 'https://www.yna.co.kr/nk/index' urls = [] urladded = set() opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch mainnews failed:%s' % main) content = result.content.decode(self.page_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 section = soup.find('section', attrs={'class': 'column-type01 column-newslist'}) for article in section.find_all('article'): if article is None: self.log.warn('This article not found') continue h2 = article.find('h2') a = h2.find('a', href=True) atitle = string_of_tag(a).strip() url = a['href'] if url.startswith('/'): url = 'https:' + url elif url.startswith('HTTP'): url = url.replace('HTTP', 'http') if url not in urladded: urls.append((u'韩联社朝鲜要闻', atitle, url, None)) urladded.add(url) related = article.find('div', attrs={'class': 'v-related'}) if related: span = related.find('span') if span: relateda = span.find('a', href=True) rtitle = string_of_tag(relateda).strip() rtitle = 'Related: ' + rtitle #在相关文章标题前加标志 rurl = relateda['href'] if rurl.startswith('/'): rurl = 'https:' + rurl elif rurl.startswith('HTTP'): rurl = rurl.replace('HTTP', 'http') if rurl not in urladded: urls.append((u'韩联社朝鲜要闻', rtitle, rurl, None)) urladded.add(rurl) part2 = 'https://www.yna.co.kr/nk/news/all' opener2 = URLOpener(self.host, timeout=90) result2 = opener2.open(part2) if result2.status_code != 200: self.log.warn('fetch latest news failed:%s' % main) content2 = result2.content.decode(self.page_encoding) soup2 = BeautifulSoup(content2, "lxml") sect = soup2.find('ul', attrs={'class': 'list-type01 yna-more'}) for arti in sect.find_all('article'): h = arti.find('h2') a2 = h.find('a', href=True) title = string_of_tag(a2).strip() if u'[북한날씨]' in title: continue aurl = a2['href'] if aurl.startswith('/'): aurl = 'https:' + aurl elif aurl.startswith('HTTP'): aurl = aurl.replace('HTTP', 'http') if aurl not in urladded: urls.append((u'朝鲜最新消息', title, aurl, None)) urladded.add(aurl) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls
def fetcharticle(self, url, decoder): """ 爱思想的文章有分页,在此函数内下载全部分页,合并成一个单独的HTML返回。""" opener = URLOpener(self.host, timeout=self.timeout) result = opener.open(url) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code, url)) return None #内嵌函数,用于处理分页信息 def not_is_thispage(tag): return not tag.has_attr('class') if self.page_encoding: try: firstpart = content.decode(self.page_encoding) except UnicodeDecodeError: firstpart = decoder.decode(content, opener.realurl) else: firstpart = decoder.decode(content, opener.realurl) otherparts = [] soup = BeautifulSoup(firstpart, "lxml") listpage = soup.find('div', attrs={'class': 'list_page'}) if listpage: #有分页 for page in listpage.find_all('li'): parturl = page.find(not_is_thispage) if parturl: parturl = self.urljoin(url, parturl['href']) result = opener.open(parturl) status_code, content = result.status_code, result.content if status_code != 200 or not content: self.log.warn('fetch article failed(%d):%s.' % (status_code, url)) else: if self.page_encoding: try: thispart = content.decode(self.page_encoding) except UnicodeDecodeError: thispart = decoder.decode(content, parturl) else: thispart = decoder.decode(content, parturl) otherparts.append(thispart) #合并文件后不再需要分页标志 listpage.decompose() #逐个处理各分页,合成一个单独文件 article1 = soup.find('div', attrs={'id': 'content'}) if not article1: return None for foot in article1.contents[-2:]: if isinstance(foot, NavigableString): if u'本文责编:' in unicode(foot) or u'进入专题:' in unicode(foot): foot.decompose() else: for s in foot.strings: if u'本文责编:' in s or u'进入专题:' in s: foot.decompose() break #将其他页的文章内容附加到第一页的文章内容后面 for page in otherparts[::-1]: souppage = BeautifulSoup(page, "lxml") article = souppage.find('div', attrs={'id': 'content'}) if not article: continue for foot in article.contents[-2:]: if isinstance(foot, NavigableString): if u'本文责编:' in unicode(foot) or u'进入专题:' in unicode(foot): foot.decompose() else: for s in foot.strings: if u'本文责编:' in s or u'进入专题:' in s: foot.decompose() break article1.insert_after(article) for a in soup.find_all('a', attrs={'href': True}): if a.string == u'点击此处阅读下一页': a.decompose() return unicode(soup)
def ParseFeedUrls(self): #return list like [(section,title,url,desc),..] def FindHo(): hopage = 'http://weekly.chosun.com/client/contents/lst.asp' opener = URLOpener(self.host, timeout=90) result = opener.open(hopage) content = result.content.decode('euc-kr') if result.status_code != 200: self.log.warn('fetching hopage failed:%s' % hopage) soup = BeautifulSoup(content, "lxml") location = soup.find('div', id='Location') edition = location.find('div', class_='edition') ho = string_of_tag(edition).strip() if ho.startswith('['): ho = ho[1:5] else: self.log.warn('Fetching ho failed.') return ho mainhead = 'http://weekly.chosun.com/client/news/alllst.asp?nHo=' urls = [] urladded = set() opener = URLOpener(self.host, timeout=90) ho = FindHo() main = mainhead + ho result = opener.open(main) if result.status_code != 200: self.log.warn('Fetching TOC failed:%s' % main) return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 def tr_has_a_tag(tag): return tag.name == 'tr' and tag.find('a') listarea = soup.find('div', class_='List_area') for section in listarea.find_all('table'): h4 = section.find_previous_sibling('h4') sectitle = string_of_tag(h4).strip() if not sectitle: self.log.warn('No section title') continue # if sectitle == 'Economic and financial indicators': # continue #self.log.info('Found section: %s' % section_title) articles = [] for tr in section.find_all(tr_has_a_tag): article = tr.find('a', href=True) title = string_of_tag(article).strip() url = article['href'] if url.startswith('viw'): url = 'http://weekly.chosun.com/client/news/' + url url = url.replace('viw', 'print', 1) #self.log.info('\tFound article:%s' % title) if url not in urladded: urls.append((sectitle, title, url, None)) urladded.add(url) if len(urls) == 0: self.log.warn('No articles found for WeeklyChosun.') return urls
def ParseFeedUrls(self): #return list like [(section,title,url,desc),..] login_url = 'https://my.economist.com/' main = 'https://www.economist.com/weeklyedition' # login_form = {"css-1gytnsx":self.account, "password":self.password} # login_response = opener.open(login_url, data=login_form) # main = 'https://www.economist.com/' urls = [] urladded = set() opener = URLOpener(self.host, timeout=90) result = opener.open(main) if result.status_code != 200: self.log.warn('fetch webpage failed:%s' % main) return [] # content = result.content.decode(self.feed_encoding) # soup = BeautifulSoup(content, "lxml") # a = soup.find('a', attrs={'class':'latest-printed__cta'}) # current = a['href'] # if current.startswith(r'/'): # current = 'https://www.economist.com' + url # opener = URLOpener(self.host, timeout=90) # result = opener.open(current) # if result.status_code != 200: # self.log.warn('fetch latest edition failed:%s'%main) # return [] content = result.content.decode(self.feed_encoding) soup = BeautifulSoup(content, "lxml") #开始解析 # for section in soup.find_all('li', attrs={'class':'list__item'}): # div = section.find('div') # if div is None: # self.log.warn('This part skipped.') # continue thisweek = soup.find('div', class_='layout-weekly-edition-wtw') if thisweek: h2 = thisweek.find('h2') sectitle = string_of_tag(h2).strip() if not sectitle: self.log.warn('No section title for the world this week') for week in thisweek.find_all('a', href=True): title = string_of_tag(week).strip() url = week['href'] if url.startswith(r'/'): url = 'https://www.economist.com' + url urls.append((sectitle, title, url, None)) else: self.log.warn('The world this week not found.') for section in soup.find_all( class_=lambda value: value and value.startswith( 'layout-weekly-edition-section')): h2 = section.find('h2') sectitle = string_of_tag(h2).strip() if not sectitle: self.log.warn('No section title') continue if 'financial indicators' in sectitle: continue #self.log.info('Found section: %s' % section_title) # articles = [] for node in section.find_all('a', href=True, class_=lambda value: value and value. startswith('headline-link')): spans = node.find_all('span') if len(spans) == 2: title = u'{}: {}'.format(*map(string_of_tag, spans)) # for node in section.find_all('a', href=True): # spans = node.findAll('span') # if len(spans) == 2: # fly= node.find('span', attrs={'class':'print-edition__link-flytitle'}) # pre= string_of_tag(fly).strip() # ti= node.find('span', attrs={'class':'print-edition__link-title'}) # post= string_of_tag(ti).strip() # title = pre +': '+ post else: title = string_of_tag(node).strip() url = node['href'] if url.startswith(r'/'): url = 'https://www.economist.com' + url #self.log.info('\tFound article:%s' % title) if url not in urladded: urls.append((sectitle, title, url, None)) urladded.add(url) if len(urls) == 0: self.log.warn('len of urls is zero.') return urls