Example #1
0
 def FetchDesc(self, url):
     opener = URLOpener(self.host, timeout=60)
     result = opener.open(url)
     if result.status_code != 200:
         self.log.warn('fetch article failed(%d):%s.' % (status_code, url))
         return None
     content = result.content.decode(self.feed_encoding)
     soup = BeautifulSoup(content, 'lxml')
     abstract = unicode(soup.find('div', attrs={'class': 'zhaiyao'}))
     article = unicode(soup.find(id='contents'))
     pagelist = soup.find('ul', attrs={'class': 'pagelist'})
     if pagelist and pagelist.find('li'):
         page_count_context = pagelist.a.text
         page_count = int(
             page_count_context[1:page_count_context.index(u'页')])
         for i in range(2, page_count + 1):
             page_url = url[:-5] + "_%d.html" % i
             result = opener.open(page_url)
             if result.status_code != 200:
                 self.log.warn(
                     'fetch page failed(%d):%s.' % (status_code, page_url))
                 return None
             content = result.content.decode(self.feed_encoding)
             pagesoup = BeautifulSoup(content, 'lxml')
             article += unicode(pagesoup.find(id='contents'))
     return abstract + article
Example #2
0
def fetch_cover(self):
    #    mainurl = 'https://www.economist.com'
    mainurl = SHARE_FUCK_GFW_SRV % urllib.quote('https://www.economist.com')
    opener = URLOpener(None, timeout=180)
    #    opener = URLOpener(self.host, timeout=90)
    result = opener.open(mainurl)
    content = result.content.decode('utf-8')
    #    content = result.content.decode(self.feed_encoding)
    soup = BeautifulSoup(content, "lxml")
    #    wrapper = soup.find('div', attrs={'class':'print-edition__cover-wrapper'})
    div = soup.find('div', class_='current-edition__cover')
    if div is not None:
        self.log.warn('Div found.')
        img = div.find('img', src=True)
        cover = img.get('src')
        if cover:
            self.log.warn('Cover: ' + cover)
        else:
            self.log.warn('No cover.')
        cover = SHARE_FUCK_GFW_SRV % urllib.quote(cover)
        opener = URLOpener()
        result = opener.open(cover)
        if result.status_code == 200 and result.content:
            return result.content
        else:
            raise Exception('Failed to fetch cover for TE.')
Example #3
0
 def ParseFeedUrls(self):
     login_url = "http://passport.infzm.com/passport/login"
     content_url = "http://www.infzm.com/enews/infzm"
     urls = []
     opener = URLOpener(self.host, timeout=60)
     login_form = {"loginname":self.account, "password":self.password}
     login_response = opener.open(login_url, data=login_form)
     opener.SaveCookies(login_response.header_msg.getheaders('Set-Cookie'))
     result = opener.open(content_url)
     content = result.content.decode(self.feed_encoding)
     soup = BeautifulSoup(content, "lxml")
     sec_titles = []
     for sec_name in soup.find_all('h2'):
         sec_titles.append(sec_name.get_text())
     for top_news in soup.find_all('dl', {'class': 'topnews'}):
         url = top_news.a['href']
         feed_content = opener.open(url).content.decode(self.feed_encoding)
         feed_soup = BeautifulSoup(feed_content, "lxml")
         urls.append(
             (sec_titles[0], top_news.a['title'], url, feed_soup.find(id="articleContent")))
     sec_count = 0
     for sec_content in soup.find_all('ul', {'class': 'relnews'}):
         for a in sec_content.find_all('a'):
             url = a['href']
             feed_content = opener.open(
                 url).content.decode(self.feed_encoding)
             feed_soup = BeautifulSoup(feed_content, "lxml")
             urls.append(
                 (sec_titles[sec_count], a['title'], url, feed_soup.find(id="articleContent")))
         sec_count += 1
     return urls
Example #4
0
 def ParseFeedUrls(self):
     #return lists like [(section,title,url,desc),..]
     main = 'http://www.reuters.com/places/north-korea'
     urls = []
     isEST = False #判断是EST还是EDT
     opener = URLOpener(self.host, timeout=90)
     result = opener.open(main)
     if result.status_code != 200:
         self.log.warn('fetch webpage failed:%s'%main)
         return []
         
     content = result.content.decode(self.feed_encoding)
     soup = BeautifulSoup(content, "lxml")
     
     #开始解析
     section=soup.find('div', attrs={'class':'topStory'})
     toparticle = section.find('a', href=True)
     if toparticle is None:
         self.log.warn('Top news not found')
     toptitle = string_of_tag(toparticle).strip()
     if not toptitle:
         self.log.warn('No top story title')
     url = toparticle['href']
     if url.startswith(r'/'):
         url = 'http://www.reuters.com' + url
     urls.append(('Reuters North Korea',toptitle,url,None))
         
     sect=soup.find('div', id='moreSectionNews')
     for feature in sect.find_all('div', attrs={'class':'feature'}):
         article = feature.find('a', href=True)
         title = string_of_tag(article).strip()
         url = article['href']
         timestamp = feature.find('span', attrs={'class':'timestamp'})
         if not timestamp:
             continue
         timestamp = string_of_tag(timestamp).strip()
         #今天的文章
         if 'EDT' in timestamp or 'EST' in timestamp:
             delta=0
             if 'EST' in timestamp:
                 isEST=True
         else:
             pubtime = datetime.datetime.strptime(timestamp, '%b %d %Y').date()
             #默认为EDT
             tnow = datetime.datetime.utcnow()-datetime.timedelta(hours=4)
             currentmonth= tnow.month
             if currentmonth in [1, 2, 12] or isEST:
                 tnow = datetime.datetime.utcnow()-datetime.timedelta(hours=5)
             tnow = tnow.date()
             delta=(tnow-pubtime).days
         if self.oldest_article > 0 and delta > self.oldest_article:
             continue
         if url.startswith(r'/'):
             url = 'http://www.reuters.com' + url
             #self.log.info('\tFound article:%s' % title)
         urls.append(('Reuters North Korea',title,url,None))
                             
     if len(urls) == 0:
         self.log.warn('len of urls is zero.')
     return urls
Example #5
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        urls = []
        timeout = self.timeout
        opener = URLOpener(self.host, timeout=timeout)

        urladded = set()
        for sec, url in self.feeds:
            result = opener.open(url)
            if result.status_code == 200:
                page = result.content.decode('utf-8')
                soup = BeautifulSoup(page)
                tbnews = soup.find(name='div', attrs={'class': ['box2']})
                if tbnews:
                    for news in tbnews.find_all('a'):
                        if not news.string or news.string == u'繼續閱讀':
                            continue
                        urlnews = news['href']
                        if not urlnews.startswith('http'):
                            urlnews = urlparse.urljoin(url, urlnews)
                        if urlnews not in urladded:
                            urls.append((sec, news.string, urlnews, None))
                            urladded.add(urlnews)
                soup = None
            else:
                self.log.warn('fetch url failed:%s' % url)

        return urls
Example #6
0
    def ParseFeedUrls(self):
        #return lists like [(section,title,url,desc),..]
        main = 'http://www.thepaper.cn/list_masonry.jsp?nodeid=26878'
        urls = []
        opener = URLOpener(self.host, timeout=90)
        result = opener.open(main)
        if result.status_code != 200:
            self.log.warn('fetch webpage failed:%s' % main)
            return []

        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")

        #开始解析
        for article in soup.find_all('div', class_='news_li', limit=6):
            inter = article.find('div', class_='pdtt_trbs')
            timestamp = inter.find('span')
            timestamp = string_of_tag(timestamp).strip()
            if u'天' in timestamp or u'-' in timestamp:
                continue
            h2 = article.find('h2')
            a = h2.find('a', href=True)
            title = string_of_tag(a).strip()
            if not title:
                self.log.warn('This title not found.')
                continue
            url = a['href']
            if url.startswith(r'news'):
                url = 'http://www.thepaper.cn/' + url
            urls.append((u'上海书评', title, url, None))
        if len(urls) == 0:
            self.log.warn('No article found for Shanghai Book Review.')
        return urls
Example #7
0
 def ParseFeedUrls(self):
     """ return list like [(section,title,url,desc),..] """
     urls = []
     timeout = self.timeout
     opener = URLOpener(self.host, timeout=timeout)
     
     urladded = set()
     for sec,url in self.feeds:
         result = opener.open(url)
         if result.status_code == 200:
             page = result.content.decode('utf-8')
             soup = BeautifulSoup(page)
             tbnews = soup.find(name='div',attrs={'class':['box2']})
             if tbnews:
                 for news in tbnews.find_all('a'):
                     if not news.string or news.string == u'繼續閱讀':
                         continue
                     urlnews = news['href']
                     if not urlnews.startswith('http'):
                         urlnews = urlparse.urljoin(url, urlnews)
                     if urlnews not in urladded:
                         urls.append((sec,news.string,urlnews,None))
                         urladded.add(urlnews)
             soup = None
         else:
             self.log.warn('fetch url failed:%s'%url)
         
     return urls
Example #8
0
    def ParseFeedUrls(self):
        #return lists like [(section,title,url,desc),..]
        main = 'http://mp.sohu.com/profile?xpt=bWhtaW5nMUBzb2h1LmNvbQ==&_f=index_pagemp_1'
        urls = []
        opener = URLOpener(self.host, timeout=90)
        result = opener.open(main)
        if result.status_code != 200:
            self.log.warn('fetch webpage failed:%s' % main)
            return []

        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")

        #开始解析
        for article in soup.find_all('div', class_='content_wrap', limit=6):
            timestamp = article.find('div', class_='wrap_mark')
            span = timestamp.find('span')
            timestamp = string_of_tag(span).strip()
            if u'今天' not in timestamp and u'昨天' not in timestamp:
                continue
            div = article.find('div', class_='wrap_title')
            a = span.find('a', href=True)
            title = string_of_tag(a).strip()
            if not title:
                self.log.warn('This title not found.')
                continue
            url = a['href']
            if url.startswith('/'):
                url = 'http:' + url
            urls.append((u'古代小说网sohu', title, url, None))
        if len(urls) == 0:
            self.log.warn('len of urls is zero.')
        return urls
Example #9
0
 def ParseFeedUrls(self):
     main = 'http://bbstsg.vip.qikan.com/text/Mag.aspx?issn=ACB37AEA-8FB7-4855-B7CA-D228E972162F'
     urls = []
     opener = URLOpener(self.host, timeout=90)
     result = opener.open(main)
     if result.status_code != 200:
         self.log.warn('fetch webpage failed:%s'%main)
         return []
     if self.feed_encoding:
         try:
             content = result.content.decode(self.feed_encoding)
         except UnicodeDecodeError:
             content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
     else:
         content = AutoDecoder(False).decode(result.content,opener.realurl,result.headers)
     soup = BeautifulSoup(content, "lxml")
     for section in soup.find_all('dl'):
         dt=section.find('dt')
         span=dt.find('span')
         if span:
             sectitle = string_of_tag(span).strip()
         for dd in section.find_all('dd'):
             a=dd.find('a', href=True)
             title = string_of_tag(a).strip()
             url = a['href']
             if url.startswith('Article'):
                 url = 'http://bbstsg.vip.qikan.com/text/'+url
             urls.append((sectitle,title,url,None))
     if len(urls) == 0:
         self.log.warn('len of urls is zero.')
     return urls
Example #10
0
    def ParseFeedUrls(self):
        #return lists like [(section,title,url,desc),..]
        main = 'http://news.joins.com/Issue/10061'
        urls = []
        opener = URLOpener(self.host, timeout=90)
        result = opener.open(main)
        if result.status_code != 200:
            self.log.warn('fetch webpage failed:%s' % main)
            return []

        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")

        #开始解析
        for article in soup.find_all('strong', class_='headline mg',
                                     limit=4):  #只保留最近一个月的四篇
            a = article.find('a', href=True)
            title = string_of_tag(a).strip()
            if not title:
                self.log.warn('This title not found.')
                continue
            url = a['href']
            if url.startswith('/'):
                url = 'http://news.joins.com' + url
            urls.append((u'사설 속으로', title, url, None))
        if len(urls) == 0:
            self.log.warn('len of urls is zero.')
        return urls
Example #11
0
    def ParseFeedUrls(self):
        #return lists like [(section,title,url,desc),..]
        main = 'http://www.jintiankansha.me/column/nwClF5ZmDJ'
        urls = []
        opener = URLOpener(self.host, timeout=90)
        result = opener.open(main)
        if result.status_code != 200:
            self.log.warn('fetch webpage failed:%s' % main)
            return []

        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")

        #开始解析
        section = soup.find('div', class_='entries')
        for article in section.find_all('div', class_='cell item', limit=10):
            timestamp = article.find('span', class_='small fade')
            timestamp = string_of_tag(timestamp).strip()
            #if u'小时' not in timestamp and u'昨天' not in timestamp:
            if u'小时' not in timestamp:
                continue
            span = article.find('span', class_='item_title')
            a = span.find('a', href=True)
            title = string_of_tag(a).strip()
            if not title:
                self.log.warn('This title not found.')
                continue
            url = a['href']
            urls.append((u'聊聊架构', title, url, None))
        if len(urls) == 0:
            self.log.warn('len of urls is zero.')
        return urls
Example #12
0
    def ParseFeedUrls(self):
        #return lists like [(section,title,url,desc),..]
        main = 'https://www.nknews.org/'
        urls = []
        opener = URLOpener(self.host, timeout=90)
        result = opener.open(main)
        if result.status_code != 200:
            self.log.warn('fetch webpage failed:%s' % main)
            return []

        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")

        #开始解析
        def is_cls_wanted(css_class):
            listwanted = [
                'col-md-7', 'post-prinicap-row', 'col-md-12',
                'col-md-6 smallboxclass'
            ]
            return css_class in listwanted
#        def not_has_class(tag):
#            return not tag.has_attr('class')

        for section in soup.find_all(class_=is_cls_wanted, limit=8):
            article = section.find('a', string=True)
            title = string_of_tag(article).strip()
            url = article['href']
            if '/pro/' in url:
                continue
            span = article.find('span')
            strong = span.find('strong')
            if not strong:
                timestamp = span
            else:
                timestamp = strong
            timestamp = string_of_tag(timestamp).strip()
            m = re.search(r'\d{4}$', timestamp)
            if m:
                pubtime = datetime.datetime.strptime(timestamp,
                                                     '%d %B %Y').date()
            else:
                m2 = re.search(r'^\d', timestamp)
                if m2:
                    pubtime = datetime.datetime.strptime(timestamp,
                                                         '%d %B').date()
                else:
                    pubtime = datetime.datetime.strptime(timestamp,
                                                         '%B %d').date()

            tnow = datetime.datetime.utcnow()
            tnow = tnow.date()
            delta = (tnow - pubtime).days
            if self.oldest_article > 0 and delta > self.oldest_article:
                continue
                #self.log.info('\tFound article:%s' % title)
            urls.append(('NK News', title, url, None))
        if len(urls) == 0:
            self.log.warn('NK News has no article.')
        return urls
Example #13
0
    def page_to_soup(self, indexurl):
        opener = URLOpener(self.host, timeout=90)
        result = opener.open(indexurl)
        if result.status_code != 200:
            self.log.warn('fetch mainnews failed:%s' % indexurl)

        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")
        return soup
 def ParseFeedUrls(self):
     #return list like [(section,title,url,desc),..]
     main = 'https://www.economist.com/printedition'
     # Did you block me?
     main = self.url4forwarder(main)
     urls = []
     urladded = set()
     opener = URLOpener(self.host, timeout=90)
     result = opener.open(main)
     if result.status_code != 200:
         self.log.warn('fetch webpage failed:%s'%main)
         return []
         
     content = result.content.decode(self.feed_encoding)
     soup = BeautifulSoup(content, "lxml")
     
     #开始解析
     for section in soup.find_all('li', attrs={'class':'list__item'}):
         div = section.find('div')
         if div is None:
             self.log.warn('This part skipped.')
             continue
         sectitle = string_of_tag(div).strip()
         if not sectitle:
             self.log.warn('No section title')
             continue
         if sectitle == 'Economic and financial indicators':
             continue
         #self.log.info('Found section: %s' % section_title)
         articles = []
         for node in section.find_all('a', href=True):
             spans = node.findAll('span')
             if len(spans) == 2:
                 fly= node.find('span', attrs={'class':'print-edition__link-flytitle'})
                 pre= string_of_tag(fly).strip()
                 ti= node.find('span', attrs={'class':'print-edition__link-title'})
                 post= string_of_tag(ti).strip()
                 title = pre +': '+ post
             else:
                 title = string_of_tag(node).strip()
             url = node['href']
             if url.startswith(r'/'):
                 url = 'http://www.economist.com' + url
                 # Did you block me?
                 url = self.url4forwarder(url)
                 #self.log.info('\tFound article:%s' % title)
                 if url not in urladded:
                     urls.append((sectitle,title,url,None))
                     urladded.add(url)
                             
     if len(urls) == 0:
         self.log.warn('len of urls is zero.')
     return urls
Example #15
0
    def ParseFeedUrls(self):
        #return lists like [(section,title,url,desc),..]
        mainhead = 'https://www.yna.co.kr/international/china/'
        num = 1
        urls = []
        callitaday = False
        koreanow = datetime.datetime.utcnow() + datetime.timedelta(hours=9)
        #        koreadate = koreanow.date()
        year = koreanow.year
        mydelta = datetime.timedelta(hours=24, minutes=10)

        while not callitaday:
            main = mainhead + str(num)
            opener = URLOpener(self.host, timeout=90)
            result = opener.open(main)
            if result.status_code != 200:
                self.log.warn('fetch mainnews failed:%s' % main)

            content = result.content.decode(self.page_encoding)
            soup = BeautifulSoup(content, "lxml")
            #开始解析

            section = soup.find('div', class_='list-type038')
            for article in section.find_all('div', class_='item-box01'):
                if article is None:
                    self.log.warn('This article not found')
                    continue
                ptime = article.find('span', class_='txt-time')
                if ptime:
                    ptime = string_of_tag(ptime).strip()
                    #                    pdate=ptime[0:5] #只要07-30这样的日期
                    ptime = str(year) + '-' + ptime  #加上年份,否则默认1900年
                    ptime = datetime.datetime.strptime(ptime, '%Y-%m-%d %H:%M')
                    delta = koreanow - ptime
                    #                    if self.oldest_article > 0 and delta >= self.oldest_article:
                    if delta > mydelta:
                        callitaday = True
                        break  #因为是按时间顺序的
                newscon = article.find('div', class_='news-con')
                a = newscon.find('a', href=True)
                atitle = string_of_tag(a).strip()
                atitle = atitle + ' ' + str(ptime)[5:-3]
                url = a['href']
                if url.startswith('/'):
                    url = 'https:' + url
                urls.append((u'중국 뉴스', atitle, url, None))
            num = num + 1
        if len(urls) == 0:
            self.log.warn('len of urls is zero.')
        return urls
Example #16
0
 def ParseFeedUrls(self):
     """ return list like [(section,title,url,desc),..] """
     mainurl = 'http://www.economist.com/printedition'
     urls = []
     urladded = set()
     opener = URLOpener(self.host, timeout=30)
     result = opener.open(mainurl)
     if result.status_code != 200:
         self.log.warn('fetch rss failed:%s'%mainurl)
         return []
         
     content = result.content.decode(self.feed_encoding)
     soup = BeautifulSoup(content, "lxml")
     
     #GAE获取到的是移动端网页,和PC获取到的网页有些不一样
     for section in soup.find_all('section', attrs={'id':lambda x: x and 'section' in x}):
         h4 = section.find('h4')
         if h4 is None:
             self.log.warn('h4 is empty')
             continue
         sectitle = string_of_tag(h4).strip()
         if not sectitle:
             self.log.warn('h4 string is empty')
             continue
         #self.log.info('Found section: %s' % section_title)
         articles = []
         subsection = ''
         for node in section.find_all('article'):
             subsec = node.find('h5')
             if subsec is not None:
                 subsection = string_of_tag(subsec)
             prefix = (subsection + ': ') if subsection else ''
             a = node.find('a', attrs={"href":True}, recursive=False)
             if a is not None:
                 url = a['href']
                 if url.startswith(r'/'):
                     url = 'http://www.economist.com' + url
                 url += '/print'
                 title = string_of_tag(a)
                 if title:
                     title = prefix + title
                     #self.log.info('\tFound article:%s' % title)
                     if url not in urladded:
                         urls.append((sectitle,title,url,None))
                         urladded.add(url)
     if len(urls) == 0:
         self.log.warn('len of urls is zero.')
     return urls
     
Example #17
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        mainurl = 'http://www.economist.com/printedition'
        urls = []
        urladded = set()
        opener = URLOpener(self.host, timeout=30)
        result = opener.open(mainurl)
        if result.status_code != 200:
            self.log.warn('fetch rss failed:%s' % mainurl)
            return []

        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")

        #GAE获取到的是移动端网页,和PC获取到的网页有些不一样
        for section in soup.find_all(
                'section', attrs={'id': lambda x: x and 'section' in x}):
            h4 = section.find('h4')
            if h4 is None:
                self.log.warn('h4 is empty')
                continue
            sectitle = string_of_tag(h4).strip()
            if not sectitle:
                self.log.warn('h4 string is empty')
                continue
            #self.log.info('Found section: %s' % section_title)
            articles = []
            subsection = ''
            for node in section.find_all('article'):
                subsec = node.find('h5')
                if subsec is not None:
                    subsection = string_of_tag(subsec)
                prefix = (subsection + ': ') if subsection else ''
                a = node.find('a', attrs={"href": True}, recursive=False)
                if a is not None:
                    url = a['href']
                    if url.startswith(r'/'):
                        url = 'http://www.economist.com' + url
                    url += '/print'
                    title = string_of_tag(a)
                    if title:
                        title = prefix + title
                        #self.log.info('\tFound article:%s' % title)
                        if url not in urladded:
                            urls.append((sectitle, title, url, None))
                            urladded.add(url)
        if len(urls) == 0:
            self.log.warn('len of urls is zero.')
        return urls
Example #18
0
def fetch_cover(self):
    mainurl = 'http://www.economist.com/printedition'
    opener = URLOpener(None, timeout=90)
    #    opener = URLOpener(self.host, timeout=90)
    result = opener.open(mainurl)
    content = result.content.decode('utf-8')
    #    content = result.content.decode(self.feed_encoding)
    soup = BeautifulSoup(content, "lxml")
    div = soup.find('div', attrs={'class': 'print-edition__cover-widget'})
    img = div.find('img', src=True)
    cover = img.get('src')
    if cover.startswith('/'):
        cover = 'http://www.economist.com' + cover
    data = urllib.urlopen(cover).read()
    return data
Example #19
0
    def ParseFeedUrls(self):
        """ return list like [(section,title,url,desc),..] """
        mainurl = "http://www.economist.com/printedition"
        urls = []
        urladded = set()
        opener = URLOpener(self.host, timeout=30)
        result = opener.open(mainurl)
        if result.status_code != 200:
            self.log.warn("fetch rss failed:%s" % mainurl)
            return []

        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")

        # GAE获取到的是移动端网页,和PC获取到的网页有些不一样
        for section in soup.find_all("section", attrs={"id": lambda x: x and "section" in x}):
            h4 = section.find("h4")
            if h4 is None:
                self.log.warn("h4 is empty")
                continue
            sectitle = string_of_tag(h4).strip()
            if not sectitle:
                self.log.warn("h4 string is empty")
                continue
            # self.log.info('Found section: %s' % section_title)
            articles = []
            subsection = ""
            for node in section.find_all("article"):
                subsec = node.find("h5")
                if subsec is not None:
                    subsection = string_of_tag(subsec)
                prefix = (subsection + ": ") if subsection else ""
                a = node.find("a", attrs={"href": True}, recursive=False)
                if a is not None:
                    url = a["href"]
                    if url.startswith(r"/"):
                        url = "http://www.economist.com" + url
                    url += "/print"
                    title = string_of_tag(a)
                    if title:
                        title = prefix + title
                        # self.log.info('\tFound article:%s' % title)
                        if url not in urladded:
                            urls.append((sectitle, title, url, None))
                            urladded.add(url)
        if len(urls) == 0:
            self.log.warn("len of urls is zero.")
        return urls
Example #20
0
 def ParseFeedUrls(self):
     mainurl = "http://www.21ccom.net/articles/china/"
     urls = []
     opener = URLOpener(self.host, timeout=60)
     result = opener.open(mainurl)
     if result.status_code != 200:
         self.log.warn('fetch rss failed:%s' % mainurl)
         return []
     content = result.content.decode(self.feed_encoding)
     soup = BeautifulSoup(content, "lxml")
     # Get the 2nd block
     ul = soup.find_all('ul', attrs={'class': ['m-list', 'list-tweet']})[1]
     for li in ul.find_all('li'):
         urls.append(
             (u'共识网一周排行', li.a.text, li.a['href'], self.FetchDesc(li.a['href'])))
     return urls
Example #21
0
 def FindHo():
     hopage = 'http://weekly.chosun.com/client/contents/lst.asp'
     opener = URLOpener(self.host, timeout=90)
     result = opener.open(hopage)
     content = result.content.decode('euc-kr')
     if result.status_code != 200:
         self.log.warn('fetching hopage failed:%s' % hopage)
     soup = BeautifulSoup(content, "lxml")
     location = soup.find('div', id='Location')
     edition = location.find('div', class_='edition')
     ho = string_of_tag(edition).strip()
     if ho.startswith('['):
         ho = ho[1:5]
     else:
         self.log.warn('Fetching ho failed.')
     return ho
Example #22
0
def fetch_cover(self):
    mainurl = 'http://weekly.chosun.com'
    opener = URLOpener(None, timeout=180)
    #   opener = URLOpener(self.host, timeout=90)
    result = opener.open(mainurl)
    content = result.content.decode('euc-kr')
    soup = BeautifulSoup(content, "lxml")
    div = soup.find('div', class_='box_cover_new')
    img = div.find('img', src=True)
    cover = img.get('src')
    if cover.startswith('/'):
        cover = mainurl + cover
    else:
        cover = 'http://weekly.chosun.com/' + cover
    data = urllib.urlopen(cover).read()
    return data
Example #23
0
def fetch_cover(self):
    mainurl = 'https://www.economist.com/weeklyedition'
    opener = URLOpener(None, timeout=180)
    #    opener = URLOpener(self.host, timeout=90)
    result = opener.open(mainurl)
    content = result.content.decode('utf-8')
    #    content = result.content.decode(self.feed_encoding)
    soup = BeautifulSoup(content, "lxml")
    #    wrapper = soup.find('div', attrs={'class':'print-edition__cover-wrapper'})
    header = soup.find('div', class_='weekly-edition-header__image')
    #    div=wrapper.find('div', class_='component-image print-edition__cover-widget__image')
    img = header.find('img', src=True)
    cover = img.get('src')
    #    if cover.startswith('/'):
    #        cover = 'http://www.economist.com' + cover
    data = urllib.urlopen(cover).read()
    return data
Example #24
0
 def fetcharticle(self, url, decoder):
     """ 爱思想的文章有分页,在此函数内下载全部分页,合并成一个单独的HTML返回。"""
     opener = URLOpener(self.host, timeout=self.timeout)
     result = opener.open(url)
     status_code, content = result.status_code, result.content
     if status_code != 200 or not content:
         self.log.warn('fetch article failed(%d):%s.' % (status_code,url))
         return None
     
     #内嵌函数,用于处理分页信息
     def not_is_thispage(tag):
         return not tag.has_attr('class')
     
     if self.page_encoding:
         try:
             firstpart = content.decode(self.page_encoding)
         except UnicodeDecodeError:
             firstpart = decoder.decode(content,url)
     else:
         firstpart = decoder.decode(content,url)
     
     otherparts = []
     soup = BeautifulSoup(firstpart, "lxml")
     listpage = soup.find('div', attrs={'class':'list_page'})
     if listpage: #有分页
         for page in listpage.find_all('li'):
             parturl = page.find(not_is_thispage)
             if parturl:
                 parturl = self.urljoin(url, parturl['href'])
                 result = opener.open(parturl)
                 status_code, content = result.status_code, result.content
                 if status_code != 200 or not content:
                     self.log.warn('fetch article failed(%d):%s.' % (status_code,url))
                 else:
                     if self.page_encoding:
                         try:
                             thispart = content.decode(self.page_encoding)
                         except UnicodeDecodeError:
                             thispart = decoder.decode(content,parturl)
                     else:
                         thispart = decoder.decode(content,parturl)
                     otherparts.append(thispart)
                     
         #合并文件后不再需要分页标志
         listpage.decompose()
         
     #逐个处理各分页,合成一个单独文件
     article1 = soup.find('div', attrs={'id':'content'})
     if not article1:
         return None
     
     for foot in article1.contents[-2:]:
         if isinstance(foot, NavigableString):
             if u'本文责编:' in unicode(foot) or u'进入专题:' in unicode(foot):
                 foot.decompose()
         else:
             for s in foot.strings:
                 if u'本文责编:' in s or u'进入专题:' in s:
                     foot.decompose()
                     break
     
     #将其他页的文章内容附加到第一页的文章内容后面
     for page in otherparts[::-1]:
         souppage = BeautifulSoup(page, "lxml")
         article = souppage.find('div', attrs={'id':'content'})
         if not article:
             continue
         
         for foot in article.contents[-2:]:
             if isinstance(foot, NavigableString):
                 if u'本文责编:' in unicode(foot) or u'进入专题:' in unicode(foot):
                     foot.decompose()
             else:
                 for s in foot.strings:
                     if u'本文责编:' in s or u'进入专题:' in s:
                         foot.decompose()
                         break
                         
         article1.insert_after(article)
     
     for a in soup.find_all('a',attrs={'href':True}):
         if a.string == u'点击此处阅读下一页':
             a.decompose()
     
     return unicode(soup)
Example #25
0
    def ParseFeedUrls(self):
        #return lists like [(section,title,url,desc),..]
        main = 'https://www.yna.co.kr/nk/index'
        urls = []
        urladded = set()
        opener = URLOpener(self.host, timeout=90)
        result = opener.open(main)
        if result.status_code != 200:
            self.log.warn('fetch mainnews failed:%s' % main)

        content = result.content.decode(self.page_encoding)
        soup = BeautifulSoup(content, "lxml")

        #开始解析
        section = soup.find('section',
                            attrs={'class': 'column-type01 column-newslist'})
        for article in section.find_all('article'):
            if article is None:
                self.log.warn('This article not found')
                continue
            h2 = article.find('h2')
            a = h2.find('a', href=True)
            atitle = string_of_tag(a).strip()
            url = a['href']
            if url.startswith('/'):
                url = 'https:' + url
            elif url.startswith('HTTP'):
                url = url.replace('HTTP', 'http')
            if url not in urladded:
                urls.append((u'韩联社朝鲜要闻', atitle, url, None))
                urladded.add(url)
            related = article.find('div', attrs={'class': 'v-related'})
            if related:
                span = related.find('span')
                if span:
                    relateda = span.find('a', href=True)
                    rtitle = string_of_tag(relateda).strip()
                    rtitle = 'Related: ' + rtitle  #在相关文章标题前加标志
                    rurl = relateda['href']
                    if rurl.startswith('/'):
                        rurl = 'https:' + rurl
                    elif rurl.startswith('HTTP'):
                        rurl = rurl.replace('HTTP', 'http')
                    if rurl not in urladded:
                        urls.append((u'韩联社朝鲜要闻', rtitle, rurl, None))
                        urladded.add(rurl)

        part2 = 'https://www.yna.co.kr/nk/news/all'
        opener2 = URLOpener(self.host, timeout=90)
        result2 = opener2.open(part2)
        if result2.status_code != 200:
            self.log.warn('fetch latest news failed:%s' % main)
        content2 = result2.content.decode(self.page_encoding)
        soup2 = BeautifulSoup(content2, "lxml")
        sect = soup2.find('ul', attrs={'class': 'list-type01 yna-more'})
        for arti in sect.find_all('article'):
            h = arti.find('h2')
            a2 = h.find('a', href=True)
            title = string_of_tag(a2).strip()
            if u'[북한날씨]' in title:
                continue
            aurl = a2['href']
            if aurl.startswith('/'):
                aurl = 'https:' + aurl
            elif aurl.startswith('HTTP'):
                aurl = aurl.replace('HTTP', 'http')
            if aurl not in urladded:
                urls.append((u'朝鲜最新消息', title, aurl, None))
                urladded.add(aurl)
        if len(urls) == 0:
            self.log.warn('len of urls is zero.')
        return urls
Example #26
0
    def fetcharticle(self, url, decoder):
        """ 爱思想的文章有分页,在此函数内下载全部分页,合并成一个单独的HTML返回。"""
        opener = URLOpener(self.host, timeout=self.timeout)
        result = opener.open(url)
        status_code, content = result.status_code, result.content
        if status_code != 200 or not content:
            self.log.warn('fetch article failed(%d):%s.' % (status_code, url))
            return None

        #内嵌函数,用于处理分页信息
        def not_is_thispage(tag):
            return not tag.has_attr('class')

        if self.page_encoding:
            try:
                firstpart = content.decode(self.page_encoding)
            except UnicodeDecodeError:
                firstpart = decoder.decode(content, opener.realurl)
        else:
            firstpart = decoder.decode(content, opener.realurl)

        otherparts = []
        soup = BeautifulSoup(firstpart, "lxml")
        listpage = soup.find('div', attrs={'class': 'list_page'})
        if listpage:  #有分页
            for page in listpage.find_all('li'):
                parturl = page.find(not_is_thispage)
                if parturl:
                    parturl = self.urljoin(url, parturl['href'])
                    result = opener.open(parturl)
                    status_code, content = result.status_code, result.content
                    if status_code != 200 or not content:
                        self.log.warn('fetch article failed(%d):%s.' %
                                      (status_code, url))
                    else:
                        if self.page_encoding:
                            try:
                                thispart = content.decode(self.page_encoding)
                            except UnicodeDecodeError:
                                thispart = decoder.decode(content, parturl)
                        else:
                            thispart = decoder.decode(content, parturl)
                        otherparts.append(thispart)

            #合并文件后不再需要分页标志
            listpage.decompose()

        #逐个处理各分页,合成一个单独文件
        article1 = soup.find('div', attrs={'id': 'content'})
        if not article1:
            return None

        for foot in article1.contents[-2:]:
            if isinstance(foot, NavigableString):
                if u'本文责编:' in unicode(foot) or u'进入专题:' in unicode(foot):
                    foot.decompose()
            else:
                for s in foot.strings:
                    if u'本文责编:' in s or u'进入专题:' in s:
                        foot.decompose()
                        break

        #将其他页的文章内容附加到第一页的文章内容后面
        for page in otherparts[::-1]:
            souppage = BeautifulSoup(page, "lxml")
            article = souppage.find('div', attrs={'id': 'content'})
            if not article:
                continue

            for foot in article.contents[-2:]:
                if isinstance(foot, NavigableString):
                    if u'本文责编:' in unicode(foot) or u'进入专题:' in unicode(foot):
                        foot.decompose()
                else:
                    for s in foot.strings:
                        if u'本文责编:' in s or u'进入专题:' in s:
                            foot.decompose()
                            break

            article1.insert_after(article)

        for a in soup.find_all('a', attrs={'href': True}):
            if a.string == u'点击此处阅读下一页':
                a.decompose()

        return unicode(soup)
Example #27
0
    def ParseFeedUrls(self):
        #return list like [(section,title,url,desc),..]
        def FindHo():
            hopage = 'http://weekly.chosun.com/client/contents/lst.asp'
            opener = URLOpener(self.host, timeout=90)
            result = opener.open(hopage)
            content = result.content.decode('euc-kr')
            if result.status_code != 200:
                self.log.warn('fetching hopage failed:%s' % hopage)
            soup = BeautifulSoup(content, "lxml")
            location = soup.find('div', id='Location')
            edition = location.find('div', class_='edition')
            ho = string_of_tag(edition).strip()
            if ho.startswith('['):
                ho = ho[1:5]
            else:
                self.log.warn('Fetching ho failed.')
            return ho

        mainhead = 'http://weekly.chosun.com/client/news/alllst.asp?nHo='
        urls = []
        urladded = set()
        opener = URLOpener(self.host, timeout=90)
        ho = FindHo()
        main = mainhead + ho
        result = opener.open(main)
        if result.status_code != 200:
            self.log.warn('Fetching TOC failed:%s' % main)
            return []
        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")

        #开始解析
        def tr_has_a_tag(tag):
            return tag.name == 'tr' and tag.find('a')

        listarea = soup.find('div', class_='List_area')
        for section in listarea.find_all('table'):
            h4 = section.find_previous_sibling('h4')
            sectitle = string_of_tag(h4).strip()
            if not sectitle:
                self.log.warn('No section title')
                continue
#            if sectitle == 'Economic and financial indicators':
#                continue
#self.log.info('Found section: %s' % section_title)
            articles = []
            for tr in section.find_all(tr_has_a_tag):
                article = tr.find('a', href=True)
                title = string_of_tag(article).strip()
                url = article['href']
                if url.startswith('viw'):
                    url = 'http://weekly.chosun.com/client/news/' + url
                    url = url.replace('viw', 'print', 1)
                    #self.log.info('\tFound article:%s' % title)
                    if url not in urladded:
                        urls.append((sectitle, title, url, None))
                        urladded.add(url)

        if len(urls) == 0:
            self.log.warn('No articles found for WeeklyChosun.')
        return urls
Example #28
0
    def ParseFeedUrls(self):
        #return list like [(section,title,url,desc),..]
        login_url = 'https://my.economist.com/'
        main = 'https://www.economist.com/weeklyedition'
        #        login_form = {"css-1gytnsx":self.account, "password":self.password}
        #        login_response = opener.open(login_url, data=login_form)
        #        main = 'https://www.economist.com/'
        urls = []
        urladded = set()
        opener = URLOpener(self.host, timeout=90)
        result = opener.open(main)
        if result.status_code != 200:
            self.log.warn('fetch webpage failed:%s' % main)
            return []
#        content = result.content.decode(self.feed_encoding)
#        soup = BeautifulSoup(content, "lxml")
#        a = soup.find('a', attrs={'class':'latest-printed__cta'})
#        current = a['href']
#        if current.startswith(r'/'):
#            current = 'https://www.economist.com' + url
#        opener = URLOpener(self.host, timeout=90)
#        result = opener.open(current)
#        if result.status_code != 200:
#            self.log.warn('fetch latest edition failed:%s'%main)
#            return []
        content = result.content.decode(self.feed_encoding)
        soup = BeautifulSoup(content, "lxml")

        #开始解析
        #        for section in soup.find_all('li', attrs={'class':'list__item'}):
        #            div = section.find('div')
        #            if div is None:
        #                self.log.warn('This part skipped.')
        #                continue
        thisweek = soup.find('div', class_='layout-weekly-edition-wtw')
        if thisweek:
            h2 = thisweek.find('h2')
            sectitle = string_of_tag(h2).strip()
            if not sectitle:
                self.log.warn('No section title for the world this week')
            for week in thisweek.find_all('a', href=True):
                title = string_of_tag(week).strip()
                url = week['href']
                if url.startswith(r'/'):
                    url = 'https://www.economist.com' + url
                urls.append((sectitle, title, url, None))
        else:
            self.log.warn('The world this week not found.')

        for section in soup.find_all(
                class_=lambda value: value and value.startswith(
                    'layout-weekly-edition-section')):
            h2 = section.find('h2')
            sectitle = string_of_tag(h2).strip()
            if not sectitle:
                self.log.warn('No section title')
                continue
            if 'financial indicators' in sectitle:
                continue
            #self.log.info('Found section: %s' % section_title)
#            articles = []
            for node in section.find_all('a',
                                         href=True,
                                         class_=lambda value: value and value.
                                         startswith('headline-link')):
                spans = node.find_all('span')
                if len(spans) == 2:
                    title = u'{}: {}'.format(*map(string_of_tag, spans))
#            for node in section.find_all('a', href=True):
#                spans = node.findAll('span')
#                if len(spans) == 2:
#                    fly= node.find('span', attrs={'class':'print-edition__link-flytitle'})
#                    pre= string_of_tag(fly).strip()
#                    ti= node.find('span', attrs={'class':'print-edition__link-title'})
#                    post= string_of_tag(ti).strip()
#                    title = pre +': '+ post
                else:
                    title = string_of_tag(node).strip()
                url = node['href']
                if url.startswith(r'/'):
                    url = 'https://www.economist.com' + url
                    #self.log.info('\tFound article:%s' % title)
                if url not in urladded:
                    urls.append((sectitle, title, url, None))
                    urladded.add(url)

        if len(urls) == 0:
            self.log.warn('len of urls is zero.')
        return urls