Example #1
0
    def get_articles(self):
        resultList = []
        sections = [('Vancouver', 'http://www.theprovince.com/scripts/Sp6Query.aspx?catalog=VAPR&tags=category|news|subcategory|metro%20vancouver'),
                    ('Fraser Valley', 'http://www.theprovince.com/scripts/Sp6Query.aspx?catalog=VAPR&tags=category|news|subcategory|fraser%20valley'),
                    ('B.C.', 'http://www.theprovince.com/scripts/Sp6Query.aspx?catalog=VAPR&tags=category|news|subcategory|b.c.'),]
        relSections = [('Canada', 'http://www.theprovince.com/7588609.atom'),
                    ('World', 'http://www.theprovince.com/7589147.atom'), ]

        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                doc = etree.fromstring(read_http_page(url))
                for entry in doc.xpath('//ns:entry[@Status="FREE"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'}):
                    title = entry.xpath('ns:title[@type="html"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].text
                    link = 'http://www.theprovince.com' + entry.xpath('ns:link[@type="text/html"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].get('href')
                    abstract = entry.xpath('ns:link[@type="text/html"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].get('Abstract')
                    resultList.append(self.create_article(title.strip(), link, abstract))

            for (title, url) in relSections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                doc = etree.fromstring(read_http_page(url))
                for entry in doc.xpath('//ns:entry[@Status="FREE"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'}):
                    title = entry.xpath('ns:title[@type="html"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].text
                    link = 'http://www.theprovince.com' + entry.xpath('ns:link[@type="text/xml"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].get('href')
                    abstract = entry.xpath('ns:link[@type="text/xml"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].get('Abstract')
                    resultList.append(self.create_article(title.strip(), link, abstract))

        except Exception as e:
            logger.exception('Problem processing url')

        return resultList
Example #2
0
    def get_articles(self):
        resultList = []
        sections = [('要聞','https://www.singtao.ca/category/52-%E5%8D%A1%E5%8A%A0%E5%88%A9%E8%A6%81%E8%81%9E/?variant=zh-hk'),
                    ('加國新聞','https://www.singtao.ca/category/54-%E5%8D%A1%E5%8A%A0%E5%88%A9%E5%8A%A0%E5%9C%8B/?variant=zh-hk'),
                    ('省市', 'https://www.singtao.ca/category/65-%E5%8D%A1%E5%8A%A0%E5%88%A9%E7%9C%81%E5%B8%82/?variant=zh-hk'),
                    ('港聞','https://www.singtao.ca/category/57-%E5%8D%A1%E5%8A%A0%E5%88%A9%E6%B8%AF%E8%81%9E/?variant=zh-hk'),
                    ('國際','https://www.singtao.ca/category/56-%E5%8D%A1%E5%8A%A0%E5%88%A9%E5%9C%8B%E9%9A%9B/?variant=zh-hk'),
                    ('中國','https://www.singtao.ca/category/58-%E5%8D%A1%E5%8A%A0%E5%88%A9%E4%B8%AD%E5%9C%8B/?variant=zh-hk'),
                    ('台灣','https://www.singtao.ca/category/59-%E5%8D%A1%E5%8A%A0%E5%88%A9%E5%8F%B0%E7%81%A3/?variant=zh-hk'),
                    ('財經','https://www.singtao.ca/category/61-%E5%8D%A1%E5%8A%A0%E5%88%A9%E8%B2%A1%E7%B6%93/?variant=zh-hk'),
                    ('體育','https://www.singtao.ca/category/60-%E5%8D%A1%E5%8A%A0%E5%88%A9%E9%AB%94%E8%82%B2/?variant=zh-hk'),
                    ('娛樂','https://www.singtao.ca/category/62-%E5%8D%A1%E5%8A%A0%E5%88%A9%E5%A8%9B%E6%A8%82/?variant=zh-hk'),]

        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                doc = html.document_fromstring(read_http_page(url, {'edition': 'calgary'}).decode('utf-8'))

                # top story
                top_story_link = doc.xpath('(//div[@class="td-ss-main-content"])[1]/div[@class="cat-header-image"]/a')
                top_story_text = doc.xpath('(//div[@class="td-ss-main-content"])[1]/div[@class="cat-header-image"]/a/div/h3')
                if top_story_link and top_story_text:
                    resultList.append(self.create_article(top_story_text[0].text.strip(), top_story_link[0].get('href')))

                for topic in doc.xpath('(//div[@class="td-ss-main-content"])[1]/div[contains(@class, "td-animation-stack")]/div[@class="item-details"]/h3/a'):
                    if topic.text and topic.get('href'):
                        resultList.append(self.create_article(topic.text.strip(), topic.get('href')))

        except Exception as e:
            logger.exception('Problem processing url: ' + str(e))
            logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))

        return resultList
Example #3
0
 def get_articles(self):
     resultList = []
     for (name, url) in self.get_rss_links():
         try:
             # for each section, insert a title...
             resultList.append(self.create_section(name))
             # ... then parse the page and extract article links
             doc = etree.fromstring(read_http_page(url),
                                    parser=etree.XMLParser(recover=True))
             if doc is not None:
                 for entry in doc.xpath(
                         '//*[local-name()="RDF"]/*[local-name()="item"]'):
                     titles = entry.xpath('*[local-name()="title"]')
                     links = entry.xpath('*[local-name()="link"]')
                     abstracts = entry.xpath(
                         '*[local-name()="description"]')
                     if titles and links:
                         title = titles[0].text
                         link = links[0].text
                         abstract = abstracts[0].text if abstracts else ""
                         resultList.append(
                             self.create_article(title.strip(), link,
                                                 abstract))
         except Exception as e:
             logger.exception("Problem processing rdf: " + str(e))
             logger.exception(
                 traceback.format_exception(etype=type(e),
                                            value=e,
                                            tb=e.__traceback__))
     return resultList
Example #4
0
    def get_articles(self):
        # Although the source is in RSS, the daily items are consolidated as CDATA.
        # Parse and break them down instead of using RSSBase
        rss_url = "http://www.daemonology.net/hn-daily/index.rss"
        resultList = []
        try:
            doc = html.document_fromstring(read_http_page(rss_url))
            for item in doc.xpath("//rss/channel/item"):
                title = (item.xpath("title")[0].text if
                         len(item.xpath("title")) > 0 else "Daily Hacker News")
                resultList.append(self.create_section(title))

                description = (item.xpath("description")[0]
                               if len(item.xpath("description")) > 0 else None)
                if description is not None:
                    for article in description.xpath(
                            'ul/li/span[@class="storylink"]/a'):
                        if article.text and article.get("href"):
                            resultList.append(
                                self.create_article(article.text.strip(),
                                                    article.get("href")))

        except Exception as e:
            logger.exception("Problem processing Hacker News: " + str(e))
            logger.exception(
                traceback.format_exception(etype=type(e),
                                           value=e,
                                           tb=e.__traceback__))

        return resultList
Example #5
0
    def get_articles(self):
        resultList = []
        sections = [('要聞港聞', '/daily/local', self._base_url + '/daily/local/'),
                    ('兩岸', '/daily/china', self._base_url + '/daily/china/'),
                    ('國際', '/daily/international', self._base_url + '/daily/international/'),
                    ('財經', '/daily/finance', self._base_url + '/daily/finance/'),
                    ('娛樂', '/daily/entertainment', self._base_url + '/daily/entertainment/'),
                    ('體育', '/daily/sports', self._base_url + '/daily/sports/'),
                    ]

        from lxml.etree import tostring
        try:
            for (title, section_id, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then retrieve the json content
                raw_page = read_http_page(url)
                date_id, d = self._find_date_id(raw_page)
                if date_id and d:
                    raw_result = self._get_collection(section_id, date_id, d)
                    result = json.loads(raw_result)
                    for article in result['content_elements']:
                        desc = article['headlines']['basic']
                        href = article['website_url']
                        abstract = None
                        if 'content_elements' in article and len(article['content_elements']) > 1 and 'content' in article['content_elements'][0]:
                            abstract = article['content_elements'][0]['content']
                        if desc and href:
                            resultList.append(self.create_article(desc.strip(), self._base_url + href, abstract))

        except Exception as e:
            logger.exception('Problem processing url: ' + str(e))
            logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))

        return resultList
Example #6
0
    def get_articles(self):
        resultList = []
        sections = [('要聞', 'http://toronto.singtao.ca/category/%e8%a6%81%e8%81%9e/?variant=zh-hk'),
                    ('城市', 'http://toronto.singtao.ca/category/%e5%9f%8e%e5%b8%82/?variant=zh-hk'),
                    ('加國', 'http://toronto.singtao.ca/category/%e5%8a%a0%e5%9c%8b/?variant=zh-hk'),
                    ('國際', 'http://toronto.singtao.ca/category/%e5%9c%8b%e9%9a%9b/?variant=zh-hk'),
                    ('港聞', 'http://toronto.singtao.ca/category/%e6%b8%af%e8%81%9e/?variant=zh-hk'),
                    ('中國', 'http://toronto.singtao.ca/category/%e4%b8%ad%e5%9c%8b/?variant=zh-hk'),
                    ('台灣', 'http://toronto.singtao.ca/category/%e5%8f%b0%e7%81%a3/?variant=zh-hk'),
                    ('體育', 'http://toronto.singtao.ca/category/%e9%ab%94%e8%82%b2/?variant=zh-hk'),
                    ('財經', 'http://toronto.singtao.ca/category/%e8%b2%a1%e7%b6%93/?variant=zh-hk'),
                    ('娛樂', 'http://toronto.singtao.ca/category/%e5%a8%9b%e6%a8%82/?variant=zh-hk'),]

        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                doc = html.document_fromstring(read_http_page(url))
                for option in doc.get_element_by_id('news').xpath('option'):
                    if option.text and option.get('value'):
                        resultList.append(self.create_article(option.text.strip(), option.get('value')))


        except Exception as e:
            logger.exception('Problem processing url')

        return resultList
Example #7
0
    def get_articles(self):
        num_pages = 2
        baseUrl = 'https://news.ltn.com.tw'

        resultList = []
        sections = [('熱門', baseUrl + '/ajax/breakingnews/popular/'),
                    ('政治', baseUrl + '/ajax/breakingnews/politics/'),
                    ('社會', baseUrl + '/ajax/breakingnews/society/'),
                    ('地方', baseUrl + '/ajax/breakingnews/local/'),
                    ('生活', baseUrl + '/ajax/breakingnews/life/'),
                    ('國際', baseUrl + '/ajax/breakingnews/world/'),]

        try:
            for page in range(1, num_pages):
                for (title, url) in sections:
                    url = url + str(page)
                    # for each section, insert a title...
                    resultList.append(self.create_section(title))
                    # ... then parse the page and extract article links
                    result = json.loads(read_http_page(url + str(page)).decode('UTF-8'))
                    if result.get('code', 0) == 200:
                        data = result.get('data', [])
                        for key in data.keys():
                            title = data[key].get('title', None)
                            url = data[key].get('url', None)
                            abstract = data[key].get('summary', None)
                            if title and url:
                                resultList.append(self.create_article(title, url, abstract))

        except Exception as e:
            logger.exception('Problem processing url: ' + str(e))
            logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))

        return resultList
Example #8
0
    def get_articles(self):
        # get date first
        dateUrl = 'http://www.mingpaocanada.com/Van/'
        theDate = datetime.datetime.today().strftime('%Y%m%d')
        try:
            doc = html.document_fromstring(read_http_page(dateUrl))
            for aLink in doc.get_element_by_id('mp-menu').xpath('//div/ul/li/a'):
                if aLink.text_content() == u'明報首頁':
                    href = aLink.attrib['href']
                    match = re.match('htm\/News\/([0-9]{8})\/main_r\.htm', href)
                    if match and match.lastindex == 1:
                        theDate = match.group(1)
                    else:
                        logger.info('no date found. using system date: ' + theDate)
        except Exception as e:
            logger.exception('Problem getting date: ' + str(e))
            logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))

        resultList = []
        sections = [('要聞','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/VAindex_r.htm'),
                    ('加國新聞','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/VBindex_r.htm'),
                    ('社區新聞','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/VDindex_r.htm'),
                    ('港聞','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/HK-VGindex_r.htm'),
                    ('國際','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/VTindex_r.htm'),
                    ('中國','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/VCindex_r.htm'),
                    ('經濟','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/VEindex_r.htm'),
                    ('體育','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/VSindex_r.htm'),
                    ('影視','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/HK-MAindex_r.htm'),
                    ('副刊','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/WWindex_r.htm'),]

        baseUrl = 'http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/'
        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                doc = html.document_fromstring(read_http_page(url).decode('big5', errors='ignore'))
                for topic in doc.xpath('//h4[contains(@class, "listing-link")]/a'):
                    if topic.text and topic.get('href'):
                        resultList.append(self.create_article(topic.text.strip(), baseUrl+topic.get('href')))

        except Exception as e:
            logger.exception('Problem processing url: ' + str(e))
            logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))

        return resultList
Example #9
0
    def get_articles(self):
        # get date first
        baseUrl = 'http://news.ltn.com.tw'
        theDate = datetime.datetime.today().strftime('%Y%m%d')
        try:
            doc = html.document_fromstring(read_http_page(baseUrl + '/newspaper/'))
            cal =  doc.get_element_by_id('box300B')
            theDate = cal.attrib['title']
        except Exception as e:
            logger.exception('Problem getting date')

        resultList = []
        sections = [('焦點', baseUrl + '/newspaper/focus/' + theDate),
                    ('政治', baseUrl + '/newspaper/politics/' + theDate),
                    ('社會', baseUrl + '/newspaper/society/' + theDate),
                    ('地方', baseUrl + '/newspaper/local/' + theDate),
                    ('生活', baseUrl + '/newspaper/life/' + theDate),
                    ('言論', baseUrl + '/newspaper/opinion/' + theDate),
                    ('國際', baseUrl + '/newspaper/world/' + theDate),
                    ('財經', baseUrl + '/newspaper/business/' + theDate),
                    ('體育', baseUrl + '/newspaper/sports/' + theDate),
                    ('娛樂', baseUrl + '/newspaper/entertainment/' + theDate),
                    ('消費', baseUrl + '/newspaper/consumer/' + theDate),
                    ('副刊', baseUrl + '/newspaper/supplement/' + theDate),]

        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                curPage = 1
                maxPage = 1
                while curPage <= maxPage:
                    # ... then parse the page and extract article links
                    doc = html.document_fromstring(read_http_page(url + '?page=' + str(curPage)))
                    for link in doc.get_element_by_id('newslistul').xpath('//a[contains(@class, "picword")]'):
                        if link.text and link.get('href'):
                            resultList.append(self.create_article(link.text.strip(), baseUrl + link.get('href')))
                    curPage += 1
                    for pageNum in doc.get_element_by_id('page').xpath('//*[contains(@class, "p_num")]'):
                        maxPage = int(pageNum.text.strip())


        except Exception as e:
            logger.exception('Problem processing url')

        return resultList
Example #10
0
    def get_articles(self):
        # get date first
        dateUrl = 'http://www.mingpaocanada.com/TOR/'
        theDate = datetime.datetime.today().strftime('%Y%m%d')
        try:
            doc = html.document_fromstring(read_http_page(dateUrl))
            for aLink in doc.get_element_by_id('mp-menu').xpath('//div/ul/li/a'):
                if aLink.text_content().encode('utf-8') == '明報首頁':
                    href = aLink.attrib['href']
                    match = re.match('htm\/News\/([0-9]{8})\/main_r\.htm', href)
                    if match and match.lastindex == 1:
                        theDate = match.group(1)
                    else:
                        logger.info('no date found. using system date: ' + theDate)
        except Exception as e:
            logger.exception('Problem getting date')

        resultList = []
        sections = [('要聞','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/TAindex_r.htm'),
                    ('加國新聞','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/TDindex_r.htm'),
                    ('地產','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/TRindex_r.htm'),
                    ('中國','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/TCAindex_r.htm'),
                    ('國際','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/TTAindex_r.htm'),
                    ('港聞','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/HK-GAindex_r.htm'),
                    ('經濟','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/THindex_r.htm'),
                    ('體育','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/TSindex_r.htm'),
                    ('影視','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/HK-MAindex_r.htm'),
                    ('副刊','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/WWindex_r.htm'),]


        baseUrl = 'http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/'
        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                doc = html.document_fromstring(unicode(read_http_page(url), 'big5', errors='ignore'))
                for topic in doc.xpath('//h4[contains(@class, "listing-link")]/a'):
                    if topic.text and topic.get('href'):
                        resultList.append(self.create_article(topic.text.strip(), baseUrl+topic.get('href')))

        except Exception as e:
            logger.exception('Problem processing url')

        return resultList
Example #11
0
    def get_articles(self):
        resultList = []
        sections = [
            ("眾聞", "https://www.hkcnews.com",
             "https://www.hkcnews.com/data/newsposts", 3),
        ]

        try:
            for (title, base_url, data_url, pages) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then get page and parse
                for page in range(1, pages + 1):
                    raw_result = read_http_page(data_url +
                                                "?page={}".format(page))
                    result = json.loads(raw_result)
                    for item in result['items']:
                        doc = html.document_fromstring(item)
                        for article in doc.xpath(
                                '//div[contains(@class, "article-block")]'):
                            article_link = article.xpath(
                                'div[contains(@class, "article-block-body")]/a'
                            )
                            article_text = article.xpath(
                                'div[contains(@class, "article-block-body")]/a/p'
                            )
                            if article_link and article_text:
                                url = base_url + article_link[0].get("href")
                                text = article_text[0].text
                                if url and text:
                                    footer = article.xpath(
                                        'a[contains(@class, "article-block-footer")]'
                                    )
                                    date_str = ''
                                    if footer:
                                        divs = footer[0].xpath(
                                            'div/div[contains(@class, "text-box")]/div'
                                        )
                                        for div in divs:
                                            if div.text and re.match(
                                                    r"[0-9]{2}\.[0-9]{2}\.[0-9]{2}",
                                                    div.text.strip()):
                                                date_str = div.text.strip()
                                    resultList.append(
                                        self.create_article(
                                            text.strip() +
                                            ' - {}'.format(date_str), url))

        except Exception as e:
            logger.exception("Problem processing url: " + str(e))
            logger.exception(
                traceback.format_exception(etype=type(e),
                                           value=e,
                                           tb=e.__traceback__))

        return resultList
Example #12
0
    def get_articles(self):
        # get date first
        dateUrl = 'http://orientaldaily.on.cc/'
        theDate = datetime.datetime.today().strftime('%Y%m%d')
        try:
            doc = html.document_fromstring(read_http_page(dateUrl))
            for aLink in doc.get_element_by_id('topMenu').xpath('ul[contains(@class, "menuList clear")]/li/a[contains(@class, "news")]'):
                href = aLink.attrib['href']
                match = re.match('\/cnt\/news\/([0-9]{8})\/index\.html', href)
                if match and match.lastindex == 1:
                    theDate = match.group(1)
                else:
                    logger.info('no date found. using system date: ' + theDate)
        except Exception as e:
            logger.exception('Problem getting date: ' + str(e))
            logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))

        resultList = []
        baseUrl = dateUrl

        sections = [('要聞港聞','http://orientaldaily.on.cc/cnt/news/' + theDate + '/index.html'),
                    ('兩岸國際','http://orientaldaily.on.cc/cnt/china_world/' + theDate + '/index.html'),
                    ('財經','http://orientaldaily.on.cc/cnt/finance/' + theDate + '/index.html'),
                    ('娛樂','http://orientaldaily.on.cc/cnt/entertainment/' + theDate + '/index.html'),
                    ('副刊','http://orientaldaily.on.cc/cnt/lifestyle/' + theDate + '/index.html'),]

        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                doc = html.document_fromstring(read_http_page(url))
                for topic in doc.get_element_by_id('articleList').xpath('ul[contains(@class, "commonBigList")]/li/a'):
                    if topic.text and topic.get('href'):
                        resultList.append(self.create_article(topic.text.strip(), baseUrl+topic.get('href')))


        except Exception as e:
            logger.exception('Problem processing url: ' + str(e))
            logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))

        return resultList
Example #13
0
    def _get_collection(self, section_id, date_id, d):
        payload_query = {
            "feedOffset":0,
            "feedQuery": "taxonomy.primary_section._id:\"{}\" AND type:story AND editor_note:\"{}\"".format(section_id, date_id),
            "feedSize":100,"sort":"location:asc"
        }
        payload_query = urllib.parse.quote(json.dumps(payload_query))

        query_url = self._base_url + \
            '/pf/api/v3/content/fetch/query-feed?query={}&d={}&_website=hk-appledaily'.format(payload_query, d)
        return read_http_page(query_url)
Example #14
0
    def get_articles(self):
        # get date first
        dateUrl = 'http://orientaldaily.on.cc/'
        theDate = datetime.datetime.today().strftime('%Y%m%d')
        try:
            doc = html.document_fromstring(read_http_page(dateUrl))
            for aLink in doc.get_element_by_id('topMenu').xpath('ul[contains(@class, "menuList clear")]/li/a[contains(@class, "news")]'):
                href = aLink.attrib['href']
                match = re.match('\/cnt\/news\/([0-9]{8})\/index\.html', href)
                if match and match.lastindex == 1:
                    theDate = match.group(1)
                else:
                    logger.info('no date found. using system date: ' + theDate)
        except Exception as e:
            logger.exception('Problem getting date')

        resultList = []
        baseUrl = dateUrl

        sections = [('要聞港聞','http://orientaldaily.on.cc/cnt/news/' + theDate + '/index.html'),
                    ('兩岸國際','http://orientaldaily.on.cc/cnt/china_world/' + theDate + '/index.html'),
                    ('財經','http://orientaldaily.on.cc/cnt/finance/' + theDate + '/index.html'),
                    ('娛樂','http://orientaldaily.on.cc/cnt/entertainment/' + theDate + '/index.html'),
                    ('副刊','http://orientaldaily.on.cc/cnt/lifestyle/' + theDate + '/index.html'),]

        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                doc = html.document_fromstring(read_http_page(url))
                for topic in doc.get_element_by_id('articleList').xpath('ul[contains(@class, "commonBigList")]/li/a'):
                    if topic.text and topic.get('href'):
                        resultList.append(self.create_article(topic.text.strip(), baseUrl+topic.get('href')))


        except Exception as e:
            logger.exception('Problem processing url')

        return resultList
Example #15
0
    def get_rss_links(self):
        resultList = []
        try:
            rss_list_url = 'https://money.udn.com/rssfeed/lists/1001';
            doc = html.document_fromstring(read_http_page(rss_list_url))
            for aLink in doc.get_element_by_id("rss_list").xpath('div/div/dl/dt/a'):
                if aLink.xpath('text()') and MoneyUnitedDailyNewsRSS.is_url(aLink.get('href')):
                    resultList.append((aLink.xpath('text()'), aLink.get('href')))
        except Exception as e:
            logger.exception('Problem fetching rss links: ' + str(e))
            logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))

        return resultList
Example #16
0
    def get_articles(self):
        # get article lists
        summary_url = 'https://tw.appledaily.com/daily'
        doc = html.document_fromstring(read_http_page(summary_url))

        resultList = []
        sections = [('頭條', u'//article[contains(@class, "nclns")]//h2[contains(text(), "頭條")]/following-sibling::ul/li/a'),
                    ('要聞', u'//article[contains(@class, "nclns")]//h2[contains(text(), "要聞")]/following-sibling::ul/li/a'),
                    ('政治', u'//article[contains(@class, "nclns")]//h2[contains(text(), "政治")]/following-sibling::ul/li/a'),
                    ('社會', u'//article[contains(@class, "nclns")]//h2[contains(text(), "社會")]/following-sibling::ul/li/a'),
                    ('蘋果爆破社', u'//article[contains(@class, "nclns")]//h2[contains(text(), "蘋果爆破社")]/following-sibling::ul/li/a'),
                    ('蘋論陣線', u'//article[contains(@class, "nclns")]//h2[contains(text(), "蘋論陣線")]/following-sibling::ul/li/a'),
                    ('暖流', u'//article[contains(@class, "nclns")]//h2[contains(text(), "暖流")]/following-sibling::ul/li/a'),
                    ('娛樂名人', u'//article[contains(@class, "nclns")]//h2[contains(text(), "娛樂名人")]/following-sibling::ul/li/a'),
                    ('木瓜霞吐槽', u'//article[contains(@class, "nclns")]//h2[contains(text(), "木瓜霞吐槽")]/following-sibling::ul/li/a'),
                    ('直擊好萊塢', u'//article[contains(@class, "nclns")]//h2[contains(text(), "直擊好萊塢")]/following-sibling::ul/li/a'),
                    ('亞洲哈燒星', u'//article[contains(@class, "nclns")]//h2[contains(text(), "亞洲哈燒星")]/following-sibling::ul/li/a'),
                    ('名人時尚', u'//article[contains(@class, "nclns")]//h2[contains(text(), "名人時尚")]/following-sibling::ul/li/a'),
                    ('國際頭條', u'//article[contains(@class, "nclns")]//h2[contains(text(), "國際頭條")]/following-sibling::ul/li/a'),
                    ('國際新聞', u'//article[contains(@class, "nclns")]//h2[contains(text(), "國際新聞")]/following-sibling::ul/li/a'),
                    ('雙語天下', u'//article[contains(@class, "nclns")]//h2[contains(text(), "雙語天下")]/following-sibling::ul/li/a'),
                    ('體育焦點', u'//article[contains(@class, "nclns")]//h2[contains(text(), "體育焦點")]/following-sibling::ul/li/a'),
                    ('大運動場', u'//article[contains(@class, "nclns")]//h2[contains(text(), "大運動場")]/following-sibling::ul/li/a'),
                    ('籃球瘋', u'//article[contains(@class, "nclns")]//h2[contains(text(), "籃球瘋")]/following-sibling::ul/li/a'),
                    ('投打對決', u'//article[contains(@class, "nclns")]//h2[contains(text(), "投打對決")]/following-sibling::ul/li/a'),
                    ('足球新聞', u'//article[contains(@class, "nclns")]//h2[contains(text(), "足球新聞")]/following-sibling::ul/li/a'),
                    ('運彩分析', u'//article[contains(@class, "nclns")]//h2[contains(text(), "運彩分析")]/following-sibling::ul/li/a'),
                    ('財經焦點', u'//article[contains(@class, "nclns")]//h2[contains(text(), "財經焦點")]/following-sibling::ul/li/a'),
                    ('頭家人生', u'//article[contains(@class, "nclns")]//h2[contains(text(), "頭家人生")]/following-sibling::ul/li/a'),
                    ('投資理財', u'//article[contains(@class, "nclns")]//h2[contains(text(), "投資理財")]/following-sibling::ul/li/a'),
                    ('卡該這樣刷', u'//article[contains(@class, "nclns")]//h2[contains(text(), "卡該這樣刷")]/following-sibling::ul/li/a'),
                    ('地產焦點', u'//article[contains(@class, "nclns")]//h2[contains(text(), "地產焦點")]/following-sibling::ul/li/a'),
                    ('副刊焦點', u'//article[contains(@class, "nclns")]//h2[contains(text(), "副刊焦點")]/following-sibling::ul/li/a'),
                    ('美食天地', u'//article[contains(@class, "nclns")]//h2[contains(text(), "美食天地")]/following-sibling::ul/li/a'),
                    ('車市3C', u'//article[contains(@class, "nclns")]//h2[contains(text(), "車市3C")]/following-sibling::ul/li/a'),
                    ('家庭與健康', u'//article[contains(@class, "nclns")]//h2[contains(text(), "家庭與健康")]/following-sibling::ul/li/a'),
                    ]

        try:
            for (title, path) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                for link in doc.xpath(path):
                    if link.get('title') and link.get('href'):
                        resultList.append(self.create_article(link.get('title').strip(), link.get('href')))

        except Exception as e:
            logger.exception('Problem processing url: ' + str(e))
            logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))

        return resultList
Example #17
0
    def get_articles(self):
        maxPagePerSection = 10
        resultList = []

        sections = [
            ("要聞港聞", "http://www.singpao.com.hk/index.php?fi=news1"),
            ("兩岸國際", "http://www.singpao.com.hk/index.php?fi=news8"),
            ("財經", "http://www.singpao.com.hk/index.php?fi=news3"),
            ("娛樂", "http://www.singpao.com.hk/index.php?fi=news4"),
            ("體育", "http://www.singpao.com.hk/index.php?fi=news5"),
            ("副刊", "http://www.singpao.com.hk/index.php?fi=news7"),
        ]
        baseUrl = "http://www.singpao.com.hk/"

        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                page = 1
                maxPage = 1
                while page <= maxPage and page <= maxPagePerSection:
                    doc = html.document_fromstring(
                        read_http_page(url + "&page=" + str(page)))
                    page += 1

                    for topic in doc.xpath(
                            '//td/a[contains(@class, "list_title")]'):
                        if topic.text and topic.get("href"):
                            resultList.append(
                                self.create_article(
                                    topic.text.strip(),
                                    baseUrl + topic.get("href")))

                    for pageIndex in doc.xpath(
                            '//a[contains(@class, "fpagelist_css")]'):
                        if pageIndex.text is not None:
                            match = re.match(r"^([0-9]+)$",
                                             pageIndex.text.strip())
                            if (match and match.lastindex == 1
                                    and int(match.group(1)) > maxPage):
                                maxPage = int(match.group(1))

        except Exception as e:
            logger.exception("Problem processing url: " + str(e))
            logger.exception(
                traceback.format_exception(etype=type(e),
                                           value=e,
                                           tb=e.__traceback__))

        return resultList
Example #18
0
    def get_articles(self):
        resultList = []

        sections = [
            ("港聞", "http://www.takungpao.com.hk/hongkong/"),
            ("內地", "http://www.takungpao.com.hk/mainland/"),
            ("台灣", "http://www.takungpao.com.hk/taiwan/"),
            ("國際", "http://www.takungpao.com.hk/international/"),
            ("評論", "http://www.takungpao.com.hk/opinion/"),
            ("經濟", "http://www.takungpao.com.hk/finance/"),
            ("文化", "http://www.takungpao.com.hk/culture/"),
            ("體育", "http://www.takungpao.com.hk/sports/"),
            ("娛樂", "http://www.takungpao.com.hk/ent/"),
        ]

        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                doc = html.document_fromstring(read_http_page(url))

                for topic in doc.xpath(
                        '//div[contains(@class, "list_tuwen")]/div[contains(@class, "content")]'
                ):
                    title = topic.xpath(
                        'ul[contains(@class, "txt")]/li[contains(@class, "title")]/a'
                    )
                    intro = topic.xpath(
                        'ul[contains(@class, "txt")]/li[contains(@class, "intro")]/a'
                    )

                    if title and title[0].text and title[0].get("href"):
                        resultList.append(
                            self.create_article(
                                title[0].text.strip(),
                                title[0].get("href"),
                                intro[0].text.strip()
                                if intro and intro[0].text else None,
                            ))

        except Exception as e:
            logger.exception("Problem processing url: " + str(e))
            logger.exception(
                traceback.format_exception(etype=type(e),
                                           value=e,
                                           tb=e.__traceback__))

        return resultList
Example #19
0
    def get_articles(self):
        resultList = []
        sections = [
            ("國際", "/realtime/international",
             self._base_url + "/realtime/international/"),
            ("娛樂時尚", "/realtime/entertainment",
             self._base_url + "/realtime/entertainment/"),
            ("社會", "/realtime/local", self._base_url + "/realtime/local"),
            ("生活", "/realtime/life", self._base_url + "/realtime/life"),
            ("財經地產", "/realtime/property",
             self._base_url + "/realtime/property/"),
            ("吃喝玩樂", "/realtime/supplement",
             self._base_url + "/realtime/supplement/"),
            ("體育", "/realtime/sports", self._base_url + "/realtime/sports/"),
        ]

        try:
            for (title, section_id, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then retrieve the json content
                raw_page = read_http_page(url)
                d = self._find_date_id(raw_page)
                if d:
                    raw_result = self._get_collection(section_id, d)
                    result = json.loads(raw_result)
                    for article in result["content_elements"]:
                        desc = article["headlines"]["basic"]
                        href = article["website_url"]
                        abstract = None
                        if ("content_elements" in article
                                and len(article["content_elements"]) > 1 and
                                "content" in article["content_elements"][0]):
                            abstract = article["content_elements"][0][
                                "content"]
                        if desc and href:
                            resultList.append(
                                self.create_article(desc.strip(),
                                                    self._base_url + href,
                                                    abstract))

        except Exception as e:
            logger.exception("Problem processing url: " + str(e))
            logger.exception(
                traceback.format_exception(etype=type(e),
                                           value=e,
                                           tb=e.__traceback__))

        return resultList
Example #20
0
    def get_articles(self):
        resultList = []
        sections = [('Vancouver', 'http://www.theprovince.com/scripts/Sp6Query.aspx?catalog=VAPR&tags=category|news|subcategory|metro%20vancouver'),
                    ('Fraser Valley', 'http://www.theprovince.com/scripts/Sp6Query.aspx?catalog=VAPR&tags=category|news|subcategory|fraser%20valley'),
                    ('B.C.', 'http://www.theprovince.com/scripts/Sp6Query.aspx?catalog=VAPR&tags=category|news|subcategory|b.c.'),]
        relSections = [('Canada', 'http://www.theprovince.com/7588609.atom'),
                    ('World', 'http://www.theprovince.com/7589147.atom'), ]

        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                doc = etree.fromstring(read_http_page(url))
                for entry in doc.xpath('//ns:entry[@Status="FREE"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'}):
                    title = entry.xpath('ns:title[@type="html"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].text
                    link = 'http://www.theprovince.com' + entry.xpath('ns:link[@type="text/html"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].get('href')
                    abstract = entry.xpath('ns:link[@type="text/html"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].get('Abstract')
                    resultList.append(self.create_article(title.strip(), link, abstract))

            for (title, url) in relSections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                doc = etree.fromstring(read_http_page(url))
                for entry in doc.xpath('//ns:entry[@Status="FREE"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'}):
                    title = entry.xpath('ns:title[@type="html"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].text
                    link = 'http://www.theprovince.com' + entry.xpath('ns:link[@type="text/xml"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].get('href')
                    abstract = entry.xpath('ns:link[@type="text/xml"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].get('Abstract')
                    resultList.append(self.create_article(title.strip(), link, abstract))

        except Exception as e:
            logger.exception('Problem processing url: ' + str(e))
            logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))

        return resultList
Example #21
0
 def get_articles(self):
     resultList = []
     for (name, url) in self.get_rss_links():
         try:
             # for each section, insert a title...
             resultList.append(self.create_section(name))
             # ... then parse the page and extract article links
             doc = etree.fromstring(read_http_page(url), parser=etree.XMLParser(recover=True))
             for entry in doc.xpath('//*[local-name()="RDF"]/*[local-name()="item"]'):
                 title = entry.xpath('*[local-name()="title"]')[0].text
                 link = entry.xpath('*[local-name()="link"]')[0].text
                 abstract = entry.xpath('*[local-name()="description"]')[0].text
                 resultList.append(self.create_article(title.strip(), link, abstract))
         except Exception as e:
             logger.exception('Problem processing rdf')
     return resultList
Example #22
0
    def get_articles(self):
        resultList = []
        # build the list
        listUrl = 'http://www.am730.com.hk/home'
        baseUrl = 'http://www.am730.com.hk/'
        try:
            doc = html.document_fromstring(read_http_page(listUrl))
            for optGroup in doc.get_element_by_id('listnews').xpath('optgroup'):
                if optGroup.get('label'):
                    resultList.append(self.create_section(optGroup.get('label')))
                for opt in optGroup.xpath('option'):
                    if opt.get('value') and opt.text:
                        resultList.append(self.create_article(opt.text.strip(), baseUrl+opt.get('value')))
        except Exception as e:
            logger.exception('Problem getting date')

        return resultList
Example #23
0
    def get_articles(self):
        resultList = []
        baseUrl = "https://www.chinatimes.com"

        sections = [
            ("政治", baseUrl + "/politic/?chdtv"),
            ("言論", baseUrl + "/opinion/?chdtv"),
            ("生活", baseUrl + "/life/?chdtv"),
            ("娛樂", baseUrl + "/star/?chdtv"),
            ("財經", baseUrl + "/money/?chdtv"),
            ("社會", baseUrl + "/society/?chdtv"),
            ("話題", baseUrl + "/hottopic/?chdtv"),
            ("國際", baseUrl + "/world/?chdtv"),
            ("軍事", baseUrl + "/armament/?chdtv"),
            ("兩岸", baseUrl + "/chinese/?chdtv"),
            ("時尚", baseUrl + "/fashion/?chdtv"),
            ("體育", baseUrl + "/sports/?chdtv"),
            ("科技", baseUrl + "/technologynews/?chdtv"),
            ("玩食", baseUrl + "/travel/?chdtv"),
            ("新聞專輯", baseUrl + "/album/?chdtv"),
        ]

        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                doc = html.document_fromstring(read_http_page(url))
                for topic in doc.xpath(
                        '//section[contains(@class, "article-list")]/ul//li//h3[contains(@class, "title")]//a'
                ):
                    if topic.text and topic.get("href"):
                        resultList.append(
                            self.create_article(topic.text.strip(),
                                                topic.get("href")))

        except Exception as e:
            logger.exception("Problem processing url: " + str(e))
            logger.exception(
                traceback.format_exception(etype=type(e),
                                           value=e,
                                           tb=e.__traceback__))

        return resultList
Example #24
0
 def get_articles(self):
     resultList = []
     for (name, url) in self.get_rss_links():
         try:
             # for each section, insert a title...
             resultList.append(self.create_section(name))
             # ... then parse the page and extract article links
             data = read_http_page(url)
             if data:
                 doc = etree.fromstring(data, parser=etree.XMLParser(recover=True))
                 for entry in doc.xpath('//rss/channel/item'):
                     title = entry.xpath('title')[0].text
                     link = entry.xpath('link')[0].text
                     abstract = entry.xpath('description')[0].text
                     resultList.append(self.create_article(title.strip(), link, abstract))
         except Exception as e:
             logger.exception('Problem processing rss: ' + str(e))
             logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))
     return resultList
Example #25
0
    def _get_collection(self, section_id, d):
        payload_query = {
            "feedOffset":
            0,
            "feedQuery":
            'taxonomy.primary_section._id:"{}" AND type:story AND display_date:[now-24h/h TO now] AND NOT taxonomy.tags.text.raw:_no_show_for_web AND NOT taxonomy.tags.text.raw:_nohkad'
            .format(section_id),
            "feedSize":
            100,
            "sort":
            "display_date:desc",
        }
        payload_query = urllib.parse.quote(json.dumps(payload_query))

        query_url = (
            self._base_url +
            "/pf/api/v3/content/fetch/query-feed?query={}&d={}&_website=tw-appledaily"
            .format(payload_query, d))
        return read_http_page(query_url)
Example #26
0
    def get_articles(self):
        resultList = []
        baseUrl = "https://api.theinitium.com"
        apiUrl = "https://api.theinitium.com/api/v2/channel/articles"

        sections = [
            ("最新", apiUrl + "/?language=zh-hant&slug=latest"),
            ("香港", apiUrl + "/?language=zh-hant&slug=hongkong"),
            ("國際", apiUrl + "/?language=zh-hant&slug=international"),
            ("大陸", apiUrl + "/?language=zh-hant&slug=mainland"),
            ("台灣", apiUrl + "/?language=zh-hant&slug=taiwan"),
            ("評論", apiUrl + "/?language=zh-hant&slug=opinion"),
            ("科技", apiUrl + "/?language=zh-hant&slug=technology"),
            ("風物", apiUrl + "/?language=zh-hant&slug=culture"),
            ("廣場", apiUrl + "/?language=zh-hant&slug=notes-and-letters"),
        ]

        headers = urllib3.make_headers(
            basic_auth="anonymous:GiCeLEjxnqBcVpnp6cLsUvJievvRQcAXLv")
        headers['Accept'] = "application/json"

        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                contents = json.loads(read_http_page(url, headers=headers))
                for digest in contents["digests"]:
                    article = digest["article"]
                    if article and article["headline"] and article["url"]:
                        resultList.append(
                            self.create_article(article["headline"].strip(),
                                                baseUrl + article["url"],
                                                article["lead"]))

        except Exception as e:
            logger.exception("Problem processing url: " + str(e))
            logger.exception(
                traceback.format_exception(etype=type(e),
                                           value=e,
                                           tb=e.__traceback__))

        return resultList
Example #27
0
    def get_articles(self):
        resultList = []
        baseUrl = "https://www.rfa.org/cantonese"

        sections = [
            ("新聞", baseUrl + "/news"),
            ("港澳台新聞", baseUrl + "/news/htm"),
            ("評論", baseUrl + "/commentaries"),
            ("聚言堂", baseUrl + "/talkshows"),
            ("專題", baseUrl + "/features/hottopic"),
            ("多媒體", baseUrl + "/multimedia"),
        ]

        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                doc = html.document_fromstring(read_http_page(url))
                for topic in doc.xpath(
                        '//div[contains(@id, "topstorywidefulltease")]|//div[contains(@class, "sectionteaser")]'
                ):
                    title = topic.xpath('h2/a')
                    intro = topic.xpath('p')

                    if title:
                        title_text = title[0].xpath('span')

                        resultList.append(
                            self.create_article(
                                title_text[0].text.strip(),
                                title[0].get("href"), intro[0].text.strip()
                                if intro and intro[0].text else None))

        except Exception as e:
            logger.exception("Problem processing url: " + str(e))
            logger.exception(
                traceback.format_exception(etype=type(e),
                                           value=e,
                                           tb=e.__traceback__))

        return resultList
Example #28
0
    def get_articles(self):
        resultList = []
        sections = [
            ("金融經濟", "https://inews.hket.com", "/sran009/金融經濟", 3),
            ("理財", "https://wealth.hket.com", "/", 1),
            ("科技", "https://inews.hket.com", "/sran010/科技", 2),
            ("中國", "https://china.hket.com", "/", 1),
            ("國際", "https://inews.hket.com", "/sran011/國際", 2),
            ("商業", "https://inews.hket.com", "/sran012/商業", 2),
        ]
        seen_url = {}

        try:
            for (title, base_url, url, pages) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then get page and parse
                for page in range(1, pages + 1):
                    doc = html.document_fromstring(
                        read_http_page(base_url + url + "?p={}".format(page)))
                    for topic in doc.xpath(
                            '//div[contains(@class, "listing-widget-33") or contains(@class, "listing-widget-4") or contains(@class, "listing-widget-9")]/a[contains(@class, "listing-overlay")]'
                    ):
                        if topic.text and topic.get("href"):
                            topic_url = (topic.get("href") if
                                         self._is_absolute(topic.get("href"))
                                         else base_url + topic.get("href"))
                            if topic_url not in seen_url:
                                seen_url[topic_url] = None
                                resultList.append(
                                    self.create_article(
                                        topic.text.strip(), topic_url))

        except Exception as e:
            logger.exception("Problem processing url: " + str(e))
            logger.exception(
                traceback.format_exception(etype=type(e),
                                           value=e,
                                           tb=e.__traceback__))

        return resultList
Example #29
0
    def get_articles(self):
        num_pages = 2
        baseUrl = "https://news.ltn.com.tw"

        resultList = []
        sections = [
            ("熱門", baseUrl + "/ajax/breakingnews/popular/"),
            ("政治", baseUrl + "/ajax/breakingnews/politics/"),
            ("社會", baseUrl + "/ajax/breakingnews/society/"),
            ("地方", baseUrl + "/ajax/breakingnews/local/"),
            ("生活", baseUrl + "/ajax/breakingnews/life/"),
            ("國際", baseUrl + "/ajax/breakingnews/world/"),
        ]

        try:
            for page in range(1, num_pages):
                for (title, url) in sections:
                    url = url + str(page)
                    # for each section, insert a title...
                    resultList.append(self.create_section(title))
                    # ... then parse the page and extract article links
                    result = json.loads(
                        read_http_page(url + str(page)).decode("UTF-8"))
                    if result.get("code", 0) == 200:
                        data = result.get("data", [])
                        for key in data.keys():
                            title = data[key].get("title", None)
                            url = data[key].get("url", None)
                            abstract = data[key].get("summary", None)
                            if title and url:
                                resultList.append(
                                    self.create_article(title, url, abstract))

        except Exception as e:
            logger.exception("Problem processing url: " + str(e))
            logger.exception(
                traceback.format_exception(etype=type(e),
                                           value=e,
                                           tb=e.__traceback__))

        return resultList
Example #30
0
    def get_articles(self):
        maxPagePerSection = 10
        resultList = []

        sections = [('要聞港聞', 'http://www.singpao.com.hk/index.php?fi=news1'),
                    ('兩岸國際', 'http://www.singpao.com.hk/index.php?fi=news8'),
                    ('財經', 'http://www.singpao.com.hk/index.php?fi=news3'),
                    ('娛樂', 'http://www.singpao.com.hk/index.php?fi=news4'),
                    ('體育', 'http://www.singpao.com.hk/index.php?fi=news5'),
                    ('副刊', 'http://www.singpao.com.hk/index.php?fi=news7'),]
        baseUrl = 'http://www.singpao.com.hk/'

        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                page = 1
                maxPage = 1
                while page <= maxPage and page <= maxPagePerSection:
                    doc = html.document_fromstring(read_http_page(url+'&page='+str(page)))
                    page += 1

                    for topic in doc.xpath('//td/a[contains(@class, "list_title")]'):
                        if topic.text and topic.get('href'):
                            resultList.append(self.create_article(topic.text.strip(), baseUrl+topic.get('href')))

                    for pageIndex in doc.xpath('//a[contains(@class, "fpagelist_css")]'):
                        if pageIndex.text is not None:
                            match = re.match('^([0-9]+)$', pageIndex.text.strip())
                            if match and match.lastindex == 1 and int(match.group(1)) > maxPage:
                                maxPage = int(match.group(1))


        except Exception as e:
            logger.exception('Problem processing url: ' + str(e))
            logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))

        return resultList
Example #31
0
    def get_articles(self):
        resultList = []
        sections = [('要聞港聞', 'http://hk.apple.nextmedia.com/news/index/'),
                    ('兩岸國際', 'http://hk.apple.nextmedia.com/international/index/'),
                    ('財經地產', 'http://hk.apple.nextmedia.com/financeestate/index/'),
                    ('娛樂名人', 'http://hk.apple.nextmedia.com/entertainment/index/'),
                    ('果籽', 'http://hk.apple.nextmedia.com/supplement/index/'),]

        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                doc = html.document_fromstring(read_http_page(url))
                for option in doc.get_element_by_id('article_ddl').xpath('//option'):
                    if option.text and option.get('value'):
                        resultList.append(self.create_article(option.text.strip(), option.get('value')))

        except Exception as e:
            logger.exception('Problem processing url')

        return resultList
Example #32
0
    def get_articles(self):
        resultList = []

        sections = [('港聞', 'http://www.takungpao.com.hk/hongkong/'),
                    ('內地', 'http://www.takungpao.com.hk/mainland/'),
                    ('台灣', 'http://www.takungpao.com.hk/taiwan/'),
                    ('國際', 'http://www.takungpao.com.hk/international/'),
                    ('評論', 'http://www.takungpao.com.hk/opinion/'),
                    ('經濟', 'http://www.takungpao.com.hk/finance/'),
                    ('文化', 'http://www.takungpao.com.hk/culture/'),
                    ('體育', 'http://www.takungpao.com.hk/sports/'),
                    ('娛樂', 'http://www.takungpao.com.hk/ent/'),]

        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                doc = html.document_fromstring(read_http_page(url))

                for topic in doc.xpath('//div[contains(@class, "list_tuwen")]/div[contains(@class, "content")]'):
                    title = topic.xpath('ul/li[contains(@class, "title")]/a')
                    intro = topic.xpath('ul/li[contains(@class, "intro")]/a')

                    if title and title[0].text and title[0].get('href'):
                        resultList.append(
                            self.create_article( \
                                title[0].text.strip(),\
                                title[0].get('href'), \
                                intro[0].text.strip() if intro and intro[0].text else None))

        except Exception as e:
            logger.exception('Problem processing url: ' + str(e))
            logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))

        return resultList
Example #33
0
    def get_articles(self):
        resultList = []
        baseUrl = 'https://www.chinatimes.com'

        sections = [('政治', baseUrl + '/politic/?chdtv'),
                    ('言論', baseUrl + '/opinion/?chdtv'),
                    ('生活', baseUrl + '/life/?chdtv'),
                    ('娛樂', baseUrl + '/star/?chdtv'),
                    ('財經', baseUrl + '/money/?chdtv'),
                    ('社會', baseUrl + '/society/?chdtv'),
                    ('話題', baseUrl + '/hottopic/?chdtv'),
                    ('國際', baseUrl + '/world/?chdtv'),
                    ('軍事', baseUrl + '/armament/?chdtv'),
                    ('兩岸', baseUrl + '/chinese/?chdtv'),
                    ('時尚', baseUrl + '/fashion/?chdtv'),
                    ('體育', baseUrl + '/sports/?chdtv'),
                    ('科技', baseUrl + '/technologynews/?chdtv'),
                    ('玩食', baseUrl + '/travel/?chdtv'),
                    ('新聞專輯', baseUrl + '/album/?chdtv'),]

        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                doc = html.document_fromstring(read_http_page(url))
                for topic in doc.xpath('//section[contains(@class, "article-list")]/ul//li//h3[contains(@class, "title")]//a'):
                    if topic.text and topic.get('href'):
                        resultList.append(self.create_article(topic.text.strip(), topic.get('href')))


        except Exception as e:
            logger.exception('Problem processing url: ' + str(e))
            logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__))

        return resultList
Example #34
0
    def get_articles(self):
        resultList = []

        pages = 3
        sections = [
            ("新聞", "https://www.storm.mg/articles"),
            ("評論", "https://www.storm.mg/all-comment"),
            ("財經", "https://www.storm.mg/category/23083"),
            ("生活", "https://www.storm.mg/category/104"),
            ("人物", "https://www.storm.mg/category/171151"),
            ("華爾街日報", "https://www.storm.mg/category/173479"),
            ("新新聞", "https://www.storm.mg/category/87726"),
        ]

        try:
            for (title, url) in sections:
                resultList.append(self.create_section(title))
                for page in range(1, pages + 1):
                    # for each section, insert a title...
                    # ... then parse the page and extract article links
                    doc = html.document_fromstring(
                        read_http_page(url + "/" + str(page)))

                    # get the first featured article
                    topic = doc.xpath(
                        '//div[contains(@class, "category_top_card")]/div[contains(@class, "card_img_wrapper")]'
                    )
                    if topic:
                        title = topic[0].xpath(
                            'div[contains(@class, "card_inner_wrapper")]/a[contains(@class, "link_title")]'
                        )
                        intro = topic[0].xpath(
                            'div[contains(@class, "card_inner_wrapper")]/a[contains(@class, "card_substance")]'
                        )
                        title_text = title[0].xpath(
                            "h2/text()") if title else None
                        if title and title_text and title[0].get("href"):
                            resultList.append(
                                self.create_article(
                                    title_text[0].strip(),
                                    title[0].get("href"),
                                    intro[0].text.strip()
                                    if intro and intro[0].text else None,
                                ))

                    for topic in doc.xpath(
                            '//div[contains(@class, "category_cards_wrapper")]/div[contains(@class, "category_card")]'
                    ):
                        title = topic.xpath(
                            'div[contains(@class, "card_inner_wrapper")]/a[contains(@class, "link_title")]'
                        )
                        intro = topic.xpath(
                            'div[contains(@class, "card_inner_wrapper")]/a[contains(@class, "card_substance")]'
                        )
                        title_text = title[0].xpath(
                            "h3/text()") if title else None

                        if title and title_text and title[0].get("href"):
                            resultList.append(
                                self.create_article(
                                    title_text[0].strip(),
                                    title[0].get("href"),
                                    intro[0].text.strip()
                                    if intro and intro[0].text else None,
                                ))

        except Exception as e:
            logger.exception("Problem processing url: " + str(e))
            logger.exception(
                traceback.format_exception(etype=type(e),
                                           value=e,
                                           tb=e.__traceback__))

        return resultList
Example #35
0
    def get_articles(self):
        siteBaseUrl = "https://money.udn.com"
        baseUrl = siteBaseUrl + "/money/cate/"

        resultList = []
        sections = [
            ("要聞", baseUrl + "10846"),
            ("國際", baseUrl + "5588"),
            ("兩岸", baseUrl + "5589"),
            ("產業", baseUrl + "5591"),
            ("證券", baseUrl + "5590"),
            ("金融", baseUrl + "12017"),
            ("期貨", baseUrl + "11111"),
            ("理財", baseUrl + "5592"),
            ("房市", baseUrl + "5593"),
            ("專欄", baseUrl + "5595"),
            ("商情", baseUrl + "5597"),
        ]

        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                doc = html.document_fromstring(read_http_page(url))
                for topic in doc.xpath(
                        '//section[contains(@class, "cate-main__section")]/div[contains(@class, "story-headline-wrapper")]'
                ):
                    # main stories first...
                    link = topic.xpath(
                        'div[contains(@class, "story__content")]/a')
                    title = topic.xpath(
                        'div[contains(@class, "story__content")]/a/h3')
                    intro = topic.xpath(
                        'div[contains(@class, "story__content")]/a/p')
                    title_text = title[0].text if title else None

                    if title and title_text and link:
                        resultList.append(
                            self.create_article(
                                title_text.strip(),
                                siteBaseUrl + link[0].get("href"),
                                intro[0].text.strip()
                                if intro and intro[0].text else None,
                            ))

                for topic in doc.xpath(
                        '//section[contains(@class, "cate-main__section")]/ul[contains(@class, "story-flex-bt-wrapper")]'
                ):
                    # ... then other stories
                    titles = topic.xpath(
                        'li[contains(@class, "story__item")]/a')
                    for title in titles:
                        title_text = title.text
                        if title_text:
                            resultList.append(
                                self.create_article(
                                    title_text.strip(),
                                    siteBaseUrl + title.get("href"),
                                    None,
                                ))

        except Exception as e:
            logger.exception("Problem processing url: " + str(e))
            logger.exception(
                traceback.format_exception(etype=type(e),
                                           value=e,
                                           tb=e.__traceback__))

        return resultList
Example #36
0
    def get_articles(self):
        # get date first
        dateUrl = "http://www.mingpaocanada.com/TOR/"
        tor_time = datetime.now(pytz.timezone("America/Toronto"))
        if tor_time.hour < 4:
            tor_time = tor_time - timedelta(days=1)
        theDate = tor_time.strftime("%Y%m%d")

        try:
            doc = html.document_fromstring(read_http_page(dateUrl))
            for aLink in doc.get_element_by_id("mp-menu").xpath(
                    "//div/ul/li/a"):
                if aLink.text_content() == u"明報首頁":
                    href = aLink.attrib["href"]
                    match = re.match(r"htm\/News\/([0-9]{8})\/main_r\.htm",
                                     href)
                    if match and match.lastindex == 1:
                        theDate = match.group(1)
                    else:
                        logger.info("no date found. using system date: " +
                                    theDate)
        except Exception as e:
            logger.exception("Problem getting date: " + str(e))
            logger.exception(
                traceback.format_exception(etype=type(e),
                                           value=e,
                                           tb=e.__traceback__))

        resultList = []
        sections = [
            (
                "要聞",
                "http://www.mingpaocanada.com/TOR/htm/News/" + theDate +
                "/TAindex_r.htm",
            ),
            (
                "加國新聞",
                "http://www.mingpaocanada.com/TOR/htm/News/" + theDate +
                "/TDindex_r.htm",
            ),
            (
                "中國",
                "http://www.mingpaocanada.com/TOR/htm/News/" + theDate +
                "/TCAindex_r.htm",
            ),
            (
                "國際",
                "http://www.mingpaocanada.com/TOR/htm/News/" + theDate +
                "/TTAindex_r.htm",
            ),
            (
                "港聞",
                "http://www.mingpaocanada.com/TOR/htm/News/" + theDate +
                "/HK-GAindex_r.htm",
            ),
            (
                "經濟",
                "http://www.mingpaocanada.com/TOR/htm/News/" + theDate +
                "/THindex_r.htm",
            ),
            (
                "體育",
                "http://www.mingpaocanada.com/TOR/htm/News/" + theDate +
                "/TSindex_r.htm",
            ),
            (
                "影視",
                "http://www.mingpaocanada.com/TOR/htm/News/" + theDate +
                "/HK-MAindex_r.htm",
            ),
            (
                "副刊",
                "http://www.mingpaocanada.com/TOR/htm/News/" + theDate +
                "/WWindex_r.htm",
            ),
        ]

        baseUrl = "http://www.mingpaocanada.com/TOR/htm/News/" + theDate + "/"
        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                doc = html.document_fromstring(
                    read_http_page(url).decode("big5-hkscs", errors="ignore"))
                for topic in doc.xpath(
                        '//h4[contains(@class, "listing-link")]/a'):
                    if topic.text and topic.get("href"):
                        resultList.append(
                            self.create_article(topic.text.strip(),
                                                baseUrl + topic.get("href")))

        except Exception as e:
            logger.exception("Problem processing url: " + str(e))
            logger.exception(
                traceback.format_exception(etype=type(e),
                                           value=e,
                                           tb=e.__traceback__))

        return resultList
Example #37
0
    def get_articles(self):
        resultList = []
        sections = [
            (
                "要聞",
                "https://www.singtao.ca/category/52-%E5%A4%9A%E5%80%AB%E5%A4%9A%E8%A6%81%E8%81%9E/?variant=zh-hk",
            ),
            (
                "加國新聞",
                "https://www.singtao.ca/category/54-%E5%A4%9A%E5%80%AB%E5%A4%9A%E5%8A%A0%E5%9C%8B/?variant=zh-hk",
            ),
            (
                "城市",
                "https://www.singtao.ca/category/53-%E5%A4%9A%E5%80%AB%E5%A4%9A%E5%9F%8E%E5%B8%82/?variant=zh-hk",
            ),
            (
                "港聞",
                "https://www.singtao.ca/category/57-%E5%A4%9A%E5%80%AB%E5%A4%9A%E6%B8%AF%E8%81%9E/?variant=zh-hk",
            ),
            (
                "國際",
                "https://www.singtao.ca/category/56-%E5%A4%9A%E5%80%AB%E5%A4%9A%E5%9C%8B%E9%9A%9B/?variant=zh-hk",
            ),
            (
                "中國",
                "https://www.singtao.ca/category/58-%E5%A4%9A%E5%80%AB%E5%A4%9A%E4%B8%AD%E5%9C%8B/?variant=zh-hk",
            ),
            (
                "台灣",
                "https://www.singtao.ca/category/59-%E5%A4%9A%E5%80%AB%E5%A4%9A%E5%8F%B0%E7%81%A3/?variant=zh-hk",
            ),
            (
                "財經",
                "https://www.singtao.ca/category/61-%E5%A4%9A%E5%80%AB%E5%A4%9A%E8%B2%A1%E7%B6%93/?variant=zh-hk",
            ),
            (
                "體育",
                "https://www.singtao.ca/category/60-%E5%A4%9A%E5%80%AB%E5%A4%9A%E9%AB%94%E8%82%B2/?variant=zh-hk",
            ),
            (
                "娛樂",
                "https://www.singtao.ca/category/62-%E5%A4%9A%E5%80%AB%E5%A4%9A%E5%A8%9B%E6%A8%82/?variant=zh-hk",
            ),
        ]

        try:
            for (title, url) in sections:
                # for each section, insert a title...
                resultList.append(self.create_section(title))
                # ... then parse the page and extract article links
                doc = html.document_fromstring(
                    read_http_page(url, {
                        "edition": "toronto"
                    }).decode("utf-8"))

                # top story
                top_story_link = doc.xpath(
                    '(//div[@class="td-ss-main-content"])[1]/div[@class="cat-header-image"]/a'
                )
                top_story_text = doc.xpath(
                    '(//div[@class="td-ss-main-content"])[1]/div[@class="cat-header-image"]/a/div/h3'
                )
                if top_story_link and top_story_text:
                    resultList.append(
                        self.create_article(
                            top_story_text[0].text.strip(),
                            top_story_link[0].get("href"),
                        ))

                for topic in doc.xpath(
                        '(//div[@class="td-ss-main-content"])[1]/div[contains(@class, "td-animation-stack")]/div[@class="item-details"]/h3/a'
                ):
                    if topic.text and topic.get("href"):
                        resultList.append(
                            self.create_article(topic.text.strip(),
                                                topic.get("href")))

        except Exception as e:
            logger.exception("Problem processing url: " + str(e))
            logger.exception(
                traceback.format_exception(etype=type(e),
                                           value=e,
                                           tb=e.__traceback__))

        return resultList
Example #38
0
    def get_articles(self):
        topUrl = "http://orientaldaily.on.cc"
        sections = {
            'news': {
                'title': '要聞港聞',
                'url': ''
            },
            'china_world': {
                'title': '兩岸國際',
                'url': ''
            },
            'finance': {
                'title': '產經',
                'url': ''
            },
            'entertainment': {
                'title': '娛樂',
                'url': ''
            },
            'lifestyle': {
                'title': '副刊',
                'url': ''
            },
            'sport': {
                'title': '體育',
                'url': ''
            }
        }

        try:
            doc = html.document_fromstring(read_http_page(topUrl))
            if doc is not None:
                menu = doc.xpath(
                    '//*[@id="pageCTN"]/header/div[contains(@class, "middle")]/ul[contains(@class, "menuList")]'
                )
                if menu:
                    for theLink in menu[0].xpath('li/a'):
                        theClass = theLink.xpath('@class')
                        if theLink.xpath('@href') and theClass and theClass[
                                0] in sections:
                            sections[theClass[0]][
                                'url'] = topUrl + theLink.xpath('@href')[0]
        except Exception as e:
            logger.exception("Problem getting sections: " + str(e))
            logger.exception(
                traceback.format_exception(etype=type(e),
                                           value=e,
                                           tb=e.__traceback__))

        resultList = []
        baseUrl = topUrl

        try:
            for _, section in sections.items():
                title = section['title']
                sectionUrl = section['url']
                if sectionUrl:
                    # for each section, insert a title...
                    resultList.append(self.create_section(title))
                    # ... then parse the page and extract article links
                    doc = html.document_fromstring(read_http_page(sectionUrl))
                    if doc is not None:
                        articles = doc.xpath(
                            '//div[contains(@class, "sectionList")]/div[contains(@class, "subsection")]/ul[contains(@class, "items")]/li[@articleid]'
                        )
                        for article in articles:
                            articleUrls = article.xpath('a/@href')
                            articleTexts = article.xpath(
                                'a/div[contains(@class, "text")]/text()')
                            if articleUrls and articleTexts:
                                resultList.append(
                                    self.create_article(
                                        articleTexts[0].strip(),
                                        baseUrl + articleUrls[0]))

        except Exception as e:
            logger.exception("Problem processing url: " + str(e))
            logger.exception(
                traceback.format_exception(etype=type(e),
                                           value=e,
                                           tb=e.__traceback__))

        return resultList