def get_articles(self): resultList = [] sections = [('Vancouver', 'http://www.theprovince.com/scripts/Sp6Query.aspx?catalog=VAPR&tags=category|news|subcategory|metro%20vancouver'), ('Fraser Valley', 'http://www.theprovince.com/scripts/Sp6Query.aspx?catalog=VAPR&tags=category|news|subcategory|fraser%20valley'), ('B.C.', 'http://www.theprovince.com/scripts/Sp6Query.aspx?catalog=VAPR&tags=category|news|subcategory|b.c.'),] relSections = [('Canada', 'http://www.theprovince.com/7588609.atom'), ('World', 'http://www.theprovince.com/7589147.atom'), ] try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = etree.fromstring(read_http_page(url)) for entry in doc.xpath('//ns:entry[@Status="FREE"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'}): title = entry.xpath('ns:title[@type="html"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].text link = 'http://www.theprovince.com' + entry.xpath('ns:link[@type="text/html"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].get('href') abstract = entry.xpath('ns:link[@type="text/html"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].get('Abstract') resultList.append(self.create_article(title.strip(), link, abstract)) for (title, url) in relSections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = etree.fromstring(read_http_page(url)) for entry in doc.xpath('//ns:entry[@Status="FREE"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'}): title = entry.xpath('ns:title[@type="html"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].text link = 'http://www.theprovince.com' + entry.xpath('ns:link[@type="text/xml"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].get('href') abstract = entry.xpath('ns:link[@type="text/xml"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].get('Abstract') resultList.append(self.create_article(title.strip(), link, abstract)) except Exception as e: logger.exception('Problem processing url') return resultList
def get_articles(self): resultList = [] sections = [('要聞','https://www.singtao.ca/category/52-%E5%8D%A1%E5%8A%A0%E5%88%A9%E8%A6%81%E8%81%9E/?variant=zh-hk'), ('加國新聞','https://www.singtao.ca/category/54-%E5%8D%A1%E5%8A%A0%E5%88%A9%E5%8A%A0%E5%9C%8B/?variant=zh-hk'), ('省市', 'https://www.singtao.ca/category/65-%E5%8D%A1%E5%8A%A0%E5%88%A9%E7%9C%81%E5%B8%82/?variant=zh-hk'), ('港聞','https://www.singtao.ca/category/57-%E5%8D%A1%E5%8A%A0%E5%88%A9%E6%B8%AF%E8%81%9E/?variant=zh-hk'), ('國際','https://www.singtao.ca/category/56-%E5%8D%A1%E5%8A%A0%E5%88%A9%E5%9C%8B%E9%9A%9B/?variant=zh-hk'), ('中國','https://www.singtao.ca/category/58-%E5%8D%A1%E5%8A%A0%E5%88%A9%E4%B8%AD%E5%9C%8B/?variant=zh-hk'), ('台灣','https://www.singtao.ca/category/59-%E5%8D%A1%E5%8A%A0%E5%88%A9%E5%8F%B0%E7%81%A3/?variant=zh-hk'), ('財經','https://www.singtao.ca/category/61-%E5%8D%A1%E5%8A%A0%E5%88%A9%E8%B2%A1%E7%B6%93/?variant=zh-hk'), ('體育','https://www.singtao.ca/category/60-%E5%8D%A1%E5%8A%A0%E5%88%A9%E9%AB%94%E8%82%B2/?variant=zh-hk'), ('娛樂','https://www.singtao.ca/category/62-%E5%8D%A1%E5%8A%A0%E5%88%A9%E5%A8%9B%E6%A8%82/?variant=zh-hk'),] try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = html.document_fromstring(read_http_page(url, {'edition': 'calgary'}).decode('utf-8')) # top story top_story_link = doc.xpath('(//div[@class="td-ss-main-content"])[1]/div[@class="cat-header-image"]/a') top_story_text = doc.xpath('(//div[@class="td-ss-main-content"])[1]/div[@class="cat-header-image"]/a/div/h3') if top_story_link and top_story_text: resultList.append(self.create_article(top_story_text[0].text.strip(), top_story_link[0].get('href'))) for topic in doc.xpath('(//div[@class="td-ss-main-content"])[1]/div[contains(@class, "td-animation-stack")]/div[@class="item-details"]/h3/a'): if topic.text and topic.get('href'): resultList.append(self.create_article(topic.text.strip(), topic.get('href'))) except Exception as e: logger.exception('Problem processing url: ' + str(e)) logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): resultList = [] for (name, url) in self.get_rss_links(): try: # for each section, insert a title... resultList.append(self.create_section(name)) # ... then parse the page and extract article links doc = etree.fromstring(read_http_page(url), parser=etree.XMLParser(recover=True)) if doc is not None: for entry in doc.xpath( '//*[local-name()="RDF"]/*[local-name()="item"]'): titles = entry.xpath('*[local-name()="title"]') links = entry.xpath('*[local-name()="link"]') abstracts = entry.xpath( '*[local-name()="description"]') if titles and links: title = titles[0].text link = links[0].text abstract = abstracts[0].text if abstracts else "" resultList.append( self.create_article(title.strip(), link, abstract)) except Exception as e: logger.exception("Problem processing rdf: " + str(e)) logger.exception( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): # Although the source is in RSS, the daily items are consolidated as CDATA. # Parse and break them down instead of using RSSBase rss_url = "http://www.daemonology.net/hn-daily/index.rss" resultList = [] try: doc = html.document_fromstring(read_http_page(rss_url)) for item in doc.xpath("//rss/channel/item"): title = (item.xpath("title")[0].text if len(item.xpath("title")) > 0 else "Daily Hacker News") resultList.append(self.create_section(title)) description = (item.xpath("description")[0] if len(item.xpath("description")) > 0 else None) if description is not None: for article in description.xpath( 'ul/li/span[@class="storylink"]/a'): if article.text and article.get("href"): resultList.append( self.create_article(article.text.strip(), article.get("href"))) except Exception as e: logger.exception("Problem processing Hacker News: " + str(e)) logger.exception( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): resultList = [] sections = [('要聞港聞', '/daily/local', self._base_url + '/daily/local/'), ('兩岸', '/daily/china', self._base_url + '/daily/china/'), ('國際', '/daily/international', self._base_url + '/daily/international/'), ('財經', '/daily/finance', self._base_url + '/daily/finance/'), ('娛樂', '/daily/entertainment', self._base_url + '/daily/entertainment/'), ('體育', '/daily/sports', self._base_url + '/daily/sports/'), ] from lxml.etree import tostring try: for (title, section_id, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then retrieve the json content raw_page = read_http_page(url) date_id, d = self._find_date_id(raw_page) if date_id and d: raw_result = self._get_collection(section_id, date_id, d) result = json.loads(raw_result) for article in result['content_elements']: desc = article['headlines']['basic'] href = article['website_url'] abstract = None if 'content_elements' in article and len(article['content_elements']) > 1 and 'content' in article['content_elements'][0]: abstract = article['content_elements'][0]['content'] if desc and href: resultList.append(self.create_article(desc.strip(), self._base_url + href, abstract)) except Exception as e: logger.exception('Problem processing url: ' + str(e)) logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): resultList = [] sections = [('要聞', 'http://toronto.singtao.ca/category/%e8%a6%81%e8%81%9e/?variant=zh-hk'), ('城市', 'http://toronto.singtao.ca/category/%e5%9f%8e%e5%b8%82/?variant=zh-hk'), ('加國', 'http://toronto.singtao.ca/category/%e5%8a%a0%e5%9c%8b/?variant=zh-hk'), ('國際', 'http://toronto.singtao.ca/category/%e5%9c%8b%e9%9a%9b/?variant=zh-hk'), ('港聞', 'http://toronto.singtao.ca/category/%e6%b8%af%e8%81%9e/?variant=zh-hk'), ('中國', 'http://toronto.singtao.ca/category/%e4%b8%ad%e5%9c%8b/?variant=zh-hk'), ('台灣', 'http://toronto.singtao.ca/category/%e5%8f%b0%e7%81%a3/?variant=zh-hk'), ('體育', 'http://toronto.singtao.ca/category/%e9%ab%94%e8%82%b2/?variant=zh-hk'), ('財經', 'http://toronto.singtao.ca/category/%e8%b2%a1%e7%b6%93/?variant=zh-hk'), ('娛樂', 'http://toronto.singtao.ca/category/%e5%a8%9b%e6%a8%82/?variant=zh-hk'),] try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = html.document_fromstring(read_http_page(url)) for option in doc.get_element_by_id('news').xpath('option'): if option.text and option.get('value'): resultList.append(self.create_article(option.text.strip(), option.get('value'))) except Exception as e: logger.exception('Problem processing url') return resultList
def get_articles(self): num_pages = 2 baseUrl = 'https://news.ltn.com.tw' resultList = [] sections = [('熱門', baseUrl + '/ajax/breakingnews/popular/'), ('政治', baseUrl + '/ajax/breakingnews/politics/'), ('社會', baseUrl + '/ajax/breakingnews/society/'), ('地方', baseUrl + '/ajax/breakingnews/local/'), ('生活', baseUrl + '/ajax/breakingnews/life/'), ('國際', baseUrl + '/ajax/breakingnews/world/'),] try: for page in range(1, num_pages): for (title, url) in sections: url = url + str(page) # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links result = json.loads(read_http_page(url + str(page)).decode('UTF-8')) if result.get('code', 0) == 200: data = result.get('data', []) for key in data.keys(): title = data[key].get('title', None) url = data[key].get('url', None) abstract = data[key].get('summary', None) if title and url: resultList.append(self.create_article(title, url, abstract)) except Exception as e: logger.exception('Problem processing url: ' + str(e)) logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): # get date first dateUrl = 'http://www.mingpaocanada.com/Van/' theDate = datetime.datetime.today().strftime('%Y%m%d') try: doc = html.document_fromstring(read_http_page(dateUrl)) for aLink in doc.get_element_by_id('mp-menu').xpath('//div/ul/li/a'): if aLink.text_content() == u'明報首頁': href = aLink.attrib['href'] match = re.match('htm\/News\/([0-9]{8})\/main_r\.htm', href) if match and match.lastindex == 1: theDate = match.group(1) else: logger.info('no date found. using system date: ' + theDate) except Exception as e: logger.exception('Problem getting date: ' + str(e)) logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) resultList = [] sections = [('要聞','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/VAindex_r.htm'), ('加國新聞','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/VBindex_r.htm'), ('社區新聞','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/VDindex_r.htm'), ('港聞','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/HK-VGindex_r.htm'), ('國際','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/VTindex_r.htm'), ('中國','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/VCindex_r.htm'), ('經濟','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/VEindex_r.htm'), ('體育','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/VSindex_r.htm'), ('影視','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/HK-MAindex_r.htm'), ('副刊','http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/WWindex_r.htm'),] baseUrl = 'http://www.mingpaocanada.com/Van/htm/News/' + theDate + '/' try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = html.document_fromstring(read_http_page(url).decode('big5', errors='ignore')) for topic in doc.xpath('//h4[contains(@class, "listing-link")]/a'): if topic.text and topic.get('href'): resultList.append(self.create_article(topic.text.strip(), baseUrl+topic.get('href'))) except Exception as e: logger.exception('Problem processing url: ' + str(e)) logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): # get date first baseUrl = 'http://news.ltn.com.tw' theDate = datetime.datetime.today().strftime('%Y%m%d') try: doc = html.document_fromstring(read_http_page(baseUrl + '/newspaper/')) cal = doc.get_element_by_id('box300B') theDate = cal.attrib['title'] except Exception as e: logger.exception('Problem getting date') resultList = [] sections = [('焦點', baseUrl + '/newspaper/focus/' + theDate), ('政治', baseUrl + '/newspaper/politics/' + theDate), ('社會', baseUrl + '/newspaper/society/' + theDate), ('地方', baseUrl + '/newspaper/local/' + theDate), ('生活', baseUrl + '/newspaper/life/' + theDate), ('言論', baseUrl + '/newspaper/opinion/' + theDate), ('國際', baseUrl + '/newspaper/world/' + theDate), ('財經', baseUrl + '/newspaper/business/' + theDate), ('體育', baseUrl + '/newspaper/sports/' + theDate), ('娛樂', baseUrl + '/newspaper/entertainment/' + theDate), ('消費', baseUrl + '/newspaper/consumer/' + theDate), ('副刊', baseUrl + '/newspaper/supplement/' + theDate),] try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) curPage = 1 maxPage = 1 while curPage <= maxPage: # ... then parse the page and extract article links doc = html.document_fromstring(read_http_page(url + '?page=' + str(curPage))) for link in doc.get_element_by_id('newslistul').xpath('//a[contains(@class, "picword")]'): if link.text and link.get('href'): resultList.append(self.create_article(link.text.strip(), baseUrl + link.get('href'))) curPage += 1 for pageNum in doc.get_element_by_id('page').xpath('//*[contains(@class, "p_num")]'): maxPage = int(pageNum.text.strip()) except Exception as e: logger.exception('Problem processing url') return resultList
def get_articles(self): # get date first dateUrl = 'http://www.mingpaocanada.com/TOR/' theDate = datetime.datetime.today().strftime('%Y%m%d') try: doc = html.document_fromstring(read_http_page(dateUrl)) for aLink in doc.get_element_by_id('mp-menu').xpath('//div/ul/li/a'): if aLink.text_content().encode('utf-8') == '明報首頁': href = aLink.attrib['href'] match = re.match('htm\/News\/([0-9]{8})\/main_r\.htm', href) if match and match.lastindex == 1: theDate = match.group(1) else: logger.info('no date found. using system date: ' + theDate) except Exception as e: logger.exception('Problem getting date') resultList = [] sections = [('要聞','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/TAindex_r.htm'), ('加國新聞','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/TDindex_r.htm'), ('地產','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/TRindex_r.htm'), ('中國','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/TCAindex_r.htm'), ('國際','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/TTAindex_r.htm'), ('港聞','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/HK-GAindex_r.htm'), ('經濟','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/THindex_r.htm'), ('體育','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/TSindex_r.htm'), ('影視','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/HK-MAindex_r.htm'), ('副刊','http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/WWindex_r.htm'),] baseUrl = 'http://www.mingpaocanada.com/TOR/htm/News/' + theDate + '/' try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = html.document_fromstring(unicode(read_http_page(url), 'big5', errors='ignore')) for topic in doc.xpath('//h4[contains(@class, "listing-link")]/a'): if topic.text and topic.get('href'): resultList.append(self.create_article(topic.text.strip(), baseUrl+topic.get('href'))) except Exception as e: logger.exception('Problem processing url') return resultList
def get_articles(self): resultList = [] sections = [ ("眾聞", "https://www.hkcnews.com", "https://www.hkcnews.com/data/newsposts", 3), ] try: for (title, base_url, data_url, pages) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then get page and parse for page in range(1, pages + 1): raw_result = read_http_page(data_url + "?page={}".format(page)) result = json.loads(raw_result) for item in result['items']: doc = html.document_fromstring(item) for article in doc.xpath( '//div[contains(@class, "article-block")]'): article_link = article.xpath( 'div[contains(@class, "article-block-body")]/a' ) article_text = article.xpath( 'div[contains(@class, "article-block-body")]/a/p' ) if article_link and article_text: url = base_url + article_link[0].get("href") text = article_text[0].text if url and text: footer = article.xpath( 'a[contains(@class, "article-block-footer")]' ) date_str = '' if footer: divs = footer[0].xpath( 'div/div[contains(@class, "text-box")]/div' ) for div in divs: if div.text and re.match( r"[0-9]{2}\.[0-9]{2}\.[0-9]{2}", div.text.strip()): date_str = div.text.strip() resultList.append( self.create_article( text.strip() + ' - {}'.format(date_str), url)) except Exception as e: logger.exception("Problem processing url: " + str(e)) logger.exception( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): # get date first dateUrl = 'http://orientaldaily.on.cc/' theDate = datetime.datetime.today().strftime('%Y%m%d') try: doc = html.document_fromstring(read_http_page(dateUrl)) for aLink in doc.get_element_by_id('topMenu').xpath('ul[contains(@class, "menuList clear")]/li/a[contains(@class, "news")]'): href = aLink.attrib['href'] match = re.match('\/cnt\/news\/([0-9]{8})\/index\.html', href) if match and match.lastindex == 1: theDate = match.group(1) else: logger.info('no date found. using system date: ' + theDate) except Exception as e: logger.exception('Problem getting date: ' + str(e)) logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) resultList = [] baseUrl = dateUrl sections = [('要聞港聞','http://orientaldaily.on.cc/cnt/news/' + theDate + '/index.html'), ('兩岸國際','http://orientaldaily.on.cc/cnt/china_world/' + theDate + '/index.html'), ('財經','http://orientaldaily.on.cc/cnt/finance/' + theDate + '/index.html'), ('娛樂','http://orientaldaily.on.cc/cnt/entertainment/' + theDate + '/index.html'), ('副刊','http://orientaldaily.on.cc/cnt/lifestyle/' + theDate + '/index.html'),] try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = html.document_fromstring(read_http_page(url)) for topic in doc.get_element_by_id('articleList').xpath('ul[contains(@class, "commonBigList")]/li/a'): if topic.text and topic.get('href'): resultList.append(self.create_article(topic.text.strip(), baseUrl+topic.get('href'))) except Exception as e: logger.exception('Problem processing url: ' + str(e)) logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def _get_collection(self, section_id, date_id, d): payload_query = { "feedOffset":0, "feedQuery": "taxonomy.primary_section._id:\"{}\" AND type:story AND editor_note:\"{}\"".format(section_id, date_id), "feedSize":100,"sort":"location:asc" } payload_query = urllib.parse.quote(json.dumps(payload_query)) query_url = self._base_url + \ '/pf/api/v3/content/fetch/query-feed?query={}&d={}&_website=hk-appledaily'.format(payload_query, d) return read_http_page(query_url)
def get_articles(self): # get date first dateUrl = 'http://orientaldaily.on.cc/' theDate = datetime.datetime.today().strftime('%Y%m%d') try: doc = html.document_fromstring(read_http_page(dateUrl)) for aLink in doc.get_element_by_id('topMenu').xpath('ul[contains(@class, "menuList clear")]/li/a[contains(@class, "news")]'): href = aLink.attrib['href'] match = re.match('\/cnt\/news\/([0-9]{8})\/index\.html', href) if match and match.lastindex == 1: theDate = match.group(1) else: logger.info('no date found. using system date: ' + theDate) except Exception as e: logger.exception('Problem getting date') resultList = [] baseUrl = dateUrl sections = [('要聞港聞','http://orientaldaily.on.cc/cnt/news/' + theDate + '/index.html'), ('兩岸國際','http://orientaldaily.on.cc/cnt/china_world/' + theDate + '/index.html'), ('財經','http://orientaldaily.on.cc/cnt/finance/' + theDate + '/index.html'), ('娛樂','http://orientaldaily.on.cc/cnt/entertainment/' + theDate + '/index.html'), ('副刊','http://orientaldaily.on.cc/cnt/lifestyle/' + theDate + '/index.html'),] try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = html.document_fromstring(read_http_page(url)) for topic in doc.get_element_by_id('articleList').xpath('ul[contains(@class, "commonBigList")]/li/a'): if topic.text and topic.get('href'): resultList.append(self.create_article(topic.text.strip(), baseUrl+topic.get('href'))) except Exception as e: logger.exception('Problem processing url') return resultList
def get_rss_links(self): resultList = [] try: rss_list_url = 'https://money.udn.com/rssfeed/lists/1001'; doc = html.document_fromstring(read_http_page(rss_list_url)) for aLink in doc.get_element_by_id("rss_list").xpath('div/div/dl/dt/a'): if aLink.xpath('text()') and MoneyUnitedDailyNewsRSS.is_url(aLink.get('href')): resultList.append((aLink.xpath('text()'), aLink.get('href'))) except Exception as e: logger.exception('Problem fetching rss links: ' + str(e)) logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): # get article lists summary_url = 'https://tw.appledaily.com/daily' doc = html.document_fromstring(read_http_page(summary_url)) resultList = [] sections = [('頭條', u'//article[contains(@class, "nclns")]//h2[contains(text(), "頭條")]/following-sibling::ul/li/a'), ('要聞', u'//article[contains(@class, "nclns")]//h2[contains(text(), "要聞")]/following-sibling::ul/li/a'), ('政治', u'//article[contains(@class, "nclns")]//h2[contains(text(), "政治")]/following-sibling::ul/li/a'), ('社會', u'//article[contains(@class, "nclns")]//h2[contains(text(), "社會")]/following-sibling::ul/li/a'), ('蘋果爆破社', u'//article[contains(@class, "nclns")]//h2[contains(text(), "蘋果爆破社")]/following-sibling::ul/li/a'), ('蘋論陣線', u'//article[contains(@class, "nclns")]//h2[contains(text(), "蘋論陣線")]/following-sibling::ul/li/a'), ('暖流', u'//article[contains(@class, "nclns")]//h2[contains(text(), "暖流")]/following-sibling::ul/li/a'), ('娛樂名人', u'//article[contains(@class, "nclns")]//h2[contains(text(), "娛樂名人")]/following-sibling::ul/li/a'), ('木瓜霞吐槽', u'//article[contains(@class, "nclns")]//h2[contains(text(), "木瓜霞吐槽")]/following-sibling::ul/li/a'), ('直擊好萊塢', u'//article[contains(@class, "nclns")]//h2[contains(text(), "直擊好萊塢")]/following-sibling::ul/li/a'), ('亞洲哈燒星', u'//article[contains(@class, "nclns")]//h2[contains(text(), "亞洲哈燒星")]/following-sibling::ul/li/a'), ('名人時尚', u'//article[contains(@class, "nclns")]//h2[contains(text(), "名人時尚")]/following-sibling::ul/li/a'), ('國際頭條', u'//article[contains(@class, "nclns")]//h2[contains(text(), "國際頭條")]/following-sibling::ul/li/a'), ('國際新聞', u'//article[contains(@class, "nclns")]//h2[contains(text(), "國際新聞")]/following-sibling::ul/li/a'), ('雙語天下', u'//article[contains(@class, "nclns")]//h2[contains(text(), "雙語天下")]/following-sibling::ul/li/a'), ('體育焦點', u'//article[contains(@class, "nclns")]//h2[contains(text(), "體育焦點")]/following-sibling::ul/li/a'), ('大運動場', u'//article[contains(@class, "nclns")]//h2[contains(text(), "大運動場")]/following-sibling::ul/li/a'), ('籃球瘋', u'//article[contains(@class, "nclns")]//h2[contains(text(), "籃球瘋")]/following-sibling::ul/li/a'), ('投打對決', u'//article[contains(@class, "nclns")]//h2[contains(text(), "投打對決")]/following-sibling::ul/li/a'), ('足球新聞', u'//article[contains(@class, "nclns")]//h2[contains(text(), "足球新聞")]/following-sibling::ul/li/a'), ('運彩分析', u'//article[contains(@class, "nclns")]//h2[contains(text(), "運彩分析")]/following-sibling::ul/li/a'), ('財經焦點', u'//article[contains(@class, "nclns")]//h2[contains(text(), "財經焦點")]/following-sibling::ul/li/a'), ('頭家人生', u'//article[contains(@class, "nclns")]//h2[contains(text(), "頭家人生")]/following-sibling::ul/li/a'), ('投資理財', u'//article[contains(@class, "nclns")]//h2[contains(text(), "投資理財")]/following-sibling::ul/li/a'), ('卡該這樣刷', u'//article[contains(@class, "nclns")]//h2[contains(text(), "卡該這樣刷")]/following-sibling::ul/li/a'), ('地產焦點', u'//article[contains(@class, "nclns")]//h2[contains(text(), "地產焦點")]/following-sibling::ul/li/a'), ('副刊焦點', u'//article[contains(@class, "nclns")]//h2[contains(text(), "副刊焦點")]/following-sibling::ul/li/a'), ('美食天地', u'//article[contains(@class, "nclns")]//h2[contains(text(), "美食天地")]/following-sibling::ul/li/a'), ('車市3C', u'//article[contains(@class, "nclns")]//h2[contains(text(), "車市3C")]/following-sibling::ul/li/a'), ('家庭與健康', u'//article[contains(@class, "nclns")]//h2[contains(text(), "家庭與健康")]/following-sibling::ul/li/a'), ] try: for (title, path) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) for link in doc.xpath(path): if link.get('title') and link.get('href'): resultList.append(self.create_article(link.get('title').strip(), link.get('href'))) except Exception as e: logger.exception('Problem processing url: ' + str(e)) logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): maxPagePerSection = 10 resultList = [] sections = [ ("要聞港聞", "http://www.singpao.com.hk/index.php?fi=news1"), ("兩岸國際", "http://www.singpao.com.hk/index.php?fi=news8"), ("財經", "http://www.singpao.com.hk/index.php?fi=news3"), ("娛樂", "http://www.singpao.com.hk/index.php?fi=news4"), ("體育", "http://www.singpao.com.hk/index.php?fi=news5"), ("副刊", "http://www.singpao.com.hk/index.php?fi=news7"), ] baseUrl = "http://www.singpao.com.hk/" try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links page = 1 maxPage = 1 while page <= maxPage and page <= maxPagePerSection: doc = html.document_fromstring( read_http_page(url + "&page=" + str(page))) page += 1 for topic in doc.xpath( '//td/a[contains(@class, "list_title")]'): if topic.text and topic.get("href"): resultList.append( self.create_article( topic.text.strip(), baseUrl + topic.get("href"))) for pageIndex in doc.xpath( '//a[contains(@class, "fpagelist_css")]'): if pageIndex.text is not None: match = re.match(r"^([0-9]+)$", pageIndex.text.strip()) if (match and match.lastindex == 1 and int(match.group(1)) > maxPage): maxPage = int(match.group(1)) except Exception as e: logger.exception("Problem processing url: " + str(e)) logger.exception( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): resultList = [] sections = [ ("港聞", "http://www.takungpao.com.hk/hongkong/"), ("內地", "http://www.takungpao.com.hk/mainland/"), ("台灣", "http://www.takungpao.com.hk/taiwan/"), ("國際", "http://www.takungpao.com.hk/international/"), ("評論", "http://www.takungpao.com.hk/opinion/"), ("經濟", "http://www.takungpao.com.hk/finance/"), ("文化", "http://www.takungpao.com.hk/culture/"), ("體育", "http://www.takungpao.com.hk/sports/"), ("娛樂", "http://www.takungpao.com.hk/ent/"), ] try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = html.document_fromstring(read_http_page(url)) for topic in doc.xpath( '//div[contains(@class, "list_tuwen")]/div[contains(@class, "content")]' ): title = topic.xpath( 'ul[contains(@class, "txt")]/li[contains(@class, "title")]/a' ) intro = topic.xpath( 'ul[contains(@class, "txt")]/li[contains(@class, "intro")]/a' ) if title and title[0].text and title[0].get("href"): resultList.append( self.create_article( title[0].text.strip(), title[0].get("href"), intro[0].text.strip() if intro and intro[0].text else None, )) except Exception as e: logger.exception("Problem processing url: " + str(e)) logger.exception( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): resultList = [] sections = [ ("國際", "/realtime/international", self._base_url + "/realtime/international/"), ("娛樂時尚", "/realtime/entertainment", self._base_url + "/realtime/entertainment/"), ("社會", "/realtime/local", self._base_url + "/realtime/local"), ("生活", "/realtime/life", self._base_url + "/realtime/life"), ("財經地產", "/realtime/property", self._base_url + "/realtime/property/"), ("吃喝玩樂", "/realtime/supplement", self._base_url + "/realtime/supplement/"), ("體育", "/realtime/sports", self._base_url + "/realtime/sports/"), ] try: for (title, section_id, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then retrieve the json content raw_page = read_http_page(url) d = self._find_date_id(raw_page) if d: raw_result = self._get_collection(section_id, d) result = json.loads(raw_result) for article in result["content_elements"]: desc = article["headlines"]["basic"] href = article["website_url"] abstract = None if ("content_elements" in article and len(article["content_elements"]) > 1 and "content" in article["content_elements"][0]): abstract = article["content_elements"][0][ "content"] if desc and href: resultList.append( self.create_article(desc.strip(), self._base_url + href, abstract)) except Exception as e: logger.exception("Problem processing url: " + str(e)) logger.exception( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): resultList = [] sections = [('Vancouver', 'http://www.theprovince.com/scripts/Sp6Query.aspx?catalog=VAPR&tags=category|news|subcategory|metro%20vancouver'), ('Fraser Valley', 'http://www.theprovince.com/scripts/Sp6Query.aspx?catalog=VAPR&tags=category|news|subcategory|fraser%20valley'), ('B.C.', 'http://www.theprovince.com/scripts/Sp6Query.aspx?catalog=VAPR&tags=category|news|subcategory|b.c.'),] relSections = [('Canada', 'http://www.theprovince.com/7588609.atom'), ('World', 'http://www.theprovince.com/7589147.atom'), ] try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = etree.fromstring(read_http_page(url)) for entry in doc.xpath('//ns:entry[@Status="FREE"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'}): title = entry.xpath('ns:title[@type="html"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].text link = 'http://www.theprovince.com' + entry.xpath('ns:link[@type="text/html"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].get('href') abstract = entry.xpath('ns:link[@type="text/html"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].get('Abstract') resultList.append(self.create_article(title.strip(), link, abstract)) for (title, url) in relSections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = etree.fromstring(read_http_page(url)) for entry in doc.xpath('//ns:entry[@Status="FREE"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'}): title = entry.xpath('ns:title[@type="html"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].text link = 'http://www.theprovince.com' + entry.xpath('ns:link[@type="text/xml"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].get('href') abstract = entry.xpath('ns:link[@type="text/xml"]', namespaces={'ns': 'http://www.w3.org/2005/Atom'})[0].get('Abstract') resultList.append(self.create_article(title.strip(), link, abstract)) except Exception as e: logger.exception('Problem processing url: ' + str(e)) logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): resultList = [] for (name, url) in self.get_rss_links(): try: # for each section, insert a title... resultList.append(self.create_section(name)) # ... then parse the page and extract article links doc = etree.fromstring(read_http_page(url), parser=etree.XMLParser(recover=True)) for entry in doc.xpath('//*[local-name()="RDF"]/*[local-name()="item"]'): title = entry.xpath('*[local-name()="title"]')[0].text link = entry.xpath('*[local-name()="link"]')[0].text abstract = entry.xpath('*[local-name()="description"]')[0].text resultList.append(self.create_article(title.strip(), link, abstract)) except Exception as e: logger.exception('Problem processing rdf') return resultList
def get_articles(self): resultList = [] # build the list listUrl = 'http://www.am730.com.hk/home' baseUrl = 'http://www.am730.com.hk/' try: doc = html.document_fromstring(read_http_page(listUrl)) for optGroup in doc.get_element_by_id('listnews').xpath('optgroup'): if optGroup.get('label'): resultList.append(self.create_section(optGroup.get('label'))) for opt in optGroup.xpath('option'): if opt.get('value') and opt.text: resultList.append(self.create_article(opt.text.strip(), baseUrl+opt.get('value'))) except Exception as e: logger.exception('Problem getting date') return resultList
def get_articles(self): resultList = [] baseUrl = "https://www.chinatimes.com" sections = [ ("政治", baseUrl + "/politic/?chdtv"), ("言論", baseUrl + "/opinion/?chdtv"), ("生活", baseUrl + "/life/?chdtv"), ("娛樂", baseUrl + "/star/?chdtv"), ("財經", baseUrl + "/money/?chdtv"), ("社會", baseUrl + "/society/?chdtv"), ("話題", baseUrl + "/hottopic/?chdtv"), ("國際", baseUrl + "/world/?chdtv"), ("軍事", baseUrl + "/armament/?chdtv"), ("兩岸", baseUrl + "/chinese/?chdtv"), ("時尚", baseUrl + "/fashion/?chdtv"), ("體育", baseUrl + "/sports/?chdtv"), ("科技", baseUrl + "/technologynews/?chdtv"), ("玩食", baseUrl + "/travel/?chdtv"), ("新聞專輯", baseUrl + "/album/?chdtv"), ] try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = html.document_fromstring(read_http_page(url)) for topic in doc.xpath( '//section[contains(@class, "article-list")]/ul//li//h3[contains(@class, "title")]//a' ): if topic.text and topic.get("href"): resultList.append( self.create_article(topic.text.strip(), topic.get("href"))) except Exception as e: logger.exception("Problem processing url: " + str(e)) logger.exception( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): resultList = [] for (name, url) in self.get_rss_links(): try: # for each section, insert a title... resultList.append(self.create_section(name)) # ... then parse the page and extract article links data = read_http_page(url) if data: doc = etree.fromstring(data, parser=etree.XMLParser(recover=True)) for entry in doc.xpath('//rss/channel/item'): title = entry.xpath('title')[0].text link = entry.xpath('link')[0].text abstract = entry.xpath('description')[0].text resultList.append(self.create_article(title.strip(), link, abstract)) except Exception as e: logger.exception('Problem processing rss: ' + str(e)) logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def _get_collection(self, section_id, d): payload_query = { "feedOffset": 0, "feedQuery": 'taxonomy.primary_section._id:"{}" AND type:story AND display_date:[now-24h/h TO now] AND NOT taxonomy.tags.text.raw:_no_show_for_web AND NOT taxonomy.tags.text.raw:_nohkad' .format(section_id), "feedSize": 100, "sort": "display_date:desc", } payload_query = urllib.parse.quote(json.dumps(payload_query)) query_url = ( self._base_url + "/pf/api/v3/content/fetch/query-feed?query={}&d={}&_website=tw-appledaily" .format(payload_query, d)) return read_http_page(query_url)
def get_articles(self): resultList = [] baseUrl = "https://api.theinitium.com" apiUrl = "https://api.theinitium.com/api/v2/channel/articles" sections = [ ("最新", apiUrl + "/?language=zh-hant&slug=latest"), ("香港", apiUrl + "/?language=zh-hant&slug=hongkong"), ("國際", apiUrl + "/?language=zh-hant&slug=international"), ("大陸", apiUrl + "/?language=zh-hant&slug=mainland"), ("台灣", apiUrl + "/?language=zh-hant&slug=taiwan"), ("評論", apiUrl + "/?language=zh-hant&slug=opinion"), ("科技", apiUrl + "/?language=zh-hant&slug=technology"), ("風物", apiUrl + "/?language=zh-hant&slug=culture"), ("廣場", apiUrl + "/?language=zh-hant&slug=notes-and-letters"), ] headers = urllib3.make_headers( basic_auth="anonymous:GiCeLEjxnqBcVpnp6cLsUvJievvRQcAXLv") headers['Accept'] = "application/json" try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links contents = json.loads(read_http_page(url, headers=headers)) for digest in contents["digests"]: article = digest["article"] if article and article["headline"] and article["url"]: resultList.append( self.create_article(article["headline"].strip(), baseUrl + article["url"], article["lead"])) except Exception as e: logger.exception("Problem processing url: " + str(e)) logger.exception( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): resultList = [] baseUrl = "https://www.rfa.org/cantonese" sections = [ ("新聞", baseUrl + "/news"), ("港澳台新聞", baseUrl + "/news/htm"), ("評論", baseUrl + "/commentaries"), ("聚言堂", baseUrl + "/talkshows"), ("專題", baseUrl + "/features/hottopic"), ("多媒體", baseUrl + "/multimedia"), ] try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = html.document_fromstring(read_http_page(url)) for topic in doc.xpath( '//div[contains(@id, "topstorywidefulltease")]|//div[contains(@class, "sectionteaser")]' ): title = topic.xpath('h2/a') intro = topic.xpath('p') if title: title_text = title[0].xpath('span') resultList.append( self.create_article( title_text[0].text.strip(), title[0].get("href"), intro[0].text.strip() if intro and intro[0].text else None)) except Exception as e: logger.exception("Problem processing url: " + str(e)) logger.exception( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): resultList = [] sections = [ ("金融經濟", "https://inews.hket.com", "/sran009/金融經濟", 3), ("理財", "https://wealth.hket.com", "/", 1), ("科技", "https://inews.hket.com", "/sran010/科技", 2), ("中國", "https://china.hket.com", "/", 1), ("國際", "https://inews.hket.com", "/sran011/國際", 2), ("商業", "https://inews.hket.com", "/sran012/商業", 2), ] seen_url = {} try: for (title, base_url, url, pages) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then get page and parse for page in range(1, pages + 1): doc = html.document_fromstring( read_http_page(base_url + url + "?p={}".format(page))) for topic in doc.xpath( '//div[contains(@class, "listing-widget-33") or contains(@class, "listing-widget-4") or contains(@class, "listing-widget-9")]/a[contains(@class, "listing-overlay")]' ): if topic.text and topic.get("href"): topic_url = (topic.get("href") if self._is_absolute(topic.get("href")) else base_url + topic.get("href")) if topic_url not in seen_url: seen_url[topic_url] = None resultList.append( self.create_article( topic.text.strip(), topic_url)) except Exception as e: logger.exception("Problem processing url: " + str(e)) logger.exception( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): num_pages = 2 baseUrl = "https://news.ltn.com.tw" resultList = [] sections = [ ("熱門", baseUrl + "/ajax/breakingnews/popular/"), ("政治", baseUrl + "/ajax/breakingnews/politics/"), ("社會", baseUrl + "/ajax/breakingnews/society/"), ("地方", baseUrl + "/ajax/breakingnews/local/"), ("生活", baseUrl + "/ajax/breakingnews/life/"), ("國際", baseUrl + "/ajax/breakingnews/world/"), ] try: for page in range(1, num_pages): for (title, url) in sections: url = url + str(page) # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links result = json.loads( read_http_page(url + str(page)).decode("UTF-8")) if result.get("code", 0) == 200: data = result.get("data", []) for key in data.keys(): title = data[key].get("title", None) url = data[key].get("url", None) abstract = data[key].get("summary", None) if title and url: resultList.append( self.create_article(title, url, abstract)) except Exception as e: logger.exception("Problem processing url: " + str(e)) logger.exception( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): maxPagePerSection = 10 resultList = [] sections = [('要聞港聞', 'http://www.singpao.com.hk/index.php?fi=news1'), ('兩岸國際', 'http://www.singpao.com.hk/index.php?fi=news8'), ('財經', 'http://www.singpao.com.hk/index.php?fi=news3'), ('娛樂', 'http://www.singpao.com.hk/index.php?fi=news4'), ('體育', 'http://www.singpao.com.hk/index.php?fi=news5'), ('副刊', 'http://www.singpao.com.hk/index.php?fi=news7'),] baseUrl = 'http://www.singpao.com.hk/' try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links page = 1 maxPage = 1 while page <= maxPage and page <= maxPagePerSection: doc = html.document_fromstring(read_http_page(url+'&page='+str(page))) page += 1 for topic in doc.xpath('//td/a[contains(@class, "list_title")]'): if topic.text and topic.get('href'): resultList.append(self.create_article(topic.text.strip(), baseUrl+topic.get('href'))) for pageIndex in doc.xpath('//a[contains(@class, "fpagelist_css")]'): if pageIndex.text is not None: match = re.match('^([0-9]+)$', pageIndex.text.strip()) if match and match.lastindex == 1 and int(match.group(1)) > maxPage: maxPage = int(match.group(1)) except Exception as e: logger.exception('Problem processing url: ' + str(e)) logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): resultList = [] sections = [('要聞港聞', 'http://hk.apple.nextmedia.com/news/index/'), ('兩岸國際', 'http://hk.apple.nextmedia.com/international/index/'), ('財經地產', 'http://hk.apple.nextmedia.com/financeestate/index/'), ('娛樂名人', 'http://hk.apple.nextmedia.com/entertainment/index/'), ('果籽', 'http://hk.apple.nextmedia.com/supplement/index/'),] try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = html.document_fromstring(read_http_page(url)) for option in doc.get_element_by_id('article_ddl').xpath('//option'): if option.text and option.get('value'): resultList.append(self.create_article(option.text.strip(), option.get('value'))) except Exception as e: logger.exception('Problem processing url') return resultList
def get_articles(self): resultList = [] sections = [('港聞', 'http://www.takungpao.com.hk/hongkong/'), ('內地', 'http://www.takungpao.com.hk/mainland/'), ('台灣', 'http://www.takungpao.com.hk/taiwan/'), ('國際', 'http://www.takungpao.com.hk/international/'), ('評論', 'http://www.takungpao.com.hk/opinion/'), ('經濟', 'http://www.takungpao.com.hk/finance/'), ('文化', 'http://www.takungpao.com.hk/culture/'), ('體育', 'http://www.takungpao.com.hk/sports/'), ('娛樂', 'http://www.takungpao.com.hk/ent/'),] try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = html.document_fromstring(read_http_page(url)) for topic in doc.xpath('//div[contains(@class, "list_tuwen")]/div[contains(@class, "content")]'): title = topic.xpath('ul/li[contains(@class, "title")]/a') intro = topic.xpath('ul/li[contains(@class, "intro")]/a') if title and title[0].text and title[0].get('href'): resultList.append( self.create_article( \ title[0].text.strip(),\ title[0].get('href'), \ intro[0].text.strip() if intro and intro[0].text else None)) except Exception as e: logger.exception('Problem processing url: ' + str(e)) logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): resultList = [] baseUrl = 'https://www.chinatimes.com' sections = [('政治', baseUrl + '/politic/?chdtv'), ('言論', baseUrl + '/opinion/?chdtv'), ('生活', baseUrl + '/life/?chdtv'), ('娛樂', baseUrl + '/star/?chdtv'), ('財經', baseUrl + '/money/?chdtv'), ('社會', baseUrl + '/society/?chdtv'), ('話題', baseUrl + '/hottopic/?chdtv'), ('國際', baseUrl + '/world/?chdtv'), ('軍事', baseUrl + '/armament/?chdtv'), ('兩岸', baseUrl + '/chinese/?chdtv'), ('時尚', baseUrl + '/fashion/?chdtv'), ('體育', baseUrl + '/sports/?chdtv'), ('科技', baseUrl + '/technologynews/?chdtv'), ('玩食', baseUrl + '/travel/?chdtv'), ('新聞專輯', baseUrl + '/album/?chdtv'),] try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = html.document_fromstring(read_http_page(url)) for topic in doc.xpath('//section[contains(@class, "article-list")]/ul//li//h3[contains(@class, "title")]//a'): if topic.text and topic.get('href'): resultList.append(self.create_article(topic.text.strip(), topic.get('href'))) except Exception as e: logger.exception('Problem processing url: ' + str(e)) logger.exception(traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): resultList = [] pages = 3 sections = [ ("新聞", "https://www.storm.mg/articles"), ("評論", "https://www.storm.mg/all-comment"), ("財經", "https://www.storm.mg/category/23083"), ("生活", "https://www.storm.mg/category/104"), ("人物", "https://www.storm.mg/category/171151"), ("華爾街日報", "https://www.storm.mg/category/173479"), ("新新聞", "https://www.storm.mg/category/87726"), ] try: for (title, url) in sections: resultList.append(self.create_section(title)) for page in range(1, pages + 1): # for each section, insert a title... # ... then parse the page and extract article links doc = html.document_fromstring( read_http_page(url + "/" + str(page))) # get the first featured article topic = doc.xpath( '//div[contains(@class, "category_top_card")]/div[contains(@class, "card_img_wrapper")]' ) if topic: title = topic[0].xpath( 'div[contains(@class, "card_inner_wrapper")]/a[contains(@class, "link_title")]' ) intro = topic[0].xpath( 'div[contains(@class, "card_inner_wrapper")]/a[contains(@class, "card_substance")]' ) title_text = title[0].xpath( "h2/text()") if title else None if title and title_text and title[0].get("href"): resultList.append( self.create_article( title_text[0].strip(), title[0].get("href"), intro[0].text.strip() if intro and intro[0].text else None, )) for topic in doc.xpath( '//div[contains(@class, "category_cards_wrapper")]/div[contains(@class, "category_card")]' ): title = topic.xpath( 'div[contains(@class, "card_inner_wrapper")]/a[contains(@class, "link_title")]' ) intro = topic.xpath( 'div[contains(@class, "card_inner_wrapper")]/a[contains(@class, "card_substance")]' ) title_text = title[0].xpath( "h3/text()") if title else None if title and title_text and title[0].get("href"): resultList.append( self.create_article( title_text[0].strip(), title[0].get("href"), intro[0].text.strip() if intro and intro[0].text else None, )) except Exception as e: logger.exception("Problem processing url: " + str(e)) logger.exception( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): siteBaseUrl = "https://money.udn.com" baseUrl = siteBaseUrl + "/money/cate/" resultList = [] sections = [ ("要聞", baseUrl + "10846"), ("國際", baseUrl + "5588"), ("兩岸", baseUrl + "5589"), ("產業", baseUrl + "5591"), ("證券", baseUrl + "5590"), ("金融", baseUrl + "12017"), ("期貨", baseUrl + "11111"), ("理財", baseUrl + "5592"), ("房市", baseUrl + "5593"), ("專欄", baseUrl + "5595"), ("商情", baseUrl + "5597"), ] try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = html.document_fromstring(read_http_page(url)) for topic in doc.xpath( '//section[contains(@class, "cate-main__section")]/div[contains(@class, "story-headline-wrapper")]' ): # main stories first... link = topic.xpath( 'div[contains(@class, "story__content")]/a') title = topic.xpath( 'div[contains(@class, "story__content")]/a/h3') intro = topic.xpath( 'div[contains(@class, "story__content")]/a/p') title_text = title[0].text if title else None if title and title_text and link: resultList.append( self.create_article( title_text.strip(), siteBaseUrl + link[0].get("href"), intro[0].text.strip() if intro and intro[0].text else None, )) for topic in doc.xpath( '//section[contains(@class, "cate-main__section")]/ul[contains(@class, "story-flex-bt-wrapper")]' ): # ... then other stories titles = topic.xpath( 'li[contains(@class, "story__item")]/a') for title in titles: title_text = title.text if title_text: resultList.append( self.create_article( title_text.strip(), siteBaseUrl + title.get("href"), None, )) except Exception as e: logger.exception("Problem processing url: " + str(e)) logger.exception( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): # get date first dateUrl = "http://www.mingpaocanada.com/TOR/" tor_time = datetime.now(pytz.timezone("America/Toronto")) if tor_time.hour < 4: tor_time = tor_time - timedelta(days=1) theDate = tor_time.strftime("%Y%m%d") try: doc = html.document_fromstring(read_http_page(dateUrl)) for aLink in doc.get_element_by_id("mp-menu").xpath( "//div/ul/li/a"): if aLink.text_content() == u"明報首頁": href = aLink.attrib["href"] match = re.match(r"htm\/News\/([0-9]{8})\/main_r\.htm", href) if match and match.lastindex == 1: theDate = match.group(1) else: logger.info("no date found. using system date: " + theDate) except Exception as e: logger.exception("Problem getting date: " + str(e)) logger.exception( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) resultList = [] sections = [ ( "要聞", "http://www.mingpaocanada.com/TOR/htm/News/" + theDate + "/TAindex_r.htm", ), ( "加國新聞", "http://www.mingpaocanada.com/TOR/htm/News/" + theDate + "/TDindex_r.htm", ), ( "中國", "http://www.mingpaocanada.com/TOR/htm/News/" + theDate + "/TCAindex_r.htm", ), ( "國際", "http://www.mingpaocanada.com/TOR/htm/News/" + theDate + "/TTAindex_r.htm", ), ( "港聞", "http://www.mingpaocanada.com/TOR/htm/News/" + theDate + "/HK-GAindex_r.htm", ), ( "經濟", "http://www.mingpaocanada.com/TOR/htm/News/" + theDate + "/THindex_r.htm", ), ( "體育", "http://www.mingpaocanada.com/TOR/htm/News/" + theDate + "/TSindex_r.htm", ), ( "影視", "http://www.mingpaocanada.com/TOR/htm/News/" + theDate + "/HK-MAindex_r.htm", ), ( "副刊", "http://www.mingpaocanada.com/TOR/htm/News/" + theDate + "/WWindex_r.htm", ), ] baseUrl = "http://www.mingpaocanada.com/TOR/htm/News/" + theDate + "/" try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = html.document_fromstring( read_http_page(url).decode("big5-hkscs", errors="ignore")) for topic in doc.xpath( '//h4[contains(@class, "listing-link")]/a'): if topic.text and topic.get("href"): resultList.append( self.create_article(topic.text.strip(), baseUrl + topic.get("href"))) except Exception as e: logger.exception("Problem processing url: " + str(e)) logger.exception( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): resultList = [] sections = [ ( "要聞", "https://www.singtao.ca/category/52-%E5%A4%9A%E5%80%AB%E5%A4%9A%E8%A6%81%E8%81%9E/?variant=zh-hk", ), ( "加國新聞", "https://www.singtao.ca/category/54-%E5%A4%9A%E5%80%AB%E5%A4%9A%E5%8A%A0%E5%9C%8B/?variant=zh-hk", ), ( "城市", "https://www.singtao.ca/category/53-%E5%A4%9A%E5%80%AB%E5%A4%9A%E5%9F%8E%E5%B8%82/?variant=zh-hk", ), ( "港聞", "https://www.singtao.ca/category/57-%E5%A4%9A%E5%80%AB%E5%A4%9A%E6%B8%AF%E8%81%9E/?variant=zh-hk", ), ( "國際", "https://www.singtao.ca/category/56-%E5%A4%9A%E5%80%AB%E5%A4%9A%E5%9C%8B%E9%9A%9B/?variant=zh-hk", ), ( "中國", "https://www.singtao.ca/category/58-%E5%A4%9A%E5%80%AB%E5%A4%9A%E4%B8%AD%E5%9C%8B/?variant=zh-hk", ), ( "台灣", "https://www.singtao.ca/category/59-%E5%A4%9A%E5%80%AB%E5%A4%9A%E5%8F%B0%E7%81%A3/?variant=zh-hk", ), ( "財經", "https://www.singtao.ca/category/61-%E5%A4%9A%E5%80%AB%E5%A4%9A%E8%B2%A1%E7%B6%93/?variant=zh-hk", ), ( "體育", "https://www.singtao.ca/category/60-%E5%A4%9A%E5%80%AB%E5%A4%9A%E9%AB%94%E8%82%B2/?variant=zh-hk", ), ( "娛樂", "https://www.singtao.ca/category/62-%E5%A4%9A%E5%80%AB%E5%A4%9A%E5%A8%9B%E6%A8%82/?variant=zh-hk", ), ] try: for (title, url) in sections: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = html.document_fromstring( read_http_page(url, { "edition": "toronto" }).decode("utf-8")) # top story top_story_link = doc.xpath( '(//div[@class="td-ss-main-content"])[1]/div[@class="cat-header-image"]/a' ) top_story_text = doc.xpath( '(//div[@class="td-ss-main-content"])[1]/div[@class="cat-header-image"]/a/div/h3' ) if top_story_link and top_story_text: resultList.append( self.create_article( top_story_text[0].text.strip(), top_story_link[0].get("href"), )) for topic in doc.xpath( '(//div[@class="td-ss-main-content"])[1]/div[contains(@class, "td-animation-stack")]/div[@class="item-details"]/h3/a' ): if topic.text and topic.get("href"): resultList.append( self.create_article(topic.text.strip(), topic.get("href"))) except Exception as e: logger.exception("Problem processing url: " + str(e)) logger.exception( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList
def get_articles(self): topUrl = "http://orientaldaily.on.cc" sections = { 'news': { 'title': '要聞港聞', 'url': '' }, 'china_world': { 'title': '兩岸國際', 'url': '' }, 'finance': { 'title': '產經', 'url': '' }, 'entertainment': { 'title': '娛樂', 'url': '' }, 'lifestyle': { 'title': '副刊', 'url': '' }, 'sport': { 'title': '體育', 'url': '' } } try: doc = html.document_fromstring(read_http_page(topUrl)) if doc is not None: menu = doc.xpath( '//*[@id="pageCTN"]/header/div[contains(@class, "middle")]/ul[contains(@class, "menuList")]' ) if menu: for theLink in menu[0].xpath('li/a'): theClass = theLink.xpath('@class') if theLink.xpath('@href') and theClass and theClass[ 0] in sections: sections[theClass[0]][ 'url'] = topUrl + theLink.xpath('@href')[0] except Exception as e: logger.exception("Problem getting sections: " + str(e)) logger.exception( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) resultList = [] baseUrl = topUrl try: for _, section in sections.items(): title = section['title'] sectionUrl = section['url'] if sectionUrl: # for each section, insert a title... resultList.append(self.create_section(title)) # ... then parse the page and extract article links doc = html.document_fromstring(read_http_page(sectionUrl)) if doc is not None: articles = doc.xpath( '//div[contains(@class, "sectionList")]/div[contains(@class, "subsection")]/ul[contains(@class, "items")]/li[@articleid]' ) for article in articles: articleUrls = article.xpath('a/@href') articleTexts = article.xpath( 'a/div[contains(@class, "text")]/text()') if articleUrls and articleTexts: resultList.append( self.create_article( articleTexts[0].strip(), baseUrl + articleUrls[0])) except Exception as e: logger.exception("Problem processing url: " + str(e)) logger.exception( traceback.format_exception(etype=type(e), value=e, tb=e.__traceback__)) return resultList