def get_contents_key(self, div): """Gets key tag from article. """ if self.contents_key == 'read-time': key_tag = div.find('a', attrs={'class': ['reading-time']}) else: url = 'http://' + BasicNewsRecipe.tag_to_string(div.find('a', attrs={'class': ['tool link']})) soup = self.browser.index_to_soup(url) key_tag = soup.find('title') return BasicNewsRecipe.tag_to_string(key_tag)
def extract_info(self,div): a = div.find('a', href=True) if a: url = self.base_url + a['href'] title = BasicNewsRecipe.tag_to_string(a, use_alt=False) description = url pubdate = strftime('%a, %d %b') summary = div.find('p') if summary: description = BasicNewsRecipe.tag_to_string(summary, use_alt=False) return dict(title=title, url=url, date=pubdate,description=description, content='')
def get_contents_key(self, div): """Gets key tag from article. """ if self.contents_key == 'read-time': key_tag = div.find('a', attrs={'class': ['reading-time']}) else: url = 'http://' + BasicNewsRecipe.tag_to_string( div.find('a', attrs={'class': ['tool link']})) soup = self.browser.index_to_soup(url) key_tag = soup.find('title') return BasicNewsRecipe.tag_to_string(key_tag)
def get_contents_key(self, div): """Gets key tag from article. """ if self.contents_key == 'read-time': key_tag = div.find('a', attrs={'class': ['reading-time']}) elif self.contents_key == 'title-and-read-time': reading_time = ' (' + str(div.find('a', attrs={'class': ['reading-time']}).contents[0]).replace('<span>', '').replace('</span>', '') + ')' key_tag = div.find('a').contents[0].rstrip('\n') + reading_time else: url = 'http://' + BasicNewsRecipe.tag_to_string(div.find('a', attrs={'class': ['tool link']})) key_tag = '{uri.netloc}'.format(uri=urlparse(url)) return BasicNewsRecipe.tag_to_string(key_tag)
def _fetch_article(self, url, dir_, f, a, num_of_feeds): br = self.browser if self.get_browser.im_func is BasicNewsRecipe.get_browser.im_func: # We are using the default get_browser, which means no need to # clone br = BasicNewsRecipe.get_browser(self) else: br = self.clone_browser(self.browser) self.web2disk_options.browser = br # ============== Here is the only change ================= fetcher = RichRecursiveFetcher(self.web2disk_options, self.log, self.image_map, self.css_map, (url, f, a, num_of_feeds), image_formats=[PngFormat(), GifFormat(), JpegFormat()]) # ======================================================== fetcher.browser = br fetcher.base_dir = dir_ fetcher.current_dir = dir_ fetcher.show_progress = False fetcher.image_url_processor = self.image_url_processor res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links if not res or not os.path.exists(res): msg = _('Could not fetch article.') + ' ' if self.debug: msg += _('The debug traceback is available earlier in this log') else: msg += _('Run with -vv to see the reason') raise Exception(msg) return res, path, failures
def _fetch_article(self, url, dir_, f, a, num_of_feeds): br = self.browser if self.get_browser.im_func is BasicNewsRecipe.get_browser.im_func: # We are using the default get_browser, which means no need to # clone br = BasicNewsRecipe.get_browser(self) else: br = self.clone_browser(self.browser) self.web2disk_options.browser = br # ============== Here is the only change ================= fetcher = RichRecursiveFetcher( self.web2disk_options, self.log, self.image_map, self.css_map, (url, f, a, num_of_feeds), image_formats=[PngFormat(), GifFormat(), JpegFormat()]) # ======================================================== fetcher.browser = br fetcher.base_dir = dir_ fetcher.current_dir = dir_ fetcher.show_progress = False fetcher.image_url_processor = self.image_url_processor res, path, failures = fetcher.start_fetch( url), fetcher.downloaded_paths, fetcher.failed_links if not res or not os.path.exists(res): msg = _('Could not fetch article.') + ' ' if self.debug: msg += _( 'The debug traceback is available earlier in this log') else: msg += _('Run with -vv to see the reason') raise Exception(msg) return res, path, failures
def extract_info(self, div): a = div.find('a', href=True) if a: url = self.base_url + a['href'] title = BasicNewsRecipe.tag_to_string(a, use_alt=False) description = url pubdate = strftime('%a, %d %b') summary = div.find('p') if summary: description = BasicNewsRecipe.tag_to_string(summary, use_alt=False) return dict(title=title, url=url, date=pubdate, description=description, content='')
def get_contents_key(self, div): """Gets key tag from article. """ if self.contents_key == 'read-time': key_tag = div.find('a', attrs={'class': ['reading-time']}) elif self.contents_key == 'title-and-read-time': reading_time = ' (' + str( div.find('a', attrs={ 'class': ['reading-time'] }).contents[0]).replace('<span>', '').replace('</span>', '') + ')' key_tag = div.find('a').contents[0].rstrip('\n') + reading_time else: url = 'http://' + BasicNewsRecipe.tag_to_string( div.find('a', attrs={'class': ['tool link']})) key_tag = '{uri.netloc}'.format(uri=urlparse(url)) return BasicNewsRecipe.tag_to_string(key_tag)
def get_browser(self): br = BasicNewsRecipe.get_browser() if self.username is not None and self.password is not None: br.open('http://www.nytimes.com/auth/login') br.form = br.forms().next() br['userid'] = self.username br['password'] = self.password raw = br.submit().read() if 'Please try again' in raw: raise Exception('Your username and password are incorrect') return br
def get_browser(self): br = BasicNewsRecipe.get_browser(self) if self.username and self.password: if self.appURL == 'http://app.inthepoche.com': br.open(self.appURL + '/u/' + self.username) else: br.open(self.appURL) br.select_form(name='loginform') br['login'] = self.username br['password'] = self.password br.submit() return br
def get_cover_url(self): cover = None st = time.localtime() year = str(st.tm_year) month = "%.2d" % st.tm_mon day = "%.2d" % st.tm_mday cover = 'http://graphics8.nytimes.com/images/' + year + '/' + month +'/' + day +'/nytfrontpage/scan.jpg' br = BasicNewsRecipe.get_browser() try: br.open(cover) except: self.log("\nCover unavailable") cover = None return cover
def get_browser(self): br = BasicNewsRecipe.get_browser(self) self.authentify_to_poche(br) return br
def parse_feeds(self): # Call parent's method. feeds = BasicNewsRecipe.parse_feeds(self) # Loop through all feeds. for feed in feeds: # Loop through all articles in feed. for article in feed.articles[:]: # Match key words and remove article if there's a match. # Most BBC rss feed video only 'articles' use upper case 'VIDEO' # as a title prefix. Just match upper case 'VIDEO', so that # articles like 'Video game banned' won't be matched and removed. if 'VIDEO' in article.title: feed.articles.remove(article) # Most BBC rss feed audio only 'articles' use upper case 'AUDIO' # as a title prefix. Just match upper case 'AUDIO', so that # articles like 'Hi-Def audio...' won't be matched and removed. elif 'AUDIO' in article.title: feed.articles.remove(article) # Most BBC rss feed photo slideshow 'articles' use 'In Pictures', # 'In pictures', and 'in pictures', somewhere in their title. # Match any case of that phrase. elif 'IN PICTURES' in article.title.upper(): feed.articles.remove(article) # As above, but user contributed pictures. Match any case. elif 'YOUR PICTURES' in article.title.upper(): feed.articles.remove(article) # 'Sportsday Live' are articles which contain a constantly and # dynamically updated 'running commentary' during a live sporting # event. Match any case. elif 'SPORTSDAY LIVE' in article.title.upper(): feed.articles.remove(article) # Sometimes 'Sportsday Live' (above) becomes 'Live - Sport Name'. # These are being matched below using 'Live - ' because removing all # articles with 'live' in their titles would remove some articles # that are in fact not live sports pages. Match any case. elif 'LIVE - ' in article.title.upper(): feed.articles.remove(article) # 'Quiz of the week' is a Flash player weekly news quiz. Match only # the 'Quiz of the' part in anticipation of monthly and yearly # variants. Match any case. elif 'QUIZ OF THE' in article.title.upper(): feed.articles.remove(article) # Remove articles with 'scorecards' in the url. These are BBC sports # pages which just display a cricket scorecard. The pages have a mass # of table and css entries to display the scorecards nicely. Probably # could make them work with this recipe, but might take a whole day # of work to sort out all the css - basically a formatting nightmare. elif 'scorecards' in article.url: feed.articles.remove(article) return feeds # End of class and file.
def __init__(self,indexPage): nb_results = BasicNewsRecipe.tag_to_string(indexPage.find('div', attrs={'class': 'nb-results'})) if nb_results != None: numbersOnResult = re.findall(r'\d+', nb_results) self.articles_number = int(numbersOnResult[0]) if numbersOnResult else 1