Ejemplo n.º 1
0
    def get_contents_key(self, div):
        """Gets key tag from article. """
    
        if self.contents_key == 'read-time':
            key_tag = div.find('a', attrs={'class': ['reading-time']})
        else:
            url = 'http://' + BasicNewsRecipe.tag_to_string(div.find('a', attrs={'class': ['tool link']}))
            soup = self.browser.index_to_soup(url)
            key_tag = soup.find('title')

        return BasicNewsRecipe.tag_to_string(key_tag)
Ejemplo n.º 2
0
 def extract_info(self,div):
     a = div.find('a', href=True)
     if a:
         url = self.base_url + a['href']
         title = BasicNewsRecipe.tag_to_string(a, use_alt=False)
         description = url
         pubdate = strftime('%a, %d %b')
         summary = div.find('p')
         if summary:
             description = BasicNewsRecipe.tag_to_string(summary, use_alt=False)
         return dict(title=title, url=url, date=pubdate,description=description, content='') 
Ejemplo n.º 3
0
    def get_contents_key(self, div):
        """Gets key tag from article. """

        if self.contents_key == 'read-time':
            key_tag = div.find('a', attrs={'class': ['reading-time']})
        else:
            url = 'http://' + BasicNewsRecipe.tag_to_string(
                div.find('a', attrs={'class': ['tool link']}))
            soup = self.browser.index_to_soup(url)
            key_tag = soup.find('title')

        return BasicNewsRecipe.tag_to_string(key_tag)
Ejemplo n.º 4
0
    def get_contents_key(self, div):
        """Gets key tag from article. """
    
        if self.contents_key == 'read-time':
            key_tag = div.find('a', attrs={'class': ['reading-time']})
        elif self.contents_key == 'title-and-read-time':
			reading_time = ' (' + str(div.find('a', attrs={'class': ['reading-time']}).contents[0]).replace('<span>', '').replace('</span>', '') + ')'
			key_tag = div.find('a').contents[0].rstrip('\n') + reading_time
        else:
            url = 'http://' + BasicNewsRecipe.tag_to_string(div.find('a', attrs={'class': ['tool link']}))
            key_tag = '{uri.netloc}'.format(uri=urlparse(url))

        return BasicNewsRecipe.tag_to_string(key_tag)
Ejemplo n.º 5
0
    def _fetch_article(self, url, dir_, f, a, num_of_feeds):
        br = self.browser
        if self.get_browser.im_func is BasicNewsRecipe.get_browser.im_func:
            # We are using the default get_browser, which means no need to
            # clone
            br = BasicNewsRecipe.get_browser(self)
        else:
            br = self.clone_browser(self.browser)
        self.web2disk_options.browser = br
        # ============== Here is the only change =================
        fetcher = RichRecursiveFetcher(self.web2disk_options, self.log,
                self.image_map, self.css_map,
                (url, f, a, num_of_feeds),
                image_formats=[PngFormat(), GifFormat(), JpegFormat()])
        # ========================================================
        fetcher.browser = br
        fetcher.base_dir = dir_
        fetcher.current_dir = dir_
        fetcher.show_progress = False
        fetcher.image_url_processor = self.image_url_processor
        res, path, failures = fetcher.start_fetch(url), fetcher.downloaded_paths, fetcher.failed_links
        if not res or not os.path.exists(res):
            msg = _('Could not fetch article.') + ' '
            if self.debug:
                msg += _('The debug traceback is available earlier in this log')
            else:
                msg += _('Run with -vv to see the reason')
            raise Exception(msg)

        return res, path, failures
Ejemplo n.º 6
0
    def _fetch_article(self, url, dir_, f, a, num_of_feeds):
        br = self.browser
        if self.get_browser.im_func is BasicNewsRecipe.get_browser.im_func:
            # We are using the default get_browser, which means no need to
            # clone
            br = BasicNewsRecipe.get_browser(self)
        else:
            br = self.clone_browser(self.browser)
        self.web2disk_options.browser = br
        # ============== Here is the only change =================
        fetcher = RichRecursiveFetcher(
            self.web2disk_options,
            self.log,
            self.image_map,
            self.css_map, (url, f, a, num_of_feeds),
            image_formats=[PngFormat(), GifFormat(),
                           JpegFormat()])
        # ========================================================
        fetcher.browser = br
        fetcher.base_dir = dir_
        fetcher.current_dir = dir_
        fetcher.show_progress = False
        fetcher.image_url_processor = self.image_url_processor
        res, path, failures = fetcher.start_fetch(
            url), fetcher.downloaded_paths, fetcher.failed_links
        if not res or not os.path.exists(res):
            msg = _('Could not fetch article.') + ' '
            if self.debug:
                msg += _(
                    'The debug traceback is available earlier in this log')
            else:
                msg += _('Run with -vv to see the reason')
            raise Exception(msg)

        return res, path, failures
Ejemplo n.º 7
0
 def extract_info(self, div):
     a = div.find('a', href=True)
     if a:
         url = self.base_url + a['href']
         title = BasicNewsRecipe.tag_to_string(a, use_alt=False)
         description = url
         pubdate = strftime('%a, %d %b')
         summary = div.find('p')
         if summary:
             description = BasicNewsRecipe.tag_to_string(summary,
                                                         use_alt=False)
         return dict(title=title,
                     url=url,
                     date=pubdate,
                     description=description,
                     content='')
Ejemplo n.º 8
0
    def get_contents_key(self, div):
        """Gets key tag from article. """

        if self.contents_key == 'read-time':
            key_tag = div.find('a', attrs={'class': ['reading-time']})
        elif self.contents_key == 'title-and-read-time':
            reading_time = ' (' + str(
                div.find('a', attrs={
                    'class': ['reading-time']
                }).contents[0]).replace('<span>', '').replace('</span>',
                                                              '') + ')'
            key_tag = div.find('a').contents[0].rstrip('\n') + reading_time
        else:
            url = 'http://' + BasicNewsRecipe.tag_to_string(
                div.find('a', attrs={'class': ['tool link']}))
            key_tag = '{uri.netloc}'.format(uri=urlparse(url))

        return BasicNewsRecipe.tag_to_string(key_tag)
Ejemplo n.º 9
0
 def get_browser(self):
     br = BasicNewsRecipe.get_browser()
     if self.username is not None and self.password is not None:
         br.open('http://www.nytimes.com/auth/login')
         br.form = br.forms().next()
         br['userid']   = self.username
         br['password'] = self.password
         raw = br.submit().read()
         if 'Please try again' in raw:
             raise Exception('Your username and password are incorrect')
     return br
Ejemplo n.º 10
0
 def get_browser(self):
     br = BasicNewsRecipe.get_browser(self)
     if self.username and self.password:
         if self.appURL == 'http://app.inthepoche.com':
             br.open(self.appURL + '/u/' + self.username)
         else:
             br.open(self.appURL)
         br.select_form(name='loginform')
         br['login'] = self.username
         br['password'] = self.password
         br.submit()
     return br
Ejemplo n.º 11
0
 def get_cover_url(self):
     cover = None
     st = time.localtime()
     year = str(st.tm_year)
     month = "%.2d" % st.tm_mon
     day = "%.2d" % st.tm_mday
     cover = 'http://graphics8.nytimes.com/images/' + year + '/' +  month +'/' + day +'/nytfrontpage/scan.jpg'
     br = BasicNewsRecipe.get_browser()
     try:
         br.open(cover)
     except:
         self.log("\nCover unavailable")
         cover = None
     return cover
Ejemplo n.º 12
0
 def get_browser(self):
     br = BasicNewsRecipe.get_browser(self)
     self.authentify_to_poche(br)
     return br
Ejemplo n.º 13
0
    def parse_feeds(self):

        # Call parent's method.
        feeds = BasicNewsRecipe.parse_feeds(self)

        # Loop through all feeds.
        for feed in feeds:

            # Loop through all articles in feed.
            for article in feed.articles[:]:

                # Match key words and remove article if there's a match.

                # Most BBC rss feed video only 'articles' use upper case 'VIDEO'
                # as a title prefix. Just match upper case 'VIDEO', so that
                # articles like 'Video game banned' won't be matched and removed.
                if 'VIDEO' in article.title:
                    feed.articles.remove(article)

                # Most BBC rss feed audio only 'articles' use upper case 'AUDIO'
                # as a title prefix. Just match upper case 'AUDIO', so that
                # articles like 'Hi-Def audio...' won't be matched and removed.
                elif 'AUDIO' in article.title:
                    feed.articles.remove(article)

                # Most BBC rss feed photo slideshow 'articles' use 'In Pictures',
                # 'In pictures', and 'in pictures', somewhere in their title.
                # Match any case of that phrase.
                elif 'IN PICTURES' in article.title.upper():
                    feed.articles.remove(article)

                # As above, but user contributed pictures. Match any case.
                elif 'YOUR PICTURES' in article.title.upper():
                    feed.articles.remove(article)

                # 'Sportsday Live' are articles which contain a constantly and
                # dynamically updated 'running commentary' during a live sporting
                # event. Match any case.
                elif 'SPORTSDAY LIVE' in article.title.upper():
                    feed.articles.remove(article)

                # Sometimes 'Sportsday Live' (above) becomes 'Live - Sport Name'.
                # These are being matched below using 'Live - ' because removing all
                # articles with 'live' in their titles would remove some articles
                # that are in fact not live sports pages. Match any case.
                elif 'LIVE - ' in article.title.upper():
                    feed.articles.remove(article)

                # 'Quiz of the week' is a Flash player weekly news quiz. Match only
                # the 'Quiz of the' part in anticipation of monthly and yearly
                # variants. Match any case.
                elif 'QUIZ OF THE' in article.title.upper():
                    feed.articles.remove(article)

                # Remove articles with 'scorecards' in the url. These are BBC sports
                # pages which just display a cricket scorecard. The pages have a mass
                # of table and css entries to display the scorecards nicely. Probably
                # could make them work with this recipe, but might take a whole day
                # of work to sort out all the css - basically a formatting nightmare.
                elif 'scorecards' in article.url:
                    feed.articles.remove(article)

        return feeds

# End of class and file.
Ejemplo n.º 14
0
 def get_browser(self):
     br = BasicNewsRecipe.get_browser(self)
     self.authentify_to_poche(br)
     return br
Ejemplo n.º 15
0
 def __init__(self,indexPage):
     nb_results = BasicNewsRecipe.tag_to_string(indexPage.find('div', attrs={'class': 'nb-results'}))
     if nb_results != None:
         numbersOnResult = re.findall(r'\d+', nb_results)
         self.articles_number = int(numbersOnResult[0]) if numbersOnResult else 1