Exemple #1
0
    def extract_search_query(self):
        search_query = None
        address = qurl(self.address, remove=['_'])
        query = urlparse(address).query
        query_dict = parse_qs(query)
        if 'q' in query_dict:
            search_query = query_dict['q'][0]

        return search_query
    def paginate_objects(self, request, objects):
        paginator = Paginator(objects, self.paginate_by)
        page = request.GET.get('page', 1)

        try:
            object_list = paginator.page(page)
        except PageNotAnInteger:
            object_list = paginator.page(1)
        except EmptyPage:
            object_list = paginator.page(paginator.num_pages)

        pagination = {
            'count': object_list.paginator.count,
            'num_pages': object_list.paginator.num_pages,
            'previous_page_number': None,
            'previous_url': None,
            'next_page_number': None,
            'next_url': None
        }
        url = request.get_full_path()

        if object_list.has_next():
            next_page_number = object_list.next_page_number()
            pagination['next_page_number'] = next_page_number
            pagination['next_url'] = 'http://{0}{1}'.format(
                request.get_host(),
                qurl(
                    url,
                    add={'page': object_list.next_page_number()}
                )
            )

        if object_list.has_previous():
            previous_page_number = object_list.previous_page_number()
            pagination['previous_page_number'] = previous_page_number
            pagination['previous_url'] = 'http://{0}{1}'.format(
                request.get_host(),
                qurl(
                    url,
                    add={'page': object_list.previous_page_number()}
                )
            )

        return object_list, pagination
Exemple #3
0
    def extract_username(self):
        username = None
        try:
            address = qurl(self.address, remove=['_'])
            username_groups = re.search('twitter.com/(\w+)/?$', address)
            if not username_groups:
                return
            username = username_groups.group(1)
        except IndexError:
            return

        return username
Exemple #4
0
 def fetch(self):
     """ 
     Uses requests to download the feed, parsing it in feedparser. Will be storified later.
     """
     start = time.time()
     identity = self.get_identity()
     log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity,
                                                         self.feed.title[:30],
                                                         self.feed.id,
                                                         datetime.datetime.now() - self.feed.last_update)
     logging.debug(log_msg)
     
     etag=self.feed.etag
     modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None
     address = self.feed.feed_address
     
     if (self.options.get('force') or random.random() <= .01):
         self.options['force'] = True
         modified = None
         etag = None
         address = qurl(address, add={"_": random.randint(0, 10000)})
         logging.debug(u'   ---> [%-30s] ~FBForcing fetch: %s' % (
                       self.feed.title[:30], address))
     elif (not self.feed.fetched_once or not self.feed.known_good):
         modified = None
         etag = None
     
     USER_AGENT = ('NewsBlur Feed Fetcher - %s subscriber%s - %s '
                   '(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) '
                   'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 '
                   'Safari/534.48.3)' % (
                       self.feed.num_subscribers,
                       's' if self.feed.num_subscribers != 1 else '',
                       self.feed.permalink,
                  ))
     if self.options.get('feed_xml'):
         logging.debug(u'   ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % (
                       self.feed.title[:30], len(self.options.get('feed_xml'))))
     
     if self.options.get('fpf'):
         self.fpf = self.options.get('fpf')
         logging.debug(u'   ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (
                       self.feed.title[:30]))
         return FEED_OK, self.fpf
     
     if 'youtube.com' in address:
         try:
             youtube_feed = self.fetch_youtube(address)
         except (requests.adapters.ConnectionError):
             youtube_feed = None
         if not youtube_feed:
             logging.debug(u'   ***> [%-30s] ~FRYouTube fetch failed: %s.' % 
                           (self.feed.title[:30], address))
             return FEED_ERRHTTP, None
         self.fpf = feedparser.parse(youtube_feed)
     elif re.match('(https?)?://twitter.com/\w+/?$', qurl(address, remove=['_'])):
         # try:
         twitter_feed = self.fetch_twitter(address)
         # except Exception, e:
         #     logging.debug(u'   ***> [%-30s] ~FRTwitter fetch failed: %s: %e' %
         #                   (self.feed.title[:30], address, e))
         #     twitter_feed = None
         if not twitter_feed:
             logging.debug(u'   ***> [%-30s] ~FRTwitter fetch failed: %s' % 
                           (self.feed.title[:30], address))
             return FEED_ERRHTTP, None
         self.fpf = feedparser.parse(twitter_feed)
     
     if not self.fpf:
         try:
             headers = {
                 'User-Agent': USER_AGENT,
                 'Accept-encoding': 'gzip, deflate',
                 'A-IM': 'feed',
             }
             if etag:
                 headers['If-None-Match'] = etag
             if modified:
                 # format into an RFC 1123-compliant timestamp. We can't use
                 # time.strftime() since the %a and %b directives can be affected
                 # by the current locale, but RFC 2616 states that dates must be
                 # in English.
                 short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
                 months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
                 modified_header = '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])
                 headers['If-Modified-Since'] = modified_header
             raw_feed = requests.get(address, headers=headers)
             if raw_feed.content:
                 response_headers = raw_feed.headers
                 response_headers['Content-Location'] = raw_feed.url
                 self.fpf = feedparser.parse(smart_unicode(raw_feed.content),
                                             response_headers=response_headers)
         except Exception, e:
             logging.debug(" ---> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.title[:30], unicode(e)[:100]))
         
         if not self.fpf:
             try:
                 self.fpf = feedparser.parse(address,
                                             agent=USER_AGENT,
                                             etag=etag,
                                             modified=modified)
             except (TypeError, ValueError, KeyError, EOFError), e:
                 logging.debug(u'   ***> [%-30s] ~FRFeed fetch error: %s' % 
                               (self.feed.title[:30], e))
                 pass
Exemple #5
0
    def process(self):
        """ Downloads and parses a feed.
        """
        start = time.time()
        self.refresh_feed()
        
        ret_values = dict(new=0, updated=0, same=0, error=0)

        if hasattr(self.fpf, 'status'):
            if self.options['verbose']:
                if self.fpf.bozo and self.fpf.status != 304:
                    logging.debug(u'   ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % (
                                  self.feed.title[:30],
                                  self.fpf.bozo_exception,
                                  len(self.fpf.entries)))
                    
            if self.fpf.status == 304:
                self.feed = self.feed.save()
                self.feed.save_feed_history(304, "Not modified")
                return FEED_SAME, ret_values
            
            # 302: Temporary redirect: ignore
            # 301: Permanent redirect: save it (after 10 tries)
            if self.fpf.status == 301:
                if self.fpf.href.endswith('feedburner.com/atom.xml'):
                    return FEED_ERRHTTP, ret_values
                redirects, non_redirects = self.feed.count_redirects_in_history('feed')
                self.feed.save_feed_history(self.fpf.status, "HTTP Redirect (%d to go)" % (10-len(redirects)))
                if len(redirects) >= 10 or len(non_redirects) == 0:
                    address = self.fpf.href
                    if self.options['force'] and address:
                        address = qurl(address, remove=['_'])
                    self.feed.feed_address = address
                if not self.feed.known_good:
                    self.feed.fetched_once = True
                    logging.debug("   ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.title[:30], self.fpf.status))
                    self.feed = self.feed.schedule_feed_fetch_immediately()
                if not self.fpf.entries:
                    self.feed = self.feed.save()
                    self.feed.save_feed_history(self.fpf.status, "HTTP Redirect")
                    return FEED_ERRHTTP, ret_values
            if self.fpf.status >= 400:
                logging.debug("   ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.title[:30], self.fpf.status))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(self.fpf.status, "HTTP Error")
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRHTTP, ret_values
        
        if not self.fpf:
            logging.debug("   ---> [%-30s] ~SB~FRFeed is Non-XML. No feedparser feed either!" % (self.feed.title[:30]))
            self.feed.save_feed_history(551, "Broken feed")
            return FEED_ERRHTTP, ret_values
            
        if self.fpf and not self.fpf.entries:
            if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
                logging.debug("   ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries)))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception)
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values
            elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
                logging.debug("   ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries)))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(553, 'SAX Exception', self.fpf.bozo_exception)
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values
                
        # the feed has changed (or it is the first time we parse it)
        # saving the etag and last_modified fields
        original_etag = self.feed.etag
        self.feed.etag = self.fpf.get('etag')
        if self.feed.etag:
            self.feed.etag = self.feed.etag[:255]
        # some times this is None (it never should) *sigh*
        if self.feed.etag is None:
            self.feed.etag = ''
        if self.feed.etag != original_etag:
            self.feed.save(update_fields=['etag'])
            
        original_last_modified = self.feed.last_modified
        if hasattr(self.fpf, 'modified') and self.fpf.modified:
            try:
                self.feed.last_modified = datetime.datetime.strptime(self.fpf.modified, '%a, %d %b %Y %H:%M:%S %Z')
            except Exception, e:
                self.feed.last_modified = None
                logging.debug("Broken mtime %s: %s" % (self.feed.last_modified, e))
                pass
Exemple #6
0
    def fetch(self):
        """ 
        Uses requests to download the feed, parsing it in feedparser. Will be storified later.
        """
        start = time.time()
        identity = self.get_identity()
        log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (
            identity, self.feed.title[:30], self.feed.id,
            datetime.datetime.now() - self.feed.last_update)
        logging.debug(log_msg)

        etag = self.feed.etag
        modified = self.feed.last_modified.utctimetuple(
        )[:7] if self.feed.last_modified else None
        address = self.feed.feed_address

        if (self.options.get('force') or random.random() <= .01):
            self.options['force'] = True
            modified = None
            etag = None
            address = qurl(address, add={"_": random.randint(0, 10000)})
            logging.debug(u'   ---> [%-30s] ~FBForcing fetch: %s' %
                          (self.feed.title[:30], address))
        elif (not self.feed.fetched_once or not self.feed.known_good):
            modified = None
            etag = None

        USER_AGENT = ('NewsBlur Feed Fetcher - %s subscriber%s - %s '
                      '(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) '
                      'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 '
                      'Safari/534.48.3)' % (
                          self.feed.num_subscribers,
                          's' if self.feed.num_subscribers != 1 else '',
                          self.feed.permalink,
                      ))
        if self.options.get('feed_xml'):
            logging.debug(
                u'   ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s'
                % (self.feed.title[:30], len(self.options.get('feed_xml'))))

        if self.options.get('fpf'):
            self.fpf = self.options.get('fpf')
            logging.debug(
                u'   ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.'
                % (self.feed.title[:30]))
            return FEED_OK, self.fpf

        if 'youtube.com' in address:
            try:
                youtube_feed = self.fetch_youtube(address)
            except (requests.adapters.ConnectionError):
                youtube_feed = None
            if not youtube_feed:
                logging.debug(u'   ***> [%-30s] ~FRYouTube fetch failed: %s.' %
                              (self.feed.title[:30], address))
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(youtube_feed)

        if not self.fpf:
            try:
                headers = {
                    'User-Agent': USER_AGENT,
                    'Accept-encoding': 'gzip, deflate',
                    'A-IM': 'feed',
                }
                if etag:
                    headers['If-None-Match'] = etag
                if modified:
                    # format into an RFC 1123-compliant timestamp. We can't use
                    # time.strftime() since the %a and %b directives can be affected
                    # by the current locale, but RFC 2616 states that dates must be
                    # in English.
                    short_weekdays = [
                        'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'
                    ]
                    months = [
                        'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug',
                        'Sep', 'Oct', 'Nov', 'Dec'
                    ]
                    modified_header = '%s, %02d %s %04d %02d:%02d:%02d GMT' % (
                        short_weekdays[modified[6]], modified[2],
                        months[modified[1] - 1], modified[0], modified[3],
                        modified[4], modified[5])
                    headers['If-Modified-Since'] = modified_header
                raw_feed = requests.get(address, headers=headers)
                if raw_feed.content:
                    response_headers = raw_feed.headers
                    response_headers['Content-Location'] = raw_feed.url
                    self.fpf = feedparser.parse(
                        smart_unicode(raw_feed.content),
                        response_headers=response_headers)
            except Exception, e:
                logging.debug(
                    " ---> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s"
                    % (self.feed.title[:30], unicode(e)[:100]))

            if not self.fpf:
                try:
                    self.fpf = feedparser.parse(address,
                                                agent=USER_AGENT,
                                                etag=etag,
                                                modified=modified)
                except (TypeError, ValueError, KeyError, EOFError), e:
                    logging.debug(u'   ***> [%-30s] ~FRFeed fetch error: %s' %
                                  (self.feed.title[:30], e))
                    pass
Exemple #7
0
    def process(self):
        """ Downloads and parses a feed.
        """
        start = time.time()
        self.refresh_feed()

        ret_values = dict(new=0, updated=0, same=0, error=0)

        if hasattr(self.fpf, 'status'):
            if self.options['verbose']:
                if self.fpf.bozo and self.fpf.status != 304:
                    logging.debug(
                        u'   ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)'
                        % (self.feed.title[:30], self.fpf.bozo_exception,
                           len(self.fpf.entries)))

            if self.fpf.status == 304:
                self.feed = self.feed.save()
                self.feed.save_feed_history(304, "Not modified")
                return FEED_SAME, ret_values

            # 302: Temporary redirect: ignore
            # 301: Permanent redirect: save it (after 10 tries)
            if self.fpf.status == 301:
                if self.fpf.href.endswith('feedburner.com/atom.xml'):
                    return FEED_ERRHTTP, ret_values
                redirects, non_redirects = self.feed.count_redirects_in_history(
                    'feed')
                self.feed.save_feed_history(
                    self.fpf.status,
                    "HTTP Redirect (%d to go)" % (10 - len(redirects)))
                if len(redirects) >= 10 or len(non_redirects) == 0:
                    address = self.fpf.href
                    if self.options['force'] and address:
                        address = qurl(address, remove=['_'])
                    self.feed.feed_address = address
                if not self.feed.known_good:
                    self.feed.fetched_once = True
                    logging.debug(
                        "   ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..."
                        % (self.feed.title[:30], self.fpf.status))
                    self.feed = self.feed.schedule_feed_fetch_immediately()
                if not self.fpf.entries:
                    self.feed = self.feed.save()
                    self.feed.save_feed_history(self.fpf.status,
                                                "HTTP Redirect")
                    return FEED_ERRHTTP, ret_values
            if self.fpf.status >= 400:
                logging.debug(
                    "   ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..."
                    % (self.feed.title[:30], self.fpf.status))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address(
                    )
                if not fixed_feed:
                    self.feed.save_feed_history(self.fpf.status, "HTTP Error")
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRHTTP, ret_values

        if not self.fpf:
            logging.debug(
                "   ---> [%-30s] ~SB~FRFeed is Non-XML. No feedparser feed either!"
                % (self.feed.title[:30]))
            self.feed.save_feed_history(551, "Broken feed")
            return FEED_ERRHTTP, ret_values

        if self.fpf and not self.fpf.entries:
            if self.fpf.bozo and isinstance(self.fpf.bozo_exception,
                                            feedparser.NonXMLContentType):
                logging.debug(
                    "   ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..."
                    % (self.feed.title[:30], len(self.fpf.entries)))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address(
                    )
                if not fixed_feed:
                    self.feed.save_feed_history(552, 'Non-xml feed',
                                                self.fpf.bozo_exception)
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values
            elif self.fpf.bozo and isinstance(
                    self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
                logging.debug(
                    "   ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..."
                    % (self.feed.title[:30], len(self.fpf.entries)))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address(
                    )
                if not fixed_feed:
                    self.feed.save_feed_history(553, 'SAX Exception',
                                                self.fpf.bozo_exception)
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values

        # the feed has changed (or it is the first time we parse it)
        # saving the etag and last_modified fields
        original_etag = self.feed.etag
        self.feed.etag = self.fpf.get('etag')
        if self.feed.etag:
            self.feed.etag = self.feed.etag[:255]
        # some times this is None (it never should) *sigh*
        if self.feed.etag is None:
            self.feed.etag = ''
        if self.feed.etag != original_etag:
            self.feed.save(update_fields=['etag'])

        original_last_modified = self.feed.last_modified
        if hasattr(self.fpf, 'modified') and self.fpf.modified:
            try:
                self.feed.last_modified = datetime.datetime.strptime(
                    self.fpf.modified, '%a, %d %b %Y %H:%M:%S %Z')
            except Exception, e:
                self.feed.last_modified = None
                logging.debug("Broken mtime %s: %s" %
                              (self.feed.last_modified, e))
                pass
Exemple #8
0
class ProcessFeed:
    def __init__(self, feed_id, fpf, options):
        self.feed_id = feed_id
        self.options = options
        self.fpf = fpf

    def refresh_feed(self):
        self.feed = Feed.get_by_id(self.feed_id)
        if self.feed_id != self.feed.pk:
            logging.debug(" ***> Feed has changed: from %s to %s" %
                          (self.feed_id, self.feed.pk))
            self.feed_id = self.feed.pk

    def process(self):
        """ Downloads and parses a feed.
        """
        start = time.time()
        self.refresh_feed()

        ret_values = dict(new=0, updated=0, same=0, error=0)

        if hasattr(self.fpf, 'status'):
            if self.options['verbose']:
                if self.fpf.bozo and self.fpf.status != 304:
                    logging.debug(
                        u'   ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)'
                        % (self.feed.title[:30], self.fpf.bozo_exception,
                           len(self.fpf.entries)))

            if self.fpf.status == 304:
                self.feed = self.feed.save()
                self.feed.save_feed_history(304, "Not modified")
                return FEED_SAME, ret_values

            # 302: Temporary redirect: ignore
            # 301: Permanent redirect: save it (after 10 tries)
            if self.fpf.status == 301:
                if self.fpf.href.endswith('feedburner.com/atom.xml'):
                    return FEED_ERRHTTP, ret_values
                redirects, non_redirects = self.feed.count_redirects_in_history(
                    'feed')
                self.feed.save_feed_history(
                    self.fpf.status,
                    "HTTP Redirect (%d to go)" % (10 - len(redirects)))
                if len(redirects) >= 10 or len(non_redirects) == 0:
                    address = self.fpf.href
                    if self.options['force'] and address:
                        address = qurl(address, remove=['_'])
                    self.feed.feed_address = address
                if not self.feed.known_good:
                    self.feed.fetched_once = True
                    logging.debug(
                        "   ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..."
                        % (self.feed.title[:30], self.fpf.status))
                    self.feed = self.feed.schedule_feed_fetch_immediately()
                if not self.fpf.entries:
                    self.feed = self.feed.save()
                    self.feed.save_feed_history(self.fpf.status,
                                                "HTTP Redirect")
                    return FEED_ERRHTTP, ret_values
            if self.fpf.status >= 400:
                logging.debug(
                    "   ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..."
                    % (self.feed.title[:30], self.fpf.status))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address(
                    )
                if not fixed_feed:
                    self.feed.save_feed_history(self.fpf.status, "HTTP Error")
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRHTTP, ret_values

        if not self.fpf:
            logging.debug(
                "   ---> [%-30s] ~SB~FRFeed is Non-XML. No feedparser feed either!"
                % (self.feed.title[:30]))
            self.feed.save_feed_history(551, "Broken feed")
            return FEED_ERRHTTP, ret_values

        if self.fpf and not self.fpf.entries:
            if self.fpf.bozo and isinstance(self.fpf.bozo_exception,
                                            feedparser.NonXMLContentType):
                logging.debug(
                    "   ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..."
                    % (self.feed.title[:30], len(self.fpf.entries)))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address(
                    )
                if not fixed_feed:
                    self.feed.save_feed_history(552, 'Non-xml feed',
                                                self.fpf.bozo_exception)
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values
            elif self.fpf.bozo and isinstance(
                    self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
                logging.debug(
                    "   ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..."
                    % (self.feed.title[:30], len(self.fpf.entries)))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address(
                    )
                if not fixed_feed:
                    self.feed.save_feed_history(553, 'SAX Exception',
                                                self.fpf.bozo_exception)
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values

        # the feed has changed (or it is the first time we parse it)
        # saving the etag and last_modified fields
        original_etag = self.feed.etag
        self.feed.etag = self.fpf.get('etag')
        if self.feed.etag:
            self.feed.etag = self.feed.etag[:255]
        # some times this is None (it never should) *sigh*
        if self.feed.etag is None:
            self.feed.etag = ''
        if self.feed.etag != original_etag:
            self.feed.save(update_fields=['etag'])

        original_last_modified = self.feed.last_modified
        if hasattr(self.fpf, 'modified') and self.fpf.modified:
            try:
                self.feed.last_modified = datetime.datetime.strptime(
                    self.fpf.modified, '%a, %d %b %Y %H:%M:%S %Z')
            except Exception, e:
                self.feed.last_modified = None
                logging.debug("Broken mtime %s: %s" %
                              (self.feed.last_modified, e))
                pass
        if self.feed.last_modified != original_last_modified:
            self.feed.save(update_fields=['last_modified'])

        self.fpf.entries = self.fpf.entries[:100]

        original_title = self.feed.feed_title
        if self.fpf.feed.get('title'):
            self.feed.feed_title = strip_tags(self.fpf.feed.get('title'))
        if self.feed.feed_title != original_title:
            self.feed.save(update_fields=['feed_title'])

        tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline)
        if tagline:
            original_tagline = self.feed.data.feed_tagline
            self.feed.data.feed_tagline = smart_unicode(tagline)
            if self.feed.data.feed_tagline != original_tagline:
                self.feed.data.save(update_fields=['feed_tagline'])

        if not self.feed.feed_link_locked:
            new_feed_link = self.fpf.feed.get('link') or self.fpf.feed.get(
                'id') or self.feed.feed_link
            if self.options['force'] and new_feed_link:
                new_feed_link = qurl(new_feed_link, remove=['_'])
            if new_feed_link != self.feed.feed_link:
                logging.debug(
                    "   ---> [%-30s] ~SB~FRFeed's page is different: %s to %s"
                    %
                    (self.feed.title[:30], self.feed.feed_link, new_feed_link))
                redirects, non_redirects = self.feed.count_redirects_in_history(
                    'page')
                self.feed.save_page_history(
                    301, "HTTP Redirect (%s to go)" % (10 - len(redirects)))
                if len(redirects) >= 10 or len(non_redirects) == 0:
                    self.feed.feed_link = new_feed_link
                    self.feed.save(update_fields=['feed_link'])

        # Determine if stories aren't valid and replace broken guids
        guids_seen = set()
        permalinks_seen = set()
        for entry in self.fpf.entries:
            guids_seen.add(entry.get('guid'))
            permalinks_seen.add(Feed.get_permalink(entry))
        guid_difference = len(guids_seen) != len(self.fpf.entries)
        single_guid = len(guids_seen) == 1
        replace_guids = single_guid and guid_difference
        permalink_difference = len(permalinks_seen) != len(self.fpf.entries)
        single_permalink = len(permalinks_seen) == 1
        replace_permalinks = single_permalink and permalink_difference

        # Compare new stories to existing stories, adding and updating
        start_date = datetime.datetime.utcnow()
        story_hashes = []
        stories = []
        for entry in self.fpf.entries:
            story = pre_process_story(entry)
            if story.get('published') < start_date:
                start_date = story.get('published')
            if replace_guids:
                if replace_permalinks:
                    new_story_guid = unicode(story.get('published'))
                    if self.options['verbose']:
                        logging.debug(
                            u'   ---> [%-30s] ~FBReplacing guid (%s) with timestamp: %s'
                            % (self.feed.title[:30], story.get('guid'),
                               new_story_guid))
                    story['guid'] = new_story_guid
                else:
                    new_story_guid = Feed.get_permalink(story)
                    if self.options['verbose']:
                        logging.debug(
                            u'   ---> [%-30s] ~FBReplacing guid (%s) with permalink: %s'
                            % (self.feed.title[:30], story.get('guid'),
                               new_story_guid))
                    story['guid'] = new_story_guid
            story['story_hash'] = MStory.feed_guid_hash_unsaved(
                self.feed.pk, story.get('guid'))
            stories.append(story)
            story_hashes.append(story.get('story_hash'))

        existing_stories = dict((s.story_hash, s) for s in MStory.objects(
            story_hash__in=story_hashes,
            # story_date__gte=start_date,
            # story_feed_id=self.feed.pk
        ))

        ret_values = self.feed.add_update_stories(
            stories,
            existing_stories,
            verbose=self.options['verbose'],
            updates_off=self.options['updates_off'])

        if (hasattr(self.fpf, 'feed') and hasattr(self.fpf.feed, 'links')
                and self.fpf.feed.links):
            hub_url = None
            self_url = self.feed.feed_address
            for link in self.fpf.feed.links:
                if link['rel'] == 'hub' and not hub_url:
                    hub_url = link['href']
                elif link['rel'] == 'self':
                    self_url = link['href']
            push_expired = False
            if self.feed.is_push:
                try:
                    push_expired = self.feed.push.lease_expires < datetime.datetime.now(
                    )
                except PushSubscription.DoesNotExist:
                    self.feed.is_push = False
            if (hub_url and self_url and not settings.DEBUG
                    and self.feed.active_subscribers > 0
                    and (push_expired or not self.feed.is_push
                         or self.options.get('force'))):
                logging.debug(
                    u'   ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s' %
                    (self.feed.title[:30], "~SKRe-~SN" if push_expired else "",
                     hub_url))
                try:
                    PushSubscription.objects.subscribe(self_url,
                                                       feed=self.feed,
                                                       hub=hub_url)
                except TimeoutError:
                    logging.debug(
                        u'   ---> [%-30s] ~BB~FW~FRTimed out~FW subscribing to PuSH hub: %s'
                        % (self.feed.title[:30], hub_url))
            elif (self.feed.is_push
                  and (self.feed.active_subscribers <= 0 or not hub_url)):
                logging.debug(
                    u'   ---> [%-30s] ~BB~FWTurning off PuSH, no hub found' %
                    (self.feed.title[:30]))
                self.feed.is_push = False
                self.feed = self.feed.save()

        logging.debug(
            u'   ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s'
            % (self.feed.title[:30], '~FG~SB' if ret_values['new'] else '',
               ret_values['new'], '~FY~SB' if ret_values['updated'] else '',
               ret_values['updated'], '~SB' if ret_values['same'] else '',
               ret_values['same'], '~FR~SB' if ret_values['error'] else '',
               ret_values['error'], len(self.fpf.entries)))
        self.feed.update_all_statistics(has_new_stories=bool(
            ret_values['new']),
                                        force=self.options['force'])
        if ret_values['new']:
            self.feed.trim_feed()
            self.feed.expire_redis()
        self.feed.save_feed_history(200, "OK")

        if self.options['verbose']:
            logging.debug(u'   ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' %
                          (self.feed.title[:30], time.time() - start))

        return FEED_OK, ret_values
Exemple #9
0
    def fetch(self):
        """ 
        Uses requests to download the feed, parsing it in feedparser. Will be storified later.
        """
        start = time.time()
        identity = self.get_identity()
        log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (
            identity, self.feed.log_title[:30], self.feed.id,
            datetime.datetime.now() - self.feed.last_update)
        logging.debug(log_msg)

        etag = self.feed.etag
        modified = self.feed.last_modified.utctimetuple(
        )[:7] if self.feed.last_modified else None
        address = self.feed.feed_address

        if (self.options.get('force') or random.random() <= .01):
            self.options['force'] = True
            modified = None
            etag = None
            if address.startswith('http'):
                address = qurl(address, add={"_": random.randint(0, 10000)})
            logging.debug(u'   ---> [%-30s] ~FBForcing fetch: %s' %
                          (self.feed.log_title[:30], address))
        elif (not self.feed.fetched_once or not self.feed.known_good):
            modified = None
            etag = None

        if self.options.get('feed_xml'):
            logging.debug(
                u'   ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s'
                %
                (self.feed.log_title[:30], len(self.options.get('feed_xml'))))

        if self.options.get('fpf'):
            self.fpf = self.options.get('fpf')
            logging.debug(
                u'   ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.'
                % (self.feed.log_title[:30]))
            return FEED_OK, self.fpf

        if 'youtube.com' in address:
            try:
                youtube_feed = self.fetch_youtube(address)
            except (requests.adapters.ConnectionError):
                youtube_feed = None
            if not youtube_feed:
                logging.debug(u'   ***> [%-30s] ~FRYouTube fetch failed: %s.' %
                              (self.feed.log_title[:30], address))
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(youtube_feed)
        elif re.match(r'(https?)?://twitter.com/\w+/?',
                      qurl(address, remove=['_'])):
            twitter_feed = self.fetch_twitter(address)
            if not twitter_feed:
                logging.debug(u'   ***> [%-30s] ~FRTwitter fetch failed: %s' %
                              (self.feed.log_title[:30], address))
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(twitter_feed)
        elif re.match(r'(.*?)facebook.com/\w+/?$', qurl(address,
                                                        remove=['_'])):
            facebook_feed = self.fetch_facebook()
            if not facebook_feed:
                logging.debug(u'   ***> [%-30s] ~FRFacebook fetch failed: %s' %
                              (self.feed.log_title[:30], address))
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(facebook_feed)

        if not self.fpf:
            try:
                headers = self.feed.fetch_headers()
                if etag:
                    headers['If-None-Match'] = etag
                if modified:
                    # format into an RFC 1123-compliant timestamp. We can't use
                    # time.strftime() since the %a and %b directives can be affected
                    # by the current locale, but RFC 2616 states that dates must be
                    # in English.
                    short_weekdays = [
                        'Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'
                    ]
                    months = [
                        'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug',
                        'Sep', 'Oct', 'Nov', 'Dec'
                    ]
                    modified_header = '%s, %02d %s %04d %02d:%02d:%02d GMT' % (
                        short_weekdays[modified[6]], modified[2],
                        months[modified[1] - 1], modified[0], modified[3],
                        modified[4], modified[5])
                    headers['If-Modified-Since'] = modified_header
                if etag or modified:
                    headers['A-IM'] = 'feed'
                raw_feed = requests.get(address, headers=headers)
                if raw_feed.status_code >= 400:
                    logging.debug(
                        "   ***> [%-30s] ~FRFeed fetch was %s status code, trying fake user agent: %s"
                        % (self.feed.log_title[:30], raw_feed.status_code,
                           raw_feed.headers))
                    raw_feed = requests.get(
                        self.feed.feed_address,
                        headers=self.feed.fetch_headers(fake=True))

                if raw_feed.content and 'application/json' in raw_feed.headers.get(
                        'Content-Type', ""):
                    # JSON Feed
                    json_feed = self.fetch_json_feed(address, raw_feed)
                    if not json_feed:
                        logging.debug(
                            u'   ***> [%-30s] ~FRJSON fetch failed: %s' %
                            (self.feed.log_title[:30], address))
                        return FEED_ERRHTTP, None
                    self.fpf = feedparser.parse(json_feed)
                elif raw_feed.content and raw_feed.status_code < 400:
                    response_headers = raw_feed.headers
                    response_headers['Content-Location'] = raw_feed.url
                    self.raw_feed = smart_unicode(raw_feed.content)
                    self.fpf = feedparser.parse(
                        self.raw_feed, response_headers=response_headers)
                    if self.options.get('debug', False):
                        logging.debug(
                            " ---> [%-30s] ~FBFeed fetch status %s: %s length / %s"
                            % (self.feed.log_title[:30], raw_feed.status_code,
                               len(smart_unicode(
                                   raw_feed.content)), raw_feed.headers))
            except Exception, e:
                logging.debug(
                    "   ***> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s"
                    % (self.feed.log_title[:30], unicode(e)[:100]))

            if not self.fpf or self.options.get('force_fp', False):
                try:
                    self.fpf = feedparser.parse(address,
                                                agent=self.feed.user_agent,
                                                etag=etag,
                                                modified=modified)
                except (TypeError, ValueError, KeyError, EOFError,
                        MemoryError), e:
                    logging.debug(u'   ***> [%-30s] ~FRFeed fetch error: %s' %
                                  (self.feed.log_title[:30], e))
                    pass
Exemple #10
0
    def fetch(self):
        """ 
        Uses requests to download the feed, parsing it in feedparser. Will be storified later.
        """
        start = time.time()
        identity = self.get_identity()
        log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (identity,
                                                            self.feed.log_title[:30],
                                                            self.feed.id,
                                                            datetime.datetime.now() - self.feed.last_update)
        logging.debug(log_msg)
        
        etag = self.feed.etag
        modified = self.feed.last_modified.utctimetuple()[:7] if self.feed.last_modified else None
        address = self.feed.feed_address
        
        if (self.options.get('force') or random.random() <= .01):
            self.options['force'] = True
            modified = None
            etag = None
            address = qurl(address, add={"_": random.randint(0, 10000)})
            logging.debug(u'   ---> [%-30s] ~FBForcing fetch: %s' % (
                          self.feed.log_title[:30], address))
        elif (not self.feed.fetched_once or not self.feed.known_good):
            modified = None
            etag = None
        
        if self.options.get('feed_xml'):
            logging.debug(u'   ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s' % (
                          self.feed.log_title[:30], len(self.options.get('feed_xml'))))
        
        if self.options.get('fpf'):
            self.fpf = self.options.get('fpf')
            logging.debug(u'   ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.' % (
                          self.feed.log_title[:30]))
            return FEED_OK, self.fpf

        if 'youtube.com' in address:
            try:
                youtube_feed = self.fetch_youtube(address)
            except (requests.adapters.ConnectionError):
                youtube_feed = None
            if not youtube_feed:
                logging.debug(u'   ***> [%-30s] ~FRYouTube fetch failed: %s.' % 
                              (self.feed.log_title[:30], address))
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(youtube_feed)
        elif re.match(r'(https?)?://twitter.com/\w+/?$', qurl(address, remove=['_'])):
            twitter_feed = self.fetch_twitter(address)
            if not twitter_feed:
                logging.debug(u'   ***> [%-30s] ~FRTwitter fetch failed: %s' % 
                              (self.feed.log_title[:30], address))
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(twitter_feed)
        elif re.match(r'(.*?)facebook.com/\w+/?$', qurl(address, remove=['_'])):
            facebook_feed = self.fetch_facebook()
            if not facebook_feed:
                logging.debug(u'   ***> [%-30s] ~FRFacebook fetch failed: %s' % 
                              (self.feed.log_title[:30], address))
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(facebook_feed)
        
        if not self.fpf:
            try:
                headers = self.feed.fetch_headers()
                if etag:
                    headers['If-None-Match'] = etag
                if modified:
                    # format into an RFC 1123-compliant timestamp. We can't use
                    # time.strftime() since the %a and %b directives can be affected
                    # by the current locale, but RFC 2616 states that dates must be
                    # in English.
                    short_weekdays = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun']
                    months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
                    modified_header = '%s, %02d %s %04d %02d:%02d:%02d GMT' % (short_weekdays[modified[6]], modified[2], months[modified[1] - 1], modified[0], modified[3], modified[4], modified[5])
                    headers['If-Modified-Since'] = modified_header
                if etag or modified:
                    headers['A-IM'] = 'feed'
                raw_feed = requests.get(address, headers=headers)
                if raw_feed.status_code >= 400:
                    logging.debug("   ***> [%-30s] ~FRFeed fetch was %s status code, trying fake user agent: %s" % (self.feed.log_title[:30], raw_feed.status_code, raw_feed.headers))
                    raw_feed = requests.get(self.feed.feed_address, headers=self.feed.fetch_headers(fake=True))
                
                if raw_feed.content and 'application/json' in raw_feed.headers.get('Content-Type', ""):
                    # JSON Feed
                    json_feed = self.fetch_json_feed(address, raw_feed)
                    if not json_feed:
                        logging.debug(u'   ***> [%-30s] ~FRJSON fetch failed: %s' % 
                                      (self.feed.log_title[:30], address))
                        return FEED_ERRHTTP, None
                    self.fpf = feedparser.parse(json_feed)
                elif raw_feed.content and raw_feed.status_code < 400:
                    response_headers = raw_feed.headers
                    response_headers['Content-Location'] = raw_feed.url
                    self.raw_feed = smart_unicode(raw_feed.content)
                    self.fpf = feedparser.parse(self.raw_feed,
                                                response_headers=response_headers)
                    if self.options.get('debug', False):
                        logging.debug(" ---> [%-30s] ~FBFeed fetch status %s: %s length / %s" % (self.feed.log_title[:30], raw_feed.status_code, len(smart_unicode(raw_feed.content)), raw_feed.headers))
            except Exception, e:
                logging.debug("   ***> [%-30s] ~FRFeed failed to fetch with request, trying feedparser: %s" % (self.feed.log_title[:30], unicode(e)[:100]))
            
            if not self.fpf or self.options.get('force_fp', False):
                try:
                    self.fpf = feedparser.parse(address,
                                                agent=self.feed.user_agent,
                                                etag=etag,
                                                modified=modified)
                except (TypeError, ValueError, KeyError, EOFError, MemoryError), e:
                    logging.debug(u'   ***> [%-30s] ~FRFeed fetch error: %s' % 
                                  (self.feed.log_title[:30], e))
                    pass
Exemple #11
0
    def process(self):
        """ Downloads and parses a feed.
        """
        start = time.time()
        self.refresh_feed()
        
        ret_values = dict(new=0, updated=0, same=0, error=0)

        if hasattr(self.fpf, 'status'):
            if self.options['verbose']:
                if self.fpf.bozo and self.fpf.status != 304:
                    logging.debug(u'   ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % (
                                  self.feed.title[:30],
                                  self.fpf.bozo_exception,
                                  len(self.fpf.entries)))
                    
            if self.fpf.status == 304:
                self.feed = self.feed.save()
                self.feed.save_feed_history(304, "Not modified")
                return FEED_SAME, ret_values
            
            # 302: Temporary redirect: ignore
            # 301: Permanent redirect: save it (after 10 tries)
            if self.fpf.status == 301:
                if self.fpf.href.endswith('feedburner.com/atom.xml'):
                    return FEED_ERRHTTP, ret_values
                redirects, non_redirects = self.feed.count_redirects_in_history('feed')
                self.feed.save_feed_history(self.fpf.status, "HTTP Redirect (%d to go)" % (10-len(redirects)))
                if len(redirects) >= 10 or len(non_redirects) == 0:
                    address = self.fpf.href
                    if self.options['force'] and address:
                        address = qurl(address, remove=['_'])
                    self.feed.feed_address = address
                if not self.feed.known_good:
                    self.feed.fetched_once = True
                    logging.debug("   ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.title[:30], self.fpf.status))
                    self.feed = self.feed.schedule_feed_fetch_immediately()
                if not self.fpf.entries:
                    self.feed = self.feed.save()
                    self.feed.save_feed_history(self.fpf.status, "HTTP Redirect")
                    return FEED_ERRHTTP, ret_values
            if self.fpf.status >= 400:
                logging.debug("   ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.title[:30], self.fpf.status))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(self.fpf.status, "HTTP Error")
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRHTTP, ret_values
        
        if not self.fpf:
            logging.debug("   ---> [%-30s] ~SB~FRFeed is Non-XML. No feedparser feed either!" % (self.feed.title[:30]))
            self.feed.save_feed_history(551, "Broken feed")
            return FEED_ERRHTTP, ret_values
            
        if self.fpf and not self.fpf.entries:
            if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
                logging.debug("   ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries)))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception)
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values
            elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
                logging.debug("   ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries)))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(553, 'SAX Exception', self.fpf.bozo_exception)
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values
                
        # the feed has changed (or it is the first time we parse it)
        # saving the etag and last_modified fields
        original_etag = self.feed.etag
        self.feed.etag = self.fpf.get('etag')
        if self.feed.etag:
            self.feed.etag = self.feed.etag[:255]
        # some times this is None (it never should) *sigh*
        if self.feed.etag is None:
            self.feed.etag = ''
        if self.feed.etag != original_etag:
            self.feed.save(update_fields=['etag'])
            
        original_last_modified = self.feed.last_modified
        try:
            self.feed.last_modified = mtime(self.fpf.modified)
        except:
            self.feed.last_modified = None
            pass
        if self.feed.last_modified != original_last_modified:
            self.feed.save(update_fields=['last_modified'])
        
        self.fpf.entries = self.fpf.entries[:100]
        
        original_title = self.feed.feed_title
        if self.fpf.feed.get('title'):
            self.feed.feed_title = strip_tags(self.fpf.feed.get('title'))
        if self.feed.feed_title != original_title:
            self.feed.save(update_fields=['feed_title'])
        
        tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline)
        if tagline:
            original_tagline = self.feed.data.feed_tagline
            self.feed.data.feed_tagline = smart_unicode(tagline)
            if self.feed.data.feed_tagline != original_tagline:
                self.feed.data.save(update_fields=['feed_tagline'])

        if not self.feed.feed_link_locked:
            new_feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link
            if self.options['force'] and new_feed_link:
                new_feed_link = qurl(new_feed_link, remove=['_'])
            if new_feed_link != self.feed.feed_link:
                logging.debug("   ---> [%-30s] ~SB~FRFeed's page is different: %s to %s" % (self.feed.title[:30], self.feed.feed_link, new_feed_link))               
                redirects, non_redirects = self.feed.count_redirects_in_history('page')
                self.feed.save_page_history(301, "HTTP Redirect (%s to go)" % (10-len(redirects)))
                if len(redirects) >= 10 or len(non_redirects) == 0:
                    self.feed.feed_link = new_feed_link
                    self.feed.save(update_fields=['feed_link'])
        
        # Determine if stories aren't valid and replace broken guids
        guids_seen = set()
        permalinks_seen = set()
        for entry in self.fpf.entries:
            guids_seen.add(entry.get('guid'))
            permalinks_seen.add(Feed.get_permalink(entry))
        guid_difference = len(guids_seen) != len(self.fpf.entries)
        single_guid = len(guids_seen) == 1
        replace_guids = single_guid and guid_difference
        permalink_difference = len(permalinks_seen) != len(self.fpf.entries)
        single_permalink = len(permalinks_seen) == 1
        replace_permalinks = single_permalink and permalink_difference
        
        # Compare new stories to existing stories, adding and updating
        start_date = datetime.datetime.utcnow()
        story_hashes = []
        stories = []
        for entry in self.fpf.entries:
            story = pre_process_story(entry)
            if story.get('published') < start_date:
                start_date = story.get('published')
            if replace_guids:
                if replace_permalinks:
                    new_story_guid = unicode(story.get('published'))
                    if self.options['verbose']:
                        logging.debug(u'   ---> [%-30s] ~FBReplacing guid (%s) with timestamp: %s' % (
                                      self.feed.title[:30],
                                      story.get('guid'), new_story_guid))
                    story['guid'] = new_story_guid
                else:
                    new_story_guid = Feed.get_permalink(story)
                    if self.options['verbose']:
                        logging.debug(u'   ---> [%-30s] ~FBReplacing guid (%s) with permalink: %s' % (
                                      self.feed.title[:30],
                                      story.get('guid'), new_story_guid))
                    story['guid'] = new_story_guid
            story['story_hash'] = MStory.feed_guid_hash_unsaved(self.feed.pk, story.get('guid'))
            stories.append(story)
            story_hashes.append(story.get('story_hash'))

        existing_stories = dict((s.story_hash, s) for s in MStory.objects(
            story_hash__in=story_hashes,
            # story_date__gte=start_date,
            # story_feed_id=self.feed.pk
        ))

        ret_values = self.feed.add_update_stories(stories, existing_stories,
                                                  verbose=self.options['verbose'],
                                                  updates_off=self.options['updates_off'])

        if (hasattr(self.fpf, 'feed') and 
            hasattr(self.fpf.feed, 'links') and self.fpf.feed.links):
            hub_url = None
            self_url = self.feed.feed_address
            for link in self.fpf.feed.links:
                if link['rel'] == 'hub' and not hub_url:
                    hub_url = link['href']
                elif link['rel'] == 'self':
                    self_url = link['href']
            push_expired = False
            if self.feed.is_push:
                try:
                    push_expired = self.feed.push.lease_expires < datetime.datetime.now()
                except PushSubscription.DoesNotExist:
                    self.feed.is_push = False
            if (hub_url and self_url and not settings.DEBUG and
                self.feed.active_subscribers > 0 and
                (push_expired or not self.feed.is_push or self.options.get('force'))):
                logging.debug(u'   ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s' % (
                              self.feed.title[:30],
                              "~SKRe-~SN" if push_expired else "", hub_url))
                try:
                    PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url)
                except TimeoutError:
                    logging.debug(u'   ---> [%-30s] ~BB~FW~FRTimed out~FW subscribing to PuSH hub: %s' % (
                                  self.feed.title[:30], hub_url))                    
            elif (self.feed.is_push and 
                  (self.feed.active_subscribers <= 0 or not hub_url)):
                logging.debug(u'   ---> [%-30s] ~BB~FWTurning off PuSH, no hub found' % (
                              self.feed.title[:30]))
                self.feed.is_push = False
                self.feed = self.feed.save()

        logging.debug(u'   ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % (
                      self.feed.title[:30], 
                      '~FG~SB' if ret_values['new'] else '', ret_values['new'],
                      '~FY~SB' if ret_values['updated'] else '', ret_values['updated'],
                      '~SB' if ret_values['same'] else '', ret_values['same'],
                      '~FR~SB' if ret_values['error'] else '', ret_values['error'],
                      len(self.fpf.entries)))
        self.feed.update_all_statistics(has_new_stories=bool(ret_values['new']), force=self.options['force'])
        if ret_values['new']:
            self.feed.trim_feed()
            self.feed.expire_redis()
        self.feed.save_feed_history(200, "OK")

        if self.options['verbose']:
            logging.debug(u'   ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % (
                          self.feed.title[:30], time.time() - start))
        
        return FEED_OK, ret_values
Exemple #12
0
    def fetch(self):
        """ 
        Uses feedparser to download the feed. Will be parsed later.
        """
        start = time.time()
        identity = self.get_identity()
        log_msg = u'%2s ---> [%-30s] ~FYFetching feed (~FB%d~FY), last update: %s' % (
            identity, self.feed.title[:30], self.feed.id,
            datetime.datetime.now() - self.feed.last_update)
        logging.debug(log_msg)

        etag = self.feed.etag
        modified = self.feed.last_modified.utctimetuple(
        )[:7] if self.feed.last_modified else None
        address = self.feed.feed_address

        if (self.options.get('force') or random.random() <= .01):
            self.options['force'] = True
            modified = None
            etag = None
            address = qurl(address, add={"_": random.randint(0, 10000)})
            logging.debug(u'   ---> [%-30s] ~FBForcing fetch: %s' %
                          (self.feed.title[:30], address))
        elif (not self.feed.fetched_once or not self.feed.known_good):
            modified = None
            etag = None

        USER_AGENT = ('NewsBlur Feed Fetcher - %s subscriber%s - %s '
                      '(Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_1) '
                      'AppleWebKit/534.48.3 (KHTML, like Gecko) Version/5.1 '
                      'Safari/534.48.3)' % (
                          self.feed.num_subscribers,
                          's' if self.feed.num_subscribers != 1 else '',
                          self.feed.permalink,
                      ))
        if self.options.get('feed_xml'):
            logging.debug(
                u'   ---> [%-30s] ~FM~BKFeed has been fat pinged. Ignoring fat: %s'
                % (self.feed.title[:30], len(self.options.get('feed_xml'))))

        if self.options.get('fpf'):
            self.fpf = self.options.get('fpf')
            logging.debug(
                u'   ---> [%-30s] ~FM~BKFeed fetched in real-time with fat ping.'
                % (self.feed.title[:30]))
            return FEED_OK, self.fpf

        if 'youtube.com' in address:
            try:
                youtube_feed = self.fetch_youtube(address)
            except (requests.adapters.ConnectionError):
                youtube_feed = None
            if not youtube_feed:
                logging.debug(u'   ***> [%-30s] ~FRYouTube fetch failed: %s.' %
                              (self.feed.title[:30], address))
                return FEED_ERRHTTP, None
            self.fpf = feedparser.parse(youtube_feed)

        if not self.fpf:
            try:
                self.fpf = feedparser.parse(address,
                                            agent=USER_AGENT,
                                            etag=etag,
                                            modified=modified)
            except (TypeError, ValueError, KeyError, EOFError), e:
                logging.debug(u'   ***> [%-30s] ~FRFeed fetch error: %s' %
                              (self.feed.title[:30], e))
                pass