def getFeed(self, request, queryset, *arg1, **arg2): logging.info(u'开始采集Feed') feed_retrieval_deadline = datetime.now() - timedelta(minutes=1200) for feed in queryset: if feed.last_retrieved > feed_retrieval_deadline: logging.info('Skipping feed %s.', feed.feedurl) continue logging.info('Getting feed %s.', feed.feedurl) try: result = getpage(feed.feedurl, 30) except Exception: logging.warning( 'Could not get feed %s ,and the fetch is restart now' % feed.feedurl) feed.last_retrieved = datetime.now() #feed.save() break if result.code == 200: self.__parse_feed(result.read(), feed.feedurl, feed.stop_target, feed.category, feed.latest, feed.start_target, feed.mid_target, feed.end_target, feed.allow_target) feed.last_retrieved = datetime.now() feed.save() elif result.code == 500: logging.error('Feed %s returned with status code 500.' % feed.feedurl) elif result.code == 404: logging.error('Error 404: Nothing found at %s.' % feed.feedurl)
def getFeed(self, request, queryset, *arg1, **arg2): logging.info(u'开始采集Feed') feed_retrieval_deadline = datetime.now() - timedelta(minutes=1200) for feed in queryset: if feed.last_retrieved > feed_retrieval_deadline: logging.info('Skipping feed %s.', feed.feedurl) continue logging.info('Getting feed %s.', feed.feedurl) try: result = getpage(feed.feedurl, 30) except Exception: logging.warning('Could not get feed %s ,and the fetch is restart now' % feed.feedurl) feed.last_retrieved = datetime.now() #feed.save() break if result.code == 200: self.__parse_feed(result.read(), feed.feedurl, feed.stop_target, feed.category, feed.latest, feed.start_target, feed.mid_target, feed.end_target, feed.allow_target) feed.last_retrieved = datetime.now() feed.save() elif result.code == 500: logging.error('Feed %s returned with status code 500.' % feed.feedurl) elif result.code == 404: logging.error('Error 404: Nothing found at %s.' % feed.feedurl)
def __parse_feed(self, feed_content, feed_url, stop_target, category, feed_latest, start_target, mid_target, end_target, allow_target): feed = feedparser.parse(feed_content) i = 0 dead_i = 0 for entry in feed.entries: logging.info('start parse feed,the dead_i is %s', dead_i) title = htmllib.decoding(entry.title) categorie_keys = [] content = '' date_published = datetime.now() author_name = '' Mystat = True if self.__feedslist_check(title) == False: try: i += 1 url = '' logging.info('beging to add new article No. %s', i) if (entry.has_key('feedburner_origlink')): url = entry.feedburner_origlink else: url = entry.link if entry.has_key('content'): content = entry.content[0].value else: content = entry.description if entry.has_key('author'): author_name = entry.author else: author_name = "转载" stripper = HTMLStripper() stripper.feed(title) title = stripper.get_data() content = htmllib.decoding(content) content = htmllib.GetFeedclean(url, content, stop_target) if (entry.has_key('updated_parsed')): date_published = datetime(*entry.updated_parsed[:6]) else: date_published = datetime.now() except Exception, data: logging.warn( 'this like something happened,the error is %s', data) try: feedresult = self.__store_article(title, url, category, content, date_published, author_name, feed_url, feed) if feedresult == True: logging.info('The No.%s is fetched to the db', i) else: logging.error('The No.%s is fetched Fail', i) Mystat = False except Exception, data: logging.warning('the error is %s', data) Mystat = False
def __parse_feed(self, feed_content, feed_url, stop_target, category, feed_latest, start_target, mid_target, end_target, allow_target): feed = feedparser.parse(feed_content) i = 0 dead_i = 0 for entry in feed.entries: logging.info('start parse feed,the dead_i is %s', dead_i) title = htmllib.decoding(entry.title) categorie_keys = [] content = '' date_published = datetime.now() author_name = '' Mystat = True if self.__feedslist_check(title) == False: try: i += 1 url = '' logging.info('beging to add new article No. %s', i) if(entry.has_key('feedburner_origlink')): url = entry.feedburner_origlink else: url = entry.link if entry.has_key('content'): content = entry.content[0].value else: content = entry.description if entry.has_key('author'): author_name = entry.author else: author_name = "转载" stripper = HTMLStripper() stripper.feed(title) title = stripper.get_data() content = htmllib.decoding(content) content = htmllib.GetFeedclean(url, content, stop_target) if(entry.has_key('updated_parsed')): date_published = datetime(*entry.updated_parsed[:6]) else: date_published = datetime.now() except Exception, data: logging.warn('this like something happened,the error is %s', data) try: feedresult = self.__store_article(title, url, category, content, date_published, author_name, feed_url, feed) if feedresult == True: logging.info('The No.%s is fetched to the db', i) else: logging.error('The No.%s is fetched Fail', i) Mystat = False except Exception, data: logging.warning('the error is %s', data) Mystat = False