Beispiel #1
0
    def getFeed(self, request, queryset, *arg1, **arg2):
        logging.info(u'开始采集Feed')
        feed_retrieval_deadline = datetime.now() - timedelta(minutes=1200)

        for feed in queryset:

            if feed.last_retrieved > feed_retrieval_deadline:
                logging.info('Skipping feed %s.', feed.feedurl)
                continue

            logging.info('Getting feed %s.', feed.feedurl)
            try:

                result = getpage(feed.feedurl, 30)
            except Exception:
                logging.warning(
                    'Could not get feed %s ,and the fetch is restart now' %
                    feed.feedurl)
                feed.last_retrieved = datetime.now()
                #feed.save()
                break
            if result.code == 200:
                self.__parse_feed(result.read(), feed.feedurl,
                                  feed.stop_target, feed.category, feed.latest,
                                  feed.start_target, feed.mid_target,
                                  feed.end_target, feed.allow_target)

                feed.last_retrieved = datetime.now()
                feed.save()

            elif result.code == 500:
                logging.error('Feed %s returned with status code 500.' %
                              feed.feedurl)
            elif result.code == 404:
                logging.error('Error 404: Nothing found at %s.' % feed.feedurl)
Beispiel #2
0
    def getFeed(self, request, queryset, *arg1, **arg2):
                logging.info(u'开始采集Feed')
                feed_retrieval_deadline = datetime.now() - timedelta(minutes=1200)
                
                for feed in queryset:

                    if feed.last_retrieved > feed_retrieval_deadline:
                            logging.info('Skipping feed %s.', feed.feedurl)
                            continue

                    logging.info('Getting feed %s.', feed.feedurl)
                    try:

                            result = getpage(feed.feedurl, 30)
                    except Exception:
                            logging.warning('Could not get feed %s ,and the fetch is restart now' % feed.feedurl)
                            feed.last_retrieved = datetime.now()
                            #feed.save()
                            break
                    if result.code == 200:
                            self.__parse_feed(result.read(), feed.feedurl, feed.stop_target, feed.category, feed.latest, feed.start_target, feed.mid_target, feed.end_target, feed.allow_target)

                            feed.last_retrieved = datetime.now()
                            feed.save()

                    elif result.code == 500:
                            logging.error('Feed %s returned with status code 500.' % feed.feedurl)
                    elif result.code == 404:
                            logging.error('Error 404: Nothing found at %s.' % feed.feedurl)
Beispiel #3
0
    def __parse_feed(self, feed_content, feed_url, stop_target, category,
                     feed_latest, start_target, mid_target, end_target,
                     allow_target):
        feed = feedparser.parse(feed_content)
        i = 0
        dead_i = 0
        for entry in feed.entries:
            logging.info('start parse feed,the dead_i is %s', dead_i)
            title = htmllib.decoding(entry.title)
            categorie_keys = []
            content = ''
            date_published = datetime.now()
            author_name = ''
            Mystat = True
            if self.__feedslist_check(title) == False:
                try:
                    i += 1
                    url = ''
                    logging.info('beging to add new article No. %s', i)
                    if (entry.has_key('feedburner_origlink')):
                        url = entry.feedburner_origlink
                    else:
                        url = entry.link
                    if entry.has_key('content'):
                        content = entry.content[0].value
                    else:
                        content = entry.description
                    if entry.has_key('author'):
                        author_name = entry.author
                    else:
                        author_name = "转载"
                    stripper = HTMLStripper()
                    stripper.feed(title)
                    title = stripper.get_data()
                    content = htmllib.decoding(content)
                    content = htmllib.GetFeedclean(url, content, stop_target)
                    if (entry.has_key('updated_parsed')):
                        date_published = datetime(*entry.updated_parsed[:6])
                    else:
                        date_published = datetime.now()
                except Exception, data:
                    logging.warn(
                        'this like something happened,the error is %s', data)

                try:
                    feedresult = self.__store_article(title, url, category,
                                                      content, date_published,
                                                      author_name, feed_url,
                                                      feed)
                    if feedresult == True:
                        logging.info('The No.%s  is fetched to the db', i)
                    else:
                        logging.error('The No.%s is fetched Fail', i)
                        Mystat = False
                except Exception, data:
                    logging.warning('the error is %s', data)
                    Mystat = False
Beispiel #4
0
    def __parse_feed(self, feed_content, feed_url, stop_target, category, feed_latest, start_target, mid_target, end_target, allow_target):
                feed = feedparser.parse(feed_content)
                i = 0
                dead_i = 0
                for entry in feed.entries:
                        logging.info('start parse feed,the dead_i is %s', dead_i)
                        title = htmllib.decoding(entry.title)
                        categorie_keys = []
                        content = ''
                        date_published = datetime.now()
                        author_name = ''
                        Mystat = True
                        if self.__feedslist_check(title) == False:
                            try:
                                    i += 1
                                    url = ''
                                    logging.info('beging to add new article No. %s', i)
                                    if(entry.has_key('feedburner_origlink')):
                                            url = entry.feedburner_origlink
                                    else:
                                            url = entry.link
                                    if entry.has_key('content'):
                                            content = entry.content[0].value
                                    else:
                                            content = entry.description
                                    if entry.has_key('author'):
                                            author_name = entry.author
                                    else:
                                            author_name = "转载"
                                    stripper = HTMLStripper()
                                    stripper.feed(title)
                                    title = stripper.get_data()
                                    content = htmllib.decoding(content)
                                    content = htmllib.GetFeedclean(url, content, stop_target)
                                    if(entry.has_key('updated_parsed')):
                                            date_published = datetime(*entry.updated_parsed[:6])
                                    else:
                                            date_published = datetime.now()
                            except Exception, data:
                                    logging.warn('this like something happened,the error is %s', data)

                            try:
                                    feedresult = self.__store_article(title, url, category, content, date_published, author_name, feed_url, feed)
                                    if feedresult == True:
                                            logging.info('The No.%s  is fetched to the db', i)
                                    else:
                                            logging.error('The No.%s is fetched Fail', i)
                                            Mystat = False
                            except Exception, data:
                                    logging.warning('the error is %s', data)
                                    Mystat = False