def __store_article(self, contenthtml, feed): entry = FeedsResult.objects.get(pk=feed.pk) try: entry.content = htmllib.decoding(contenthtml) entry.fetch_stat = 1 images = htmllib.Parse_images_url(contenthtml) for image in images: obj, result = TempImages.objects.get_or_create(oldurl=image, entry=entry) except Exception, data: entry.fetch_stat = 2 logging.info('the db saved error is: %s', data)
def __Parse_image(self, content): images = htmllib.Parse_images_url(content) if images: try: for image in images: tmpimage = TempImages.objects.get(oldurl=image) if tmpimage != None: content = gbtools.stringQ2B(content) content = htmllib.decoding(content).replace(image, tmpimage.newurl) except Exception, data: logging.info(data)
def __Parse_image(self, content): images = htmllib.Parse_images_url(content) if images: try: for image in images: tmpimage = TempImages.objects.get(oldurl=image) if tmpimage != None: content = gbtools.stringQ2B(content) content = htmllib.decoding(content).replace( image, tmpimage.newurl) except Exception, data: logging.info(data)
def getArticle(self, request, queryset, *arg1, **arg2): for feed in queryset: logging.info('start to fetch article,The title is %s', feed.title) try: if feed.feed.start_target != 'nohtml': logging.info('fetch new article %s,at %s' % (feed.link, datetime.now())) contenthtml = '' try: result = getpage(feed.link, 30) if result.code == 200: if len( feed.feed.start_target ) != 0 and feed.feed.start_target != 'nohtml': contenthtml = htmllib.parsehtml( result.read(), feed.feed, feed.link, feed.feed.feedurl) else: contenthtml = feed.excerpt self.__store_article(contenthtml, feed) return True return False except Exception, data: logging.info('DownloadError in get %s.the error is %s', feed.link, data) return False else: self.__store_article(feed.excerpt, feed)
def getFeed(self, request, queryset, *arg1, **arg2): logging.info(u'开始采集Feed') feed_retrieval_deadline = datetime.now() - timedelta(minutes=1200) for feed in queryset: if feed.last_retrieved > feed_retrieval_deadline: logging.info('Skipping feed %s.', feed.feedurl) continue logging.info('Getting feed %s.', feed.feedurl) try: result = getpage(feed.feedurl, 30) except Exception: logging.warning( 'Could not get feed %s ,and the fetch is restart now' % feed.feedurl) feed.last_retrieved = datetime.now() #feed.save() break if result.code == 200: self.__parse_feed(result.read(), feed.feedurl, feed.stop_target, feed.category, feed.latest, feed.start_target, feed.mid_target, feed.end_target, feed.allow_target) feed.last_retrieved = datetime.now() feed.save() elif result.code == 500: logging.error('Feed %s returned with status code 500.' % feed.feedurl) elif result.code == 404: logging.error('Error 404: Nothing found at %s.' % feed.feedurl)
def getArticle(self, request, queryset, *arg1, **arg2): for feed in queryset: logging.info('start to fetch article,The title is %s', feed.title) try: if feed.feed.start_target != 'nohtml': logging.info('fetch new article %s,at %s' % (feed.link, datetime.now())) contenthtml = '' try: result = getpage(feed.link, 30) if result.code == 200: if len(feed.feed.start_target) != 0 and feed.feed.start_target != 'nohtml': contenthtml = htmllib.parsehtml(result.read(), feed.feed, feed.link, feed.feed.feedurl) else: contenthtml = feed.excerpt self.__store_article(contenthtml, feed) return True return False except Exception, data: logging.info('DownloadError in get %s.the error is %s', feed.link, data) return False else: self.__store_article(feed.excerpt, feed)
def getFeed(self, request, queryset, *arg1, **arg2): logging.info(u'开始采集Feed') feed_retrieval_deadline = datetime.now() - timedelta(minutes=1200) for feed in queryset: if feed.last_retrieved > feed_retrieval_deadline: logging.info('Skipping feed %s.', feed.feedurl) continue logging.info('Getting feed %s.', feed.feedurl) try: result = getpage(feed.feedurl, 30) except Exception: logging.warning('Could not get feed %s ,and the fetch is restart now' % feed.feedurl) feed.last_retrieved = datetime.now() #feed.save() break if result.code == 200: self.__parse_feed(result.read(), feed.feedurl, feed.stop_target, feed.category, feed.latest, feed.start_target, feed.mid_target, feed.end_target, feed.allow_target) feed.last_retrieved = datetime.now() feed.save() elif result.code == 500: logging.error('Feed %s returned with status code 500.' % feed.feedurl) elif result.code == 404: logging.error('Error 404: Nothing found at %s.' % feed.feedurl)
def __parse_feed(self, feed_content, feed_url, stop_target, category, feed_latest, start_target, mid_target, end_target, allow_target): feed = feedparser.parse(feed_content) i = 0 dead_i = 0 for entry in feed.entries: logging.info('start parse feed,the dead_i is %s', dead_i) title = htmllib.decoding(entry.title) categorie_keys = [] content = '' date_published = datetime.now() author_name = '' Mystat = True if self.__feedslist_check(title) == False: try: i += 1 url = '' logging.info('beging to add new article No. %s', i) if (entry.has_key('feedburner_origlink')): url = entry.feedburner_origlink else: url = entry.link if entry.has_key('content'): content = entry.content[0].value else: content = entry.description if entry.has_key('author'): author_name = entry.author else: author_name = "转载" stripper = HTMLStripper() stripper.feed(title) title = stripper.get_data() content = htmllib.decoding(content) content = htmllib.GetFeedclean(url, content, stop_target) if (entry.has_key('updated_parsed')): date_published = datetime(*entry.updated_parsed[:6]) else: date_published = datetime.now() except Exception, data: logging.warn( 'this like something happened,the error is %s', data) try: feedresult = self.__store_article(title, url, category, content, date_published, author_name, feed_url, feed) if feedresult == True: logging.info('The No.%s is fetched to the db', i) else: logging.error('The No.%s is fetched Fail', i) Mystat = False except Exception, data: logging.warning('the error is %s', data) Mystat = False
def getImages(self, request, queryset, *arg1, **arg2): for image in queryset: logging.info('start to fetch images,The url is %s', image.oldurl) try: name = htmllib.sid() + '.jpg' result = getpage(htmllib.encoding(image.oldurl), 30) if result.code == 200: result = self.__store_images(result.read(), name, image) else: result = False if result: logging.info('Success!') else: logging.info('this one was Fail!') except Exception, data: logging.info(data)
def __parse_feed(self, feed_content, feed_url, stop_target, category, feed_latest, start_target, mid_target, end_target, allow_target): feed = feedparser.parse(feed_content) i = 0 dead_i = 0 for entry in feed.entries: logging.info('start parse feed,the dead_i is %s', dead_i) title = htmllib.decoding(entry.title) categorie_keys = [] content = '' date_published = datetime.now() author_name = '' Mystat = True if self.__feedslist_check(title) == False: try: i += 1 url = '' logging.info('beging to add new article No. %s', i) if(entry.has_key('feedburner_origlink')): url = entry.feedburner_origlink else: url = entry.link if entry.has_key('content'): content = entry.content[0].value else: content = entry.description if entry.has_key('author'): author_name = entry.author else: author_name = "转载" stripper = HTMLStripper() stripper.feed(title) title = stripper.get_data() content = htmllib.decoding(content) content = htmllib.GetFeedclean(url, content, stop_target) if(entry.has_key('updated_parsed')): date_published = datetime(*entry.updated_parsed[:6]) else: date_published = datetime.now() except Exception, data: logging.warn('this like something happened,the error is %s', data) try: feedresult = self.__store_article(title, url, category, content, date_published, author_name, feed_url, feed) if feedresult == True: logging.info('The No.%s is fetched to the db', i) else: logging.error('The No.%s is fetched Fail', i) Mystat = False except Exception, data: logging.warning('the error is %s', data) Mystat = False
def __store_article(self, contenthtml, feed): entry = FeedsResult.objects.get(pk=feed.pk) try: entry.content = htmllib.decoding(contenthtml) entry.fetch_stat = 1 images = htmllib.Parse_images_url(contenthtml) for image in images: obj, result = TempImages.objects.get_or_create(oldurl=image, entry=entry) except Exception, data: entry.fetch_stat = 2 logging.info('the db saved error is: %s', data) entry.save() logging.info('adding the article,the name is %s', feed.title) def saveArticle(self, request, queryset, *arg1, **arg2): for entry in queryset: result = self.__store_entry(entry) saveArticle.short_description = u'发布采集' def __store_entry(self, feed): try: entry, result = Entry.published.get_or_create(title=feed.title) entry.excerpt = feed.excerpt entry.status = 2 entry.author_name = feed.author_name entry.date = feed.date entry.slug = htmllib.sid()
getArticle.short_description = u'采集正文内容' def __store_article(self, contenthtml, feed): entry = FeedsResult.objects.get(pk=feed.pk) try: entry.content = htmllib.decoding(contenthtml) entry.fetch_stat = 1 images = htmllib.Parse_images_url(contenthtml) for image in images: obj, result = TempImages.objects.get_or_create(oldurl=image, entry=entry) except Exception, data: entry.fetch_stat = 2 logging.info('the db saved error is: %s', data) entry.save() logging.info('adding the article,the name is %s', feed.title) def saveArticle(self, request, queryset, *arg1, **arg2): for entry in queryset: result = self.__store_entry(entry) saveArticle.short_description = u'发布采集' def __store_entry(self, feed): try: entry, result = Entry.published.get_or_create(title=feed.title) entry.excerpt = feed.excerpt entry.status = 2 entry.author_name = feed.author_name entry.date = feed.date entry.slug = htmllib.sid() entry.content = self.__Parse_image(feed.content)