def process_rss_item(self, item): if mongo.getdb().rss.find({"md5":item['md5']}).count() is 0: if item['title'] is '' or item['content'] is '': pass else: #re_l = re.compile("<!\[CDATA\[") item['title'] = re.sub(ur'<[^>]*>','',re.sub(ur'\]\]>','',re.sub(ur'<!\[CDATA\[','',item['title']))) item['content'] = re.sub(ur'<[^>]*>','',re.sub(ur'\]\]>','',re.sub(ur'<!\[CDATA\[','',item['content']))) if item.get('date') is None: item['date'] = self.date mongo.getdb().rss.insert(dict(item)) if item['title'] not in ['',None]: mongo.getdb().moniter.update({"name":"rss"},{"$inc":{"day."+self.date:1}},True) self.datacore("RSS",item['title'])
def process_status_item(self, item): if mongo.getdb().status.find({"statusid":item['statusid']}).count() is 0: mongo.getdb().status.insert(dict(item)) mongo.getdb().user.update({"userid":item['statusuid']}, {"$inc":{'statistic.'+self.date:1}}) mongo.getdb().moniter.update({"name":"weibo"},{"$inc":{"day."+self.date:1}},True) self.datacore("WB","#"+item['statusuname']+"# "+item['content'])
def stats_spider_closed(self, spider, spider_stats): statsinfo = {} statsinfo['name'] = spider.name #statsinfo['created_at'] = unicode(datetime.datetime.now().replace(microsecond=0)) #statsinfo['updated_at'] = unicode(datetime.datetime.now().replace(microsecond=0)) statsinfo['start_time'] = unicode(spider_stats['start_time'].replace(microsecond=0)) statsinfo['finish_time'] = unicode(spider_stats['finish_time'].replace(microsecond=0)) #statsinfo['finish_reason'] = spider_stats['finish_reason'].encode('utf-8') statsinfo['time_scraped_count'] = spider_stats['item_scraped_count'] if 'item_scraped_count' in spider_stats else 0 statsinfo['images_count']= spider_stats['images_count'] if 'images_count' in spider_stats else 0 statsinfo['images_uptodate'] = spider_stats['images_uptodate'] if 'images_uptodate' in spider_stats else 0 statsinfo['images_downloaded'] = spider_stats['images_downloaded'] if 'images_downloaded' in spider_stats else 0 statsinfo['request_count'] = spider_stats['downloader/request_count'] if 'request_count' in spider_stats else 0 statsinfo['response_count'] = spider_stats['downloader/response_count'] if 'downloader/response_count' in spider_stats else 0 statsinfo['response_status_count_200'] = spider_stats['downloader/response_status_count/200'] if 'downloader/response_status_count/200' in spider_stats else 0 statsinfo['response_status_count_301'] = spider_stats['downloader/response_status_count/301'] if 'downloader/response_status_count/301' in spider_stats else 0 statsinfo['response_status_count_302'] = spider_stats['downloader/response_status_count/302'] if 'downloader/response_status_count/302' in spider_stats else 0 statsinfo['response_status_count_500'] = spider_stats['downloader/response_status_count/500'] if 'downloader/response_status_count/500' in spider_stats else 0 mongo.getdb().statsinfo.insert(statsinfo)
def process_blog_item(self, item): if mongo.getdb().blog.find({"md5":item['md5']}).count() is 0: mongo.getdb().blog.insert(dict(item)) mongo.getdb().moniter.update({"name":"blog"},{"$inc":{"day."+self.date:1}},True) self.datacore("Blog",item['title'] + " " + item['url']) pass
def process_news_item(self, item): if mongo.getdb().news.find({"md5":item['md5']}).count() is 0: mongo.getdb().news.insert(dict(item)) self.datacore("News",item['title'] + " " + item['url']) mongo.getdb().moniter.update({"name":"news"},{"$inc":{"day."+self.date:1}},True) pass
def process_user_item(self, item): if mongo.getdb().user.find({"userid":item['userid']}).count() is 0: mongo.getdb().user.insert(dict(item))
def process_sns_item(self, item): #if mongo.getdb().sns.find({"uid":item['uid'],'flatform':item['flatform']}).count() is 0: # mongo.getdb().user.insert(dict(item)) mongo.getdb().sns.insert(dict(item))
def process_long_item(self, item): # mongo.getdb().link.update({},{}) if True: mongo.getdb().data.insert() pass