Esempio n. 1
0
 def process_rss_item(self, item):
     if mongo.getdb().rss.find({"md5":item['md5']}).count() is 0:
         if item['title'] is '' or item['content'] is '':
             pass
         else:
             #re_l = re.compile("<!\[CDATA\[")
             item['title'] = re.sub(ur'<[^>]*>','',re.sub(ur'\]\]>','',re.sub(ur'<!\[CDATA\[','',item['title'])))
             item['content'] = re.sub(ur'<[^>]*>','',re.sub(ur'\]\]>','',re.sub(ur'<!\[CDATA\[','',item['content'])))
             if item.get('date') is None:
                 item['date'] = self.date
             mongo.getdb().rss.insert(dict(item))
             if item['title'] not in ['',None]:
                 mongo.getdb().moniter.update({"name":"rss"},{"$inc":{"day."+self.date:1}},True)
                 self.datacore("RSS",item['title'])
Esempio n. 2
0
    def process_status_item(self, item): 

        if mongo.getdb().status.find({"statusid":item['statusid']}).count() is 0:
            mongo.getdb().status.insert(dict(item))
            mongo.getdb().user.update({"userid":item['statusuid']}, {"$inc":{'statistic.'+self.date:1}})
            mongo.getdb().moniter.update({"name":"weibo"},{"$inc":{"day."+self.date:1}},True)
            self.datacore("WB","#"+item['statusuname']+"# "+item['content'])
Esempio n. 3
0
 def stats_spider_closed(self, spider, spider_stats):
     statsinfo = {}
     statsinfo['name'] = spider.name
     #statsinfo['created_at'] = unicode(datetime.datetime.now().replace(microsecond=0))
     #statsinfo['updated_at'] = unicode(datetime.datetime.now().replace(microsecond=0))
     statsinfo['start_time'] = unicode(spider_stats['start_time'].replace(microsecond=0))
     statsinfo['finish_time'] = unicode(spider_stats['finish_time'].replace(microsecond=0))
     #statsinfo['finish_reason'] = spider_stats['finish_reason'].encode('utf-8')
     statsinfo['time_scraped_count'] = spider_stats['item_scraped_count'] if 'item_scraped_count' in spider_stats  else 0
     statsinfo['images_count']= spider_stats['images_count'] if 'images_count' in spider_stats  else 0 
     statsinfo['images_uptodate'] = spider_stats['images_uptodate'] if 'images_uptodate' in spider_stats  else 0
     statsinfo['images_downloaded'] = spider_stats['images_downloaded'] if 'images_downloaded' in spider_stats  else 0
     statsinfo['request_count'] = spider_stats['downloader/request_count'] if 'request_count' in spider_stats  else 0
     statsinfo['response_count'] = spider_stats['downloader/response_count'] if 'downloader/response_count' in spider_stats  else 0
     statsinfo['response_status_count_200'] = spider_stats['downloader/response_status_count/200'] if 'downloader/response_status_count/200' in spider_stats else 0
     statsinfo['response_status_count_301'] = spider_stats['downloader/response_status_count/301'] if 'downloader/response_status_count/301' in spider_stats else 0
     statsinfo['response_status_count_302'] = spider_stats['downloader/response_status_count/302'] if 'downloader/response_status_count/302' in spider_stats else 0
     statsinfo['response_status_count_500'] = spider_stats['downloader/response_status_count/500'] if 'downloader/response_status_count/500' in spider_stats else 0
     mongo.getdb().statsinfo.insert(statsinfo)
Esempio n. 4
0
 def process_blog_item(self, item):
     if mongo.getdb().blog.find({"md5":item['md5']}).count() is 0:
         mongo.getdb().blog.insert(dict(item))
         mongo.getdb().moniter.update({"name":"blog"},{"$inc":{"day."+self.date:1}},True)
         self.datacore("Blog",item['title'] + " " + item['url'])
     pass
Esempio n. 5
0
 def process_news_item(self, item):
     if mongo.getdb().news.find({"md5":item['md5']}).count() is 0:
         mongo.getdb().news.insert(dict(item))
         self.datacore("News",item['title'] + " " + item['url'])
         mongo.getdb().moniter.update({"name":"news"},{"$inc":{"day."+self.date:1}},True)
     pass 
Esempio n. 6
0
 def process_user_item(self, item):
     if mongo.getdb().user.find({"userid":item['userid']}).count() is 0:
         mongo.getdb().user.insert(dict(item))
Esempio n. 7
0
 def process_sns_item(self, item):
     #if mongo.getdb().sns.find({"uid":item['uid'],'flatform':item['flatform']}).count() is 0:
     #    mongo.getdb().user.insert(dict(item))
     mongo.getdb().sns.insert(dict(item))
Esempio n. 8
0
 def process_long_item(self, item):
     #
     mongo.getdb().link.update({},{})
     if True:
         mongo.getdb().data.insert()
     pass