class QtAudioSpider(scrapy.Spider): name = "qt_audio" start_urls = ( ) client = MongoClient(ConfUtil.getMongoIP(),ConfUtil.getMongoPort()) db = client[ConfUtil.getDBName()] c_audio = db[ConfUtil.getQTAudioCollectionName()] custom_settings = get_project_settings().getdict('QT_SETTINGS') FILES_STORE_BASE = custom_settings['FILES_STORE'] def start_requests(self): yield scrapy.Request( 'http://www.baidu.com',callback=self.parse ) def parse(self, response): cursor = self.c_audio.find( { 'audioDownloadDir':None } ) for audio in cursor: audioItem = AudioItem() audioItem['_id'] = audio['_id'] audioItem['collection'] = ConfUtil.getQTAudioCollectionName() audioItem['url'] = audio['playUrl'] audioItem['audio_base'] = self.FILES_STORE_BASE yield audioItem
def testGetXmlContent(self): xmly = db[ConfUtil.getXMLYAudioCollectionName()] kl = db[ConfUtil.getKLAudioCollectionName()] xmly_audio = xmly.find_one() kl_audio = kl.find_one() print self.sender.getXMLContent('kl',kl_audio) print self.sender.getXMLContent('xmly',xmly_audio)
class AudioDownloader(FilesPipeline): client = MongoClient(ConfUtil.getMongoIP(), ConfUtil.getMongoPort()) db = client[ConfUtil.getDBName()] def get_media_requests(self, item, info): if item.type == 'audio': yield scrapy.Request(item['url']) def item_completed(self, results, item, info): ''' 文件已经下载完毕,需要将文件的路径保存在mongo 中 :param results: :param item: :param info: :return: ''' ''' results 的形式如下 [(True, {'url': 'http://audio.xmcdn.com/group8/M04/37/6C/wKgDYVb_iwyzAqwqABd42Fth8jI393.m4a', 'path': 'full/9668aa9324060a7d8e193b46b96257.m4a', 'checksum': '79b63c45bef51ebac3aa5ab69018d0d9'})] ''' if item.type == 'audio': path = os.path.join(item['audio_base'], results[0][1]['path']) checksum = results[0][1]['checksum'] self.db[item['collection']].update( {'_id': item['_id']}, {'$set': { 'audioDownloadDir': path, 'checksum': checksum }}) return item
def __init__(self): handlers = urlpatterns conn = MotorClient(ConfUtil.getMongoIP(),ConfUtil.getMongoPort()) self.db = conn[ConfUtil.getDBName()] settings = dict( template_path = ConfUtil.getTemplatePath(), static_path = ConfUtil.getStaticPath(), debug = True, ) tornado.web.Application.__init__(self,handlers=handlers,**settings)
def __init__(self): handlers = urlpatterns conn = MotorClient(ConfUtil.getMongoIP(), ConfUtil.getMongoPort()) self.db = conn[ConfUtil.getDBName()] settings = dict( template_path=ConfUtil.getTemplatePath(), static_path=ConfUtil.getStaticPath(), debug=True, ) tornado.web.Application.__init__(self, handlers=handlers, **settings)
def getSummary(): ''' 获得当前数据库状态的整体描述 :return: ''' xmlyUtil = XMLYUtil() klUtil = KLUtil() qtUtil = QTUtil() qtRes = {} klRes = {} xmlyRes = {} qtRes['albumCount'] = qtUtil.getAlbumCount() qtRes['audioCount'] = qtUtil.getTotalAudioCount() qtRes['categoryWithCount'] = qtUtil.getAllCategoryWithCount() xmlyRes['albumCount'] = xmlyUtil.getAlbumCount() xmlyRes['audioCount'] = xmlyUtil.getTotalAudioCount() xmlyRes['categoryWithCount'] = xmlyUtil.getAllCategoryWithCount() klRes['albumCount'] = klUtil.getAlbumCount() klRes['audioCount'] = klUtil.getTotalAudioCount() klRes['categoryWithCount'] = klUtil.getAllCategoryWithCount() res = dict( totalAlbumCount = qtRes['albumCount'] + xmlyRes['albumCount'] + klRes['albumCount'], totalAudioCount = qtRes['audioCount'] + qtRes['audioCount'] + klRes['audioCount'], qtRes = qtRes, klRes = klRes, xmlyRes = xmlyRes, type = 'summary' ) db[ConfUtil.getCrontabDbCollectionName()].insert(res)
def getSummary(): ''' 获得当前数据库状态的整体描述 :return: ''' xmlyUtil = XMLYUtil() klUtil = KLUtil() qtUtil = QTUtil() qtRes = {} klRes = {} xmlyRes = {} qtRes['albumCount'] = qtUtil.getAlbumCount() qtRes['audioCount'] = qtUtil.getTotalAudioCount() qtRes['categoryWithCount'] = qtUtil.getAllCategoryWithCount() xmlyRes['albumCount'] = xmlyUtil.getAlbumCount() xmlyRes['audioCount'] = xmlyUtil.getTotalAudioCount() xmlyRes['categoryWithCount'] = xmlyUtil.getAllCategoryWithCount() klRes['albumCount'] = klUtil.getAlbumCount() klRes['audioCount'] = klUtil.getTotalAudioCount() klRes['categoryWithCount'] = klUtil.getAllCategoryWithCount() res = dict(totalAlbumCount=qtRes['albumCount'] + xmlyRes['albumCount'] + klRes['albumCount'], totalAudioCount=qtRes['audioCount'] + qtRes['audioCount'] + klRes['audioCount'], qtRes=qtRes, klRes=klRes, xmlyRes=xmlyRes, type='summary') db[ConfUtil.getCrontabDbCollectionName()].insert(res)
def runForQt(self): ''' 执行qt 统计任务 :return: ''' audio_dir = ConfUtil.getQTAudioDir() image_dir = ConfUtil.getQTImageDir() res = defaultdict() res['totalAudioSize(bytes)'] = getDirSize(audio_dir) res['totalImageSize(bytes)'] = getDirSize(image_dir) res['totalAlbumCount'] = self.qtSt.getAlbumCount() res['totalAudioCount'] = self.qtSt.getAudioCount() res['audioCountPerCategory'] = self.qtSt.getAudioCountPerCategory() res['albumCountPerCategory'] = self.qtSt.getAlbumCountPerCategory() return res
def runForXMLY(self): ''' 执行喜马拉雅统计任务 :return: ''' audio_dir = ConfUtil.getXMLYAudioDir() image_dir = ConfUtil.getXmlyImageDir() res = defaultdict() res['totalAudioSize(bytes)'] = getDirSize(audio_dir) res['totalImageSize(bytes)'] = getDirSize(image_dir) res['totalAlbumCount'] = self.xmlySt.getAlbumCount() res['totalAudioCount'] = self.xmlySt.getAudioCount() res['audioCountPerCategory'] = self.xmlySt.getAudioCountPerCategory() res['albumCountPerCategory'] = self.xmlySt.getAlbumCountPerCategory() return res
def putVisitedAlbumToMongoSet(cls): ''' 将访问过的 album 信息放到mongo 数据库中 :return: ''' mongoUri, mongoDatabase = ConfUtil.getMongoConf() client = pymongo.MongoClient(mongoUri) db = client[mongoDatabase] db.get_collection()
def get(self, uuid): coll = self.application.db[ConfUtil.getQTAudioCollectionName()] audio = yield coll.find_one({"uuid": uuid}) if audio == None: self.set_status(404, u'audioNotFound') self.finish(u"<html><body> Not Found </body></html>") else: self.set_header('Content-Type', 'text/javascript') self.write(json.dumps(audio, default=json_util.default))
def sendXMLToCNR(self,xml): ''' 将xml 内容推送到cnr :param xml: :return: ''' headers = {'Content-Type':'application/xml'} res = requests.post(ConfUtil.getCnrUri(),data=xml.encode('utf-8'), headers = headers) return res
def runForKL(self): ''' 执行考拉的统计任务 :return: ''' #根据系统统计文件大小 audio_dir = ConfUtil.getKLAudioDir() image_dir = ConfUtil.getKLImageDir() res = defaultdict() res['totalAudioSize(bytes)'] = getDirSize(audio_dir) res['totalImageSize(bytes)'] = getDirSize(image_dir) #根据数据库统计专辑与媒体文件的数量 res['totalAlbumCount'] = self.klSt.getAlbumCount() res['totalAudioCount'] = self.klSt.getAudioCount() res['audioCountPerCategory'] = self.klSt.getAudioCountPerCategory() res['albumCountPerCategory'] = self.klSt.getAlbumCountPerCategory() return res
def putVisitedAlbumToMongoSet(cls): ''' 将访问过的 album 信息放到mongo 数据库中 :return: ''' mongoUri,mongoDatabase = ConfUtil.getMongoConf() client = pymongo.MongoClient( mongoUri ) db = client[mongoDatabase] db.get_collection()
class LiveImageDownloader(FilesPipeline): client = MongoClient(ConfUtil.getMongoIP(), ConfUtil.getMongoPort()) db = client[ConfUtil.getLiveDbName()] def get_media_requests(self, item, info): print item.type if item.type == 'live': yield scrapy.Request(item['url']) def item_completed(self, results, item, info): if item.type == 'live': path = os.path.join(item['image_base'], results[0][1]['path']) checksum = results[0][1]['checksum'] self.db[item['collection']].update( {'_id': item['_id']}, {'$set': { 'img': path, 'imgCheckSum': checksum }}) return item
def parse(self, response): cursor = self.c_audio.find( { 'audioDownloadDir':None } ) for audio in cursor: audioItem = AudioItem() audioItem['_id'] = audio['_id'] audioItem['collection'] = ConfUtil.getQTAudioCollectionName() audioItem['url'] = audio['playUrl'] audioItem['audio_base'] = self.FILES_STORE_BASE yield audioItem
class LiveImageSpider(scrapy.Spider): name = "live_image" client = MongoClient(ConfUtil.getMongoIP(), ConfUtil.getMongoPort()) db = client[ConfUtil.getLiveDbName()] live_coll = db[ConfUtil.getLiveCollectionName()] custom_settings = get_project_settings().getdict('LIVE_SETTINGS') FILES_STORE_BASE = custom_settings['FILES_STORE'] live_coll_name = ConfUtil.getLiveCollectionName() start_urls = () def start_requests(self): yield scrapy.Request('http://www.baidu.com', callback=self.parse) def parse(self, response): cursor = self.live_coll.find({'img': {"$exists": False}}) for live in cursor: liveItem = LiveImageItem() liveItem['_id'] = live['_id'] liveItem['collection'] = self.live_coll_name liveItem['url'] = live['imgSrc'] liveItem['image_base'] = self.FILES_STORE_BASE yield liveItem
def get(self,foo): ''' 返回时间区间内 summary 情况 :param start_time: :param end_time: :return: ''' coll = self.application.db[ConfUtil.getCrontabResultCollectionName()] latest = coll.find( {"type":"summary"} ).sort([("_id",1),]).limit(1) while (yield latest.fetch_next): doc = latest.next_object() break del doc['_id'] self.write(doc)
def get(self, uuid): coll = self.application.db[ConfUtil.getKLAudioCollectionName()] audio = yield coll.find_one( { "uuid":uuid } ) if audio == None: self.set_status(404,u'audioNotFound') self.finish(u"<html><body> Not Found </body></html>") else: self.set_header( 'Content-Type','text/javascript' ) self.write( json.dumps(audio,default=json_util.default) )
def get(self, foo): ''' 返回时间区间内 summary 情况 :param start_time: :param end_time: :return: ''' coll = self.application.db[ConfUtil.getCrontabResultCollectionName()] latest = coll.find({ "type": "summary" }).sort([ ("_id", 1), ]).limit(1) while (yield latest.fetch_next): doc = latest.next_object() break del doc['_id'] self.write(doc)
from m_interact.feedBack import FeedBack,HandleQTRe,HandleKLRe,HandleXMLYRe,ViewSummary from m_interact.sender import XXXSender,AllSender from m_interact.crawlerManager import XXXManager,QtTopnManager,XMLYTopnManager from conf_util import ConfUtil #在urlpatterns 中添加新的路由 urlpatterns = [ (r'/infoCrawler',FeedBack), (r'/toCNR/xmly/(\w+)',HandleXMLYRe), (r'/toCNR/kl/(\w+)',HandleKLRe), (r'/toCNR/qt/(\w+)',HandleQTRe), #对内的统计页面,资源整体情况描述 (r'/toStatistic/summary/',ViewSummary), # xmly 数据推送 url(r'/api/sender/vod/xmly',XXXSender,dict(collection = ConfUtil.getXMLYAudioCollectionName(), web_str='xmly' )), url(r'/api/sender/vod/qt',XXXSender,dict(collection = ConfUtil.getQTAudioCollectionName(), web_str = 'qt' )), url(r'/api/sender/vod/kl',XXXSender,dict( collection = ConfUtil.getKLAudioCollectionName(), web_str = 'kl' )), url(r'/api/xmly/full', XXXManager, dict( process_name = ConfUtil.xmlyFullProcessName() )), url(r'/api/qt/full', XXXManager, dict(
class XMLGenerator: template = Environment(loader=PackageLoader( 'm_interact', 'templates')).get_template('sendTemp.xml') soapTargetUri = ConfUtil.getSoapTargetUri() def __init__(self): pass def getXMLContentFromAudio(self, sourceWeb, audio): ''' 从audio 中获得 xml 内容 函数会根据 sourceWeb 的不同来决定推送的逻辑 sourceWeb 为 kl xmly 或者 qt audio 为直接从 数据库中取到的对应网站audio 的字典格式 ''' now = datetime.datetime.now() RequestID = audio.get('uuid', None) RequestTime = now.strftime("%Y-%m-%d %H:%M:%S") TaskGUID = audio.get('uuid', None) TaskName = audio.get('album_title', None) PutinTime = now.strftime("%Y-%m-%d %H:%M:%S") uuid = audio.get('uuid', None) SoapTargetUri = self.soapTargetUri.format(sourceWeb=sourceWeb, uuid=uuid) PGMNAME = TaskName PGMGUID = audio.get('uuid', None) Title = TaskName #如下代码之后需要重构,已经将sourceWeb 写死在,故并不通用 if sourceWeb == 'kl': CATALOGNAME = u'考拉fm\点播\{category}\{album}'.format( category=audio.get('category_title', u'未知'), album=audio.get('album_title', u'未知')) CreatorName = audio.get('uploaderName', u'Crawler').strip() PgmNote = audio.get('fullDescs', u'描述未知') FileName = audio.get('audioDownloadDir', None) elif sourceWeb == 'xmly': CATALOGNAME = u'喜马拉雅fm\点播\{category}\{album}'.format( category=audio.get('category_title', u'未知'), album=audio.get('album_title', u'未知')) CreatorName = audio.get('uploadUserName', u'Crawler').strip() PgmNote = audio.get('intro', u'描述未知') FileName = audio.get('audioDownloadDir', None) elif sourceWeb == 'qt': CATALOGNAME = u'蜻蜓fm\点播\{category}\{album}'.format( category=audio.get('category_title', u'未知类别'), album=audio.get('album_title', u'未知专辑')) CreatorName = u'蜻蜓fm,作者未知' PgmNote = audio.get('audioName', u'描述未知') FileName = audio.get('audioDownloadDir', None) else: print u'未知sourceWeb' xmlContent = self.template.render(RequestID=RequestID, RequestTime=RequestTime, TaskGUID=TaskGUID, PutinTime=PutinTime, uuid=uuid, SoapTargetUri=SoapTargetUri, PGMNAME=PGMNAME, PGMGUID=PGMGUID, Title=Title, CATALOGNAME=CATALOGNAME, CreatorName=CreatorName, PgmNote=PgmNote, FileName=FileName, TaskName=TaskName, firstplaytime=None, broadstarttime=None, broadendtime=None) return xmlContent
#coding=utf-8 __author__ = 'xiyuanbupt' import argparse from pymongo import MongoClient from statistics.fromLog import getScrapyStatusFromScrapyLog from conf_util import ConfUtil tClient = MongoClient(ConfUtil.getMongoIP(),ConfUtil.getMongoPort()) tDb = tClient[ConfUtil.getStatisticsDBName()] ''' 本脚本为在爬虫爬取相关数据之后通过日志统计相关信息 信息形式如下 {'downloader/request_bytes': 227847, 'downloader/request_count': 427, 'downloader/request_method_count/GET': 427, 'downloader/response_bytes': 799168, 'downloader/response_count': 427, 'downloader/response_status_count/200': 427, 'finish_reason': 'finished', 'finish_time': datetime.datetime(2016, 5, 3, 9, 7, 24, 34782), 'item_scraped_count': 6882, 'log_count/DEBUG': 7310, 'log_count/INFO': 16, 'request_depth_max': 3, 'response_received_count': 427, 'scheduler/dequeued': 427, 'scheduler/dequeued/memory': 427, 'scheduler/enqueued': 427, 'scheduler/enqueued/memory': 427,
#coding=utf-8 from __future__ import absolute_import ''' 对于删除文件的操作仅仅在服务器端执行有效 ''' from itertools import chain from os import path import os from os.path import getsize from pymongo import MongoClient from conf_util import ConfUtil from m_spider.settings import XMLY_SETTINGS,KL_SETTINGS client = MongoClient(ConfUtil.getMongoIP(),ConfUtil.getMongoPort()) #喜马拉雅中的一些查询函数 class XMLYUtil(): ''' 基于数据库统计xmly 目前资源的情况 ''' imagesDir = XMLY_SETTINGS['IMAGES_STORE'] filesDir = XMLY_SETTINGS['FILES_STORE'] def __init__(self): db = client[ConfUtil.getDBName()] self.album = db[ConfUtil.getXMLYAlbumCollectionName()] self.category = db[ConfUtil.getXMLYCategoryCollectionName()] def getAllAudioIdFormAlbum(self,album): albumId = album['album_id'] audios = album['audios']
#coding=utf-8 from __future__ import absolute_import ''' 对于删除文件的操作仅仅在服务器端执行有效 ''' from itertools import chain from os import path import os from os.path import getsize from pymongo import MongoClient from conf_util import ConfUtil from m_spider.settings import XMLY_SETTINGS, KL_SETTINGS client = MongoClient(ConfUtil.getMongoIP(), ConfUtil.getMongoPort()) #喜马拉雅中的一些查询函数 class XMLYUtil(): ''' 基于数据库统计xmly 目前资源的情况 ''' imagesDir = XMLY_SETTINGS['IMAGES_STORE'] filesDir = XMLY_SETTINGS['FILES_STORE'] def __init__(self): db = client[ConfUtil.getDBName()] self.album = db[ConfUtil.getXMLYAlbumCollectionName()] self.category = db[ConfUtil.getXMLYCategoryCollectionName()]
#coding=utf-8 __author__ = 'xiyuanbupt' # e-mail : [email protected] import redis from conf_util import ConfUtil redis_pool = redis.ConnectionPool(host=ConfUtil.getRedisHost(), port=ConfUtil.getRedisPort(), db=ConfUtil.getRedisDb())
def testConfUtil(self): self.assertEqual(ConfUtil.getQTAudioDir(),'/var/crawler/qt/audios/full') self.assertEqual( ConfUtil.getKLAudioDir(),'/var/crawler/kl/audios/full' )
def __init__(self): self.album = db[ConfUtil.getKLAlbumCollectionName()] self.category = db[ConfUtil.getKLCategoryCollectionName()] self.audio = db[ConfUtil.getKLAudioCollectionName()]
def __init__(self): self.album = db[ConfUtil.getQTAlbumCollectionName()] pass
def __init__(self): self.album = db[ConfUtil.getXMLYAlbumCollectionName()] self.category = db[ConfUtil.getXMLYCategoryCollectionName()] self.audio = db[ConfUtil.getXMLYAudioCollectionName()]
#coding=utf-8 __author__ = 'xiyuanbupt' # e-mail : [email protected] import redis from conf_util import ConfUtil redis_pool = redis.ConnectionPool( host = ConfUtil.getRedisHost(), port = ConfUtil.getRedisPort(), db = ConfUtil.getRedisDb() )
#coding=utf-8 from __future__ import absolute_import __author__ = 'xiyuanbupt' import datetime from xml.etree.ElementTree import Element from jinja2 import Environment, PackageLoader from pymongo import MongoClient import requests from conf_util import ConfUtil env = Environment(loader=PackageLoader('m_interact', 'templates')) ''' 用于向接口中推送数据,每天会在固定的时间启动一个 sender 进程,用来推送当前的数据 ''' from m_spider.settings import XMLY_SETTINGS, KL_SETTINGS, QT_SETTINGS client = MongoClient(ConfUtil.getMongoIP(), ConfUtil.getMongoPort()) db = client[ConfUtil.getDBName()] from dbTool.tool import XMLYUtil, KLUtil, QTUtil class Sender: klAudio = db[ConfUtil.getKLAudioCollectionName()] xmlyAudio = db[ConfUtil.getKLAudioCollectionName()] qtAudio = db[ConfUtil.getQTAudioCollectionName()] template = env.get_template('sendTemp.xml') soapTargetUri = ConfUtil.getSoapTargetUri() def __init__(self): self.xmlyUtil = XMLYUtil() self.qtUtil = QTUtil()
class Sender: klAudio = db[ConfUtil.getKLAudioCollectionName()] xmlyAudio = db[ConfUtil.getXMLYAudioCollectionName()] qtAudio = db[ConfUtil.getQTAudioCollectionName()] template = env.get_template('sendTemp.xml') soapTargetUri = ConfUtil.getSoapTargetUri() def __init__(self): pass def useJinja(self): template = env.get_template('sendTemp.xml') return template.render(PGMGUID = 'wwww') #从数据库中读取所有未被推送到cnr 并且媒体文件已经被下载的数据项 def getXMLYAudioNotInCNRWithFile(self): ''' 获得所有未被推送到CNR 但是媒体文件已经被下载的 audio ''' with self.xmlyAudio.find( { "sendToCNRTime":None, "audioDownloadDir":{"$ne":None} } ) as cursor: for audio in cursor: yield audio def getKLAudioNotInCNRWithFile(self): ''' 获得所有未被推送到CNR 但是媒体文件已经被下载的audio ''' with self.klAudio.find( { "sendToCNRTime":None, "audioDownloadDir":{"$ne":None} } ) as cursor: for audio in cursor: yield audio def getQTAudioNotInCNRWithFile(self): ''' 获得未被推送到cnr 但是媒体文件已经被下载的audio,qt 网站 :return: ''' with self.qtAudio.find( { "sendToCNRTime":None, "audioDownloadDir":{"$ne":None} } ) as cursor: for audio in cursor: yield audio def getXMLContentFromAudio(self,sourceWeb,audio): ''' 从audio 中获得 xml 内容 函数会根据 sourceWeb 的不同来决定推送的逻辑 sourceWeb 为 kl xmly 或者 qt audio 为直接从 数据库中取到的对应网站audio 的字典格式 ''' now = datetime.datetime.now() RequestID = audio.get('uuid',None) RequestTime = now.strftime("%Y-%m-%d %H:%M:%S") TaskGUID = audio.get('uuid',None) TaskName = audio.get('album_title',None) PutinTime = now.strftime("%Y-%m-%d %H:%M:%S") uuid = audio.get('uuid',None) SoapTargetUri = self.soapTargetUri.format( sourceWeb = sourceWeb,uuid = uuid ) PGMNAME = TaskName PGMGUID = audio.get('uuid',None) Title = TaskName #如下代码之后需要重构,已经将sourceWeb 写死在,故并不通用 if sourceWeb == 'kl': CATALOGNAME = u'考拉fm\点播\{category}\{album}'.format( category = audio.get('category_title',u'未知'), album = audio.get('album_title',u'未知') ) CreatorName = audio.get('uploaderName',u'北邮爬虫').strip() PgmNote = audio.get('fullDescs',u'描述未知') FileName = audio.get('audioDownloadDir',None) elif sourceWeb == 'xmly': CATALOGNAME = u'喜马拉雅fm\点播\{category}\{album}'.format( category = audio.get('category_title',u'未知'), album = audio.get('album_title',u'未知') ) CreatorName = audio.get('uploadUserName',u'北邮爬虫').strip() PgmNote = audio.get('intro',u'描述未知') FileName = audio.get('audioDownloadDir',None) elif sourceWeb == 'qt': CATALOGNAME = u'蜻蜓fm\点播\{category}\{album}'.format( category = audio.get('category_title',u'未知类别'), album = audio.get('album_title',u'未知专辑') ) CreatorName = u'蜻蜓fm,作者未知' PgmNote = audio.get('audioName',u'描述未知') FileName = audio.get('audioDownloadDir',None) else: print u'未知sourceWeb' xmlContent = self.template.render( RequestID = RequestID, RequestTime = RequestTime, TaskGUID = TaskGUID, PutinTime = PutinTime, uuid = uuid, SoapTargetUri = SoapTargetUri, PGMNAME = PGMNAME, PGMGUID = PGMGUID, Title = Title, CATALOGNAME = CATALOGNAME, CreatorName = CreatorName, PgmNote = PgmNote, FileName = FileName, TaskName = TaskName, firstplaytime = None, broadstarttime = None, broadendtime = None ) return xmlContent def sendXMLToCNR(self,xml): ''' 将xml 内容推送到cnr :param xml: :return: ''' headers = {'Content-Type':'application/xml'} res = requests.post(ConfUtil.getCnrUri(),data=xml.encode('utf-8'), headers = headers) return res def getAudioPutToCNR(self,count = 10000,funAfterPush = lambda:time.sleep(0)): ''' 冷启动,或者平时使用,向cnr 发送当前数据库中有媒体文件但是未被发送到cnr 的数据 执行获得所有未被推送到cnr ,并且文件已经被下载到本地种的audio 并将其推送至cnr 并更改标志位 :param count 为本次期望发送到cnr 最大的音频数量,默认为10,生产环境中需要更改 :param funAfterPush 每次执行推送任务之后的函数 ''' logger = logging.getLogger('sender') #每个网站推送相同的数据量 count = count/3 xcount , qcount , kcount = count,count,count xmlyAudios = self.getXMLYAudioNotInCNRWithFile() for xmlyAudio in xmlyAudios: xcount -= 1 if xcount < 0: break xmlContent = self.getXMLContentFromAudio('xmly',xmlyAudio) resp = self.sendXMLToCNR(xmlContent) #需要添加根据返回的状态判断相关信息 if True: logger.info( u'send xmlyAudio uuid - {0}'.format( xmlyAudio['uuid'] ) ) #设置将数据推送到cnr 的时间 funAfterPush() self.xmlyAudio.update( { "_id":xmlyAudio['_id'] }, { "$set":{ "sendToCNRTime":datetime.datetime.now() } } ) klAudios = self.getKLAudioNotInCNRWithFile() for klAudio in klAudios: kcount -= 1 if kcount < 0: break xmlContent = self.getXMLContentFromAudio('kl',klAudio) resp = self.sendXMLToCNR(xmlContent) if True: logger.info( u'send klAudio uuid - {0}'.format( klAudio['uuid'] ) ) #设置推送到cnr 的时间 funAfterPush() self.klAudio.update( { "_id":klAudio['_id'] }, { "$set":{ "sendToCNRTime":datetime.datetime.now() } } ) qtAudios = self.getQTAudioNotInCNRWithFile() for qtAudio in qtAudios: qcount -= 1 if qcount < 0: break xmlContent = self.getXMLContentFromAudio('qt',qtAudio) resp = self.sendXMLToCNR(xmlContent) if True: logger.info( u'send qtAudio uuid - {0}'.format( qtAudio['uuid'] ) ) funAfterPush() self.qtAudio.update( { "_id":qtAudio['_id'] }, { "$set":{ "sendToCNRTime":datetime.datetime.now() } } )
def run(self): sender = Sender() sleepSec = ConfUtil.getSleepSecAgterPush() sender.getAudioPutToCNR(ConfUtil.getCnrSendCountOnce(), lambda:time.sleep(sleepSec))
from m_interact.feedBack import FeedBack, HandleQTRe, HandleKLRe, HandleXMLYRe, ViewSummary from m_interact.sender import XXXSender, AllSender from m_interact.crawlerManager import XXXManager, QtTopnManager, XMLYTopnManager from conf_util import ConfUtil #在urlpatterns 中添加新的路由 urlpatterns = [ (r'/infoCrawler', FeedBack), (r'/toCNR/xmly/(\w+)', HandleXMLYRe), (r'/toCNR/kl/(\w+)', HandleKLRe), (r'/toCNR/qt/(\w+)', HandleQTRe), #对内的统计页面,资源整体情况描述 (r'/toStatistic/summary/', ViewSummary), # xmly 数据推送 url(r'/api/sender/vod/xmly', XXXSender, dict(collection=ConfUtil.getXMLYAudioCollectionName(), web_str='xmly')), url(r'/api/sender/vod/qt', XXXSender, dict(collection=ConfUtil.getQTAudioCollectionName(), web_str='qt')), url(r'/api/sender/vod/kl', XXXSender, dict(collection=ConfUtil.getKLAudioCollectionName(), web_str='kl')), url(r'/api/xmly/full', XXXManager, dict(process_name=ConfUtil.xmlyFullProcessName())), url(r'/api/qt/full', XXXManager, dict(process_name=ConfUtil.qtFullProcessName())), url(r'/api/xmly/increment', XXXManager, dict(process_name=ConfUtil.xmlyIncreProcessName())), url(r'/api/kl/increment', XXXManager, dict(process_name=ConfUtil.klIncreProcessName())), url(r'/api/qt/increment', XXXManager, dict(process_name=ConfUtil.qtIncreProcessName())),
def testGetSaveScrapyStatusFromLog(self): getSaveScrapyStatusFromLog(ConfUtil.getTestLogDir(),ConfUtil.getTestCrawler())
class XXXManager(tornado.web.RequestHandler): ''' 处理爬虫启停与状态 ''' executor = ThreadPoolExecutor(max_workers=MAX_WORKERS) server = xmlrpclib.Server(ConfUtil.getSupervisorUri()) supervisor = server.supervisor status_del_tags = [ 'group', 'pid', 'stderr_logfile', 'stdout_logfile', 'logfile', 'spawnerr' ] def initialize(self, process_name): self.process_name = process_name @run_on_executor def get_process_info(self,process_name): status = self.supervisor.getProcessInfo(process_name) try: for key in self.status_del_tags: del(status[key]) except Exception as e: pass return status @run_on_executor def stop_process(self, process_name): ''' 向进程发送Ctrl-C 命令 :param process_name: :return: ''' self.supervisor.signalProcess(process_name, '2') @run_on_executor def force_stop_process(self, process_name): ''' 强制停止进程 :param process_name: :return: ''' self.supervisor.stopProcess(process_name) @run_on_executor def direct_start_process(self, process_name): ''' 开始执行进程,对于互相间没有交互的 :param process_name: :return: ''' self.supervisor.startProcess(process_name) @gen.coroutine def get(self, *args, **kwargs): ''' 获得爬虫状态 :param args: :param kwargs: :return: ''' status = yield self.get_process_info(self.process_name) self.write(status) @gen.coroutine def post(self, *args, **kwargs): ''' 控制爬虫启停,只针对无交互的爬虫进程 :param args: :param kwargs: :return: ''' status = yield self.get_process_info(self.process_name) if status['state'] == 20: self.write( { 'status':'fail', 'reason':'stillrunning' } ) else: yield self.direct_start_process(self.process_name) status = yield self.get_process_info(self.process_name) status['status'] = 'success' self.write(status) @gen.coroutine def delete(self, *args, **kwargs): ''' 强行停止爬虫 :param args: :param kwargs: :return: ''' status = yield self.get_process_info(self.process_name) if status['state']== 20: try: body = json.loads(self.request.body.decode('utf-8')) except: body = {} if body.get('sigint', True): yield self.stop_process(self.process_name) self.write({"status":"success"}) else: yield self.force_stop_process(self.process_name) self.write({"status":"success"}) pass else: status['status'] = 'faile' status['reason'] = 'Not running' self.write(status)
#coding=utf-8 __author__ = 'xiyuanbupt' from collections import defaultdict,Counter from pymongo import MongoClient from conf_util import ConfUtil from statistics.fromDB import KaoLaStatistics,XmlyStatistics,QtStatistics from statistics.fromSys import getDirSize client = MongoClient(ConfUtil.getMongoIP(),ConfUtil.getMongoPort()) db = client[ConfUtil.getStatisticsDBName()] class Main: coll = db[ConfUtil.getStatisticCronPerHourCollection()] def __init__(self): self.klSt = KaoLaStatistics() self.qtSt = QtStatistics() self.xmlySt = XmlyStatistics() def runOnce(self): ''' 执行一次统计任务 :return: ''' kl = self.runForKL() qt = self.runForQt() xmly = self.runForXMLY() forInsert = dict(
def set_topn_n_and_topn_table(self,topn_n): self.r.set(ConfUtil.xmly_topn_n_key(),topn_n) now = datetime.datetime.now().strftime('%Y%m%d%H%M%S') table_name = 'xmly_top%d_%s' % (topn_n, now) self.r.set(ConfUtil.xmly_topn_table_key(), table_name)
def __init__(self): db = client[ConfUtil.getDBName()] self.album = db[ConfUtil.getKLAlbumCollectionName()] self.category = db[ConfUtil.getKLCategoryCollectionName()]
#coding=utf-8 __author__ = 'xiyuanbupt' from collections import defaultdict from pymongo import MongoClient from conf_util import ConfUtil tmpClient = MongoClient(ConfUtil.getMongoIP(),ConfUtil.getMongoPort()) sDb = tmpClient[ConfUtil.getSpiderDBName()] class KaoLaStatistics: klAudio = sDb[ConfUtil.getKLAudioCollectionName()] klAlbum = sDb[ConfUtil.getKLAlbumCollectionName()] def __init__(self): self.perCategoryRes = None #获得每个类别下的 album 总数以及 audio 总数 def _getAlbum_AudioCountPerCategory(self): res = defaultdict() cursor = self.klAlbum.aggregate( [ { "$group":{ "_id":"$categoryName", "totalAudio":{"$sum":"$audioCounts"}, "totalAlbum":{"$sum":1}, } }
class XXXSender(tornado.web.RequestHandler): executor = ThreadPoolExecutor(max_workers=MAX_WORKERS) xmlyImgDownloader = FilesDownloader(ConfUtil.getXmlyImgDir()) xmlyAudioDownloader = FilesDownloader(ConfUtil.getXmlyAudioDir()) qtImgDownloader = FilesDownloader(ConfUtil.getQtImgDir()) qtAudioDownloader = FilesDownloader(ConfUtil.getQtAudioDir()) klImgDownloader = FilesDownloader(ConfUtil.getKlImgDir()) klAudioDownloader = FilesDownloader(ConfUtil.getKlAudioDir()) namespace = ("m", "urn:mpc") proxy = SOAPProxy(ConfUtil.getCnrUri(), namespace) redis = redis.Redis(connection_pool=redis_pool) def initialize(self, collection, web_str): self.collection = collection self.web_str = web_str @gen.coroutine def get(self, *args, **kwargs): user = { "Name": "Pradeep", "Company": "SCTL", "Address": "Mumbai", "Location": "RCP" } self.redis.hmset("foiiio", user) cache = self.redis.hgetall("foiiio") self.write(cache) # 推送数据到cnr的索贝接口 @gen.coroutine def post(self, *args, **kwargs): data = json.loads(self.request.body.decode('utf-8')) _ids = data.get('_ids', None) force_push = data.get('force_push', False) if not _ids: raise MissingArgumentError("_ids") coll = self.application.db[self.collection] if force_push: audios = yield [ coll.find_one({"_id": ObjectId(_id)}) for _id in _ids ] else: audios = yield [ coll.find_one({ "_id": ObjectId(_id), "sendToCNRTime": None }) for _id in _ids ] # 如果对应的audio与媒体文件没有被下载,那么下载对应的audio与媒体文件 audiosInfo = None imgsInfo = None if self.web_str == 'kl': # 因为kl网站挂掉了所以暂时不提供kl网站下载 raise UnSupportWebError(self.web_str) elif self.web_str == 'xmly': # 因为取消掉媒体文件下载进程,所以所有媒体文件下载都在这里 audios_url = [audio.get("play_path", None) for audio in audios] imgs_url = [audio.get("cover_url_142", None) for audio in audios] audiosInfo = yield [ self.xmlyAudioDownloader.download_file(url) for url in audios_url ] imgsInfo = yield [ self.xmlyImgDownloader.download_file(url) for url in imgs_url ] elif self.web_str == 'qt': audios_url = [audio.get("playUrl") for audio in audios] audiosInfo = yield [ self.qtAudioDownloader.download_file(url) for url in audios_url ] # 因为爬虫没有获得img url,所以imgs 都为空 imgsInfo = [None for audio in audios] else: raise UnSupportWebError(self.web_str) audiosInfo = zip(audios, audiosInfo, imgsInfo) xmls = [ xmlGenerator.getXMLContentFromAudio(self.web_str, audioInfo) for audioInfo in audiosInfo if audiosInfo[0] ] resps = yield [self.sendXMLToCNR(xml) for xml in xmls] # 将推送到cnr 的时间设置到数据库中 yield [ coll.update({"_id": audio["_id"]}, {"$set": { "sendToCNRTime": datetime.datetime.now() }}) for audio in audios ] self.write({ "audios": [audio.get('album_title') for audio in audios], "resps": ['success' if resp else 'fault' for resp in resps], "request_push_count": len(_ids), "real_push_count": len(xmls), "force_push": force_push, }) @run_on_executor def sendXMLToCNR(self, xml): ''' 将xml 内容推送到cnr :param xml: :return: ''' resp = self.proxy.mpccommit(strInput=xml) return resp
__author__ = 'xiyuanbupt' import datetime import logging import logging.config logging.config.fileConfig('./logger.ini') from jinja2 import Environment,PackageLoader from pymongo import MongoClient import requests import time,threading from conf_util import ConfUtil env = Environment(loader=PackageLoader('m_interact','templates')) ''' 用于向接口中推送数据,每天会在固定的时间启动一个 sender 进程,用来推送当前的数据 ''' client = MongoClient(ConfUtil.getMongoIP(),ConfUtil.getMongoPort()) db = client[ConfUtil.getDBName()] class Sender: klAudio = db[ConfUtil.getKLAudioCollectionName()] xmlyAudio = db[ConfUtil.getXMLYAudioCollectionName()] qtAudio = db[ConfUtil.getQTAudioCollectionName()] template = env.get_template('sendTemp.xml') soapTargetUri = ConfUtil.getSoapTargetUri() def __init__(self): pass def useJinja(self): template = env.get_template('sendTemp.xml') return template.render(PGMGUID = 'wwww')
def getSaveScrapyStatusFromLog(logfile,crawler): res = getScrapyStatusFromScrapyLog(logfile) res['crawler'] = crawler coll = tDb[ConfUtil.getCrawlHistoryCollectionName()] coll.insert(res)
class XMLGenerator: # 转码服务器共享盘挂载的路径 transcoding_mount = 'H:' # 爬虫服务器共享盘挂载的路径 my_mount = '/var/crawler/cnr_shares' template = Environment(loader=PackageLoader('m_interact','templates')).get_template( 'sendTemp.xml' ) soapTargetUri = ConfUtil.getSoapTargetUri() def __init__(self): pass def getXMLContentFromAudio(self,sourceWeb,audioInfo): ''' 从audio 中获得 xml 内容 函数会根据 sourceWeb 的不同来决定推送的逻辑 sourceWeb 为 kl xmly 或者 qt audio 为直接从 数据库中取到的对应网站audio 的字典格式 ''' audio = audioInfo[0] audioFile = audioInfo[1] imgFile = audioInfo[2] now = datetime.datetime.now() RequestID = audio.get('uuid',None) RequestTime = now.strftime("%Y-%m-%d %H:%M:%S") TaskGUID = audio.get('uuid',None) TaskName = audio.get('album_title',None) PutinTime = now.strftime("%Y-%m-%d %H:%M:%S") uuid = audio.get('uuid',None) SoapTargetUri = self.soapTargetUri.format( sourceWeb = sourceWeb,uuid = uuid ) PGMNAME = TaskName PGMGUID = audio.get('uuid',None) Title = TaskName #如下代码之后需要重构,已经将sourceWeb 写死在,故并不通用 if sourceWeb == 'kl': CATALOGNAME = u'考拉fm\点播\{category}\{album}'.format( category = audio.get('category_title',u'未知').strip(), album = audio.get('album_title',u'未知').strip() ) CreatorName = audio.get('uploaderName',u'Crawler').strip() PgmNote = audio.get('fullDescs',u'描述未知').strip() AudioFileName = audioFile.get('path',None) if audioFile else None ImgFileName = imgFile.get('path',None) if imgFile else None elif sourceWeb == 'xmly': CATALOGNAME = u'喜马拉雅fm\点播\{category}\{album}'.format( category = audio.get('category_title',u'未知').strip(), album = audio.get('album_title',u'未知').strip() ) CreatorName = audio.get('uploadUserName',u'Crawler').strip() PgmNote = audio.get('intro',u'描述未知') AudioFileName = audioFile.get('path',None) if audioFile else None ImgFileName = imgFile.get('path',None) if imgFile else None elif sourceWeb == 'qt': CATALOGNAME = u'蜻蜓fm\点播\{category}\{album}'.format( category = audio.get('category_title',u'未知类别').strip(), album = audio.get('album_title',u'未知专辑').strip() ) CreatorName = u'蜻蜓fm,作者未知' PgmNote = audio.get('audioName',u'描述未知') AudioFileName = audioFile.get('path',None) if audioFile else None ImgFileName = imgFile.get('path',None) if imgFile else None else: print u'未知sourceWeb' AudioFileName = changePathStyle(AudioFileName) ImgFileName = changePathStyle(ImgFileName) CATALOGNAME = u'网络爬取数据\\' + CATALOGNAME xmlContent = self.template.render( RequestID = RequestID, RequestTime = RequestTime, TaskGUID = TaskGUID, PutinTime = PutinTime, uuid = uuid, SoapTargetUri = SoapTargetUri, PGMNAME = PGMNAME, PGMGUID = PGMGUID, Title = Title, CATALOGNAME = CATALOGNAME, CreatorName = CreatorName, PgmNote = PgmNote, AudioFileName = self.transcoding_mount + AudioFileName, ImgFileName = (self.transcoding_mount + ImgFileName) if ImgFileName else "H:\\jpg\\2013127111320.png", TaskName = TaskName, firstplaytime = None, broadstarttime = None, broadendtime = None ) return xmlContent
class Sender: klAudio = db[ConfUtil.getKLAudioCollectionName()] xmlyAudio = db[ConfUtil.getKLAudioCollectionName()] qtAudio = db[ConfUtil.getQTAudioCollectionName()] template = env.get_template('sendTemp.xml') soapTargetUri = ConfUtil.getSoapTargetUri() def __init__(self): self.xmlyUtil = XMLYUtil() self.qtUtil = QTUtil() self.klUtil = KLUtil() def useJinja(self): template = env.get_template('sendTemp.xml') return template.render(PGMGUID='wwww') #从数据库中读取所有未被推送到cnr 并且媒体文件已经被下载的数据项 def getXMLYAudioNotInCNRWithFile(self): ''' 获得所有未被推送到CNR 但是媒体文件已经被下载的 audio ''' cursor = self.xmlyAudio.find({ "sendToCNRTime": None, "audioDownloadDir": { "$ne": None } }) for audio in cursor: yield audio def getKLAudioNotInCNRWithFile(self): ''' 获得所有未被推送到CNR 但是媒体文件已经被下载的audio ''' cursor = self.klAudio.find({ "sendToCNRTime": None, "audioDownloadDir": { "$ne": None } }) for audio in cursor: yield audio def getXMLContent(self, sourceWeb, audio): ''' 从audio 中获得 xml 内容 函数会根据 sourceWeb 的不同来决定推送的逻辑 sourceWeb 为 kl xmly 或者 qt ''' now = datetime.datetime.now() RequestID = audio.get('uuid', None) RequestTime = now.strftime("%Y-%m-%d %H:%M:%S") TaskGUID = audio.get('uuid', None) TaskName = audio.get('album_title', None) PutinTime = now.strftime("%Y-%m-%d %H:%M:%S") uuid = audio.get('uuid', None) SoapTargetUri = self.soapTargetUri.format(sourceWeb=sourceWeb, uuid=uuid) PGMNAME = TaskName PGMGUID = audio.get('uuid', None) Title = TaskName #如下代码之后需要重构,已经将sourceWeb 写死在,故并不通用 if sourceWeb == 'kl': CATALOGNAME = u'考拉fm\点播\{category}\{album}'.format( category=audio.get('categoryName', u'未知'), album=audio.get('albumName', u'未知')) CreatorName = audio.get('uploaderName', u'北邮爬虫') PgmNote = audio.get('fullDescs', u'描述未知') FileName = audio.get('audioDownloadDir', None) elif sourceWeb == 'xmly': CATALOGNAME = u'喜马拉雅fm\点播\{category}\{album}'.format( category=audio.get('category_title', u'未知'), album=audio.get('album_title', u'未知')) CreatorName = audio.get('uploadUserName', u'北邮爬虫') PgmNote = audio.get('intro', u'描述未知') FileName = audio.get('audioDownloadDir', None) else: print u'未知sourceWeb' xmlContent = self.template.render(RequestID=RequestID, RequestTime=RequestTime, TaskGUID=TaskGUID, PutinTime=PutinTime, uuid=uuid, SoapTargetUri=SoapTargetUri, PGMNAME=PGMNAME, PGMGUID=PGMGUID, Title=Title, CATALOGNAME=CATALOGNAME, CreatorName=CreatorName, PgmNote=PgmNote, FileName=FileName, TaskName=TaskName, firstplaytime=None, broadstarttime=None, broadendtime=None) return xmlContent def getAudioPutToCNR(self): ''' 执行获得所有未被推送到cnr ,并且文件已经被下载到本地种的audio 并将其推送至cnr 并更改标志位 ''' pass