Example #1
0
class QtAudioSpider(scrapy.Spider):
    name = "qt_audio"
    start_urls = (
    )
    client = MongoClient(ConfUtil.getMongoIP(),ConfUtil.getMongoPort())
    db = client[ConfUtil.getDBName()]
    c_audio = db[ConfUtil.getQTAudioCollectionName()]
    custom_settings = get_project_settings().getdict('QT_SETTINGS')
    FILES_STORE_BASE = custom_settings['FILES_STORE']

    def start_requests(self):
        yield scrapy.Request(
            'http://www.baidu.com',callback=self.parse
        )

    def parse(self, response):
        cursor = self.c_audio.find(
            {
                'audioDownloadDir':None
            }
        )
        for audio in cursor:
            audioItem = AudioItem()
            audioItem['_id'] = audio['_id']
            audioItem['collection'] = ConfUtil.getQTAudioCollectionName()
            audioItem['url'] = audio['playUrl']
            audioItem['audio_base'] = self.FILES_STORE_BASE
            yield audioItem
Example #2
0
 def testGetXmlContent(self):
     xmly = db[ConfUtil.getXMLYAudioCollectionName()]
     kl = db[ConfUtil.getKLAudioCollectionName()]
     xmly_audio = xmly.find_one()
     kl_audio = kl.find_one()
     print self.sender.getXMLContent('kl',kl_audio)
     print self.sender.getXMLContent('xmly',xmly_audio)
Example #3
0
class AudioDownloader(FilesPipeline):
    client = MongoClient(ConfUtil.getMongoIP(), ConfUtil.getMongoPort())
    db = client[ConfUtil.getDBName()]

    def get_media_requests(self, item, info):
        if item.type == 'audio':
            yield scrapy.Request(item['url'])

    def item_completed(self, results, item, info):
        '''
        文件已经下载完毕,需要将文件的路径保存在mongo 中
        :param results:
        :param item:
        :param info:
        :return:
        '''
        '''
        results 的形式如下
        [(True, {'url': 'http://audio.xmcdn.com/group8/M04/37/6C/wKgDYVb_iwyzAqwqABd42Fth8jI393.m4a',
        'path': 'full/9668aa9324060a7d8e193b46b96257.m4a',
        'checksum': '79b63c45bef51ebac3aa5ab69018d0d9'})]
        '''
        if item.type == 'audio':
            path = os.path.join(item['audio_base'], results[0][1]['path'])
            checksum = results[0][1]['checksum']
            self.db[item['collection']].update(
                {'_id': item['_id']},
                {'$set': {
                    'audioDownloadDir': path,
                    'checksum': checksum
                }})
        return item
Example #4
0
 def __init__(self):
     handlers = urlpatterns
     conn = MotorClient(ConfUtil.getMongoIP(),ConfUtil.getMongoPort())
     self.db = conn[ConfUtil.getDBName()]
     settings = dict(
         template_path = ConfUtil.getTemplatePath(),
         static_path = ConfUtil.getStaticPath(),
         debug = True,
     )
     tornado.web.Application.__init__(self,handlers=handlers,**settings)
Example #5
0
 def __init__(self):
     handlers = urlpatterns
     conn = MotorClient(ConfUtil.getMongoIP(), ConfUtil.getMongoPort())
     self.db = conn[ConfUtil.getDBName()]
     settings = dict(
         template_path=ConfUtil.getTemplatePath(),
         static_path=ConfUtil.getStaticPath(),
         debug=True,
     )
     tornado.web.Application.__init__(self, handlers=handlers, **settings)
Example #6
0
def getSummary():
    '''
    获得当前数据库状态的整体描述
    :return:
    '''
    xmlyUtil = XMLYUtil()
    klUtil = KLUtil()
    qtUtil = QTUtil()

    qtRes = {}
    klRes = {}
    xmlyRes = {}
    qtRes['albumCount'] = qtUtil.getAlbumCount()
    qtRes['audioCount'] = qtUtil.getTotalAudioCount()
    qtRes['categoryWithCount'] = qtUtil.getAllCategoryWithCount()
    xmlyRes['albumCount'] = xmlyUtil.getAlbumCount()
    xmlyRes['audioCount'] = xmlyUtil.getTotalAudioCount()
    xmlyRes['categoryWithCount'] = xmlyUtil.getAllCategoryWithCount()
    klRes['albumCount'] = klUtil.getAlbumCount()
    klRes['audioCount'] = klUtil.getTotalAudioCount()
    klRes['categoryWithCount'] = klUtil.getAllCategoryWithCount()
    res = dict(
        totalAlbumCount = qtRes['albumCount'] + xmlyRes['albumCount'] + klRes['albumCount'],
        totalAudioCount = qtRes['audioCount'] + qtRes['audioCount'] + klRes['audioCount'],
        qtRes = qtRes,
        klRes = klRes,
        xmlyRes = xmlyRes,
        type = 'summary'
    )
    db[ConfUtil.getCrontabDbCollectionName()].insert(res)
Example #7
0
def getSummary():
    '''
    获得当前数据库状态的整体描述
    :return:
    '''
    xmlyUtil = XMLYUtil()
    klUtil = KLUtil()
    qtUtil = QTUtil()

    qtRes = {}
    klRes = {}
    xmlyRes = {}
    qtRes['albumCount'] = qtUtil.getAlbumCount()
    qtRes['audioCount'] = qtUtil.getTotalAudioCount()
    qtRes['categoryWithCount'] = qtUtil.getAllCategoryWithCount()
    xmlyRes['albumCount'] = xmlyUtil.getAlbumCount()
    xmlyRes['audioCount'] = xmlyUtil.getTotalAudioCount()
    xmlyRes['categoryWithCount'] = xmlyUtil.getAllCategoryWithCount()
    klRes['albumCount'] = klUtil.getAlbumCount()
    klRes['audioCount'] = klUtil.getTotalAudioCount()
    klRes['categoryWithCount'] = klUtil.getAllCategoryWithCount()
    res = dict(totalAlbumCount=qtRes['albumCount'] + xmlyRes['albumCount'] +
               klRes['albumCount'],
               totalAudioCount=qtRes['audioCount'] + qtRes['audioCount'] +
               klRes['audioCount'],
               qtRes=qtRes,
               klRes=klRes,
               xmlyRes=xmlyRes,
               type='summary')
    db[ConfUtil.getCrontabDbCollectionName()].insert(res)
Example #8
0
    def runForQt(self):
        '''
        执行qt 统计任务
        :return:
        '''
        audio_dir = ConfUtil.getQTAudioDir()
        image_dir = ConfUtil.getQTImageDir()
        res = defaultdict()
        res['totalAudioSize(bytes)'] = getDirSize(audio_dir)
        res['totalImageSize(bytes)'] = getDirSize(image_dir)

        res['totalAlbumCount'] = self.qtSt.getAlbumCount()
        res['totalAudioCount'] = self.qtSt.getAudioCount()

        res['audioCountPerCategory'] = self.qtSt.getAudioCountPerCategory()
        res['albumCountPerCategory'] = self.qtSt.getAlbumCountPerCategory()
        return res
Example #9
0
    def runForXMLY(self):
        '''
        执行喜马拉雅统计任务
        :return:
        '''
        audio_dir = ConfUtil.getXMLYAudioDir()
        image_dir = ConfUtil.getXmlyImageDir()
        res = defaultdict()
        res['totalAudioSize(bytes)'] = getDirSize(audio_dir)
        res['totalImageSize(bytes)'] = getDirSize(image_dir)

        res['totalAlbumCount'] = self.xmlySt.getAlbumCount()
        res['totalAudioCount'] = self.xmlySt.getAudioCount()

        res['audioCountPerCategory'] = self.xmlySt.getAudioCountPerCategory()
        res['albumCountPerCategory'] = self.xmlySt.getAlbumCountPerCategory()
        return res
Example #10
0
 def putVisitedAlbumToMongoSet(cls):
     '''
     将访问过的 album 信息放到mongo 数据库中
     :return:
     '''
     mongoUri, mongoDatabase = ConfUtil.getMongoConf()
     client = pymongo.MongoClient(mongoUri)
     db = client[mongoDatabase]
     db.get_collection()
Example #11
0
 def get(self, uuid):
     coll = self.application.db[ConfUtil.getQTAudioCollectionName()]
     audio = yield coll.find_one({"uuid": uuid})
     if audio == None:
         self.set_status(404, u'audioNotFound')
         self.finish(u"<html><body> Not Found </body></html>")
     else:
         self.set_header('Content-Type', 'text/javascript')
         self.write(json.dumps(audio, default=json_util.default))
Example #12
0
 def sendXMLToCNR(self,xml):
     '''
     将xml 内容推送到cnr
     :param xml:
     :return:
     '''
     headers = {'Content-Type':'application/xml'}
     res = requests.post(ConfUtil.getCnrUri(),data=xml.encode('utf-8'),
                         headers = headers)
     return res
Example #13
0
    def runForKL(self):
        '''
        执行考拉的统计任务
        :return:
        '''
        #根据系统统计文件大小
        audio_dir = ConfUtil.getKLAudioDir()
        image_dir = ConfUtil.getKLImageDir()
        res = defaultdict()
        res['totalAudioSize(bytes)'] = getDirSize(audio_dir)
        res['totalImageSize(bytes)'] = getDirSize(image_dir)

        #根据数据库统计专辑与媒体文件的数量
        res['totalAlbumCount'] = self.klSt.getAlbumCount()
        res['totalAudioCount'] = self.klSt.getAudioCount()

        res['audioCountPerCategory'] = self.klSt.getAudioCountPerCategory()
        res['albumCountPerCategory'] = self.klSt.getAlbumCountPerCategory()
        return res
Example #14
0
 def putVisitedAlbumToMongoSet(cls):
     '''
     将访问过的 album 信息放到mongo 数据库中
     :return:
     '''
     mongoUri,mongoDatabase = ConfUtil.getMongoConf()
     client = pymongo.MongoClient(
         mongoUri
     )
     db = client[mongoDatabase]
     db.get_collection()
Example #15
0
class LiveImageDownloader(FilesPipeline):
    client = MongoClient(ConfUtil.getMongoIP(), ConfUtil.getMongoPort())

    db = client[ConfUtil.getLiveDbName()]

    def get_media_requests(self, item, info):
        print item.type
        if item.type == 'live':
            yield scrapy.Request(item['url'])

    def item_completed(self, results, item, info):
        if item.type == 'live':
            path = os.path.join(item['image_base'], results[0][1]['path'])
            checksum = results[0][1]['checksum']
            self.db[item['collection']].update(
                {'_id': item['_id']},
                {'$set': {
                    'img': path,
                    'imgCheckSum': checksum
                }})
        return item
Example #16
0
 def parse(self, response):
     cursor = self.c_audio.find(
         {
             'audioDownloadDir':None
         }
     )
     for audio in cursor:
         audioItem = AudioItem()
         audioItem['_id'] = audio['_id']
         audioItem['collection'] = ConfUtil.getQTAudioCollectionName()
         audioItem['url'] = audio['playUrl']
         audioItem['audio_base'] = self.FILES_STORE_BASE
         yield audioItem
Example #17
0
class LiveImageSpider(scrapy.Spider):
    name = "live_image"
    client = MongoClient(ConfUtil.getMongoIP(), ConfUtil.getMongoPort())
    db = client[ConfUtil.getLiveDbName()]
    live_coll = db[ConfUtil.getLiveCollectionName()]
    custom_settings = get_project_settings().getdict('LIVE_SETTINGS')
    FILES_STORE_BASE = custom_settings['FILES_STORE']

    live_coll_name = ConfUtil.getLiveCollectionName()

    start_urls = ()

    def start_requests(self):
        yield scrapy.Request('http://www.baidu.com', callback=self.parse)

    def parse(self, response):
        cursor = self.live_coll.find({'img': {"$exists": False}})
        for live in cursor:
            liveItem = LiveImageItem()
            liveItem['_id'] = live['_id']
            liveItem['collection'] = self.live_coll_name
            liveItem['url'] = live['imgSrc']
            liveItem['image_base'] = self.FILES_STORE_BASE
            yield liveItem
Example #18
0
 def get(self,foo):
     '''
     返回时间区间内 summary 情况
     :param start_time:
     :param end_time:
     :return:
     '''
     coll = self.application.db[ConfUtil.getCrontabResultCollectionName()]
     latest = coll.find(
         {"type":"summary"}
     ).sort([("_id",1),]).limit(1)
     while (yield latest.fetch_next):
         doc = latest.next_object()
         break
     del doc['_id']
     self.write(doc)
Example #19
0
 def get(self, uuid):
     coll = self.application.db[ConfUtil.getKLAudioCollectionName()]
     audio = yield coll.find_one(
         {
             "uuid":uuid
         }
     )
     if audio == None:
         self.set_status(404,u'audioNotFound')
         self.finish(u"<html><body> Not Found </body></html>")
     else:
         self.set_header(
             'Content-Type','text/javascript'
         )
         self.write(
             json.dumps(audio,default=json_util.default)
         )
Example #20
0
 def get(self, foo):
     '''
     返回时间区间内 summary 情况
     :param start_time:
     :param end_time:
     :return:
     '''
     coll = self.application.db[ConfUtil.getCrontabResultCollectionName()]
     latest = coll.find({
         "type": "summary"
     }).sort([
         ("_id", 1),
     ]).limit(1)
     while (yield latest.fetch_next):
         doc = latest.next_object()
         break
     del doc['_id']
     self.write(doc)
Example #21
0
from m_interact.feedBack import FeedBack,HandleQTRe,HandleKLRe,HandleXMLYRe,ViewSummary
from m_interact.sender import XXXSender,AllSender
from m_interact.crawlerManager import XXXManager,QtTopnManager,XMLYTopnManager
from conf_util import ConfUtil

#在urlpatterns 中添加新的路由
urlpatterns = [
    (r'/infoCrawler',FeedBack),
    (r'/toCNR/xmly/(\w+)',HandleXMLYRe),
    (r'/toCNR/kl/(\w+)',HandleKLRe),
    (r'/toCNR/qt/(\w+)',HandleQTRe),
    #对内的统计页面,资源整体情况描述
    (r'/toStatistic/summary/',ViewSummary),
    # xmly 数据推送
    url(r'/api/sender/vod/xmly',XXXSender,dict(collection = ConfUtil.getXMLYAudioCollectionName(),
                                               web_str='xmly'
                                               )),
    url(r'/api/sender/vod/qt',XXXSender,dict(collection = ConfUtil.getQTAudioCollectionName(),
                                             web_str = 'qt'
                                             )),
    url(r'/api/sender/vod/kl',XXXSender,dict(
        collection = ConfUtil.getKLAudioCollectionName(),
        web_str = 'kl'
    )),

    url(r'/api/xmly/full', XXXManager, dict(
        process_name = ConfUtil.xmlyFullProcessName()
    )),

    url(r'/api/qt/full', XXXManager, dict(
Example #22
0
class XMLGenerator:

    template = Environment(loader=PackageLoader(
        'm_interact', 'templates')).get_template('sendTemp.xml')
    soapTargetUri = ConfUtil.getSoapTargetUri()

    def __init__(self):
        pass

    def getXMLContentFromAudio(self, sourceWeb, audio):
        '''
        从audio 中获得 xml 内容
        函数会根据 sourceWeb 的不同来决定推送的逻辑
        sourceWeb 为 kl xmly 或者 qt
        audio 为直接从 数据库中取到的对应网站audio 的字典格式
        '''

        now = datetime.datetime.now()
        RequestID = audio.get('uuid', None)
        RequestTime = now.strftime("%Y-%m-%d %H:%M:%S")
        TaskGUID = audio.get('uuid', None)
        TaskName = audio.get('album_title', None)
        PutinTime = now.strftime("%Y-%m-%d %H:%M:%S")
        uuid = audio.get('uuid', None)
        SoapTargetUri = self.soapTargetUri.format(sourceWeb=sourceWeb,
                                                  uuid=uuid)
        PGMNAME = TaskName
        PGMGUID = audio.get('uuid', None)
        Title = TaskName
        #如下代码之后需要重构,已经将sourceWeb 写死在,故并不通用
        if sourceWeb == 'kl':
            CATALOGNAME = u'考拉fm\点播\{category}\{album}'.format(
                category=audio.get('category_title', u'未知'),
                album=audio.get('album_title', u'未知'))
            CreatorName = audio.get('uploaderName', u'Crawler').strip()
            PgmNote = audio.get('fullDescs', u'描述未知')
            FileName = audio.get('audioDownloadDir', None)
        elif sourceWeb == 'xmly':
            CATALOGNAME = u'喜马拉雅fm\点播\{category}\{album}'.format(
                category=audio.get('category_title', u'未知'),
                album=audio.get('album_title', u'未知'))
            CreatorName = audio.get('uploadUserName', u'Crawler').strip()
            PgmNote = audio.get('intro', u'描述未知')
            FileName = audio.get('audioDownloadDir', None)

        elif sourceWeb == 'qt':
            CATALOGNAME = u'蜻蜓fm\点播\{category}\{album}'.format(
                category=audio.get('category_title', u'未知类别'),
                album=audio.get('album_title', u'未知专辑'))
            CreatorName = u'蜻蜓fm,作者未知'
            PgmNote = audio.get('audioName', u'描述未知')
            FileName = audio.get('audioDownloadDir', None)
        else:
            print u'未知sourceWeb'
        xmlContent = self.template.render(RequestID=RequestID,
                                          RequestTime=RequestTime,
                                          TaskGUID=TaskGUID,
                                          PutinTime=PutinTime,
                                          uuid=uuid,
                                          SoapTargetUri=SoapTargetUri,
                                          PGMNAME=PGMNAME,
                                          PGMGUID=PGMGUID,
                                          Title=Title,
                                          CATALOGNAME=CATALOGNAME,
                                          CreatorName=CreatorName,
                                          PgmNote=PgmNote,
                                          FileName=FileName,
                                          TaskName=TaskName,
                                          firstplaytime=None,
                                          broadstarttime=None,
                                          broadendtime=None)

        return xmlContent
Example #23
0
#coding=utf-8
__author__ = 'xiyuanbupt'
import argparse
from pymongo import MongoClient

from statistics.fromLog import getScrapyStatusFromScrapyLog
from conf_util import ConfUtil

tClient = MongoClient(ConfUtil.getMongoIP(),ConfUtil.getMongoPort())
tDb = tClient[ConfUtil.getStatisticsDBName()]

'''
本脚本为在爬虫爬取相关数据之后通过日志统计相关信息
信息形式如下
    {'downloader/request_bytes': 227847,
     'downloader/request_count': 427,
     'downloader/request_method_count/GET': 427,
     'downloader/response_bytes': 799168,
     'downloader/response_count': 427,
     'downloader/response_status_count/200': 427,
     'finish_reason': 'finished',
     'finish_time': datetime.datetime(2016, 5, 3, 9, 7, 24, 34782),
     'item_scraped_count': 6882,
     'log_count/DEBUG': 7310,
     'log_count/INFO': 16,
     'request_depth_max': 3,
     'response_received_count': 427,
     'scheduler/dequeued': 427,
     'scheduler/dequeued/memory': 427,
     'scheduler/enqueued': 427,
     'scheduler/enqueued/memory': 427,
Example #24
0
#coding=utf-8
from __future__ import absolute_import
'''
对于删除文件的操作仅仅在服务器端执行有效
'''
from itertools import chain
from os import path
import os
from os.path import getsize

from pymongo import MongoClient
from conf_util import ConfUtil

from m_spider.settings import XMLY_SETTINGS,KL_SETTINGS
client = MongoClient(ConfUtil.getMongoIP(),ConfUtil.getMongoPort())

#喜马拉雅中的一些查询函数
class XMLYUtil():
    '''
    基于数据库统计xmly 目前资源的情况
    '''
    imagesDir = XMLY_SETTINGS['IMAGES_STORE']
    filesDir = XMLY_SETTINGS['FILES_STORE']
    def __init__(self):
        db = client[ConfUtil.getDBName()]
        self.album = db[ConfUtil.getXMLYAlbumCollectionName()]
        self.category = db[ConfUtil.getXMLYCategoryCollectionName()]

    def getAllAudioIdFormAlbum(self,album):
        albumId = album['album_id']
        audios = album['audios']
Example #25
0
#coding=utf-8
from __future__ import absolute_import
'''
对于删除文件的操作仅仅在服务器端执行有效
'''
from itertools import chain
from os import path
import os
from os.path import getsize

from pymongo import MongoClient
from conf_util import ConfUtil

from m_spider.settings import XMLY_SETTINGS, KL_SETTINGS

client = MongoClient(ConfUtil.getMongoIP(), ConfUtil.getMongoPort())


#喜马拉雅中的一些查询函数
class XMLYUtil():
    '''
    基于数据库统计xmly 目前资源的情况
    '''
    imagesDir = XMLY_SETTINGS['IMAGES_STORE']
    filesDir = XMLY_SETTINGS['FILES_STORE']

    def __init__(self):
        db = client[ConfUtil.getDBName()]
        self.album = db[ConfUtil.getXMLYAlbumCollectionName()]
        self.category = db[ConfUtil.getXMLYCategoryCollectionName()]
Example #26
0
#coding=utf-8
__author__ = 'xiyuanbupt'
# e-mail : [email protected]

import redis

from conf_util import ConfUtil

redis_pool = redis.ConnectionPool(host=ConfUtil.getRedisHost(),
                                  port=ConfUtil.getRedisPort(),
                                  db=ConfUtil.getRedisDb())
Example #27
0
 def testConfUtil(self):
     self.assertEqual(ConfUtil.getQTAudioDir(),'/var/crawler/qt/audios/full')
     self.assertEqual(
         ConfUtil.getKLAudioDir(),'/var/crawler/kl/audios/full'
     )
Example #28
0
 def __init__(self):
     self.album = db[ConfUtil.getKLAlbumCollectionName()]
     self.category = db[ConfUtil.getKLCategoryCollectionName()]
     self.audio = db[ConfUtil.getKLAudioCollectionName()]
Example #29
0
 def __init__(self):
     self.album = db[ConfUtil.getQTAlbumCollectionName()]
     pass
Example #30
0
 def __init__(self):
     self.album = db[ConfUtil.getQTAlbumCollectionName()]
     pass
Example #31
0
 def __init__(self):
     self.album = db[ConfUtil.getXMLYAlbumCollectionName()]
     self.category = db[ConfUtil.getXMLYCategoryCollectionName()]
     self.audio = db[ConfUtil.getXMLYAudioCollectionName()]
Example #32
0
#coding=utf-8
__author__ = 'xiyuanbupt'
# e-mail : [email protected]

import redis

from conf_util import ConfUtil

redis_pool = redis.ConnectionPool(
    host = ConfUtil.getRedisHost(),
    port = ConfUtil.getRedisPort(),
    db = ConfUtil.getRedisDb()
)
Example #33
0
#coding=utf-8
from __future__ import absolute_import
__author__ = 'xiyuanbupt'
import datetime
from xml.etree.ElementTree import Element
from jinja2 import Environment, PackageLoader
from pymongo import MongoClient
import requests

from conf_util import ConfUtil
env = Environment(loader=PackageLoader('m_interact', 'templates'))
'''
用于向接口中推送数据,每天会在固定的时间启动一个 sender 进程,用来推送当前的数据
'''
from m_spider.settings import XMLY_SETTINGS, KL_SETTINGS, QT_SETTINGS
client = MongoClient(ConfUtil.getMongoIP(), ConfUtil.getMongoPort())
db = client[ConfUtil.getDBName()]

from dbTool.tool import XMLYUtil, KLUtil, QTUtil


class Sender:
    klAudio = db[ConfUtil.getKLAudioCollectionName()]
    xmlyAudio = db[ConfUtil.getKLAudioCollectionName()]
    qtAudio = db[ConfUtil.getQTAudioCollectionName()]
    template = env.get_template('sendTemp.xml')
    soapTargetUri = ConfUtil.getSoapTargetUri()

    def __init__(self):
        self.xmlyUtil = XMLYUtil()
        self.qtUtil = QTUtil()
Example #34
0
class Sender:
    klAudio = db[ConfUtil.getKLAudioCollectionName()]
    xmlyAudio = db[ConfUtil.getXMLYAudioCollectionName()]
    qtAudio = db[ConfUtil.getQTAudioCollectionName()]
    template = env.get_template('sendTemp.xml')
    soapTargetUri = ConfUtil.getSoapTargetUri()

    def __init__(self):
        pass

    def useJinja(self):
        template = env.get_template('sendTemp.xml')
        return template.render(PGMGUID = 'wwww')

    #从数据库中读取所有未被推送到cnr 并且媒体文件已经被下载的数据项
    def getXMLYAudioNotInCNRWithFile(self):
        '''
        获得所有未被推送到CNR 但是媒体文件已经被下载的 audio
        '''
        with self.xmlyAudio.find(
            {
                "sendToCNRTime":None,
                "audioDownloadDir":{"$ne":None}
            }
        ) as cursor:
            for audio in cursor:
                yield audio

    def getKLAudioNotInCNRWithFile(self):
        '''
        获得所有未被推送到CNR 但是媒体文件已经被下载的audio
        '''
        with self.klAudio.find(
            {
                "sendToCNRTime":None,
                "audioDownloadDir":{"$ne":None}
            }
        ) as cursor:
            for audio in cursor:
                yield audio

    def getQTAudioNotInCNRWithFile(self):
        '''
        获得未被推送到cnr 但是媒体文件已经被下载的audio,qt 网站
        :return:
        '''
        with self.qtAudio.find(
                {
                    "sendToCNRTime":None,
                    "audioDownloadDir":{"$ne":None}
                }
        ) as cursor:
            for audio in cursor:
                yield  audio


    def getXMLContentFromAudio(self,sourceWeb,audio):
        '''
        从audio 中获得 xml 内容
        函数会根据 sourceWeb 的不同来决定推送的逻辑
        sourceWeb 为 kl xmly 或者 qt
        audio 为直接从 数据库中取到的对应网站audio 的字典格式
        '''
        now = datetime.datetime.now()
        RequestID = audio.get('uuid',None)
        RequestTime = now.strftime("%Y-%m-%d %H:%M:%S")
        TaskGUID = audio.get('uuid',None)
        TaskName = audio.get('album_title',None)
        PutinTime = now.strftime("%Y-%m-%d %H:%M:%S")
        uuid = audio.get('uuid',None)
        SoapTargetUri = self.soapTargetUri.format(
            sourceWeb = sourceWeb,uuid = uuid
        )
        PGMNAME = TaskName
        PGMGUID = audio.get('uuid',None)
        Title = TaskName
        #如下代码之后需要重构,已经将sourceWeb 写死在,故并不通用
        if sourceWeb == 'kl':
            CATALOGNAME = u'考拉fm\点播\{category}\{album}'.format(
                category = audio.get('category_title',u'未知'),
                album = audio.get('album_title',u'未知')
            )
            CreatorName = audio.get('uploaderName',u'北邮爬虫').strip()
            PgmNote = audio.get('fullDescs',u'描述未知')
            FileName = audio.get('audioDownloadDir',None)
        elif sourceWeb == 'xmly':
            CATALOGNAME = u'喜马拉雅fm\点播\{category}\{album}'.format(
                category = audio.get('category_title',u'未知'),
                album = audio.get('album_title',u'未知')
            )
            CreatorName = audio.get('uploadUserName',u'北邮爬虫').strip()
            PgmNote = audio.get('intro',u'描述未知')
            FileName = audio.get('audioDownloadDir',None)

        elif sourceWeb == 'qt':
            CATALOGNAME = u'蜻蜓fm\点播\{category}\{album}'.format(
                category = audio.get('category_title',u'未知类别'),
                album = audio.get('album_title',u'未知专辑')
            )
            CreatorName = u'蜻蜓fm,作者未知'
            PgmNote = audio.get('audioName',u'描述未知')
            FileName = audio.get('audioDownloadDir',None)
        else:
            print u'未知sourceWeb'
        xmlContent = self.template.render(
            RequestID = RequestID,
            RequestTime = RequestTime,
            TaskGUID = TaskGUID,
            PutinTime = PutinTime,
            uuid = uuid,
            SoapTargetUri = SoapTargetUri,
            PGMNAME = PGMNAME,
            PGMGUID = PGMGUID,
            Title = Title,
            CATALOGNAME = CATALOGNAME,
            CreatorName = CreatorName,
            PgmNote = PgmNote,
            FileName = FileName,
            TaskName = TaskName,
            firstplaytime = None,
            broadstarttime = None,
            broadendtime = None
        )

        return xmlContent

    def sendXMLToCNR(self,xml):
        '''
        将xml 内容推送到cnr
        :param xml:
        :return:
        '''
        headers = {'Content-Type':'application/xml'}
        res = requests.post(ConfUtil.getCnrUri(),data=xml.encode('utf-8'),
                            headers = headers)
        return res

    def getAudioPutToCNR(self,count = 10000,funAfterPush = lambda:time.sleep(0)):
        '''
        冷启动,或者平时使用,向cnr 发送当前数据库中有媒体文件但是未被发送到cnr 的数据
        执行获得所有未被推送到cnr ,并且文件已经被下载到本地种的audio 并将其推送至cnr
        并更改标志位
        :param count 为本次期望发送到cnr 最大的音频数量,默认为10,生产环境中需要更改
        :param funAfterPush 每次执行推送任务之后的函数
        '''
        logger = logging.getLogger('sender')
        #每个网站推送相同的数据量
        count = count/3
        xcount , qcount , kcount = count,count,count
        xmlyAudios = self.getXMLYAudioNotInCNRWithFile()
        for xmlyAudio in xmlyAudios:
            xcount -= 1
            if xcount < 0:
                break
            xmlContent = self.getXMLContentFromAudio('xmly',xmlyAudio)
            resp = self.sendXMLToCNR(xmlContent)
            #需要添加根据返回的状态判断相关信息
            if True:
                logger.info(
                    u'send xmlyAudio uuid - {0}'.format(
                        xmlyAudio['uuid']
                    )
                )
                #设置将数据推送到cnr 的时间
                funAfterPush()
                self.xmlyAudio.update(
                    {
                        "_id":xmlyAudio['_id']
                    },
                    {
                        "$set":{
                            "sendToCNRTime":datetime.datetime.now()
                        }
                    }
                )

        klAudios = self.getKLAudioNotInCNRWithFile()
        for klAudio in klAudios:
            kcount -= 1
            if kcount < 0:
                break
            xmlContent = self.getXMLContentFromAudio('kl',klAudio)
            resp = self.sendXMLToCNR(xmlContent)
            if True:
                logger.info(
                    u'send klAudio uuid - {0}'.format(
                        klAudio['uuid']
                    )
                )
                #设置推送到cnr 的时间
                funAfterPush()
                self.klAudio.update(
                    {
                        "_id":klAudio['_id']
                    },
                    {
                        "$set":{
                            "sendToCNRTime":datetime.datetime.now()
                        }
                    }
                )
        qtAudios = self.getQTAudioNotInCNRWithFile()
        for qtAudio in qtAudios:
            qcount -= 1
            if qcount < 0:
                break
            xmlContent = self.getXMLContentFromAudio('qt',qtAudio)
            resp = self.sendXMLToCNR(xmlContent)
            if True:
                logger.info(
                    u'send qtAudio uuid - {0}'.format(
                        qtAudio['uuid']
                    )
                )
                funAfterPush()
                self.qtAudio.update(
                    {
                        "_id":qtAudio['_id']
                    },
                    {
                        "$set":{
                            "sendToCNRTime":datetime.datetime.now()
                        }
                    }
                )
Example #35
0
 def run(self):
     sender = Sender()
     sleepSec = ConfUtil.getSleepSecAgterPush()
     sender.getAudioPutToCNR(ConfUtil.getCnrSendCountOnce(),
                             lambda:time.sleep(sleepSec))
Example #36
0
from m_interact.feedBack import FeedBack, HandleQTRe, HandleKLRe, HandleXMLYRe, ViewSummary
from m_interact.sender import XXXSender, AllSender
from m_interact.crawlerManager import XXXManager, QtTopnManager, XMLYTopnManager
from conf_util import ConfUtil

#在urlpatterns 中添加新的路由
urlpatterns = [
    (r'/infoCrawler', FeedBack),
    (r'/toCNR/xmly/(\w+)', HandleXMLYRe),
    (r'/toCNR/kl/(\w+)', HandleKLRe),
    (r'/toCNR/qt/(\w+)', HandleQTRe),
    #对内的统计页面,资源整体情况描述
    (r'/toStatistic/summary/', ViewSummary),
    # xmly 数据推送
    url(r'/api/sender/vod/xmly', XXXSender,
        dict(collection=ConfUtil.getXMLYAudioCollectionName(),
             web_str='xmly')),
    url(r'/api/sender/vod/qt', XXXSender,
        dict(collection=ConfUtil.getQTAudioCollectionName(), web_str='qt')),
    url(r'/api/sender/vod/kl', XXXSender,
        dict(collection=ConfUtil.getKLAudioCollectionName(), web_str='kl')),
    url(r'/api/xmly/full', XXXManager,
        dict(process_name=ConfUtil.xmlyFullProcessName())),
    url(r'/api/qt/full', XXXManager,
        dict(process_name=ConfUtil.qtFullProcessName())),
    url(r'/api/xmly/increment', XXXManager,
        dict(process_name=ConfUtil.xmlyIncreProcessName())),
    url(r'/api/kl/increment', XXXManager,
        dict(process_name=ConfUtil.klIncreProcessName())),
    url(r'/api/qt/increment', XXXManager,
        dict(process_name=ConfUtil.qtIncreProcessName())),
Example #37
0
 def testGetSaveScrapyStatusFromLog(self):
     getSaveScrapyStatusFromLog(ConfUtil.getTestLogDir(),ConfUtil.getTestCrawler())
Example #38
0
class XXXManager(tornado.web.RequestHandler):
    '''
    处理爬虫启停与状态
    '''
    executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)
    server = xmlrpclib.Server(ConfUtil.getSupervisorUri())
    supervisor = server.supervisor
    status_del_tags = [
        'group', 'pid', 'stderr_logfile', 'stdout_logfile',
        'logfile', 'spawnerr'
    ]


    def initialize(self, process_name):
        self.process_name = process_name

    @run_on_executor
    def get_process_info(self,process_name):
        status = self.supervisor.getProcessInfo(process_name)
        try:
            for key in self.status_del_tags:
                del(status[key])
        except Exception as e:
            pass
        return status

    @run_on_executor
    def stop_process(self, process_name):
        '''
        向进程发送Ctrl-C 命令
        :param process_name:
        :return:
        '''
        self.supervisor.signalProcess(process_name, '2')

    @run_on_executor
    def force_stop_process(self, process_name):
        '''
        强制停止进程
        :param process_name:
        :return:
        '''
        self.supervisor.stopProcess(process_name)

    @run_on_executor
    def direct_start_process(self, process_name):
        '''
        开始执行进程,对于互相间没有交互的
        :param process_name:
        :return:
        '''
        self.supervisor.startProcess(process_name)

    @gen.coroutine
    def get(self, *args, **kwargs):
        '''
        获得爬虫状态
        :param args:
        :param kwargs:
        :return:
        '''
        status = yield self.get_process_info(self.process_name)
        self.write(status)

    @gen.coroutine
    def post(self, *args, **kwargs):
        '''
        控制爬虫启停,只针对无交互的爬虫进程
        :param args:
        :param kwargs:
        :return:
        '''
        status = yield self.get_process_info(self.process_name)
        if status['state'] == 20:
            self.write(
                {
                    'status':'fail',
                    'reason':'stillrunning'
                }
            )
        else:
            yield self.direct_start_process(self.process_name)
            status = yield self.get_process_info(self.process_name)
            status['status'] = 'success'
            self.write(status)

    @gen.coroutine
    def delete(self, *args, **kwargs):
        '''
        强行停止爬虫
        :param args:
        :param kwargs:
        :return:
        '''
        status = yield self.get_process_info(self.process_name)
        if status['state']== 20:
            try:
                body = json.loads(self.request.body.decode('utf-8'))
            except:
                body = {}
            if body.get('sigint', True):
                yield self.stop_process(self.process_name)
                self.write({"status":"success"})
            else:
                yield self.force_stop_process(self.process_name)
                self.write({"status":"success"})
            pass
        else:
            status['status'] = 'faile'
            status['reason'] = 'Not running'
            self.write(status)
Example #39
0
#coding=utf-8
__author__ = 'xiyuanbupt'

from collections import defaultdict,Counter

from pymongo import MongoClient

from conf_util import ConfUtil
from statistics.fromDB import KaoLaStatistics,XmlyStatistics,QtStatistics
from statistics.fromSys import getDirSize

client = MongoClient(ConfUtil.getMongoIP(),ConfUtil.getMongoPort())
db = client[ConfUtil.getStatisticsDBName()]

class Main:
    coll = db[ConfUtil.getStatisticCronPerHourCollection()]

    def __init__(self):
        self.klSt = KaoLaStatistics()
        self.qtSt = QtStatistics()
        self.xmlySt = XmlyStatistics()

    def runOnce(self):
        '''
        执行一次统计任务
        :return:
        '''
        kl = self.runForKL()
        qt = self.runForQt()
        xmly = self.runForXMLY()
        forInsert = dict(
Example #40
0
 def set_topn_n_and_topn_table(self,topn_n):
     self.r.set(ConfUtil.xmly_topn_n_key(),topn_n)
     now = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
     table_name =  'xmly_top%d_%s' % (topn_n, now)
     self.r.set(ConfUtil.xmly_topn_table_key(), table_name)
Example #41
0
 def __init__(self):
     db = client[ConfUtil.getDBName()]
     self.album = db[ConfUtil.getKLAlbumCollectionName()]
     self.category = db[ConfUtil.getKLCategoryCollectionName()]
Example #42
0
#coding=utf-8
__author__ = 'xiyuanbupt'

from collections import defaultdict

from pymongo import MongoClient

from conf_util import ConfUtil

tmpClient = MongoClient(ConfUtil.getMongoIP(),ConfUtil.getMongoPort())
sDb = tmpClient[ConfUtil.getSpiderDBName()]

class KaoLaStatistics:
    klAudio = sDb[ConfUtil.getKLAudioCollectionName()]
    klAlbum = sDb[ConfUtil.getKLAlbumCollectionName()]

    def __init__(self):
        self.perCategoryRes = None

    #获得每个类别下的 album 总数以及 audio 总数
    def _getAlbum_AudioCountPerCategory(self):
        res = defaultdict()
        cursor = self.klAlbum.aggregate(
            [
                {
                    "$group":{
                        "_id":"$categoryName",
                        "totalAudio":{"$sum":"$audioCounts"},
                        "totalAlbum":{"$sum":1},
                    }
                }
Example #43
0
class XXXSender(tornado.web.RequestHandler):

    executor = ThreadPoolExecutor(max_workers=MAX_WORKERS)

    xmlyImgDownloader = FilesDownloader(ConfUtil.getXmlyImgDir())
    xmlyAudioDownloader = FilesDownloader(ConfUtil.getXmlyAudioDir())
    qtImgDownloader = FilesDownloader(ConfUtil.getQtImgDir())
    qtAudioDownloader = FilesDownloader(ConfUtil.getQtAudioDir())
    klImgDownloader = FilesDownloader(ConfUtil.getKlImgDir())
    klAudioDownloader = FilesDownloader(ConfUtil.getKlAudioDir())

    namespace = ("m", "urn:mpc")
    proxy = SOAPProxy(ConfUtil.getCnrUri(), namespace)

    redis = redis.Redis(connection_pool=redis_pool)

    def initialize(self, collection, web_str):
        self.collection = collection
        self.web_str = web_str

    @gen.coroutine
    def get(self, *args, **kwargs):
        user = {
            "Name": "Pradeep",
            "Company": "SCTL",
            "Address": "Mumbai",
            "Location": "RCP"
        }
        self.redis.hmset("foiiio", user)
        cache = self.redis.hgetall("foiiio")
        self.write(cache)

    # 推送数据到cnr的索贝接口
    @gen.coroutine
    def post(self, *args, **kwargs):
        data = json.loads(self.request.body.decode('utf-8'))
        _ids = data.get('_ids', None)
        force_push = data.get('force_push', False)
        if not _ids:
            raise MissingArgumentError("_ids")
        coll = self.application.db[self.collection]
        if force_push:
            audios = yield [
                coll.find_one({"_id": ObjectId(_id)}) for _id in _ids
            ]
        else:
            audios = yield [
                coll.find_one({
                    "_id": ObjectId(_id),
                    "sendToCNRTime": None
                }) for _id in _ids
            ]
        # 如果对应的audio与媒体文件没有被下载,那么下载对应的audio与媒体文件
        audiosInfo = None
        imgsInfo = None
        if self.web_str == 'kl':
            # 因为kl网站挂掉了所以暂时不提供kl网站下载
            raise UnSupportWebError(self.web_str)
        elif self.web_str == 'xmly':
            # 因为取消掉媒体文件下载进程,所以所有媒体文件下载都在这里
            audios_url = [audio.get("play_path", None) for audio in audios]
            imgs_url = [audio.get("cover_url_142", None) for audio in audios]
            audiosInfo = yield [
                self.xmlyAudioDownloader.download_file(url)
                for url in audios_url
            ]
            imgsInfo = yield [
                self.xmlyImgDownloader.download_file(url) for url in imgs_url
            ]

        elif self.web_str == 'qt':
            audios_url = [audio.get("playUrl") for audio in audios]
            audiosInfo = yield [
                self.qtAudioDownloader.download_file(url) for url in audios_url
            ]

            # 因为爬虫没有获得img url,所以imgs 都为空
            imgsInfo = [None for audio in audios]
        else:
            raise UnSupportWebError(self.web_str)

        audiosInfo = zip(audios, audiosInfo, imgsInfo)

        xmls = [
            xmlGenerator.getXMLContentFromAudio(self.web_str, audioInfo)
            for audioInfo in audiosInfo if audiosInfo[0]
        ]
        resps = yield [self.sendXMLToCNR(xml) for xml in xmls]
        # 将推送到cnr 的时间设置到数据库中
        yield [
            coll.update({"_id": audio["_id"]},
                        {"$set": {
                            "sendToCNRTime": datetime.datetime.now()
                        }}) for audio in audios
        ]
        self.write({
            "audios": [audio.get('album_title') for audio in audios],
            "resps": ['success' if resp else 'fault' for resp in resps],
            "request_push_count": len(_ids),
            "real_push_count": len(xmls),
            "force_push": force_push,
        })

    @run_on_executor
    def sendXMLToCNR(self, xml):
        '''
        将xml 内容推送到cnr
        :param xml:
        :return:
        '''
        resp = self.proxy.mpccommit(strInput=xml)
        return resp
Example #44
0
__author__ = 'xiyuanbupt'
import datetime
import logging
import logging.config
logging.config.fileConfig('./logger.ini')
from jinja2 import Environment,PackageLoader
from pymongo import MongoClient
import requests
import time,threading

from conf_util import ConfUtil
env = Environment(loader=PackageLoader('m_interact','templates'))
'''
用于向接口中推送数据,每天会在固定的时间启动一个 sender 进程,用来推送当前的数据
'''
client = MongoClient(ConfUtil.getMongoIP(),ConfUtil.getMongoPort())
db = client[ConfUtil.getDBName()]

class Sender:
    klAudio = db[ConfUtil.getKLAudioCollectionName()]
    xmlyAudio = db[ConfUtil.getXMLYAudioCollectionName()]
    qtAudio = db[ConfUtil.getQTAudioCollectionName()]
    template = env.get_template('sendTemp.xml')
    soapTargetUri = ConfUtil.getSoapTargetUri()

    def __init__(self):
        pass

    def useJinja(self):
        template = env.get_template('sendTemp.xml')
        return template.render(PGMGUID = 'wwww')
Example #45
0
def getSaveScrapyStatusFromLog(logfile,crawler):
    res = getScrapyStatusFromScrapyLog(logfile)
    res['crawler'] = crawler
    coll = tDb[ConfUtil.getCrawlHistoryCollectionName()]
    coll.insert(res)
Example #46
0
 def __init__(self):
     db = client[ConfUtil.getDBName()]
     self.album = db[ConfUtil.getKLAlbumCollectionName()]
     self.category = db[ConfUtil.getKLCategoryCollectionName()]
Example #47
0
class XMLGenerator:

    # 转码服务器共享盘挂载的路径
    transcoding_mount = 'H:'
    # 爬虫服务器共享盘挂载的路径
    my_mount = '/var/crawler/cnr_shares'

    template = Environment(loader=PackageLoader('m_interact','templates')).get_template(
        'sendTemp.xml'
    )
    soapTargetUri = ConfUtil.getSoapTargetUri()

    def __init__(self):
        pass

    def getXMLContentFromAudio(self,sourceWeb,audioInfo):
        '''
        从audio 中获得 xml 内容
        函数会根据 sourceWeb 的不同来决定推送的逻辑
        sourceWeb 为 kl xmly 或者 qt
        audio 为直接从 数据库中取到的对应网站audio 的字典格式
        '''
        audio = audioInfo[0]
        audioFile = audioInfo[1]
        imgFile = audioInfo[2]

        now = datetime.datetime.now()
        RequestID = audio.get('uuid',None)
        RequestTime = now.strftime("%Y-%m-%d %H:%M:%S")
        TaskGUID = audio.get('uuid',None)
        TaskName = audio.get('album_title',None)
        PutinTime = now.strftime("%Y-%m-%d %H:%M:%S")
        uuid = audio.get('uuid',None)
        SoapTargetUri = self.soapTargetUri.format(
            sourceWeb = sourceWeb,uuid = uuid
        )
        PGMNAME = TaskName
        PGMGUID = audio.get('uuid',None)
        Title = TaskName
        #如下代码之后需要重构,已经将sourceWeb 写死在,故并不通用
        if sourceWeb == 'kl':
            CATALOGNAME = u'考拉fm\点播\{category}\{album}'.format(
                category = audio.get('category_title',u'未知').strip(),
                album = audio.get('album_title',u'未知').strip()
            )
            CreatorName = audio.get('uploaderName',u'Crawler').strip()
            PgmNote = audio.get('fullDescs',u'描述未知').strip()
            AudioFileName = audioFile.get('path',None) if audioFile else None
            ImgFileName = imgFile.get('path',None) if imgFile else None
        elif sourceWeb == 'xmly':
            CATALOGNAME = u'喜马拉雅fm\点播\{category}\{album}'.format(
                category = audio.get('category_title',u'未知').strip(),
                album = audio.get('album_title',u'未知').strip()
            )
            CreatorName = audio.get('uploadUserName',u'Crawler').strip()
            PgmNote = audio.get('intro',u'描述未知')
            AudioFileName = audioFile.get('path',None) if audioFile else None
            ImgFileName = imgFile.get('path',None) if imgFile else None
        elif sourceWeb == 'qt':
            CATALOGNAME = u'蜻蜓fm\点播\{category}\{album}'.format(
                category = audio.get('category_title',u'未知类别').strip(),
                album = audio.get('album_title',u'未知专辑').strip()
            )
            CreatorName = u'蜻蜓fm,作者未知'
            PgmNote = audio.get('audioName',u'描述未知')
            AudioFileName = audioFile.get('path',None) if audioFile else None
            ImgFileName = imgFile.get('path',None) if imgFile else None
        else:
            print u'未知sourceWeb'
        AudioFileName = changePathStyle(AudioFileName)
        ImgFileName = changePathStyle(ImgFileName)
        CATALOGNAME = u'网络爬取数据\\' + CATALOGNAME
        xmlContent = self.template.render(
            RequestID = RequestID,
            RequestTime = RequestTime,
            TaskGUID = TaskGUID,
            PutinTime = PutinTime,
            uuid = uuid,
            SoapTargetUri = SoapTargetUri,
            PGMNAME = PGMNAME,
            PGMGUID = PGMGUID,
            Title = Title,
            CATALOGNAME = CATALOGNAME,
            CreatorName = CreatorName,
            PgmNote = PgmNote,
            AudioFileName = self.transcoding_mount + AudioFileName,
            ImgFileName = (self.transcoding_mount + ImgFileName) if ImgFileName else "H:\\jpg\\2013127111320.png",
            TaskName = TaskName,
            firstplaytime = None,
            broadstarttime = None,
            broadendtime = None
        )

        return xmlContent
Example #48
0
class Sender:
    klAudio = db[ConfUtil.getKLAudioCollectionName()]
    xmlyAudio = db[ConfUtil.getKLAudioCollectionName()]
    qtAudio = db[ConfUtil.getQTAudioCollectionName()]
    template = env.get_template('sendTemp.xml')
    soapTargetUri = ConfUtil.getSoapTargetUri()

    def __init__(self):
        self.xmlyUtil = XMLYUtil()
        self.qtUtil = QTUtil()
        self.klUtil = KLUtil()

    def useJinja(self):
        template = env.get_template('sendTemp.xml')
        return template.render(PGMGUID='wwww')

    #从数据库中读取所有未被推送到cnr 并且媒体文件已经被下载的数据项
    def getXMLYAudioNotInCNRWithFile(self):
        '''
        获得所有未被推送到CNR 但是媒体文件已经被下载的 audio
        '''
        cursor = self.xmlyAudio.find({
            "sendToCNRTime": None,
            "audioDownloadDir": {
                "$ne": None
            }
        })
        for audio in cursor:
            yield audio

    def getKLAudioNotInCNRWithFile(self):
        '''
        获得所有未被推送到CNR 但是媒体文件已经被下载的audio
        '''
        cursor = self.klAudio.find({
            "sendToCNRTime": None,
            "audioDownloadDir": {
                "$ne": None
            }
        })
        for audio in cursor:
            yield audio

    def getXMLContent(self, sourceWeb, audio):
        '''
        从audio 中获得 xml 内容
        函数会根据 sourceWeb 的不同来决定推送的逻辑
        sourceWeb 为 kl xmly 或者 qt
        '''
        now = datetime.datetime.now()
        RequestID = audio.get('uuid', None)
        RequestTime = now.strftime("%Y-%m-%d %H:%M:%S")
        TaskGUID = audio.get('uuid', None)
        TaskName = audio.get('album_title', None)
        PutinTime = now.strftime("%Y-%m-%d %H:%M:%S")
        uuid = audio.get('uuid', None)
        SoapTargetUri = self.soapTargetUri.format(sourceWeb=sourceWeb,
                                                  uuid=uuid)
        PGMNAME = TaskName
        PGMGUID = audio.get('uuid', None)
        Title = TaskName
        #如下代码之后需要重构,已经将sourceWeb 写死在,故并不通用
        if sourceWeb == 'kl':
            CATALOGNAME = u'考拉fm\点播\{category}\{album}'.format(
                category=audio.get('categoryName', u'未知'),
                album=audio.get('albumName', u'未知'))
            CreatorName = audio.get('uploaderName', u'北邮爬虫')
            PgmNote = audio.get('fullDescs', u'描述未知')
            FileName = audio.get('audioDownloadDir', None)
        elif sourceWeb == 'xmly':
            CATALOGNAME = u'喜马拉雅fm\点播\{category}\{album}'.format(
                category=audio.get('category_title', u'未知'),
                album=audio.get('album_title', u'未知'))
            CreatorName = audio.get('uploadUserName', u'北邮爬虫')
            PgmNote = audio.get('intro', u'描述未知')
            FileName = audio.get('audioDownloadDir', None)
        else:
            print u'未知sourceWeb'
        xmlContent = self.template.render(RequestID=RequestID,
                                          RequestTime=RequestTime,
                                          TaskGUID=TaskGUID,
                                          PutinTime=PutinTime,
                                          uuid=uuid,
                                          SoapTargetUri=SoapTargetUri,
                                          PGMNAME=PGMNAME,
                                          PGMGUID=PGMGUID,
                                          Title=Title,
                                          CATALOGNAME=CATALOGNAME,
                                          CreatorName=CreatorName,
                                          PgmNote=PgmNote,
                                          FileName=FileName,
                                          TaskName=TaskName,
                                          firstplaytime=None,
                                          broadstarttime=None,
                                          broadendtime=None)

        return xmlContent

    def getAudioPutToCNR(self):
        '''
        执行获得所有未被推送到cnr ,并且文件已经被下载到本地种的audio 并将其推送至cnr
        并更改标志位
        '''
        pass