Example #1
0
def checkKeyDir(f):
    error = []
    status = 'OK'
    meta = None
    key = os.path.basename(f[0])

    if 'meta.json' not in  f[2]:
        error.append("no meta file !")
        status = 'ERROR'
    else:
        meta = util.loadJsonFile(os.path.join(f[0], 'meta.json'))
        if 'isIgnore' in meta and meta['isIgnore'] == True:
            status = 'IGNORE'
            error = []
            return status, error
            
    if 'content.html' not in  f[2]:
        error.append("no content file !")
        status = 'ERROR'
    if 'content.mp3' not in f[2]:
        error.append("no mp3 file !")
        status = 'ERROR'
    if '__zipPic__.jpg' not in f[2] and '__zipPic__.png' not in f[2]:
        error.append("no zipPic file !")
        status = 'ERROR'

    meta = util.loadJsonFile(os.path.join(f[0], 'meta.json'))
    if meta:
        #check embPic:
        for picName in meta['embPics']:
            if util.getUrlFileName(picName) not in f[2]:
                error.append("no embPic %s !" % (util.getUrlFileName(picName)))
                status = 'ERROR'
    return status, error
Example #2
0
    def doOneTask(self, task):
        """
        """
        self.signTask(task)
        task.status = 'down'
        outData = None
        if task.taskType == 'page': 
            data = self.fetcher.keepFetchRead(task.url, task.postdata, task.timeout, task.tryTimes, False)
        elif task.taskType == 'media':
            #genPath
            to = ''
            if task.dest != '':
                to = task.dest
            else:
                # todo mk it more random
                to = util.getUrlFileName(task.url)
                if to == '':
                    to = "unameFile"
                task.dest = to
            dirPath = iPapa.iTsOutputPath
            to = os.path.join(dirPath, to)
            data = self.fetcher.download(task.url, to, task.postdata, task.timeout, task.tryTimes, False)

        if data == None: #download error
            # set sth, put it into error     
            task.status = 'failed'
            task.msg = 'download failed'
            myLogger.error('[wThread_%s]Fetcher data failed in task.id[%d]' % (self.name, task.id))
        else: # OK
            task['__data'] = data
            task.status = 'downed'

        self.outQueue.put(task) 
Example #3
0
 def parse(self, task):
     newTasks = []
     ret, status = self.parseContent(task['__data'])
     meta = {}
     if status == 'OK':
         key = task['key']
         keyOutputPath = os.path.join(iPapa.iTsOutputPath, key)
         # contentMp3
         if 'contentMp3' in ret:
             url = ret['contentMp3']
             dest = os.path.join(key, util.getUrlFileName(url))
             newT = Task(-1,
                         url=url,
                         handler='AudioHandler',
                         taskType='media',
                         ref=task.url,
                         dest=dest)
             newT['key'] = task['key']
             newT['audioType'] = os.path.splitext(dest)[1].upper()
             newTasks.append(newT)
         else:
             task.status = 'failed'
     else:
         task.status = 'failed'
     if newTasks != []:
         return {'newTasks': newTasks}
     return {}
Example #4
0
def check(dirName):
    #find today's keyfiles
    tsDir = dirName
    report = {}
    okKeys = []
    for f in os.walk(os.path.join(outputPath, tsDir)):
        if os.path.basename(f[0]).startswith('_content_') and f[1] == []:
            error = []
            print f
            meta = None
            key = os.path.basename(f[0])
            if 'meta.json' not in f[2]:
                error.append("no meta file !")
            else:
                meta = util.loadJsonFile(os.path.join(f[0], 'meta.json'))
                if 'isIgnore' in meta and meta['isIgnore'] == True:
                    report[key] = [
                        "ignored:%s" % meta['ignoreMsg'],
                    ]
                    okKeys.append(key)
                    continue

            if 'content.html' not in f[2]:
                error.append("no content file !")
            if 'content.mp3' not in f[2]:
                error.append("no mp3 file !")
            if '__zipPic__.jpg' not in f[2] and '__zipPic__.png' not in f[2]:
                error.append("no zipPic file !")

            meta = util.loadJsonFile(os.path.join(f[0], 'meta.json'))
            if meta:
                #check embPic:
                for picName in meta['embPics']:
                    if util.getUrlFileName(picName) not in f[2]:
                        error.append("no embPic %s !" %
                                     (util.getUrlFileName(picName)))
            if error != []:
                report[key] = error
            else:
                okKeys.append(key)

    return (report, okKeys)
Example #5
0
def check(dirName):
    #find today's keyfiles 
    tsDir = dirName
    report = {}
    okKeys = []
    for f in os.walk(os.path.join(outputPath, tsDir)):
        if os.path.basename(f[0]).startswith('_content_') and f[1] == []:
            error = []
            print f
            meta = None
            key = os.path.basename(f[0])
            if 'meta.json' not in  f[2]:
                error.append("no meta file !")
            else:
                meta = util.loadJsonFile(os.path.join(f[0], 'meta.json'))
                if 'isIgnore' in meta and meta['isIgnore'] == True:
                    report[key] = ["ignored:%s" % meta['ignoreMsg'], ]
                    okKeys.append(key)
                    continue
                    
            if 'content.html' not in  f[2]:
                error.append("no content file !")
            if 'content.mp3' not in f[2]:
                error.append("no mp3 file !")
            if '__zipPic__.jpg' not in f[2] and '__zipPic__.png' not in f[2]:
                error.append("no zipPic file !")

            meta = util.loadJsonFile(os.path.join(f[0], 'meta.json'))
            if meta:
                #check embPic:
                for picName in meta['embPics']:
                    if util.getUrlFileName(picName) not in f[2]:
                        error.append("no embPic %s !" % (util.getUrlFileName(picName)))
            if error != []:
                report[key] = error
            else:
                okKeys.append(key)
            
                

    return (report, okKeys)
 def parse(self, task):
     newTasks = []
     ret, status = self.parseContent(task['__data'])
     meta = {}
     if status == 'OK':
         key = task['key']
         keyOutputPath = os.path.join(iPapa.iTsOutputPath, key)
         # contentMp3 
         if 'contentMp3' in ret:
             url = ret['contentMp3']
             dest = os.path.join(key, util.getUrlFileName(url)) 
             newT = Task(-1, url=url, handler='AudioHandler', taskType='media', ref=task.url, dest=dest)  
             newT['key'] = task['key']
             newT['audioType'] = os.path.splitext(dest)[1].upper()
             newTasks.append(newT)
         else:
             task.status = 'failed'
     else:
         task.status = 'failed'
     if newTasks != []:
         return {'newTasks': newTasks}
     return {}
Example #7
0
    def doOneTask(self, task):
        """
        """
        self.signTask(task)
        task.status = 'down'
        outData = None
        if task.taskType == 'page':
            data = self.fetcher.keepFetchRead(task.url, task.postdata,
                                              task.timeout, task.tryTimes,
                                              False)
        elif task.taskType == 'media':
            #genPath
            to = ''
            if task.dest != '':
                to = task.dest
            else:
                # todo mk it more random
                to = util.getUrlFileName(task.url)
                if to == '':
                    to = "unameFile"
                task.dest = to
            dirPath = iPapa.iTsOutputPath
            to = os.path.join(dirPath, to)
            data = self.fetcher.download(task.url, to, task.postdata,
                                         task.timeout, task.tryTimes, False)

        if data == None:  #download error
            # set sth, put it into error
            task.status = 'failed'
            task.msg = 'download failed'
            myLogger.error('[wThread_%s]Fetcher data failed in task.id[%d]' %
                           (self.name, task.id))
        else:  # OK
            task['__data'] = data
            task.status = 'downed'

        self.outQueue.put(task)
    def parse(self, task):
        print "ContentPageHandler parse", task.url, task['key']
        newTasks = []
        ret, status = self.parseContent(task['__data'])
        meta = {}
        if status == 'OK':
            key = task['key']
            keyOutputPath = os.path.join(iPapa.iTsOutputPath, key)
            #siteTile 
            meta['siteTitle'] = ret['siteTitle']
            #title
            meta['title'] = ret['title']
            # url
            meta['url'] = task.url
            # date
            meta['date'] = ret['date']
            #contentPics
            #record and new task to download it
            meta['contentPics'] = ret['contentPics']
            meta['contentPicCaptions'] = ret['contentPicCaptions']
            meta['embPics'] = ret['embPics']
            #create new tasks here
            if len(ret['contentPics']) and len(ret['contentPicCaptions']):
                picUrl = ret['contentPics'][0]
                #only the first pic here, ignore others now
                dest = os.path.join(key, util.getUrlFileName(picUrl)) 
                newT = Task(-1, url=picUrl, handler='PicHandler', taskType='media', ref=task.url, dest=dest)  
                newT['key'] = task['key']
                newT['picType'] = 'contentPic'
                newTasks.append(newT)

            #for content, we store it 
            contentLoc = os.path.join(keyOutputPath, 'content.html')
            util.writeFile(contentLoc, ret['content'])
            # contentMp3 
            if 'contentMp3' in ret:
                url = ret['contentMp3']
                dest = os.path.join(key, util.getUrlFileName(url)) 
                newT = Task(-1, url=url, handler='AudioHandler', taskType='media', ref=task.url, dest=dest)  
                newT['key'] = task['key']
                newT['audioType'] = os.path.splitext(dest)[1].upper()
                newTasks.append(newT)
            elif 'contentMp3Page' in ret: #always be with big file, we ignore it 
                #url = ret['contentMp3Page']
                #newT = Task(-1, url=urlparse.urljoin(task.url, url), handler='ContentMp3PageHandler', ref=task.url,) 
                #newT['key'] = task['key']
                #newTasks.append(newT)
                task.status = 'ignore' 
                meta['isIgnore'] = True
                meta['ignoreMsg'] = "Audio file is too big, we should ignore this now."
                task.msg = 'Audio file is too big, we should ignore this now.' 
            else:
                #Failed
                task.status = 'failed' 
                task.msg = 'failed in Findding a Audio' 

            # download here 
            # embPics
            for embPic in ret['embPics']:
                picUrl = embPic
                #only the first pic here, ignore others now
                dest = os.path.join(key, util.getUrlFileName(picUrl)) 
                newT = Task(-1, url=picUrl, handler='PicHandler', taskType='media', ref=task.url, dest=dest)  
                newT['key'] = task['key']
                newT['picType'] = 'embPic'
                newTasks.append(newT)

            # store meta file
            metaLoc = os.path.join(keyOutputPath, 'meta.json') 
            if util.dump2JsonFile(meta, metaLoc) != True:
                task.status = 'failed'    

        else:
            task.status = 'failed'
        if task.status == 'ignore': 
            return {}
        if newTasks != []:
            return {'newTasks': newTasks}
        return {}
Example #9
0
    def parse(self, task):
        print "ContentPageHandler parse", task.url, task['key']
        newTasks = []
        ret, status = self.parseContent(task['__data'])
        meta = {}
        if status == 'OK':
            key = task['key']
            keyOutputPath = os.path.join(iPapa.iTsOutputPath, key)
            #siteTile
            meta['siteTitle'] = ret['siteTitle']
            #title
            meta['title'] = ret['title']
            # url
            meta['url'] = task.url
            # date
            meta['date'] = ret['date']
            #contentPics
            #record and new task to download it
            meta['contentPics'] = ret['contentPics']
            meta['contentPicCaptions'] = ret['contentPicCaptions']
            meta['embPics'] = ret['embPics']
            #create new tasks here
            if len(ret['contentPics']) and len(ret['contentPicCaptions']):
                picUrl = ret['contentPics'][0]
                #only the first pic here, ignore others now
                dest = os.path.join(key, util.getUrlFileName(picUrl))
                newT = Task(-1,
                            url=picUrl,
                            handler='PicHandler',
                            taskType='media',
                            ref=task.url,
                            dest=dest)
                newT['key'] = task['key']
                newT['picType'] = 'contentPic'
                newTasks.append(newT)

            #for content, we store it
            contentLoc = os.path.join(keyOutputPath, 'content.html')
            util.writeFile(contentLoc, ret['content'])
            # contentMp3
            if 'contentMp3' in ret:
                url = ret['contentMp3']
                dest = os.path.join(key, util.getUrlFileName(url))
                newT = Task(-1,
                            url=url,
                            handler='AudioHandler',
                            taskType='media',
                            ref=task.url,
                            dest=dest)
                newT['key'] = task['key']
                newT['audioType'] = os.path.splitext(dest)[1].upper()
                newTasks.append(newT)
            elif 'contentMp3Page' in ret:  #always be with big file, we ignore it
                #url = ret['contentMp3Page']
                #newT = Task(-1, url=urlparse.urljoin(task.url, url), handler='ContentMp3PageHandler', ref=task.url,)
                #newT['key'] = task['key']
                #newTasks.append(newT)
                task.status = 'ignore'
                meta['isIgnore'] = True
                meta[
                    'ignoreMsg'] = "Audio file is too big, we should ignore this now."
                task.msg = 'Audio file is too big, we should ignore this now.'
            else:
                #Failed
                task.status = 'failed'
                task.msg = 'failed in Findding a Audio'

            # download here
            # embPics
            for embPic in ret['embPics']:
                picUrl = embPic
                #only the first pic here, ignore others now
                dest = os.path.join(key, util.getUrlFileName(picUrl))
                newT = Task(-1,
                            url=picUrl,
                            handler='PicHandler',
                            taskType='media',
                            ref=task.url,
                            dest=dest)
                newT['key'] = task['key']
                newT['picType'] = 'embPic'
                newTasks.append(newT)

            # store meta file
            metaLoc = os.path.join(keyOutputPath, 'meta.json')
            if util.dump2JsonFile(meta, metaLoc) != True:
                task.status = 'failed'

        else:
            task.status = 'failed'
        if task.status == 'ignore':
            return {}
        if newTasks != []:
            return {'newTasks': newTasks}
        return {}