def checkKeyDir(f): error = [] status = 'OK' meta = None key = os.path.basename(f[0]) if 'meta.json' not in f[2]: error.append("no meta file !") status = 'ERROR' else: meta = util.loadJsonFile(os.path.join(f[0], 'meta.json')) if 'isIgnore' in meta and meta['isIgnore'] == True: status = 'IGNORE' error = [] return status, error if 'content.html' not in f[2]: error.append("no content file !") status = 'ERROR' if 'content.mp3' not in f[2]: error.append("no mp3 file !") status = 'ERROR' if '__zipPic__.jpg' not in f[2] and '__zipPic__.png' not in f[2]: error.append("no zipPic file !") status = 'ERROR' meta = util.loadJsonFile(os.path.join(f[0], 'meta.json')) if meta: #check embPic: for picName in meta['embPics']: if util.getUrlFileName(picName) not in f[2]: error.append("no embPic %s !" % (util.getUrlFileName(picName))) status = 'ERROR' return status, error
def doOneTask(self, task): """ """ self.signTask(task) task.status = 'down' outData = None if task.taskType == 'page': data = self.fetcher.keepFetchRead(task.url, task.postdata, task.timeout, task.tryTimes, False) elif task.taskType == 'media': #genPath to = '' if task.dest != '': to = task.dest else: # todo mk it more random to = util.getUrlFileName(task.url) if to == '': to = "unameFile" task.dest = to dirPath = iPapa.iTsOutputPath to = os.path.join(dirPath, to) data = self.fetcher.download(task.url, to, task.postdata, task.timeout, task.tryTimes, False) if data == None: #download error # set sth, put it into error task.status = 'failed' task.msg = 'download failed' myLogger.error('[wThread_%s]Fetcher data failed in task.id[%d]' % (self.name, task.id)) else: # OK task['__data'] = data task.status = 'downed' self.outQueue.put(task)
def parse(self, task): newTasks = [] ret, status = self.parseContent(task['__data']) meta = {} if status == 'OK': key = task['key'] keyOutputPath = os.path.join(iPapa.iTsOutputPath, key) # contentMp3 if 'contentMp3' in ret: url = ret['contentMp3'] dest = os.path.join(key, util.getUrlFileName(url)) newT = Task(-1, url=url, handler='AudioHandler', taskType='media', ref=task.url, dest=dest) newT['key'] = task['key'] newT['audioType'] = os.path.splitext(dest)[1].upper() newTasks.append(newT) else: task.status = 'failed' else: task.status = 'failed' if newTasks != []: return {'newTasks': newTasks} return {}
def check(dirName): #find today's keyfiles tsDir = dirName report = {} okKeys = [] for f in os.walk(os.path.join(outputPath, tsDir)): if os.path.basename(f[0]).startswith('_content_') and f[1] == []: error = [] print f meta = None key = os.path.basename(f[0]) if 'meta.json' not in f[2]: error.append("no meta file !") else: meta = util.loadJsonFile(os.path.join(f[0], 'meta.json')) if 'isIgnore' in meta and meta['isIgnore'] == True: report[key] = [ "ignored:%s" % meta['ignoreMsg'], ] okKeys.append(key) continue if 'content.html' not in f[2]: error.append("no content file !") if 'content.mp3' not in f[2]: error.append("no mp3 file !") if '__zipPic__.jpg' not in f[2] and '__zipPic__.png' not in f[2]: error.append("no zipPic file !") meta = util.loadJsonFile(os.path.join(f[0], 'meta.json')) if meta: #check embPic: for picName in meta['embPics']: if util.getUrlFileName(picName) not in f[2]: error.append("no embPic %s !" % (util.getUrlFileName(picName))) if error != []: report[key] = error else: okKeys.append(key) return (report, okKeys)
def check(dirName): #find today's keyfiles tsDir = dirName report = {} okKeys = [] for f in os.walk(os.path.join(outputPath, tsDir)): if os.path.basename(f[0]).startswith('_content_') and f[1] == []: error = [] print f meta = None key = os.path.basename(f[0]) if 'meta.json' not in f[2]: error.append("no meta file !") else: meta = util.loadJsonFile(os.path.join(f[0], 'meta.json')) if 'isIgnore' in meta and meta['isIgnore'] == True: report[key] = ["ignored:%s" % meta['ignoreMsg'], ] okKeys.append(key) continue if 'content.html' not in f[2]: error.append("no content file !") if 'content.mp3' not in f[2]: error.append("no mp3 file !") if '__zipPic__.jpg' not in f[2] and '__zipPic__.png' not in f[2]: error.append("no zipPic file !") meta = util.loadJsonFile(os.path.join(f[0], 'meta.json')) if meta: #check embPic: for picName in meta['embPics']: if util.getUrlFileName(picName) not in f[2]: error.append("no embPic %s !" % (util.getUrlFileName(picName))) if error != []: report[key] = error else: okKeys.append(key) return (report, okKeys)
def parse(self, task): print "ContentPageHandler parse", task.url, task['key'] newTasks = [] ret, status = self.parseContent(task['__data']) meta = {} if status == 'OK': key = task['key'] keyOutputPath = os.path.join(iPapa.iTsOutputPath, key) #siteTile meta['siteTitle'] = ret['siteTitle'] #title meta['title'] = ret['title'] # url meta['url'] = task.url # date meta['date'] = ret['date'] #contentPics #record and new task to download it meta['contentPics'] = ret['contentPics'] meta['contentPicCaptions'] = ret['contentPicCaptions'] meta['embPics'] = ret['embPics'] #create new tasks here if len(ret['contentPics']) and len(ret['contentPicCaptions']): picUrl = ret['contentPics'][0] #only the first pic here, ignore others now dest = os.path.join(key, util.getUrlFileName(picUrl)) newT = Task(-1, url=picUrl, handler='PicHandler', taskType='media', ref=task.url, dest=dest) newT['key'] = task['key'] newT['picType'] = 'contentPic' newTasks.append(newT) #for content, we store it contentLoc = os.path.join(keyOutputPath, 'content.html') util.writeFile(contentLoc, ret['content']) # contentMp3 if 'contentMp3' in ret: url = ret['contentMp3'] dest = os.path.join(key, util.getUrlFileName(url)) newT = Task(-1, url=url, handler='AudioHandler', taskType='media', ref=task.url, dest=dest) newT['key'] = task['key'] newT['audioType'] = os.path.splitext(dest)[1].upper() newTasks.append(newT) elif 'contentMp3Page' in ret: #always be with big file, we ignore it #url = ret['contentMp3Page'] #newT = Task(-1, url=urlparse.urljoin(task.url, url), handler='ContentMp3PageHandler', ref=task.url,) #newT['key'] = task['key'] #newTasks.append(newT) task.status = 'ignore' meta['isIgnore'] = True meta['ignoreMsg'] = "Audio file is too big, we should ignore this now." task.msg = 'Audio file is too big, we should ignore this now.' else: #Failed task.status = 'failed' task.msg = 'failed in Findding a Audio' # download here # embPics for embPic in ret['embPics']: picUrl = embPic #only the first pic here, ignore others now dest = os.path.join(key, util.getUrlFileName(picUrl)) newT = Task(-1, url=picUrl, handler='PicHandler', taskType='media', ref=task.url, dest=dest) newT['key'] = task['key'] newT['picType'] = 'embPic' newTasks.append(newT) # store meta file metaLoc = os.path.join(keyOutputPath, 'meta.json') if util.dump2JsonFile(meta, metaLoc) != True: task.status = 'failed' else: task.status = 'failed' if task.status == 'ignore': return {} if newTasks != []: return {'newTasks': newTasks} return {}
def parse(self, task): print "ContentPageHandler parse", task.url, task['key'] newTasks = [] ret, status = self.parseContent(task['__data']) meta = {} if status == 'OK': key = task['key'] keyOutputPath = os.path.join(iPapa.iTsOutputPath, key) #siteTile meta['siteTitle'] = ret['siteTitle'] #title meta['title'] = ret['title'] # url meta['url'] = task.url # date meta['date'] = ret['date'] #contentPics #record and new task to download it meta['contentPics'] = ret['contentPics'] meta['contentPicCaptions'] = ret['contentPicCaptions'] meta['embPics'] = ret['embPics'] #create new tasks here if len(ret['contentPics']) and len(ret['contentPicCaptions']): picUrl = ret['contentPics'][0] #only the first pic here, ignore others now dest = os.path.join(key, util.getUrlFileName(picUrl)) newT = Task(-1, url=picUrl, handler='PicHandler', taskType='media', ref=task.url, dest=dest) newT['key'] = task['key'] newT['picType'] = 'contentPic' newTasks.append(newT) #for content, we store it contentLoc = os.path.join(keyOutputPath, 'content.html') util.writeFile(contentLoc, ret['content']) # contentMp3 if 'contentMp3' in ret: url = ret['contentMp3'] dest = os.path.join(key, util.getUrlFileName(url)) newT = Task(-1, url=url, handler='AudioHandler', taskType='media', ref=task.url, dest=dest) newT['key'] = task['key'] newT['audioType'] = os.path.splitext(dest)[1].upper() newTasks.append(newT) elif 'contentMp3Page' in ret: #always be with big file, we ignore it #url = ret['contentMp3Page'] #newT = Task(-1, url=urlparse.urljoin(task.url, url), handler='ContentMp3PageHandler', ref=task.url,) #newT['key'] = task['key'] #newTasks.append(newT) task.status = 'ignore' meta['isIgnore'] = True meta[ 'ignoreMsg'] = "Audio file is too big, we should ignore this now." task.msg = 'Audio file is too big, we should ignore this now.' else: #Failed task.status = 'failed' task.msg = 'failed in Findding a Audio' # download here # embPics for embPic in ret['embPics']: picUrl = embPic #only the first pic here, ignore others now dest = os.path.join(key, util.getUrlFileName(picUrl)) newT = Task(-1, url=picUrl, handler='PicHandler', taskType='media', ref=task.url, dest=dest) newT['key'] = task['key'] newT['picType'] = 'embPic' newTasks.append(newT) # store meta file metaLoc = os.path.join(keyOutputPath, 'meta.json') if util.dump2JsonFile(meta, metaLoc) != True: task.status = 'failed' else: task.status = 'failed' if task.status == 'ignore': return {} if newTasks != []: return {'newTasks': newTasks} return {}