def parse(self, task): newTasks = [] ret, status = self.parseContent(task['__data']) if status == 'OK': #dump list for i in range(len(ret['stockList'])): ret['stockList'][i]['href'] = urlparse.urljoin(task.url, ret['stockList'][i]['href']) newT = Task(-1, url=ret['stockList'][i]['href'], handler='ShStockBasicInfoHandler', ref=task.url) newTasks.append(newT) keyOutputPath = iPapa.iTsOutputPath outputJsonLoc = os.path.join(keyOutputPath, 'sh_stock_list_page_%d.json' % task['key']) outputTxtLoc = os.path.join(keyOutputPath, 'sh_stock_list_page_%d.txt' % task['key']) if util.dump2JsonFile(ret['stockList'], outputJsonLoc) != True: task.status = 'failed' if util.dumpDictList2TxtFile(ret['stockList'], outputTxtLoc) != True: task.status = 'failed' #new tasks here page = task['key'] if ret['nextPage'] != '': newT = Task(-1, url=urlparse.urljoin(task.url, ret['nextPage']), handler='InitShHandler', ref=task.url) newT['key'] = page+1 newTasks.append(newT) else: task.status = 'failed' if newTasks != []: return {'newTasks': newTasks} return {}
def parse(self, task): newTasks = [] ret, status = self.parseContent(task['__data']) if status == 'OK': #dump list keyOutputPath = iPapa.iTsOutputPath outputJsonLoc = os.path.join(keyOutputPath, 'hk_gem_stock_list_cn.json') outputTxtLoc = os.path.join(keyOutputPath, 'hk_gem_stock_list_cn.txt') if util.dump2JsonFile(ret['stockList'], outputJsonLoc) != True: task.status = 'failed' if util.dump2TxtFile(ret['stockList'], outputTxtLoc) != True: task.status = 'failed' for k in ret['stockInfoPage']: if k not in self.historyKeys: newT = Task(-1, url=ret['stockInfoPage'][k], handler='HkStockInfoCnPageHandler', ref=task.url) newT['key'] = "hk_stock_info_" + k newTasks.append(newT) else: task.status = 'failed' if newTasks != []: return {'newTasks': newTasks} return {}
def dealWithParserOutput(self, task): if task.status == "done": # counters self.taskDoneCounter.inc() self.rmTask(task) # print "task id %s done" % (task.id) myLogger.info("task [%d] Done [%s]" % (task.id, task.exprMe())) task["__expr__"] = task.exprMe() fileName = os.path.join(iPapa.iTsOutputPath, "doneTask.%d.json" % task.id) util.dump2JsonFile(task["__expr__"], fileName) elif task.status == "ignore": self.taskIgnoreCounter.inc() self.rmTask(task) # print "task id %s done" % (task.id) myLogger.info("task [%d] Ignored [%s]" % (task.id, task.exprMe())) task["__expr__"] = task.exprMe() fileName = os.path.join(iPapa.iTsOutputPath, "ignoreTask.%d.json" % task.id) util.dump2JsonFile(task["__expr__"], fileName) else: # counters self.returnVal = False self.taskFailedCounter.inc() self.rmTask(task) myLogger.error("task [%d] Failed [%s], dump it" % (task.id, task.exprMe())) task["__expr__"] = task.exprMe() if "__data" in task: del task["__data"] fileName = os.path.join(iPapa.iTsOutputPath, "failedTask.%d.json" % task.id) util.dump2JsonFile(task, fileName) # todo :if we want to support the repeat argument # take care of the function flush pass
def dealWithParserOutput(self, task): if task.status == 'done': #counters self.taskDoneCounter.inc() self.rmTask(task) #print "task id %s done" % (task.id) myLogger.info("task [%d] Done [%s]" % (task.id, task.exprMe())) task['__expr__'] = task.exprMe() fileName = os.path.join(iPapa.iTsOutputPath, "doneTask.%d.json" % task.id) util.dump2JsonFile(task['__expr__'], fileName) elif task.status == 'ignore': self.taskIgnoreCounter.inc() self.rmTask(task) #print "task id %s done" % (task.id) myLogger.info("task [%d] Ignored [%s]" % (task.id, task.exprMe())) task['__expr__'] = task.exprMe() fileName = os.path.join(iPapa.iTsOutputPath, "ignoreTask.%d.json" % task.id) util.dump2JsonFile(task['__expr__'], fileName) else: #counters self.returnVal = False self.taskFailedCounter.inc() self.rmTask(task) myLogger.error("task [%d] Failed [%s], dump it" % (task.id, task.exprMe())) task['__expr__'] = task.exprMe() if '__data' in task: del task['__data'] fileName = os.path.join(iPapa.iTsOutputPath, "failedTask.%d.json" % task.id) util.dump2JsonFile(task, fileName) # todo :if we want to support the repeat argument # take care of the function flush pass
def parse(self, task): newTasks = [] ret, status = self.parseContent(task['__data']) if status == 'OK': #dump list code = ret['stockBasicInfo']['code'] keyOutputPath = iPapa.iTsOutputPath outputJsonLoc = os.path.join(keyOutputPath, 'sh_stock_basic_info_%s.json'%code) outputTxtLoc = os.path.join(keyOutputPath, 'sh_stock_basic_info_%s.txt'%code) if util.dump2JsonFile(ret['stockBasicInfo'], outputJsonLoc) != True: task.status = 'failed' if util.dumpDict2TxtFile(ret['stockBasicInfo'], outputTxtLoc) != True: task.status = 'failed' else: task.status = 'failed' if newTasks != []: return {'newTasks': newTasks} return {}
def parse(self, task): newTasks = [] ret, status = self.parseContent(task['__data']) if status == 'OK': #dump list for i in range(len(ret['stockList'])): ret['stockList'][i]['href'] = urlparse.urljoin( task.url, ret['stockList'][i]['href']) newT = Task(-1, url=ret['stockList'][i]['href'], handler='ShStockBasicInfoHandler', ref=task.url) newTasks.append(newT) keyOutputPath = iPapa.iTsOutputPath outputJsonLoc = os.path.join( keyOutputPath, 'sh_stock_list_page_%d.json' % task['key']) outputTxtLoc = os.path.join( keyOutputPath, 'sh_stock_list_page_%d.txt' % task['key']) if util.dump2JsonFile(ret['stockList'], outputJsonLoc) != True: task.status = 'failed' if util.dumpDictList2TxtFile(ret['stockList'], outputTxtLoc) != True: task.status = 'failed' #new tasks here page = task['key'] if ret['nextPage'] != '': newT = Task(-1, url=urlparse.urljoin(task.url, ret['nextPage']), handler='InitShHandler', ref=task.url) newT['key'] = page + 1 newTasks.append(newT) else: task.status = 'failed' if newTasks != []: return {'newTasks': newTasks} return {}
def parse(self, task): print "HkStockInfoCnPageHandler parse", task.url, task['key'] ret, status = self.parseContent(task['__data']) if status == 'OK': key = task['key'] keyOutputPath = iPapa.iTsOutputPath outputJsonLoc = os.path.join(keyOutputPath, 'hk_stock_info_cn_'+key+'.json') outputTxtLoc = os.path.join(keyOutputPath, 'hk_stock_info_cn_'+key+'.txt') if util.dump2JsonFile(ret, outputJsonLoc) != True: task.status = 'failed' if util.dump2TxtFile(zip(ret.keys(), ret.values()), outputTxtLoc) != True: task.status = 'failed' else: task.status = 'failed' if task.status == 'ignore': return {} return {}
def parse(self, task): newTasks = [] ret, status = self.parseContent(task['__data']) if status == 'OK': #dump list code = ret['stockBasicInfo']['code'] keyOutputPath = iPapa.iTsOutputPath outputJsonLoc = os.path.join(keyOutputPath, 'sh_stock_basic_info_%s.json' % code) outputTxtLoc = os.path.join(keyOutputPath, 'sh_stock_basic_info_%s.txt' % code) if util.dump2JsonFile(ret['stockBasicInfo'], outputJsonLoc) != True: task.status = 'failed' if util.dumpDict2TxtFile(ret['stockBasicInfo'], outputTxtLoc) != True: task.status = 'failed' else: task.status = 'failed' if newTasks != []: return {'newTasks': newTasks} return {}
def parse(self, task): print "HkStockInfoCnPageHandler parse", task.url, task['key'] ret, status = self.parseContent(task['__data']) if status == 'OK': key = task['key'] keyOutputPath = iPapa.iTsOutputPath outputJsonLoc = os.path.join(keyOutputPath, 'hk_stock_info_cn_' + key + '.json') outputTxtLoc = os.path.join(keyOutputPath, 'hk_stock_info_cn_' + key + '.txt') if util.dump2JsonFile(ret, outputJsonLoc) != True: task.status = 'failed' if util.dump2TxtFile(zip(ret.keys(), ret.values()), outputTxtLoc) != True: task.status = 'failed' else: task.status = 'failed' if task.status == 'ignore': return {} return {}
def parse(self, task): newTasks = [] ret, status = self.parseContent(task['__data']) if status == 'OK': #dump list keyOutputPath = iPapa.iTsOutputPath outputJsonLoc = os.path.join(keyOutputPath, 'hk_main_stock_list_cn.json') outputTxtLoc = os.path.join(keyOutputPath, 'hk_main_stock_list_cn.txt') if util.dump2JsonFile(ret['stockList'], outputJsonLoc) != True: task.status = 'failed' if util.dump2TxtFile(ret['stockList'], outputTxtLoc) != True: task.status = 'failed' for k in ret['stockInfoPage']: if k not in self.historyKeys: newT = Task(-1, url=ret['stockInfoPage'][k], handler='HkStockInfoCnPageHandler', ref=task.url) newT['key'] = "hk_stock_info_" + k newTasks.append(newT) else: task.status = 'failed' if newTasks != []: return {'newTasks': newTasks} return {}
def parse(self, task): print "ContentPageHandler parse", task.url, task['key'] newTasks = [] ret, status = self.parseContent(task['__data']) meta = {} if status == 'OK': key = task['key'] keyOutputPath = os.path.join(iPapa.iTsOutputPath, key) #siteTile meta['siteTitle'] = ret['siteTitle'] #title meta['title'] = ret['title'] # url meta['url'] = task.url # date meta['date'] = ret['date'] #contentPics #record and new task to download it meta['contentPics'] = ret['contentPics'] meta['contentPicCaptions'] = ret['contentPicCaptions'] meta['embPics'] = ret['embPics'] #create new tasks here if len(ret['contentPics']) and len(ret['contentPicCaptions']): picUrl = ret['contentPics'][0] #only the first pic here, ignore others now dest = os.path.join(key, util.getUrlFileName(picUrl)) newT = Task(-1, url=picUrl, handler='PicHandler', taskType='media', ref=task.url, dest=dest) newT['key'] = task['key'] newT['picType'] = 'contentPic' newTasks.append(newT) #for content, we store it contentLoc = os.path.join(keyOutputPath, 'content.html') util.writeFile(contentLoc, ret['content']) # contentMp3 if 'contentMp3' in ret: url = ret['contentMp3'] dest = os.path.join(key, util.getUrlFileName(url)) newT = Task(-1, url=url, handler='AudioHandler', taskType='media', ref=task.url, dest=dest) newT['key'] = task['key'] newT['audioType'] = os.path.splitext(dest)[1].upper() newTasks.append(newT) elif 'contentMp3Page' in ret: #always be with big file, we ignore it #url = ret['contentMp3Page'] #newT = Task(-1, url=urlparse.urljoin(task.url, url), handler='ContentMp3PageHandler', ref=task.url,) #newT['key'] = task['key'] #newTasks.append(newT) task.status = 'ignore' meta['isIgnore'] = True meta['ignoreMsg'] = "Audio file is too big, we should ignore this now." task.msg = 'Audio file is too big, we should ignore this now.' else: #Failed task.status = 'failed' task.msg = 'failed in Findding a Audio' # download here # embPics for embPic in ret['embPics']: picUrl = embPic #only the first pic here, ignore others now dest = os.path.join(key, util.getUrlFileName(picUrl)) newT = Task(-1, url=picUrl, handler='PicHandler', taskType='media', ref=task.url, dest=dest) newT['key'] = task['key'] newT['picType'] = 'embPic' newTasks.append(newT) # store meta file metaLoc = os.path.join(keyOutputPath, 'meta.json') if util.dump2JsonFile(meta, metaLoc) != True: task.status = 'failed' else: task.status = 'failed' if task.status == 'ignore': return {} if newTasks != []: return {'newTasks': newTasks} return {}
def parse(self, task): print "ContentPageHandler parse", task.url, task['key'] newTasks = [] ret, status = self.parseContent(task['__data']) meta = {} if status == 'OK': key = task['key'] keyOutputPath = os.path.join(iPapa.iTsOutputPath, key) #siteTile meta['siteTitle'] = ret['siteTitle'] #title meta['title'] = ret['title'] # url meta['url'] = task.url # date meta['date'] = ret['date'] #contentPics #record and new task to download it meta['contentPics'] = ret['contentPics'] meta['contentPicCaptions'] = ret['contentPicCaptions'] meta['embPics'] = ret['embPics'] #create new tasks here if len(ret['contentPics']) and len(ret['contentPicCaptions']): picUrl = ret['contentPics'][0] #only the first pic here, ignore others now dest = os.path.join(key, util.getUrlFileName(picUrl)) newT = Task(-1, url=picUrl, handler='PicHandler', taskType='media', ref=task.url, dest=dest) newT['key'] = task['key'] newT['picType'] = 'contentPic' newTasks.append(newT) #for content, we store it contentLoc = os.path.join(keyOutputPath, 'content.html') util.writeFile(contentLoc, ret['content']) # contentMp3 if 'contentMp3' in ret: url = ret['contentMp3'] dest = os.path.join(key, util.getUrlFileName(url)) newT = Task(-1, url=url, handler='AudioHandler', taskType='media', ref=task.url, dest=dest) newT['key'] = task['key'] newT['audioType'] = os.path.splitext(dest)[1].upper() newTasks.append(newT) elif 'contentMp3Page' in ret: #always be with big file, we ignore it #url = ret['contentMp3Page'] #newT = Task(-1, url=urlparse.urljoin(task.url, url), handler='ContentMp3PageHandler', ref=task.url,) #newT['key'] = task['key'] #newTasks.append(newT) task.status = 'ignore' meta['isIgnore'] = True meta[ 'ignoreMsg'] = "Audio file is too big, we should ignore this now." task.msg = 'Audio file is too big, we should ignore this now.' else: #Failed task.status = 'failed' task.msg = 'failed in Findding a Audio' # download here # embPics for embPic in ret['embPics']: picUrl = embPic #only the first pic here, ignore others now dest = os.path.join(key, util.getUrlFileName(picUrl)) newT = Task(-1, url=picUrl, handler='PicHandler', taskType='media', ref=task.url, dest=dest) newT['key'] = task['key'] newT['picType'] = 'embPic' newTasks.append(newT) # store meta file metaLoc = os.path.join(keyOutputPath, 'meta.json') if util.dump2JsonFile(meta, metaLoc) != True: task.status = 'failed' else: task.status = 'failed' if task.status == 'ignore': return {} if newTasks != []: return {'newTasks': newTasks} return {}