def parseWebsite(): log = open('../logs/parseweb.log', 'a') # 每次取5000条 count = TempLz.select().count() pagesize = 5000 pagecount = int(math.ceil(float(count) / float(pagesize))) for i in range(pagecount): datas = TempLz.select().where(TempLz.id>18879).order_by(TempLz.id).paginate(i + 1, pagesize) if datas is not None: for d in datas: data = d.lzpage if data is not None: parseData = parserCompanyAndWeb(data) com = buildCompany(parseData['company']) web = buildWebsite(parseData['web']) if com is not None and web is not None: c = Company.getOne(Company.coname == com.coname) if c is not None: web.regID = c impWebsite(web) else: impCompanyInfo(com) tempCom = Company.getOne(Company.regID == com.regID) web.regID = tempCom impWebsite(web) log.write(str(d.id)+ "\n") print d.id log.flush() log.close()
def judgeLzResult(com, web, shortUrl, subTask): # 更新网站信息和公司信息 impCompanyInfo(com) # impWebsite(web) # nWeb = Website.getOne(Website.domain == web.domain) # judgeWeb = Website.getOne(Website.domain ** str.format("%{}%", shortUrl)) # 如果查询网址与抓取亮照后的网址不匹配 if shortUrl != '': shortUrl = shortUrl.replace('http://', '').replace(' ', '') print 'shortUrl:', shortUrl print "web.domain:", web.domain com = Company.getOne(Company.coname == com.coname) if shortUrl != web.domain: # 当已经存在跳转关系记录时,不再操作 existsJumpWeb = Website.getOne((Website.domain == web.domain)) if existsJumpWeb is None: dt = format(datetime.datetime.now(), '%Y-%m-%d %H:%M:%S') web.updateDate = dt web.regID = com web.save(force_insert=True) # 更新网站跳转地址 q = Website.update(jump=web).where(Website.webId == subTask.webId.webId) q.execute() else: impWebsite(web) # 更新网站跳转地址 q = Website.update(jump=existsJumpWeb).where(Website.webId == subTask.webId.webId) q.execute() else: #更新网站信息 impWebsite(web) count = Website.select().where( (Website.licID != '') & (Website.regID == com) & (Website.domain == shortUrl) ).count() if count == 0: onlyCname = Website.select().where( (Website.licID != '') & (Website.regID == com)).count() onlyDomain = Website.select().where( (Website.licID != '') & (Website.domain == shortUrl)).count() if onlyCname > 0: q = TaskInfo.update(state='9').where(TaskInfo.id == subTask.id) q.execute() elif onlyDomain > 0: q = TaskInfo.update(state='8').where(TaskInfo.id == subTask.id) q.execute() else: q = TaskInfo.update(state='4').where(TaskInfo.id == subTask.id) q.execute() else: q = TaskInfo.update(state='2').where(TaskInfo.id == subTask.id) q.execute()
def checkAllLz(filePath, taskResultId): taskResult = TaskResult.getOne(TaskResult.taskResultId == taskResultId) delayDay = taskResult.taskId.intervalDay #根据当前任务编号目录获取该目录下的所有xml文件 print '目录路径:',filePath fnames=os.listdir(filePath) for name in fnames: xmlName = filePath+'/'+name print 'xmlName:',xmlName data = parseXMLFile(xmlName, 'CheckItem') print 'data:',data # 将需要检查的信息入库 for d in data: cname = d.get('cname') url = d.get('url') if url != '': if url[-1] == '/': url = url.replace('http://', '')[0:-1].replace(' ','') if cname!='': cname = cname.replace(' ','') webArea = d.get('area') webtype = d.get('WebType') # 检查更新company if Company.getOne(Company.coname == cname) is None: c = Company() c.coname = cname c.save(force_insert=True) # 检查更新website if Website.getOne(Website.domain == url) is not None: q = Website.update(domain=url, type=webtype, area=webArea).where(Website.domain == url) q.execute() else: com = Company.getOne(Company.coname == cname) w = Website() w.regID = com w.domain = url w.area = webArea w.type = webtype w.save(force_insert=True) updateWeb = Website.getOne(Website.domain == url) subTask = TaskInfo() subTask.taskResultId = taskResult subTask.webId = updateWeb subTask.state = '1' subTask.save(force_insert=True) taskResultId = str(taskResultId) if delayDay > 0: # 需要周期执行的任务 executeMultiTaskInfo(taskResultId) else: #logger.debug("开始调用单次任务") # 单次执行的任务 executeSingleTaskInfo(taskResultId)