def getInfo3(url): # http://club.xywy.com/familyDoctor/pay/43983196?info=1&page=2#name2 对应的用户评价具体内容 try: # 当第一次访问页面时,除了获取评论信息,也要获取全部的评论页的总数 html = NetworkIO().requestHtml(url) if html is not None: evaluateBlock = html.findall( './/div[@class="User_eval lh180 btn-a f14 fwei mt10"]') for index, block in enumerate(evaluateBlock): uName = block.findtext('.//span[@class="mr10 fl"]').strip() evalAtti = block.findtext( './/span[@class="fl colbd mr10"]').strip() evalScore = block.findtext( './/span[@class="colClass01 fl"]').strip() evalText = block.findtext('.//div[@class="pt5"]').strip() evalTime = block.findtext( './/span[@class="colbd f12 db pt10"]').strip() dbInfo = (url + '#' + str(index), uName, evalAtti, evalScore, evalText, datetime.strptime(evalTime, '%Y-%m-%d %H:%M:%S')) MySQL().saveDoctorEvaluationText(dbInfo) # 评价共有多少页 totalPageInfo = html.find( './/div[@class="mt20 HomeFen f14"]/span[@class="mr5"]') totalPageInfo = 1 if totalPageInfo is None else totalPageInfo.text.strip( )[1:-3] # 目前评价页的索引值 tmpIndex = url.find('page=') + 5 currentPageIndex = url[tmpIndex:-6] # 获取当前页以后的评论页的评论信息 if int(currentPageIndex) < int(totalPageInfo): for pageIndex in range( int(currentPageIndex) + 1, int(totalPageInfo) + 1): url = url[:int(tmpIndex)] + str(pageIndex) + '#name2' html = NetworkIO().requestHtml(url) if html is not None: evaluateBlock = html.findall( './/div[@class="User_eval lh180 btn-a f14 fwei mt10"]' ) for index, block in enumerate(evaluateBlock): uName = block.findtext( './/span[@class="mr10 fl"]').strip() evalAtti = block.findtext( './/span[@class="fl colbd mr10"]').strip() evalScore = block.findtext( './/span[@class="colClass01 fl"]').strip() evalText = block.findtext( './/div[@class="pt5"]').strip() evalTime = block.findtext( './/span[@class="colbd f12 db pt10"]').strip() dbInfo = (url + '#' + str(index), uName, evalAtti, evalScore, evalText, datetime.strptime( evalTime, '%Y-%m-%d %H:%M:%S')) MySQL().saveDoctorEvaluationText(dbInfo) except: doExpt('url3', url, 'url3')
def getQInfo(url, elem): sectionBlock = elem.findall('./p[@class="pt10 pb10 lh180 znblue normal-a"]/a') # 一级科室 keshi1 = getPureText(sectionBlock[2].text) # 二级科室 keshi2 = None if len(sectionBlock) >= 4: keshi2 = getPureText(sectionBlock[3].text) qInfoBlock = elem.find('./div/div[@class="User_askcon clearfix pr"]') # 问题题目 qTitle = getPureText(qInfoBlock.find('.//p[@class="fl dib fb"]').text) userInfoBlock = qInfoBlock.findall('./div[@class="f12 graydeep Userinfo clearfix pl29"]/span') # 提问者姓名 uName = None # 提问者性别 uSex = None # 提问者年龄 uAge = None # 问题发表时间 qDatetime = None if len(userInfoBlock) >= 7: uName = getPureText(userInfoBlock[0].text) uSex = getPureText(userInfoBlock[2].text) uAge = getPureText(userInfoBlock[4].text) qDatetime = getPureText(userInfoBlock[6].text) qDatetime = (datetime.strptime(qDatetime, '%Y-%m-%d %H:%M:%S') if qDatetime is not None else datetime.strptime('2000-01-01 00:00', '%Y-%m-%d %H:%M:%S')) # 问题内容 qBodyBlock = qInfoBlock.find('./div/div[@id="qdetailc"]') qBody = '' for tmpText in qBodyBlock.itertext(): subText = getPureText(tmpText) qBody = qBody + subText if subText is not None else '' MySQL().saveQInfo((url, qTitle, qBody, qDatetime, keshi1, keshi2, uName, uSex, uAge))
def getInfo2(url): # http://club.xywy.com/familyDoctor/pay/43983196?info=1&page=2#name2 对应页面总的用户评价相关信息 try: html = NetworkIO().requestHtml(url) if html is not None: evaluateScore = html.findtext( './/h4[@class="f30 colClass01 fWei tc"]').strip() evaluateStat = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0} evaluateStatBlock = html.findall( './/div[@class="HomSptop_Ri fWei f14 mt20 fl"]/span') for index, item in enumerate(evaluateStatBlock): tmptext = item.text evaluateStat[index] = 0 if len(tmptext) == 0 else int( tmptext[tmptext.find('(') + 1:tmptext.find(')')]) dbInfo = (url, evaluateScore, evaluateStat[0], evaluateStat[1], evaluateStat[2], evaluateStat[3], evaluateStat[4], evaluateStat[5], evaluateStat[6], evaluateStat[7]) MySQL().saveDoctorEvaluation(dbInfo) except: doExpt('url2', url, 'url2')
def getReply2Info(reply1Id, elems): reply2Index = 1 for reply2InfoBlock in elems: # 二级回复中,是哪一方在回复:值为0表示提问者追问,值为1表示医生回复 whoReply = reply2InfoBlock.find('.//span[@class="fl dib fb Doc_bla"]') whoReply = getPureText(whoReply.text if whoReply is not None else None) if whoReply is not None: if '追问' in whoReply: whoReply = 0 else: whoReply = 1 # 二级回复的内容 reply2Body = reply2InfoBlock.find('.//*[@class="fl w390"]') reply2Body = getPureText(reply2Body.text) # 二级回复的时间 reply2Datetime = reply2InfoBlock.find('./p[@class="tr col99 f12"]/span') reply2Datetime = getPureText(reply2Datetime.text) reply2Datetime = (datetime.strptime(reply2Datetime, '%Y-%m-%d %H:%M:%S') if reply2Datetime is not None else datetime.strptime('2000-01-01 00:00', '%Y-%m-%d %H:%M:%S')) reply2Id = reply1Id + '_' + str(reply2Index) reply2Index += 1 MySQL().saveReply2Info((reply2Id, reply2Body, whoReply, reply2Datetime))
def getReplyInfo(url, elem): # 医生回复是否被采纳:0--未采纳;1--采纳 accepted = elem.find('./div[@class="t9999 questnew_icon Quest_askh2 pa"]') accepted = getPureText(accepted.text) if accepted is not None else None accepted = 1 if accepted == '最佳答案' else 0 # reply1Block = None if accepted: reply1Block = elem.findall('./div[@class="docall clearfix Bestbg"]') else: reply1Block = elem.findall('./div[@class="docall clearfix "]') reply1Index = 1 for block in reply1Block: # 回复医生的个人url doctorUrl = block.find('.//a[@class="f14 fb Doc_bla"]') doctorUrl = doctorUrl.get('href') if doctorUrl is not None else None # 医生回复的具体内容 reply1Body = block.find('.//div[@class="pt15 f14 graydeep pl20 pr20"]') reply1Body = getAllText(reply1Body) if reply1Body is not None else None # 回复的时间 reply1Datetime = block.find('.//p[@class="col99 tr clearfix pr20"]/span') reply1Datetime = getPureText(reply1Datetime.text) reply1Datetime = (datetime.strptime(reply1Datetime, '%Y-%m-%d %H:%M:%S') if reply1Datetime is not None else datetime.strptime('2000-01-01 00:00', '%Y-%m-%d %H:%M:%S')) if accepted: puIndex = block.find('.//div[@class="clearfix pb10 pl20 pr20 ballc"]//b[@class="gratenum"]') else: puIndex = block.find('.//div[@class="clearfix pb10 pl20 pr20 ballc pr"]//b[@class="gratenum"]') puIndex = getPureText(puIndex.text) if puIndex is not None else None reply1Id = url + '#' + str(reply1Index) reply1Index += 1 MySQL().saveReply1Info((reply1Id, doctorUrl, reply1Body, reply1Datetime, puIndex, accepted)) # 获取二级回复信息 reply2InfoBlock = block.findall('.//div[@class="appdoc appxian ml20 mr20 mt15 pb10 clearfix f14"]' '/div[@class="usezw pt10 clearfix"]') getReply2Info(reply1Id, reply2InfoBlock)
if rawText == '': rawText = None return rawText def doExpt(password, tb, url, logIdentifier): if password is not None: UrlClient.saveUrl(password, tb, url) else: Redis().saveUrl(tb, url) # 与redis在同一台主机上时 FileIO.handleExpt(traceback.format_exc(), url, logIdentifier) if __name__ == '__main__': tmpPwd = input('请输入用于认证的密码:').strip() tmpYear = input('请输入数据所在队列键名:').strip() MySQL().createTables() print('数据库中相应数据表已准备完成...') # threadList = [] # for i in range(5): # tmpThread = threading.Thread(target=getQPageInfo, args=(tmpYear, None if tmpPwd == '' else tmpPwd)) # threadList.append(tmpThread) # for tmpThread in threadList: # tmpThread.start() # for tmpThread in threadList: # tmpThread.join() jobs = [] for i in range(5): jobs.append(gevent.spawn(getQPageInfo, tmpYear, (None if tmpPwd == '' else tmpPwd))) gevent.joinall(jobs)
def getInfo(url): # http://club.xywy.com/familyDoctor/pay/43983196 对应的页面信息 try: html = NetworkIO().requestHtml(url) if html is not None: # 医生姓名 doctorName = html.findtext('.//i[@class="fwei fl"]') doctorName = doctorName[:-6] if doctorName is not None and len( doctorName) > 6 else None # 医生职称和医院科室 doctorRankAndHosp = html.find('.//div[@class=" lh200 pt10 f14"]') doctorRank = doctorRankAndHosp.text doctorHosp = doctorRankAndHosp.find('./br') # 获取医生的勋章 medalsBlock = html.findall('.//div[@class="HomePth"]/span') medals = '' for medal in medalsBlock: medals += medal.get('data-th') + ',' # 医生的寄语 sendWord = html.find( './/div[@class="f12 graydeep club_home_icon HomePj"]/span' ).tail # 医生的服务类型 serviceTypeBlock = { 0: html.find('.//div[@class="fl pr"]'), 1: None } if serviceTypeBlock[0] is None: serviceTypeBlock[1] = html.find('.//div[@class="fl f14"]') serviceTypes = {0: '', 1: ''} oldServiceTypes = {0: '', 1: ''} if serviceTypeBlock[0] is not None: serviceTypeBlock2 = serviceTypeBlock[0].findall('.//a[@cate]') for index, item in enumerate(serviceTypeBlock2): for text in item.itertext(): serviceTypes[index] += text.strip() elif serviceTypeBlock[1] is not None: # 各服务原始价格 serviceTypeBlock2 = serviceTypeBlock[1].findall('.//a[@cate]') for index, item in enumerate(serviceTypeBlock2): for text in item.itertext(): serviceTypes[index] += text.strip() serviceTypeBlock2 = serviceTypeBlock[1].findall( './/span[@class="f14 col99 ml10"]') for index, item in enumerate(serviceTypeBlock2): for text in item.itertext(): oldServiceTypes[index] += text.strip() # 用户评分(放到用户评价界面抓取) # evaluateScore = html.findtext('.//span[@class="fl colClass01 fwei"]') # 签约家庭和帮助用户 helpedInfo = {0: None, 1: None} helpedInfoBlock = html.findall('.//span[@class="fb f16 ml5"]') for index, item in enumerate(helpedInfoBlock): helpedInfo[index] = item.text # 擅长、简介以及荣誉 infos = {0: '', 1: '', 2: ''} infoBlock = html.findall('.//div[@class="HomeJie f14 fwei pt20"]') for item in infoBlock: tmp = item.findtext('./h4') textblock = item.find('./div') tmptext = '' for text in textblock.itertext(): tmptext += text.strip() if '擅长' in tmp: infos[0] = tmptext elif '简介' in tmp: infos[1] = tmptext elif '荣誉' in tmp: infos[2] = tmptext dbInfo = (url, doctorName, doctorRank, doctorHosp.tail, medals, sendWord, serviceTypes[0], serviceTypes[1], oldServiceTypes[0], oldServiceTypes[1], helpedInfo[0], helpedInfo[1], infos[0], infos[1], infos[2]) MySQL().saveDoctorInfo(dbInfo) except: doExpt('url1', url, 'url1')
def getInfo4(url): # http://club.xywy.com/familyDoctor/pay/43983196?info=2&page=2#name3 对应的服务购买信息 try: html = NetworkIO().requestHtml(url) if html is not None: serviceBuyBlock = html.findall('.//div[@class="HomBone fwei f14"]') for index, block in enumerate(serviceBuyBlock): uName = block.findtext('.//span[@class="w100"]').strip() serviceType = 1 if '包月' in block.findtext( './/span[@class="w200 tl"]').strip() else 0 serviceCount = block.findtext( './/span[@class="w60 tc"]').strip() servicePrice = block.findtext( './/span[@class="colClass01 fb w80 tc"]').strip() serviceStatus = block.findtext( './/span[@class="club_home_icon HomBsuc"]').strip() serviceTime = block.findtext( './/span[@class="col99 ml20 tc"]').strip() dbInfo = (url + '#' + str(index), uName, serviceType, serviceCount, servicePrice, serviceStatus, serviceTime) MySQL().saveServiceInfo(dbInfo) # 评价共有多少页 totalPageInfo = html.find( './/div[@class="mt20 HomeFen f14"]/span[@class="mr5"]') totalPageInfo = 1 if totalPageInfo is None else totalPageInfo.text.strip( )[1:-3] # 目前评价页的索引值 tmpIndex = url.find('page=') + 5 currentPageIndex = url[tmpIndex:-6] # 获取当前页以后的评论页的评论信息 if int(currentPageIndex) < int(totalPageInfo): for pageIndex in range( int(currentPageIndex) + 1, int(totalPageInfo) + 1): url = url[:int(tmpIndex)] + str(pageIndex) + '#name3' html = NetworkIO().requestHtml(url) if html is not None: serviceBuyBlock = html.findall( './/div[@class="HomBone fwei f14"]') for index, block in enumerate(serviceBuyBlock): uName = block.findtext( './/span[@class="w100"]').strip() serviceType = 1 if '包月' in block.findtext( './/span[@class="w200 tl"]').strip() else 0 serviceCount = block.findtext( './/span[@class="w60 tc"]').strip() servicePrice = block.findtext( './/span[@class="colClass01 fb w80 tc"]' ).strip() serviceStatus = block.findtext( './/span[@class="club_home_icon HomBsuc"]' ).strip() serviceTime = block.findtext( './/span[@class="col99 ml20 tc"]').strip() dbInfo = (url + '#' + str(index), uName, serviceType, serviceCount, servicePrice, serviceStatus, serviceTime) MySQL().saveServiceInfo(dbInfo) except: doExpt('url4', url, 'url4')