def __getAllDUrl(thisTask): for url in thisTask: try: spiltIndex = url.rfind('page=') + 5 urlPrefix = url[:spiltIndex] basePageIndex = int(url[spiltIndex:]) html = NetworkIO().requestHtml(url) if html is not None: doctorUrls = DUrlProducer.__getDoctorUrl(html) pageCount = html.xpath( '//div[@class="mt20 HomeFen f14"]/a[@class="page"]/text()' ) pageCount = 1 if len(pageCount) == 0 else pageCount[-1] pageCount = int(pageCount) Redis().saveUrl('family-doctor-url', *doctorUrls, backup=True) if pageCount > basePageIndex: for pageIndex in range(basePageIndex + 1, pageCount + 1): url = urlPrefix + str(pageIndex) html = NetworkIO().requestHtml(url) if html is not None: doctorUrls = DUrlProducer.__getDoctorUrl(html) Redis().saveUrl('family-doctor-url', *doctorUrls, backup=True) except: # print('>>>Exception: ' + traceback.format_exc()) DUrlProducer.__doExpt('sickness-url', url, 'doctor_0')
def getQPageInfo(year, password): if password is not None: urlPool = UrlClient.getUrls(password) # 与redis不在同一台主机上时 else: urlPool = Redis().listUrls(year, 300) # 与redis在同一台主机上时 while 1: if len(urlPool) > 0: for url in urlPool: try: html = NetworkIO().requestHtml(url) if html is not None: # 获取问题信息 qInfoBlock = html.xpath('//div[@class="w980 clearfix bc f12 btn-a pr"]') if len(qInfoBlock) > 0: getQInfo(url, qInfoBlock[0]) # 获取关于问题的回复信息 replyInfoBlock = html.xpath('//div[@class="Doc_con clearfix pr mt5 "]') if len(replyInfoBlock) > 0: getReplyInfo(url, replyInfoBlock[0]) except: # print('>>>Exception: ' + traceback.format_exc()) doExpt(password, year, url, '1') if password is not None: # 与redis不在同一台主机上时 urlPool = UrlClient.getUrls(password) else: # 与redis在同一台主机上时 urlPool = Redis().listUrls(year, 300) else: break
def __getAllSicknessUrl(self): html = NetworkIO().requestHtml(self.__initUrl) if html is not None: sicknessUrls = html.xpath('//a[@class="mr5"]/@href') tmpUrls = [] for url in sicknessUrls: tmpUrls.append('http://club.xywy.com' + url + '&keyword=&page=1') Redis().saveUrl('sickness-url', *tmpUrls)
def __getDayUrl(self): urlPrefix = 'http://club.xywy.com/keshi/' for pageIndex in range(self.__startPage, self.__endPage + 1): tmpUrl = urlPrefix + str(pageIndex) + '.html' html = NetworkIO().requestHtml(tmpUrl) if html is not None: dayLinkBlock = html.xpath( '//ul[@class="club_Date clearfix"]//a') for dayLink in dayLinkBlock: dayName = dayLink.text.strip('[] ') if not self.__isFiltered(dayName): dayUrl = dayLink.get('href') # print((dayName, dayUrl)) year = dayName[0:4] if len(dayName) == 10 else '2000' if year not in self.__years: self.__years.add(year) Redis().saveUrl('-' + year, dayUrl) print(tmpUrl + ' --> completed...')
def getInfo3(url): # http://club.xywy.com/familyDoctor/pay/43983196?info=1&page=2#name2 对应的用户评价具体内容 try: # 当第一次访问页面时,除了获取评论信息,也要获取全部的评论页的总数 html = NetworkIO().requestHtml(url) if html is not None: evaluateBlock = html.findall( './/div[@class="User_eval lh180 btn-a f14 fwei mt10"]') for index, block in enumerate(evaluateBlock): uName = block.findtext('.//span[@class="mr10 fl"]').strip() evalAtti = block.findtext( './/span[@class="fl colbd mr10"]').strip() evalScore = block.findtext( './/span[@class="colClass01 fl"]').strip() evalText = block.findtext('.//div[@class="pt5"]').strip() evalTime = block.findtext( './/span[@class="colbd f12 db pt10"]').strip() dbInfo = (url + '#' + str(index), uName, evalAtti, evalScore, evalText, datetime.strptime(evalTime, '%Y-%m-%d %H:%M:%S')) MySQL().saveDoctorEvaluationText(dbInfo) # 评价共有多少页 totalPageInfo = html.find( './/div[@class="mt20 HomeFen f14"]/span[@class="mr5"]') totalPageInfo = 1 if totalPageInfo is None else totalPageInfo.text.strip( )[1:-3] # 目前评价页的索引值 tmpIndex = url.find('page=') + 5 currentPageIndex = url[tmpIndex:-6] # 获取当前页以后的评论页的评论信息 if int(currentPageIndex) < int(totalPageInfo): for pageIndex in range( int(currentPageIndex) + 1, int(totalPageInfo) + 1): url = url[:int(tmpIndex)] + str(pageIndex) + '#name2' html = NetworkIO().requestHtml(url) if html is not None: evaluateBlock = html.findall( './/div[@class="User_eval lh180 btn-a f14 fwei mt10"]' ) for index, block in enumerate(evaluateBlock): uName = block.findtext( './/span[@class="mr10 fl"]').strip() evalAtti = block.findtext( './/span[@class="fl colbd mr10"]').strip() evalScore = block.findtext( './/span[@class="colClass01 fl"]').strip() evalText = block.findtext( './/div[@class="pt5"]').strip() evalTime = block.findtext( './/span[@class="colbd f12 db pt10"]').strip() dbInfo = (url + '#' + str(index), uName, evalAtti, evalScore, evalText, datetime.strptime( evalTime, '%Y-%m-%d %H:%M:%S')) MySQL().saveDoctorEvaluationText(dbInfo) except: doExpt('url3', url, 'url3')
def getInfo2(url): # http://club.xywy.com/familyDoctor/pay/43983196?info=1&page=2#name2 对应页面总的用户评价相关信息 try: html = NetworkIO().requestHtml(url) if html is not None: evaluateScore = html.findtext( './/h4[@class="f30 colClass01 fWei tc"]').strip() evaluateStat = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0} evaluateStatBlock = html.findall( './/div[@class="HomSptop_Ri fWei f14 mt20 fl"]/span') for index, item in enumerate(evaluateStatBlock): tmptext = item.text evaluateStat[index] = 0 if len(tmptext) == 0 else int( tmptext[tmptext.find('(') + 1:tmptext.find(')')]) dbInfo = (url, evaluateScore, evaluateStat[0], evaluateStat[1], evaluateStat[2], evaluateStat[3], evaluateStat[4], evaluateStat[5], evaluateStat[6], evaluateStat[7]) MySQL().saveDoctorEvaluation(dbInfo) except: doExpt('url2', url, 'url2')
def __getAllQUrl(task, year): for url in task: try: date = url[27:37] # 用户构造问题url pageBase = int(url[38:-5]) html = NetworkIO().requestHtml(url) if html is not None: pageCount = QUrlProducer.__getDayPageCount(html) qUrls = QUrlProducer.__getPageQUrl(html) Redis().saveUrl(year, *qUrls, backup=True) if pageCount > pageBase: for pageIndex in range(pageBase + 1, pageCount + 1): url = 'http://club.xywy.com/keshi/' + date + '/' + str( pageIndex) + '.html' html = NetworkIO().requestHtml(url) if html is not None: qUrls = QUrlProducer.__getPageQUrl(html) Redis().saveUrl(year, *qUrls, backup=True) except: # print('>>>Exception: ' + traceback.format_exc()) QUrlProducer.__doExpt('-' + year, url, '0')
def getInfo(url): # http://club.xywy.com/familyDoctor/pay/43983196 对应的页面信息 try: html = NetworkIO().requestHtml(url) if html is not None: # 医生姓名 doctorName = html.findtext('.//i[@class="fwei fl"]') doctorName = doctorName[:-6] if doctorName is not None and len( doctorName) > 6 else None # 医生职称和医院科室 doctorRankAndHosp = html.find('.//div[@class=" lh200 pt10 f14"]') doctorRank = doctorRankAndHosp.text doctorHosp = doctorRankAndHosp.find('./br') # 获取医生的勋章 medalsBlock = html.findall('.//div[@class="HomePth"]/span') medals = '' for medal in medalsBlock: medals += medal.get('data-th') + ',' # 医生的寄语 sendWord = html.find( './/div[@class="f12 graydeep club_home_icon HomePj"]/span' ).tail # 医生的服务类型 serviceTypeBlock = { 0: html.find('.//div[@class="fl pr"]'), 1: None } if serviceTypeBlock[0] is None: serviceTypeBlock[1] = html.find('.//div[@class="fl f14"]') serviceTypes = {0: '', 1: ''} oldServiceTypes = {0: '', 1: ''} if serviceTypeBlock[0] is not None: serviceTypeBlock2 = serviceTypeBlock[0].findall('.//a[@cate]') for index, item in enumerate(serviceTypeBlock2): for text in item.itertext(): serviceTypes[index] += text.strip() elif serviceTypeBlock[1] is not None: # 各服务原始价格 serviceTypeBlock2 = serviceTypeBlock[1].findall('.//a[@cate]') for index, item in enumerate(serviceTypeBlock2): for text in item.itertext(): serviceTypes[index] += text.strip() serviceTypeBlock2 = serviceTypeBlock[1].findall( './/span[@class="f14 col99 ml10"]') for index, item in enumerate(serviceTypeBlock2): for text in item.itertext(): oldServiceTypes[index] += text.strip() # 用户评分(放到用户评价界面抓取) # evaluateScore = html.findtext('.//span[@class="fl colClass01 fwei"]') # 签约家庭和帮助用户 helpedInfo = {0: None, 1: None} helpedInfoBlock = html.findall('.//span[@class="fb f16 ml5"]') for index, item in enumerate(helpedInfoBlock): helpedInfo[index] = item.text # 擅长、简介以及荣誉 infos = {0: '', 1: '', 2: ''} infoBlock = html.findall('.//div[@class="HomeJie f14 fwei pt20"]') for item in infoBlock: tmp = item.findtext('./h4') textblock = item.find('./div') tmptext = '' for text in textblock.itertext(): tmptext += text.strip() if '擅长' in tmp: infos[0] = tmptext elif '简介' in tmp: infos[1] = tmptext elif '荣誉' in tmp: infos[2] = tmptext dbInfo = (url, doctorName, doctorRank, doctorHosp.tail, medals, sendWord, serviceTypes[0], serviceTypes[1], oldServiceTypes[0], oldServiceTypes[1], helpedInfo[0], helpedInfo[1], infos[0], infos[1], infos[2]) MySQL().saveDoctorInfo(dbInfo) except: doExpt('url1', url, 'url1')
def getInfo4(url): # http://club.xywy.com/familyDoctor/pay/43983196?info=2&page=2#name3 对应的服务购买信息 try: html = NetworkIO().requestHtml(url) if html is not None: serviceBuyBlock = html.findall('.//div[@class="HomBone fwei f14"]') for index, block in enumerate(serviceBuyBlock): uName = block.findtext('.//span[@class="w100"]').strip() serviceType = 1 if '包月' in block.findtext( './/span[@class="w200 tl"]').strip() else 0 serviceCount = block.findtext( './/span[@class="w60 tc"]').strip() servicePrice = block.findtext( './/span[@class="colClass01 fb w80 tc"]').strip() serviceStatus = block.findtext( './/span[@class="club_home_icon HomBsuc"]').strip() serviceTime = block.findtext( './/span[@class="col99 ml20 tc"]').strip() dbInfo = (url + '#' + str(index), uName, serviceType, serviceCount, servicePrice, serviceStatus, serviceTime) MySQL().saveServiceInfo(dbInfo) # 评价共有多少页 totalPageInfo = html.find( './/div[@class="mt20 HomeFen f14"]/span[@class="mr5"]') totalPageInfo = 1 if totalPageInfo is None else totalPageInfo.text.strip( )[1:-3] # 目前评价页的索引值 tmpIndex = url.find('page=') + 5 currentPageIndex = url[tmpIndex:-6] # 获取当前页以后的评论页的评论信息 if int(currentPageIndex) < int(totalPageInfo): for pageIndex in range( int(currentPageIndex) + 1, int(totalPageInfo) + 1): url = url[:int(tmpIndex)] + str(pageIndex) + '#name3' html = NetworkIO().requestHtml(url) if html is not None: serviceBuyBlock = html.findall( './/div[@class="HomBone fwei f14"]') for index, block in enumerate(serviceBuyBlock): uName = block.findtext( './/span[@class="w100"]').strip() serviceType = 1 if '包月' in block.findtext( './/span[@class="w200 tl"]').strip() else 0 serviceCount = block.findtext( './/span[@class="w60 tc"]').strip() servicePrice = block.findtext( './/span[@class="colClass01 fb w80 tc"]' ).strip() serviceStatus = block.findtext( './/span[@class="club_home_icon HomBsuc"]' ).strip() serviceTime = block.findtext( './/span[@class="col99 ml20 tc"]').strip() dbInfo = (url + '#' + str(index), uName, serviceType, serviceCount, servicePrice, serviceStatus, serviceTime) MySQL().saveServiceInfo(dbInfo) except: doExpt('url4', url, 'url4')