Python NetworkIOの例、common.IOHandler.NetworkIO Pythonの例

コード例 #1

0

ファイルを表示

    def __getAllDUrl(thisTask):
        for url in thisTask:
            try:
                spiltIndex = url.rfind('page=') + 5
                urlPrefix = url[:spiltIndex]
                basePageIndex = int(url[spiltIndex:])
                html = NetworkIO().requestHtml(url)
                if html is not None:
                    doctorUrls = DUrlProducer.__getDoctorUrl(html)
                    pageCount = html.xpath(
                        '//div[@class="mt20 HomeFen f14"]/a[@class="page"]/text()'
                    )
                    pageCount = 1 if len(pageCount) == 0 else pageCount[-1]
                    pageCount = int(pageCount)
                    Redis().saveUrl('family-doctor-url',
                                    *doctorUrls,
                                    backup=True)

                    if pageCount > basePageIndex:
                        for pageIndex in range(basePageIndex + 1,
                                               pageCount + 1):
                            url = urlPrefix + str(pageIndex)
                            html = NetworkIO().requestHtml(url)
                            if html is not None:
                                doctorUrls = DUrlProducer.__getDoctorUrl(html)
                                Redis().saveUrl('family-doctor-url',
                                                *doctorUrls,
                                                backup=True)
            except:
                # print('>>>Exception: ' + traceback.format_exc())
                DUrlProducer.__doExpt('sickness-url', url, 'doctor_0')

コード例 #2

0

ファイルを表示

ファイル: QUrlConsumer.py プロジェクト: liuming-dev/XYWYCrawler

def getQPageInfo(year, password):
    if password is not None:
        urlPool = UrlClient.getUrls(password)  # 与redis不在同一台主机上时
    else:
        urlPool = Redis().listUrls(year, 300)  # 与redis在同一台主机上时
    while 1:
        if len(urlPool) > 0:
            for url in urlPool:
                try:
                    html = NetworkIO().requestHtml(url)
                    if html is not None:
                        # 获取问题信息
                        qInfoBlock = html.xpath('//div[@class="w980 clearfix bc f12 btn-a pr"]')
                        if len(qInfoBlock) > 0:
                            getQInfo(url, qInfoBlock[0])
                        # 获取关于问题的回复信息
                        replyInfoBlock = html.xpath('//div[@class="Doc_con clearfix pr mt5 "]')
                        if len(replyInfoBlock) > 0:
                            getReplyInfo(url, replyInfoBlock[0])
                except:
                    # print('>>>Exception: ' + traceback.format_exc())
                    doExpt(password, year, url, '1')
            if password is not None:
                # 与redis不在同一台主机上时
                urlPool = UrlClient.getUrls(password)
            else:
                # 与redis在同一台主机上时
                urlPool = Redis().listUrls(year, 300)
        else:
            break

コード例 #3

0

ファイルを表示

 def __getAllSicknessUrl(self):
     html = NetworkIO().requestHtml(self.__initUrl)
     if html is not None:
         sicknessUrls = html.xpath('//a[@class="mr5"]/@href')
         tmpUrls = []
         for url in sicknessUrls:
             tmpUrls.append('http://club.xywy.com' + url +
                            '&keyword=&page=1')
         Redis().saveUrl('sickness-url', *tmpUrls)

コード例 #4

0

ファイルを表示

 def __getDayUrl(self):
     urlPrefix = 'http://club.xywy.com/keshi/'
     for pageIndex in range(self.__startPage, self.__endPage + 1):
         tmpUrl = urlPrefix + str(pageIndex) + '.html'
         html = NetworkIO().requestHtml(tmpUrl)
         if html is not None:
             dayLinkBlock = html.xpath(
                 '//ul[@class="club_Date clearfix"]//a')
             for dayLink in dayLinkBlock:
                 dayName = dayLink.text.strip('[] ')
                 if not self.__isFiltered(dayName):
                     dayUrl = dayLink.get('href')
                     # print((dayName, dayUrl))
                     year = dayName[0:4] if len(dayName) == 10 else '2000'
                     if year not in self.__years:
                         self.__years.add(year)
                     Redis().saveUrl('-' + year, dayUrl)
         print(tmpUrl + ' --> completed...')

コード例 #5

0

ファイルを表示

ファイル: DUrlConsumer.py プロジェクト: liuming-dev/XYWYCrawler

def getInfo3(url):
    # http://club.xywy.com/familyDoctor/pay/43983196?info=1&page=2#name2 对应的用户评价具体内容
    try:
        # 当第一次访问页面时，除了获取评论信息，也要获取全部的评论页的总数
        html = NetworkIO().requestHtml(url)
        if html is not None:
            evaluateBlock = html.findall(
                './/div[@class="User_eval lh180 btn-a f14 fwei mt10"]')
            for index, block in enumerate(evaluateBlock):
                uName = block.findtext('.//span[@class="mr10 fl"]').strip()
                evalAtti = block.findtext(
                    './/span[@class="fl colbd mr10"]').strip()
                evalScore = block.findtext(
                    './/span[@class="colClass01 fl"]').strip()
                evalText = block.findtext('.//div[@class="pt5"]').strip()
                evalTime = block.findtext(
                    './/span[@class="colbd f12 db pt10"]').strip()
                dbInfo = (url + '#' + str(index), uName, evalAtti, evalScore,
                          evalText,
                          datetime.strptime(evalTime, '%Y-%m-%d %H:%M:%S'))
                MySQL().saveDoctorEvaluationText(dbInfo)
            # 评价共有多少页
            totalPageInfo = html.find(
                './/div[@class="mt20 HomeFen f14"]/span[@class="mr5"]')
            totalPageInfo = 1 if totalPageInfo is None else totalPageInfo.text.strip(
            )[1:-3]
            # 目前评价页的索引值
            tmpIndex = url.find('page=') + 5
            currentPageIndex = url[tmpIndex:-6]
            # 获取当前页以后的评论页的评论信息
            if int(currentPageIndex) < int(totalPageInfo):
                for pageIndex in range(
                        int(currentPageIndex) + 1,
                        int(totalPageInfo) + 1):
                    url = url[:int(tmpIndex)] + str(pageIndex) + '#name2'
                    html = NetworkIO().requestHtml(url)
                    if html is not None:
                        evaluateBlock = html.findall(
                            './/div[@class="User_eval lh180 btn-a f14 fwei mt10"]'
                        )
                        for index, block in enumerate(evaluateBlock):
                            uName = block.findtext(
                                './/span[@class="mr10 fl"]').strip()
                            evalAtti = block.findtext(
                                './/span[@class="fl colbd mr10"]').strip()
                            evalScore = block.findtext(
                                './/span[@class="colClass01 fl"]').strip()
                            evalText = block.findtext(
                                './/div[@class="pt5"]').strip()
                            evalTime = block.findtext(
                                './/span[@class="colbd f12 db pt10"]').strip()
                            dbInfo = (url + '#' + str(index), uName, evalAtti,
                                      evalScore, evalText,
                                      datetime.strptime(
                                          evalTime, '%Y-%m-%d %H:%M:%S'))
                            MySQL().saveDoctorEvaluationText(dbInfo)
    except:
        doExpt('url3', url, 'url3')

コード例 #6

0

ファイルを表示

ファイル: DUrlConsumer.py プロジェクト: liuming-dev/XYWYCrawler

def getInfo2(url):
    # http://club.xywy.com/familyDoctor/pay/43983196?info=1&page=2#name2 对应页面总的用户评价相关信息
    try:
        html = NetworkIO().requestHtml(url)
        if html is not None:
            evaluateScore = html.findtext(
                './/h4[@class="f30 colClass01 fWei tc"]').strip()
            evaluateStat = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0}
            evaluateStatBlock = html.findall(
                './/div[@class="HomSptop_Ri fWei f14 mt20 fl"]/span')
            for index, item in enumerate(evaluateStatBlock):
                tmptext = item.text
                evaluateStat[index] = 0 if len(tmptext) == 0 else int(
                    tmptext[tmptext.find('（') + 1:tmptext.find('）')])
            dbInfo = (url, evaluateScore, evaluateStat[0], evaluateStat[1],
                      evaluateStat[2], evaluateStat[3], evaluateStat[4],
                      evaluateStat[5], evaluateStat[6], evaluateStat[7])
            MySQL().saveDoctorEvaluation(dbInfo)
    except:
        doExpt('url2', url, 'url2')

コード例 #7

0

ファイルを表示

    def __getAllQUrl(task, year):
        for url in task:
            try:
                date = url[27:37]  # 用户构造问题url
                pageBase = int(url[38:-5])
                html = NetworkIO().requestHtml(url)
                if html is not None:
                    pageCount = QUrlProducer.__getDayPageCount(html)
                    qUrls = QUrlProducer.__getPageQUrl(html)
                    Redis().saveUrl(year, *qUrls, backup=True)

                    if pageCount > pageBase:
                        for pageIndex in range(pageBase + 1, pageCount + 1):
                            url = 'http://club.xywy.com/keshi/' + date + '/' + str(
                                pageIndex) + '.html'
                            html = NetworkIO().requestHtml(url)
                            if html is not None:
                                qUrls = QUrlProducer.__getPageQUrl(html)
                                Redis().saveUrl(year, *qUrls, backup=True)
            except:
                # print('>>>Exception: ' + traceback.format_exc())
                QUrlProducer.__doExpt('-' + year, url, '0')

コード例 #8

0

ファイルを表示

ファイル: DUrlConsumer.py プロジェクト: liuming-dev/XYWYCrawler

def getInfo(url):
    # http://club.xywy.com/familyDoctor/pay/43983196 对应的页面信息
    try:
        html = NetworkIO().requestHtml(url)
        if html is not None:
            # 医生姓名
            doctorName = html.findtext('.//i[@class="fwei fl"]')
            doctorName = doctorName[:-6] if doctorName is not None and len(
                doctorName) > 6 else None
            # 医生职称和医院科室
            doctorRankAndHosp = html.find('.//div[@class=" lh200 pt10 f14"]')
            doctorRank = doctorRankAndHosp.text
            doctorHosp = doctorRankAndHosp.find('./br')
            # 获取医生的勋章
            medalsBlock = html.findall('.//div[@class="HomePth"]/span')
            medals = ''
            for medal in medalsBlock:
                medals += medal.get('data-th') + ','
            # 医生的寄语
            sendWord = html.find(
                './/div[@class="f12 graydeep club_home_icon HomePj"]/span'
            ).tail
            # 医生的服务类型
            serviceTypeBlock = {
                0: html.find('.//div[@class="fl pr"]'),
                1: None
            }
            if serviceTypeBlock[0] is None:
                serviceTypeBlock[1] = html.find('.//div[@class="fl f14"]')
            serviceTypes = {0: '', 1: ''}
            oldServiceTypes = {0: '', 1: ''}
            if serviceTypeBlock[0] is not None:
                serviceTypeBlock2 = serviceTypeBlock[0].findall('.//a[@cate]')
                for index, item in enumerate(serviceTypeBlock2):
                    for text in item.itertext():
                        serviceTypes[index] += text.strip()
            elif serviceTypeBlock[1] is not None:
                # 各服务原始价格
                serviceTypeBlock2 = serviceTypeBlock[1].findall('.//a[@cate]')
                for index, item in enumerate(serviceTypeBlock2):
                    for text in item.itertext():
                        serviceTypes[index] += text.strip()
                serviceTypeBlock2 = serviceTypeBlock[1].findall(
                    './/span[@class="f14 col99 ml10"]')
                for index, item in enumerate(serviceTypeBlock2):
                    for text in item.itertext():
                        oldServiceTypes[index] += text.strip()
            # 用户评分(放到用户评价界面抓取)
            # evaluateScore = html.findtext('.//span[@class="fl colClass01 fwei"]')
            # 签约家庭和帮助用户
            helpedInfo = {0: None, 1: None}
            helpedInfoBlock = html.findall('.//span[@class="fb f16 ml5"]')
            for index, item in enumerate(helpedInfoBlock):
                helpedInfo[index] = item.text
            # 擅长、简介以及荣誉
            infos = {0: '', 1: '', 2: ''}
            infoBlock = html.findall('.//div[@class="HomeJie f14 fwei pt20"]')
            for item in infoBlock:
                tmp = item.findtext('./h4')
                textblock = item.find('./div')
                tmptext = ''
                for text in textblock.itertext():
                    tmptext += text.strip()
                if '擅长' in tmp:
                    infos[0] = tmptext
                elif '简介' in tmp:
                    infos[1] = tmptext
                elif '荣誉' in tmp:
                    infos[2] = tmptext
            dbInfo = (url, doctorName, doctorRank, doctorHosp.tail, medals,
                      sendWord, serviceTypes[0], serviceTypes[1],
                      oldServiceTypes[0], oldServiceTypes[1], helpedInfo[0],
                      helpedInfo[1], infos[0], infos[1], infos[2])
            MySQL().saveDoctorInfo(dbInfo)
    except:
        doExpt('url1', url, 'url1')

コード例 #9

0

ファイルを表示

ファイル: DUrlConsumer.py プロジェクト: liuming-dev/XYWYCrawler

def getInfo4(url):
    # http://club.xywy.com/familyDoctor/pay/43983196?info=2&page=2#name3 对应的服务购买信息
    try:
        html = NetworkIO().requestHtml(url)
        if html is not None:
            serviceBuyBlock = html.findall('.//div[@class="HomBone fwei f14"]')
            for index, block in enumerate(serviceBuyBlock):
                uName = block.findtext('.//span[@class="w100"]').strip()
                serviceType = 1 if '包月' in block.findtext(
                    './/span[@class="w200 tl"]').strip() else 0
                serviceCount = block.findtext(
                    './/span[@class="w60 tc"]').strip()
                servicePrice = block.findtext(
                    './/span[@class="colClass01 fb w80 tc"]').strip()
                serviceStatus = block.findtext(
                    './/span[@class="club_home_icon HomBsuc"]').strip()
                serviceTime = block.findtext(
                    './/span[@class="col99 ml20 tc"]').strip()
                dbInfo = (url + '#' + str(index), uName, serviceType,
                          serviceCount, servicePrice, serviceStatus,
                          serviceTime)
                MySQL().saveServiceInfo(dbInfo)
            # 评价共有多少页
            totalPageInfo = html.find(
                './/div[@class="mt20 HomeFen f14"]/span[@class="mr5"]')
            totalPageInfo = 1 if totalPageInfo is None else totalPageInfo.text.strip(
            )[1:-3]
            # 目前评价页的索引值
            tmpIndex = url.find('page=') + 5
            currentPageIndex = url[tmpIndex:-6]
            # 获取当前页以后的评论页的评论信息
            if int(currentPageIndex) < int(totalPageInfo):
                for pageIndex in range(
                        int(currentPageIndex) + 1,
                        int(totalPageInfo) + 1):
                    url = url[:int(tmpIndex)] + str(pageIndex) + '#name3'
                    html = NetworkIO().requestHtml(url)
                    if html is not None:
                        serviceBuyBlock = html.findall(
                            './/div[@class="HomBone fwei f14"]')
                        for index, block in enumerate(serviceBuyBlock):
                            uName = block.findtext(
                                './/span[@class="w100"]').strip()
                            serviceType = 1 if '包月' in block.findtext(
                                './/span[@class="w200 tl"]').strip() else 0
                            serviceCount = block.findtext(
                                './/span[@class="w60 tc"]').strip()
                            servicePrice = block.findtext(
                                './/span[@class="colClass01 fb w80 tc"]'
                            ).strip()
                            serviceStatus = block.findtext(
                                './/span[@class="club_home_icon HomBsuc"]'
                            ).strip()
                            serviceTime = block.findtext(
                                './/span[@class="col99 ml20 tc"]').strip()
                            dbInfo = (url + '#' + str(index), uName,
                                      serviceType, serviceCount, servicePrice,
                                      serviceStatus, serviceTime)
                            MySQL().saveServiceInfo(dbInfo)
    except:
        doExpt('url4', url, 'url4')