Python getHTMLText Examples, basic.getHTMLText Python Examples

Example #1

0

Show file

def get_page(viewHref):
    #"http://www.shandong.gov.cn/sdxxgk/nest/xxgk/list.do?pagenos={}".format(page)
    time.sleep(1.5)
    print(viewHref)
    #获取页面内容
    html = getHTMLText(viewHref, headers)
    if html == -1:  #连接错误
        return 1

    try:
        soup = BeautifulSoup(html, 'lxml')

        table = soup.find('table')
        itemlist = table.findAll('tr')
        result = []

        for item in itemlist:
            url = item.find(attrs={'class': 'list-tit'}).a.get('href')
            department = item.find(attrs={'class': 'list-dw'}).getText()
            pubdate = item.find(attrs={'class': 'list-st'}).getText()
            nowtime = time.strftime('%Y-%m-%d %H:%M:%S',
                                    time.localtime(time.time()))
            result.append([url, 0, department, pubdate, nowtime])
        return result
    except:
        #格式错误
        traceback.print_exc()
        return 2

Example #2

0

Show file

File: urlcrawl.py Project: Heaven-zhw/EnglishOnline

def get_page(viewHref):
    time.sleep(1.5)
    html = getHTMLText(viewHref, headers)

    if html == -1:
        return 1  #网络连接错误
    try:
        soup = BeautifulSoup(html, 'lxml')
        questions = soup.find('div', attrs={'class': 'questions_col'})
        questions = questions.ul.findAll('li')
    except:
        return 2  #页面内容错误

    # 题型
    typeid = int(re.findall('type=(\d+)', viewHref)[0])
    result = []
    for ques in questions:
        answerhref = ""
        flag = 0
        nowtime = time.strftime('%Y-%m-%d %H:%M:%S',
                                time.localtime(time.time()))

        #提取题目的链接
        try:
            answertag = ques.find(attrs={'class': 'view_all'})
            answerhref = str(answertag.get('href'))

            print(answerhref)
            urllist = [typeid, answerhref, flag, nowtime]
            result.append(urllist)
        except:
            continue
    print(len(result))
    #print(result)
    return result

Example #3

0

Show file

def findPageNum(targetDate, totalPageNum):
    left = 0
    right = int(totalPageNum)
    while left < right:
        mid = left + (right - left) // 2
        viewHref = spiderUrl + str(mid + 1)
        print('第', mid + 1, '页')
        #获取当前页面最近的和最远的日期
        try:
            html = getHTMLText(viewHref, headers)
            if html == -1:
                return -1

            soup = BeautifulSoup(html, 'lxml')
            firstDate = soup.find(attrs={'class': 'list-st'}).getText()
            lastDate = soup.findAll(attrs={'class': 'list-st'})[-1].getText()
            time.sleep(1)
        except:
            return -1

        if targetDate > firstDate:  # (fisrtDate,MAX)
            right = mid
        elif targetDate > lastDate:  # (lastDate，fisrtDate]
            return mid + 1
        else:  # (MIN,lastDate]
            left = mid + 1

    return left

Example #4

0

Show file

def getSingleChoice(fromid,typeid,viewHref):
    print(fromid,typeid,viewHref)
    time.sleep(1)
    html=getHTMLText(viewHref,headers)
    if html==-1:
        return 1 #网络连接错误

    soup=BeautifulSoup(html,'lxml')
    try:
        answertag=soup.find('div',attrs={'class':'answer_detail'})

        title=str(answertag.dl.dt.p)

        answer_and_parsing=answertag.dl.dd.findAll('p')
        answer=answer_and_parsing[0].i.getText()
        analysis=str(answer_and_parsing[1].i)

        options=answertag.find('table',attrs={'name':'optionsTable'}).findAll('td')
        choiceA_pat=re.compile('(?<=>A.).*?(?=<)')
        choiceB_pat=re.compile('(?<=>B.).*?(?=<)')
        choiceC_pat=re.compile('(?<=>C.).*?(?=<)')
        choiceD_pat=re.compile('(?<=>D.).*?(?=<)')

        if options != None:
            if len(options) == 4:
                choiceA, choiceB, choiceC, choiceD = options
            choiceA = choiceA_pat.findall(str(options))[0]
            choiceB = choiceB_pat.findall(str(options))[0]
            choiceC = choiceC_pat.findall(str(options))[0]
            choiceD = choiceD_pat.findall(str(options))[0]

    except:
        return 2 # 题目内容提取错误

    result=[fromid,title,choiceA,choiceB,choiceC,choiceD,answer,analysis,typeid]
    return result

Example #5

0

Show file

File: crawl3.py Project: Heaven-zhw/EnglishOnline

    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Pragma': 'no-cache',
    'Cache-Control': 'no-cache',
}

#阅读

href = "https://tiku.21cnjy.com/quest/xOTMz0AY.html"  #有的题干没有题号的阅读，希望通过题目和选项数目不匹配舍去
href = "https://tiku.21cnjy.com/quest/1OTMz0AM.html"  #文章题目选项纯<br>标签的阅读，有的题干和选项没有题号的阅读，要舍去
href = "https://tiku.21cnjy.com/quest/0MDMz0AM.html"  #文章题目选项纯<br>标签的阅读
href = "https://tiku.21cnjy.com/quest/zNDMz0AQ.html"  #题号数字后跟全角点号的阅读
href = "https://tiku.21cnjy.com/quest/yMjMzzAE.html"  #既有4个选项同在一行，也有每两个在一行且存在图片的阅读。如何提取 or 舍去?
href = "https://tiku.21cnjy.com/quest/yODMj5cQ.html"  #文章题目选项纯<br>标签的完型

html = getHTMLText(href, headers)
soup = BeautifulSoup(html, 'lxml')

answertag = soup.find('div', attrs={'class': 'answer_detail'})

#获取阅读文本
readingTag = answertag.dl.dt.p
textpat = re.compile('(<p>.*?>)\s*\d+[\.．]')
readingText = textpat.findall(str(readingTag))[0]
readingText = readingText + '</p>'
print(readingText)

textList = []

fromid = 1111
typeid = 7

Example #6

0

Show file

def getReadingOrCloze(fromid,typeid,viewHref,textid):
    print(fromid, typeid, viewHref,textid)
    time.sleep(1)

    html = getHTMLText(viewHref, headers)
    if html==-1:
        return 1 #连接错误

    try:
        soup = BeautifulSoup(html, 'lxml')
        answertag = soup.find('div', attrs={'class': 'answer_detail'})

        textList = [] #文本信息的结果

        # 获取阅读文本
        readingTag = answertag.dl.dt.p
        quesTitlepat = re.compile('(?<=>)【小题\d+】.*?(?=<)')
        readingText = quesTitlepat.sub('', str(readingTag))
        # print(readingText)

        # 获取所有题目
        allQuesTag = answertag.dl.dt
        allQuesTitles = quesTitlepat.findall(str(allQuesTag))
        # print(allQuesTitles)

        #题目数量
        quesNum = len(allQuesTitles)
        if quesNum == 0:
            raise Exception("question num error!")
        #保存题目的列表，元素类型为字典
        quesList = [{'fromid': fromid, 'typeid': typeid,'textid':textid} for _ in range(quesNum)]

        #添加题干字段
        for i in range(0, len(allQuesTitles)):
            # allQuesTitles[i]="<p>{}</p>".format(allQuesTitles[i]) #加上p标签
            quesList[i].update(title=allQuesTitles[i])

        # print(allQuesTitles)
        # 获取选项
        choiceA_pat = re.compile('(?<=>A.).*?(?=<)')
        choiceB_pat = re.compile('(?<=>B.).*?(?=<)')
        choiceC_pat = re.compile('(?<=>C.).*?(?=<)')
        choiceD_pat = re.compile('(?<=>D.).*?(?=<)')

        optionsTags = answertag.findAll('table', attrs={'name': 'optionsTable'})

        #检查题目和选项个数是否匹配
        #print(len(optionsTags),quesNum)
        if quesNum!=len(optionsTags):
            raise Exception("option num error!")
        #添加各个选项字段
        for i, tag in enumerate(optionsTags):
            options = tag.findAll('td')
            # 默认有四个选项
            choiceA, choiceB, choiceC, choiceD = options
            choiceA = choiceA_pat.findall(str(options))[0]
            choiceB = choiceB_pat.findall(str(options))[0]
            choiceC = choiceC_pat.findall(str(options))[0]
            choiceD = choiceD_pat.findall(str(options))[0]
            # print('A.{} B.{} C.{} D.{}'.format(choiceA,choiceB,choiceC,choiceD))
            quesList[i].update(choiceA=choiceA, choiceB=choiceB, choiceC=choiceC, choiceD=choiceD)

        # 答案
        answer_and_parsing = answertag.dl.dd.findAll('p')
        answers = answer_and_parsing[0].i.getText()
        answerpat = re.compile('(?<=】)[ABCD]')
        answersList = answerpat.findall(str(answers))

        # 检查题目和答案个数是否匹配
        if quesNum != len(answersList):
            raise Exception("answer num error!")
        #添加答案字段
        for i in range(0, len(answersList)):
            quesList[i].update(answer=answersList[i])


        # 解析
        try:
            analyses = answer_and_parsing[1].i
            analysispat = re.compile('(?<=>)【小题\d+】.*?(?=<)')
            analysesList = analysispat.findall(str(analyses))
            # print(analysesList)
            # 无解析
            if analysesList == []:
                raise Exception("no analysis")
            # 添加解析字段
            for i in range(0, len(analysesList)):
                quesList[i].update(analysis=analysesList[i])

        except:  # 无解析添加空值
            for i in range(0, len(quesList)):
                quesList[i].update(analysis='')
    except:
        traceback.print_exc()
        print(viewHref)
        return 2

    textList = [textid, readingText, quesNum,typeid]
    result=[textList,quesList]
    return result

Example #7

0

Show file

File: contentcrawl.py Project: Heaven-zhw/sdgov1.0

def get_page(viewHref):
    time.sleep(1.5)
    print(viewHref)

    html = getHTMLText(viewHref, headers)
    if html == -1:
        return 1  #连接错误

    try:
        soup = BeautifulSoup(html, 'lxml')

        # 抓取公开内容基本信息
        table = soup.find('table')
        tds = table.findAll('td')

        # indexNo=tds[1].getText().strip()
        themeclass = tds[3].getText()  #主题分类
        if type(themeclass) == str:
            themeclass = themeclass.strip()

        department = tds[5].getText()  #发布机构
        if type(department) == str:
            department = department.strip()

        serveclass = tds[7].getText()  #服务对象分类
        if type(serveclass) == str:
            serveclass = serveclass.strip()

        # docNo=tds[9].getText().strip()
        dispatchtime = tds[11].getText()  #发文时间
        if type(dispatchtime) == str:
            dispatchtime = dispatchtime.strip()

        title = tds[13].getText()  #文章标题
        if type(title) == str:
            title = title.strip()

        pubtime = tds[15].getText()  #公开时间
        if type(pubtime) == str:
            pubtime = pubtime.strip()

        # 通过正文标签提取正文

        def getInfoByBody(soup, infobody, infotext):
            if infobody != None:
                #textpat = re.compile(r'p|span|ul') #span在p标签内文本会被提取提取两次
                textpat = re.compile(r'p|ul')
                texts = infobody.findAll(textpat)
                # 大部分情况都有p标签
                for item in texts:
                    if type(item.getText()) == str:
                        # 要去除不间断空白符和全角空白符
                        infotext = infotext + item.getText().replace(
                            u'\u3000', u' ').replace(u'\xa0',
                                                     u' ').strip() + '\n'
                # print(infotext)
                # 针对没有p标签，正文直接在infobody中的情况
                if infotext == "":
                    if type(infobody.getText()) == str:
                        infotext = infotext + infobody.getText().replace(
                            u'\u3000', u' ').replace(u'\xa0', u' ').strip()
            return infotext

        '''
        # 防止span标签被多次提取
        def getInfoByBody(soup, infobody, infotext):
            if infobody != None:
                if type(infobody.getText()) == str:
                    infotext = infotext + infobody.getText().replace(u'\u3000', u' ').replace(u'\xa0', u' ').strip()
            return infotext

        '''
        # 抓取正文内容
        infotext = ""
        infobody = soup.find('div', attrs={'style': 'align-content: center;'})
        # 最主要的标签
        infotext = getInfoByBody(soup, infobody, infotext)
        # 如果还没抓到正文，换标签
        if infotext == "":
            infobody = soup.find('div', attrs={'ergodic': 'article'})
            infotext = getInfoByBody(soup, infobody, infotext)

        # 抓取附件信息
        filesurl = []
        filesname = []

        filebody = soup.find('div', attrs={'class': 'chancolor'})
        if filebody != None:
            fileAtags = filebody.findAll('a')
            for item in fileAtags:
                fileurl = item.get('href')
                # 注意：有可能有html类型的链接
                if fileurl.find("down.do") != -1:
                    filename = item.getText()
                    filesurl.append(fileurl)
                    filesname.append(filename)
                    # print(filename)
        nowtime = time.strftime('%Y-%m-%d %H:%M:%S',
                                time.localtime(time.time()))
        result = [
            title, department, themeclass, serveclass, dispatchtime, pubtime,
            infotext,
            str(filesurl),
            str(filesname), nowtime
        ]

        #print(result)
        print(len(infotext))
        return result

    except:
        traceback.print_exc()
        return 2

Example #8

0

Show file

            #print("The num is:",findPageNum("2018-09-13",8598))

            if dbLatestDate == None:
                #localLatestDate="1970-01-01"
                localLatestDate = "2017-01-01"
            elif dbLatestDate == 1:
                print("日期读取错误")
                raise Exception("日期读取错误")
            else:
                localLatestDate = str(dbLatestDate[0])
                print(localLatestDate)

            firstHref = spiderUrl + str(1)
            #获取网站最新的信息日期和信息总页数

            html = getHTMLText(firstHref, headers)
            if html == -1:
                raise Exception("获取网站最新的信息日期和信息总页数错误")
            time.sleep(2)

            soup = BeautifulSoup(html, 'lxml')
            webLatestDate = soup.find(attrs={'class': 'list-st'}).getText()
            #lastdate = soup.findAll(attrs={'class': 'list-st'})[-1].getText()
            # <span class="simple_pgTotalPage">8598</span>
            totalPageNum = int(
                soup.find('span', attrs={
                    'class': 'simple_pgTotalPage'
                }).getText())

            print(webLatestDate, totalPageNum)