def get_page(viewHref): #"http://www.shandong.gov.cn/sdxxgk/nest/xxgk/list.do?pagenos={}".format(page) time.sleep(1.5) print(viewHref) #获取页面内容 html = getHTMLText(viewHref, headers) if html == -1: #连接错误 return 1 try: soup = BeautifulSoup(html, 'lxml') table = soup.find('table') itemlist = table.findAll('tr') result = [] for item in itemlist: url = item.find(attrs={'class': 'list-tit'}).a.get('href') department = item.find(attrs={'class': 'list-dw'}).getText() pubdate = item.find(attrs={'class': 'list-st'}).getText() nowtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) result.append([url, 0, department, pubdate, nowtime]) return result except: #格式错误 traceback.print_exc() return 2
def get_page(viewHref): time.sleep(1.5) html = getHTMLText(viewHref, headers) if html == -1: return 1 #网络连接错误 try: soup = BeautifulSoup(html, 'lxml') questions = soup.find('div', attrs={'class': 'questions_col'}) questions = questions.ul.findAll('li') except: return 2 #页面内容错误 # 题型 typeid = int(re.findall('type=(\d+)', viewHref)[0]) result = [] for ques in questions: answerhref = "" flag = 0 nowtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) #提取题目的链接 try: answertag = ques.find(attrs={'class': 'view_all'}) answerhref = str(answertag.get('href')) print(answerhref) urllist = [typeid, answerhref, flag, nowtime] result.append(urllist) except: continue print(len(result)) #print(result) return result
def findPageNum(targetDate, totalPageNum): left = 0 right = int(totalPageNum) while left < right: mid = left + (right - left) // 2 viewHref = spiderUrl + str(mid + 1) print('第', mid + 1, '页') #获取当前页面最近的和最远的日期 try: html = getHTMLText(viewHref, headers) if html == -1: return -1 soup = BeautifulSoup(html, 'lxml') firstDate = soup.find(attrs={'class': 'list-st'}).getText() lastDate = soup.findAll(attrs={'class': 'list-st'})[-1].getText() time.sleep(1) except: return -1 if targetDate > firstDate: # (fisrtDate,MAX) right = mid elif targetDate > lastDate: # (lastDate,fisrtDate] return mid + 1 else: # (MIN,lastDate] left = mid + 1 return left
def getSingleChoice(fromid,typeid,viewHref): print(fromid,typeid,viewHref) time.sleep(1) html=getHTMLText(viewHref,headers) if html==-1: return 1 #网络连接错误 soup=BeautifulSoup(html,'lxml') try: answertag=soup.find('div',attrs={'class':'answer_detail'}) title=str(answertag.dl.dt.p) answer_and_parsing=answertag.dl.dd.findAll('p') answer=answer_and_parsing[0].i.getText() analysis=str(answer_and_parsing[1].i) options=answertag.find('table',attrs={'name':'optionsTable'}).findAll('td') choiceA_pat=re.compile('(?<=>A.).*?(?=<)') choiceB_pat=re.compile('(?<=>B.).*?(?=<)') choiceC_pat=re.compile('(?<=>C.).*?(?=<)') choiceD_pat=re.compile('(?<=>D.).*?(?=<)') if options != None: if len(options) == 4: choiceA, choiceB, choiceC, choiceD = options choiceA = choiceA_pat.findall(str(options))[0] choiceB = choiceB_pat.findall(str(options))[0] choiceC = choiceC_pat.findall(str(options))[0] choiceD = choiceD_pat.findall(str(options))[0] except: return 2 # 题目内容提取错误 result=[fromid,title,choiceA,choiceB,choiceC,choiceD,answer,analysis,typeid] return result
'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Pragma': 'no-cache', 'Cache-Control': 'no-cache', } #阅读 href = "https://tiku.21cnjy.com/quest/xOTMz0AY.html" #有的题干没有题号的阅读,希望通过题目和选项数目不匹配舍去 href = "https://tiku.21cnjy.com/quest/1OTMz0AM.html" #文章题目选项纯<br>标签的阅读,有的题干和选项没有题号的阅读,要舍去 href = "https://tiku.21cnjy.com/quest/0MDMz0AM.html" #文章题目选项纯<br>标签的阅读 href = "https://tiku.21cnjy.com/quest/zNDMz0AQ.html" #题号数字后跟全角点号的阅读 href = "https://tiku.21cnjy.com/quest/yMjMzzAE.html" #既有4个选项同在一行,也有每两个在一行且存在图片的阅读。如何提取 or 舍去? href = "https://tiku.21cnjy.com/quest/yODMj5cQ.html" #文章题目选项纯<br>标签的完型 html = getHTMLText(href, headers) soup = BeautifulSoup(html, 'lxml') answertag = soup.find('div', attrs={'class': 'answer_detail'}) #获取阅读文本 readingTag = answertag.dl.dt.p textpat = re.compile('(<p>.*?>)\s*\d+[\..]') readingText = textpat.findall(str(readingTag))[0] readingText = readingText + '</p>' print(readingText) textList = [] fromid = 1111 typeid = 7
def getReadingOrCloze(fromid,typeid,viewHref,textid): print(fromid, typeid, viewHref,textid) time.sleep(1) html = getHTMLText(viewHref, headers) if html==-1: return 1 #连接错误 try: soup = BeautifulSoup(html, 'lxml') answertag = soup.find('div', attrs={'class': 'answer_detail'}) textList = [] #文本信息的结果 # 获取阅读文本 readingTag = answertag.dl.dt.p quesTitlepat = re.compile('(?<=>)【小题\d+】.*?(?=<)') readingText = quesTitlepat.sub('', str(readingTag)) # print(readingText) # 获取所有题目 allQuesTag = answertag.dl.dt allQuesTitles = quesTitlepat.findall(str(allQuesTag)) # print(allQuesTitles) #题目数量 quesNum = len(allQuesTitles) if quesNum == 0: raise Exception("question num error!") #保存题目的列表,元素类型为字典 quesList = [{'fromid': fromid, 'typeid': typeid,'textid':textid} for _ in range(quesNum)] #添加题干字段 for i in range(0, len(allQuesTitles)): # allQuesTitles[i]="<p>{}</p>".format(allQuesTitles[i]) #加上p标签 quesList[i].update(title=allQuesTitles[i]) # print(allQuesTitles) # 获取选项 choiceA_pat = re.compile('(?<=>A.).*?(?=<)') choiceB_pat = re.compile('(?<=>B.).*?(?=<)') choiceC_pat = re.compile('(?<=>C.).*?(?=<)') choiceD_pat = re.compile('(?<=>D.).*?(?=<)') optionsTags = answertag.findAll('table', attrs={'name': 'optionsTable'}) #检查题目和选项个数是否匹配 #print(len(optionsTags),quesNum) if quesNum!=len(optionsTags): raise Exception("option num error!") #添加各个选项字段 for i, tag in enumerate(optionsTags): options = tag.findAll('td') # 默认有四个选项 choiceA, choiceB, choiceC, choiceD = options choiceA = choiceA_pat.findall(str(options))[0] choiceB = choiceB_pat.findall(str(options))[0] choiceC = choiceC_pat.findall(str(options))[0] choiceD = choiceD_pat.findall(str(options))[0] # print('A.{} B.{} C.{} D.{}'.format(choiceA,choiceB,choiceC,choiceD)) quesList[i].update(choiceA=choiceA, choiceB=choiceB, choiceC=choiceC, choiceD=choiceD) # 答案 answer_and_parsing = answertag.dl.dd.findAll('p') answers = answer_and_parsing[0].i.getText() answerpat = re.compile('(?<=】)[ABCD]') answersList = answerpat.findall(str(answers)) # 检查题目和答案个数是否匹配 if quesNum != len(answersList): raise Exception("answer num error!") #添加答案字段 for i in range(0, len(answersList)): quesList[i].update(answer=answersList[i]) # 解析 try: analyses = answer_and_parsing[1].i analysispat = re.compile('(?<=>)【小题\d+】.*?(?=<)') analysesList = analysispat.findall(str(analyses)) # print(analysesList) # 无解析 if analysesList == []: raise Exception("no analysis") # 添加解析字段 for i in range(0, len(analysesList)): quesList[i].update(analysis=analysesList[i]) except: # 无解析添加空值 for i in range(0, len(quesList)): quesList[i].update(analysis='') except: traceback.print_exc() print(viewHref) return 2 textList = [textid, readingText, quesNum,typeid] result=[textList,quesList] return result
def get_page(viewHref): time.sleep(1.5) print(viewHref) html = getHTMLText(viewHref, headers) if html == -1: return 1 #连接错误 try: soup = BeautifulSoup(html, 'lxml') # 抓取公开内容基本信息 table = soup.find('table') tds = table.findAll('td') # indexNo=tds[1].getText().strip() themeclass = tds[3].getText() #主题分类 if type(themeclass) == str: themeclass = themeclass.strip() department = tds[5].getText() #发布机构 if type(department) == str: department = department.strip() serveclass = tds[7].getText() #服务对象分类 if type(serveclass) == str: serveclass = serveclass.strip() # docNo=tds[9].getText().strip() dispatchtime = tds[11].getText() #发文时间 if type(dispatchtime) == str: dispatchtime = dispatchtime.strip() title = tds[13].getText() #文章标题 if type(title) == str: title = title.strip() pubtime = tds[15].getText() #公开时间 if type(pubtime) == str: pubtime = pubtime.strip() # 通过正文标签提取正文 def getInfoByBody(soup, infobody, infotext): if infobody != None: #textpat = re.compile(r'p|span|ul') #span在p标签内文本会被提取提取两次 textpat = re.compile(r'p|ul') texts = infobody.findAll(textpat) # 大部分情况都有p标签 for item in texts: if type(item.getText()) == str: # 要去除不间断空白符和全角空白符 infotext = infotext + item.getText().replace( u'\u3000', u' ').replace(u'\xa0', u' ').strip() + '\n' # print(infotext) # 针对没有p标签,正文直接在infobody中的情况 if infotext == "": if type(infobody.getText()) == str: infotext = infotext + infobody.getText().replace( u'\u3000', u' ').replace(u'\xa0', u' ').strip() return infotext ''' # 防止span标签被多次提取 def getInfoByBody(soup, infobody, infotext): if infobody != None: if type(infobody.getText()) == str: infotext = infotext + infobody.getText().replace(u'\u3000', u' ').replace(u'\xa0', u' ').strip() return infotext ''' # 抓取正文内容 infotext = "" infobody = soup.find('div', attrs={'style': 'align-content: center;'}) # 最主要的标签 infotext = getInfoByBody(soup, infobody, infotext) # 如果还没抓到正文,换标签 if infotext == "": infobody = soup.find('div', attrs={'ergodic': 'article'}) infotext = getInfoByBody(soup, infobody, infotext) # 抓取附件信息 filesurl = [] filesname = [] filebody = soup.find('div', attrs={'class': 'chancolor'}) if filebody != None: fileAtags = filebody.findAll('a') for item in fileAtags: fileurl = item.get('href') # 注意:有可能有html类型的链接 if fileurl.find("down.do") != -1: filename = item.getText() filesurl.append(fileurl) filesname.append(filename) # print(filename) nowtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) result = [ title, department, themeclass, serveclass, dispatchtime, pubtime, infotext, str(filesurl), str(filesname), nowtime ] #print(result) print(len(infotext)) return result except: traceback.print_exc() return 2
#print("The num is:",findPageNum("2018-09-13",8598)) if dbLatestDate == None: #localLatestDate="1970-01-01" localLatestDate = "2017-01-01" elif dbLatestDate == 1: print("日期读取错误") raise Exception("日期读取错误") else: localLatestDate = str(dbLatestDate[0]) print(localLatestDate) firstHref = spiderUrl + str(1) #获取网站最新的信息日期和信息总页数 html = getHTMLText(firstHref, headers) if html == -1: raise Exception("获取网站最新的信息日期和信息总页数错误") time.sleep(2) soup = BeautifulSoup(html, 'lxml') webLatestDate = soup.find(attrs={'class': 'list-st'}).getText() #lastdate = soup.findAll(attrs={'class': 'list-st'})[-1].getText() # <span class="simple_pgTotalPage">8598</span> totalPageNum = int( soup.find('span', attrs={ 'class': 'simple_pgTotalPage' }).getText()) print(webLatestDate, totalPageNum)