def parse_page(self, download_page_left, page_source): ''' 保存页面信息 解析每一页的下载地址 ''' soup = BeautifulSoup(page_source, 'lxml') # 定位到内容表区域 tr_table = soup.find(name='table', attrs={'class': 'GridTableContent'}) # 处理验证码 try: # 去除第一个tr标签(表头) tr_table.tr.extract() except Exception as e: logging.error('出现验证码') return self.parse_page( download_page_left, crack.get_image(self.get_result_url, self.session, page_source)) # 遍历每一行 for index, tr_info in enumerate(tr_table.find_all(name='tr')): tr_text = '' download_url = '' detail_url = '' # 遍历每一列 for index, td_info in enumerate(tr_info.find_all(name='td')): # 因为一列中的信息非常杂乱,此处进行二次拼接 td_text = '' for string in td_info.stripped_strings: td_text += string tr_text += td_text + ' ' with open( 'data/ReferenceList.txt', 'a', encoding='utf-8') as file: file.write(td_text + ' ') # 寻找下载链接 dl_url = td_info.find('a', attrs={'class': 'briefDl_D'}) # 寻找详情链接 dt_url = td_info.find('a', attrs={'class': 'fz14'}) # 排除不是所需要的列 if dt_url: detail_url = dt_url.attrs['href'] if dl_url: download_url = dl_url.attrs['href'] # 将每一篇文献的信息分组 single_refence_list = tr_text.split(' ') self.download_refence(download_url, single_refence_list) # 是否开启详情页数据抓取 if config.crawl_isdetail == '1': time.sleep(config.crawl_stepWaitTime) page_detail.get_detail_page(self.session, self.get_result_url, detail_url, single_refence_list, self.download_url) # 在每一行结束后输入一个空行 with open('data/ReferenceList.txt', 'a', encoding='utf-8') as file: file.write('\n') # download_page_left为剩余等待遍历页面 if download_page_left > 1: self.cur_page_num += 1 self.get_another_page(download_page_left)
def parse_page(self, page_source): ''' 保存页面信息 解析每一页的下载地址 ''' soup = BeautifulSoup(page_source, 'lxml') # 定位到内容表区域 tr_table = soup.find(name='table', attrs={'class': 'GridTableContent'}) # 处理验证码 try: # 去除第一个tr标签(表头) tr_table.tr.extract() except Exception as e: logging.error('出现验证码') return self.parse_page( self.left_page_num, crack.get_image(self.get_result_url, self.session, page_source)) for index, tr_info in enumerate(tr_table.find_all(name='tr')): searching_idx = str((index+1) + (self.cur_page_num-1)*20) #print('searching_idx: ', searching_idx) if self.repair & (searching_idx in self.missingPage) is False: ##print('QQ') self.getDetail = False else: self.getDetail = True tr_text = '' detail_url = '' filename = '' # 遍历每一列 for index, td_info in enumerate(tr_info.find_all(name='td')): # 因为一列中的信息非常杂乱,此处进行二次拼接 td_text = '' for string in td_info.stripped_strings: td_text += string tr_text += td_text + ' ' # 寻找详情链接 dt_url = td_info.find('a', attrs={'class': 'fz14'}) if dt_url: detail_url = dt_url.attrs['href'] filename = detail_url[detail_url.find('filename=')+9:] # 将每一篇文献的信息分组 single_refence_list = tr_text.split(' ') #print('正在下载: ' + single_refence_list[1]) # 是否开启详情页数据抓取 if self.getDetail: ##print('repair:',self.repair) time.sleep(config.crawl_stepWaitTime) page_detail.get_detail_page(self.session, self.get_result_url, detail_url, single_refence_list, self.userInput, self.repair) self.getDetail = False #print('download_page_left: ', self.left_page_num) self.get_another_page()