Ejemplo n.º 1
0
 def parse_page(self, download_page_left, page_source):
     '''
     保存页面信息
     解析每一页的下载地址
     '''
     soup = BeautifulSoup(page_source, 'lxml')
     # 定位到内容表区域
     tr_table = soup.find(name='table', attrs={'class': 'GridTableContent'})
     # 处理验证码
     try:
         # 去除第一个tr标签(表头)
         tr_table.tr.extract()
     except Exception as e:
         logging.error('出现验证码')
         return self.parse_page(
             download_page_left,
             crack.get_image(self.get_result_url, self.session,
                             page_source))
     # 遍历每一行
     for index, tr_info in enumerate(tr_table.find_all(name='tr')):
         tr_text = ''
         download_url = ''
         detail_url = ''
         # 遍历每一列
         for index, td_info in enumerate(tr_info.find_all(name='td')):
             # 因为一列中的信息非常杂乱,此处进行二次拼接
             td_text = ''
             for string in td_info.stripped_strings:
                 td_text += string
             tr_text += td_text + ' '
             with open(
                     'data/ReferenceList.txt', 'a',
                     encoding='utf-8') as file:
                 file.write(td_text + ' ')
             # 寻找下载链接
             dl_url = td_info.find('a', attrs={'class': 'briefDl_D'})
             # 寻找详情链接
             dt_url = td_info.find('a', attrs={'class': 'fz14'})
             # 排除不是所需要的列
             if dt_url:
                 detail_url = dt_url.attrs['href']
             if dl_url:
                 download_url = dl_url.attrs['href']
         # 将每一篇文献的信息分组
         single_refence_list = tr_text.split(' ')
         self.download_refence(download_url, single_refence_list)
         # 是否开启详情页数据抓取
         if config.crawl_isdetail == '1':
             time.sleep(config.crawl_stepWaitTime)
             page_detail.get_detail_page(self.session, self.get_result_url,
                                         detail_url, single_refence_list,
                                         self.download_url)
         # 在每一行结束后输入一个空行
         with open('data/ReferenceList.txt', 'a', encoding='utf-8') as file:
             file.write('\n')
     # download_page_left为剩余等待遍历页面
     if download_page_left > 1:
         self.cur_page_num += 1
         self.get_another_page(download_page_left)
Ejemplo n.º 2
0
    def parse_page(self, page_source):
        '''
        保存页面信息
        解析每一页的下载地址
        '''
        soup = BeautifulSoup(page_source, 'lxml')
        # 定位到内容表区域
        tr_table = soup.find(name='table', attrs={'class': 'GridTableContent'})
        # 处理验证码
        try:
            # 去除第一个tr标签(表头)
            tr_table.tr.extract()
        except Exception as e:
            logging.error('出现验证码')
            return self.parse_page(
                self.left_page_num,
                crack.get_image(self.get_result_url, self.session,
                                page_source))
        
        for index, tr_info in enumerate(tr_table.find_all(name='tr')):

            searching_idx = str((index+1) + (self.cur_page_num-1)*20)
            #print('searching_idx: ', searching_idx)
            if self.repair & (searching_idx in self.missingPage) is False:
                ##print('QQ')
                self.getDetail = False
            else:
                self.getDetail = True


            tr_text = ''
            detail_url = ''
            filename = ''
            
            # 遍历每一列
            for index, td_info in enumerate(tr_info.find_all(name='td')):
                
                # 因为一列中的信息非常杂乱,此处进行二次拼接
                td_text = ''
                for string in td_info.stripped_strings:
                    td_text += string
                tr_text += td_text + ' '
                
                # 寻找详情链接
                dt_url = td_info.find('a', attrs={'class': 'fz14'})
                if dt_url:
                    detail_url = dt_url.attrs['href']
                    filename = detail_url[detail_url.find('filename=')+9:]

            # 将每一篇文献的信息分组
            single_refence_list = tr_text.split(' ')
            #print('正在下载: ' + single_refence_list[1])
            
            # 是否开启详情页数据抓取
            if self.getDetail:
                ##print('repair:',self.repair)
                time.sleep(config.crawl_stepWaitTime)
                page_detail.get_detail_page(self.session, self.get_result_url,
                                            detail_url, single_refence_list, self.userInput, self.repair)
            self.getDetail = False

        
        #print('download_page_left: ', self.left_page_num)
        
        self.get_another_page()