Ejemplo n.º 1
0
 def chufaparse(self, response):
     items = self.configParse(chufaConfigs, response, response)
     for item in items.__iter__():
         _item = item
         url = re.search("\((.*?)\)",
                         _item['result']['contents']).group(1).strip()
         url = response.urljoin(url)
         try:
             content = S._txtparse(url)
             content = S.replace_invalid_char(content)
         except:
             content = None
         _item['result']['contents'] = content
         _item['result']['url'] = url
         yield _item
     page, totalpage, counts = getTotalPage(response)
     if page < totalpage:
         page += 1
         url = 'http://www.szse.cn/szseWeb/FrontController.szse?ACTIONID=7&AJAX=AJAX-TRUE&CATALOGID=1759_cxda&TABKEY=tab1&tab1PAGENO={page}&tab1PAGECOUNT={totalpage}&tab1RECORDCOUNT={counts}&REPORT_ACTION=navigate'.format(
             page=page, totalpage=totalpage, counts=counts)
         yield scrapy.Request(url,
                              headers=hdr(),
                              meta={
                                  'page': page,
                                  'totalpage': totalpage,
                                  'counts': counts
                              },
                              callback=self.chufaparse,
                              priority=1)
Ejemplo n.º 2
0
 def zrdsinfoparse(self, response):
     item = SzseItem()
     result = response.meta['result']
     result['ins'] = "".join(
         response.xpath(
             '//span[@id="ViewResume1_lblContent"]/text()').extract())
     result['ins'] = S.replace_invalid_char(result['ins'])
     item['result'] = result
     item['db'] = response.meta['db']
     item['keys'] = response.meta['keys']
     yield item
Ejemplo n.º 3
0
 def configParse(self, configs, _response, response=None):
     item = SzseItem()
     if isinstance(configs, dict):
         configs = [configs]
     for _configs in configs:
         #迭代可能多个的configs
         if _configs.get('flag') is None:
             _configs['flag'] = True
         if _configs['list']['v'] and _configs['flag']:
             res = S.select_content(_response, _configs['list'])
         elif isinstance(_response, list):
             res = _response
         else:
             #list(response)   ----让response可迭代
             res = [_response]
         if res:
             for _res in res:
                 #初始化result
                 result = dict()
                 #遍历每个字段提取
                 for config in _configs['data']:
                     k = config['En']
                     result[k] = S.select_content(_res, config, response)
                     result[k] = S.replace_invalid_char(result[k])
                     result[k] = S.replace_invalid_html_char(result[k])
                 if _configs.get('clear'):
                     for config in _configs['clear']:
                         k = config['En']
                         result[k] = S.select_content(
                             result[k], config, response)
                 item['result'] = result
                 item['keys'] = _configs['list']['keys']
                 item['db'] = _configs['list']['db']
                 item['conn'] = _configs['list'].get('conn')
                 #传递pipelin处理item字段
                 if result[_configs['list']['check']]:
                     yield item