コード例 #1
0
 def extract_page(self, response):
     arguments = self.parse_url(response.request.url)
     self.log('Process page {}'.format(response.request.url),
              level=logging.INFO)
     with open(
             os.path.join(
                 self.work_directory,
                 '{}_{}_{}_{}.html'.format(arguments['province'][0],
                                           arguments['round'][0],
                                           arguments['industry'][0],
                                           arguments.get('p', ['1'])[0])),
             'wb') as fo:
         fo.write(response.body)
     row = response.xpath("//table[@class='ntable']/tr")
     if len(row) < 1:
         self.log('page {} has no data'.format(response.request.url),
                  level=logging.WARNING)
         POOL.remove(response.request.meta['proxy'])
         return
     first_row = True
     result = []
     for r in row:
         if first_row:
             first_row = False
             continue
         image = r.xpath('td[1]/img/@src').extract_first()
         project_name = r.xpath('td[2]/a/text()').extract_first()
         project_url = r.xpath('td[2]/a/@src').extract_first()
         investor = r.xpath('td[3]/text()').extract_first()
         stage = r.xpath('td[4]/text()').extract_first()
         invest_time = r.xpath('td[5]/text()').extract_first()
         result.append({
             'image': image,
             'project': {
                 'url': project_url,
                 'name': project_name
             },
             'investor': investor,
             'stage': stage,
             'time': invest_time
         })
         self.log('find investment from {} to {}'.format(
             investor, project_name),
                  level=logging.INFO)
     # save the file
     with open(
             os.path.join(
                 self.work_directory,
                 '{}_{}_{}_{}.json'.format(arguments['province'][0],
                                           arguments['round'][0],
                                           arguments['industry'][0],
                                           arguments.get('p', ['1'])[0])),
             'w') as fo:
         json.dump(result, fo, ensure_ascii=False)
コード例 #2
0
 def parse(self, response):
     # find the document url
     link = response.xpath(
         "//div[@class='ProjetInfo_title']/a/@href").extract_first()
     if link is None:
         self.log('{} fail to download'.format(response.url, link),
                  level=logging.WARNING)
         # remove the invalid proxy
         POOL.remove(response.request.meta['proxy'])
         return
     page = uuid.uuid5(uuid.NAMESPACE_URL, response.url.encode('utf-8')).hex
     filename = '%s.html' % page
     with open(os.path.join(self.work_directory, filename), 'wb') as f:
         f.write(response.body)
     self.log('{} => {}'.format(response.url, link), level=logging.INFO)
     yield {
         'link': response.url,
         'xml': link,
     }