def extract_page(self, response): arguments = self.parse_url(response.request.url) self.log('Process page {}'.format(response.request.url), level=logging.INFO) with open( os.path.join( self.work_directory, '{}_{}_{}_{}.html'.format(arguments['province'][0], arguments['round'][0], arguments['industry'][0], arguments.get('p', ['1'])[0])), 'wb') as fo: fo.write(response.body) row = response.xpath("//table[@class='ntable']/tr") if len(row) < 1: self.log('page {} has no data'.format(response.request.url), level=logging.WARNING) POOL.remove(response.request.meta['proxy']) return first_row = True result = [] for r in row: if first_row: first_row = False continue image = r.xpath('td[1]/img/@src').extract_first() project_name = r.xpath('td[2]/a/text()').extract_first() project_url = r.xpath('td[2]/a/@src').extract_first() investor = r.xpath('td[3]/text()').extract_first() stage = r.xpath('td[4]/text()').extract_first() invest_time = r.xpath('td[5]/text()').extract_first() result.append({ 'image': image, 'project': { 'url': project_url, 'name': project_name }, 'investor': investor, 'stage': stage, 'time': invest_time }) self.log('find investment from {} to {}'.format( investor, project_name), level=logging.INFO) # save the file with open( os.path.join( self.work_directory, '{}_{}_{}_{}.json'.format(arguments['province'][0], arguments['round'][0], arguments['industry'][0], arguments.get('p', ['1'])[0])), 'w') as fo: json.dump(result, fo, ensure_ascii=False)
def parse(self, response): # find the document url link = response.xpath( "//div[@class='ProjetInfo_title']/a/@href").extract_first() if link is None: self.log('{} fail to download'.format(response.url, link), level=logging.WARNING) # remove the invalid proxy POOL.remove(response.request.meta['proxy']) return page = uuid.uuid5(uuid.NAMESPACE_URL, response.url.encode('utf-8')).hex filename = '%s.html' % page with open(os.path.join(self.work_directory, filename), 'wb') as f: f.write(response.body) self.log('{} => {}'.format(response.url, link), level=logging.INFO) yield { 'link': response.url, 'xml': link, }