def parse_info(self, response): uid = self._extract_uid(response.url) for link in self.investment_link_extractor.extract_links(response): yield HoldDepthRequest(url=link.url, callback=self.parse_investment, meta={'uid': uid}) item = CorpInfo('corp_info') item['uid'] = uid item['corp_name'] = safe_extract(response.xpath('//*[@id="companyheader"]/h3/span/text()')) corp_info = response.css('.company-info')[0] for li in corp_info.xpath('li'): label = safe_strip(li.xpath('label/text()').extract_first(), chars=self.label_strip_chars) field = self.label_to_field[label] if field is 'legal_rep': item[field] = safe_extract(li.xpath('a/text()'), sep='') else: item[field] = safe_extract(li.xpath('text()'), sep='') # parse shareholders and related pages shareholders = [] for a in response.css('tr.white>td>div>a[href*=firm]'): # yield request url = response.urljoin(a.xpath('@href').extract_first()) yield Request(url=url, callback=self.parse_info) shareholder = Shareholder() shareholder['name'] = safe_extract(a.xpath('span/text()')) shareholder['corp_uid'] = self._extract_uid(url) tr = a.xpath('../../..') shareholder['holder_type'] = safe_extract(tr.xpath('td[2]/text()')) shareholder['reg_amount'] = safe_extract(tr.xpath('td[3]/p/text()'), sep='/') shareholder['paid_amount'] = safe_extract(tr.xpath('td[4]/p/text()'), sep='/') shareholder['method'] = safe_extract(tr.xpath('td[5]/text()')) shareholders.append(shareholder) for a in response.css('tr.white>td>div>a[href*=search]'): shareholder = Shareholder() shareholder['name'] = safe_extract(a.xpath('text()')) tr = a.xpath('../../../..') shareholder['holder_type'] = safe_extract(tr.xpath('td[2]/text()')) shareholder['reg_amount'] = safe_extract(tr.xpath('td[3]/p/text()'), sep='/') shareholder['paid_amount'] = safe_extract(tr.xpath('td[4]/p/text()'), sep='/') shareholder['method'] = safe_extract(tr.xpath('td[5]/text()')) shareholders.append(shareholder) item['shareholders'] = shareholders # parse subsidiaries subsidiaries = [] for a in response.css('tr.white>td>a'): # yield request url = response.urljoin(a.xpath('@href').extract_first()) yield Request(url=url, callback=self.parse_info) subsidiary = Subsidiary() subsidiary['corp_uid'] = self._extract_uid(url) subsidiary['corp_name'] = safe_strip(a.xpath('text()').extract_first()) subsidiaries.append(subsidiary) item['subsidiaries'] = subsidiaries yield item
def _parse_subsidiaries(self, response): trs = response.xpath('//table[@id="branchTable"]/tr[@class="page-item"]') items = [] for tr in trs: item = Subsidiary() item['reg_no'] = safe_strip(tr.xpath('td[2]/text()').extract_first()) item['name'] = safe_strip(tr.xpath('td[3]/text()').extract_first()) item['reg_office'] = safe_strip(tr.xpath('td[4]/text()').extract_first()) items.append(item) return items
def _parse_shareholders(self, response): trs = response.xpath('//table[@id="investorTable"]/tr[@class="page-item"]') items = [] for tr in trs: item = Shareholder() item['holder_type'] = safe_strip(tr.xpath('td[1]/text()').extract_first()) item['name'] = safe_strip(tr.xpath('td[2]/text()').extract_first()) item['license_type'] = safe_strip(tr.xpath('td[3]/text()').extract_first()) items.append(item) return items
def parse_staff(self, response): item = CorpInfo('corp_info') item['stock_id'] = response.meta['stock_id'] staff = [] for tr in response.xpath('//div[@class="zx_left"]//table/tr[position() > 1]'): s = Staff() s['name'] = safe_strip(tr.xpath('td[1]/text()').extract_first()) s['title'] = safe_strip(tr.xpath('td[2]/text()').extract_first()) s['gender'] = safe_strip(tr.xpath('td[4]/text()').extract_first()) staff.append(s) item['staff'] = staff yield item
def _parse_staff(self, response): trs = response.xpath('//table[@id="memberTable"]/tr[@class="page-item"]') items = [] for tr in trs: item = Staff() item['name'] = safe_strip(tr.xpath('td[2]/text()').extract_first()) item['title'] = safe_strip(tr.xpath('td[3]/text()').extract_first()) items.append(item) name = safe_strip(tr.xpath('td[5]/text()').extract_first()) if name != '': item = Staff() item['name'] = name item['title'] = safe_strip(tr.xpath('td[6]/text()').extract_first()) items.append(item) return items
def parse_shareholders(self, response): item = CorpInfo('corp_info') item['stock_id'] = response.meta['stock_id'] shareholders = [] table = response.css('div.zx_left table')[0] rowspan = int(table.xpath('tr[2]/td/@rowspan').extract_first()) chars = '1234567890.' for i in xrange(2, 2 + rowspan): s = Shareholder() if i == 2: s['name'] = safe_strip(table.xpath('tr[%d]/td[2]/text()' % i).extract_first(), chars) s['ratio'] = float(safe_strip(table.xpath('tr[%d]/td[4]/text()' % i).extract_first())) else: s['name'] = safe_strip(table.xpath('tr[%d]/td[1]/text()' % i).extract_first(), chars) s['ratio'] = float(safe_strip(table.xpath('tr[%d]/td[3]/text()' % i).extract_first())) shareholders.append(s) item['shareholders'] = shareholders yield item
def parse_investment(self, response): item = CorpInfo('corp_info') item['uid'] = response.meta['uid'] investments = [] for a in response.css('.site-list-title>a[href*=firm]'): url = response.urljoin(a.xpath('@href').extract_first()) yield Request(url=url, callback=self.parse_info) investment = Investment() investment['corp_uid'] = self._extract_uid(url) investment['corp_name'] = safe_strip(a.xpath('text()').extract_first()) investments.append(investment) item['investments'] = investments yield item
def parse_brief(self, response): item = CorpInfo('corp_info') item['stock_id'] = response.meta['stock_id'] table = response.css('div.zx_left table')[0] item['name'] = safe_strip(table.xpath('tr[1]/td[2]/text()').extract_first()) item['eng_name'] = safe_strip(table.xpath('tr[2]/td[2]/text()').extract_first()) item['address'] = safe_strip(table.xpath('tr[3]/td[2]/text()').extract_first()) item['brief_name'] = safe_strip(table.xpath('tr[4]/td[2]/text()').extract_first()) item['legal_rep'] = safe_strip(table.xpath('tr[5]/td[2]/text()').extract_first()) item['reg_capital'] = safe_strip(table.xpath('tr[7]/td[2]/text()').extract_first()) item['busi_type'] = safe_strip(table.xpath('tr[8]/td[2]/text()').extract_first()) item['postcode'] = safe_strip(table.xpath('tr[9]/td[2]/text()').extract_first()) item['tel'] = safe_strip(table.xpath('tr[10]/td[2]/text()').extract_first()) item['fax'] = safe_strip(table.xpath('tr[11]/td[2]/text()').extract_first()) item['homepage'] = safe_strip(table.xpath('tr[12]/td[2]/text()').extract_first()) yield item
def parse(self, response): table = response.css('#jibenxinxi table')[0] if 'szcredit' in response.url: item = LimitedCorpCredit(self.col_name) item['reg_no'] = safe_strip(table.xpath('tr[3]/td[1]/span/text()').extract_first()) item['corp_name'] = safe_strip(table.xpath('tr[3]/td[2]/span/text()').extract_first()) item['corp_type'] = safe_strip(table.xpath('tr[4]/td[1]/span/text()').extract_first()) item['legal_rep'] = safe_strip(table.xpath('tr[4]/td[2]/span/text()').extract_first()) item['reg_capital'] = safe_strip(table.xpath('tr[5]/td[1]/span/text()').extract_first()) item['fund_date'] = safe_strip(table.xpath('tr[5]/td[2]/span/text()').extract_first()) item['address'] = safe_strip(table.xpath('tr[6]/td/span/text()').extract_first()) item['op_period_from'] = safe_strip(table.xpath('tr[7]/td[1]/span/text()').extract_first()) item['op_period_to'] = safe_strip(table.xpath('tr[7]/td[2]/span/text()').extract_first()) item['reg_office'] = safe_strip(table.xpath('tr[3]/td[1]/span/text()').extract_first()) item['appr_date'] = safe_strip(table.xpath('tr[3]/td[2]/span/text()').extract_first()) item['reg_status'] = safe_strip(table.xpath('tr[9]/td[1]/span/text()').extract_first()) elif len(table.xpath('tr')) == 9: item = IndividualCorpCredit(self.col_name) item['reg_no'] = safe_strip(table.xpath('tr[3]/td[1]/text()').extract_first()) item['corp_name'] = safe_strip(table.xpath('tr[3]/td[2]/text()').extract_first()) item['corp_type'] = safe_strip(table.xpath('tr[4]/td[1]/text()').extract_first()) item['investor'] = safe_strip(table.xpath('tr[4]/td[2]/text()').extract_first()) item['address'] = safe_strip(table.xpath('tr[5]/td/text()').extract_first()) item['fund_date'] = safe_strip(table.xpath('tr[6]/td[2]/text()').extract_first()) item['busi_scope'] = safe_strip(table.xpath('tr[7]/td/text()').extract_first()) item['reg_office'] = safe_strip(table.xpath('tr[8]/td[1]/text()').extract_first()) item['appr_date'] = safe_strip(table.xpath('tr[8]/td[2]/text()').extract_first()) item['reg_status'] = safe_strip(table.xpath('tr[9]/td[1]/text()').extract_first()) elif len(table.xpath('tr')) == 10: item = LimitedCorpCredit(self.col_name) item['reg_no'] = safe_strip(table.xpath('tr[3]/td[1]/text()').extract_first()) item['corp_name'] = safe_strip(table.xpath('tr[3]/td[2]/text()').extract_first()) item['corp_type'] = safe_strip(table.xpath('tr[4]/td[1]/text()').extract_first()) item['legal_rep'] = safe_strip(table.xpath('tr[4]/td[2]/text()').extract_first()) item['reg_capital'] = safe_strip(table.xpath('tr[5]/td[1]/text()').extract_first()) item['fund_date'] = safe_strip(table.xpath('tr[5]/td[2]/text()').extract_first()) item['address'] = safe_strip(table.xpath('tr[6]/td/text()').extract_first()) item['op_period_from'] = safe_strip(table.xpath('tr[7]/td[1]/text()').extract_first()) item['op_period_to'] = safe_strip(table.xpath('tr[7]/td[2]/text()').extract_first()) item['busi_scope'] = safe_strip(table.xpath('tr[8]/td/text()').extract_first()) item['reg_office'] = safe_strip(table.xpath('tr[9]/td[1]/text()').extract_first()) item['appr_date'] = safe_strip(table.xpath('tr[9]/td[2]/text()').extract_first()) item['reg_status'] = safe_strip(table.xpath('tr[10]/td[1]/text()').extract_first()) yield item
def parse(self, response): table = response.xpath('//div[@rel="layout-01_01"][1]/table')[0] item = LimitedCorpCredit(self.col_name) item['reg_no'] = safe_strip(table.xpath('tr[2]/td[1]/text()').extract_first()) item['corp_name'] = safe_strip(table.xpath('tr[2]/td[2]/text()').extract_first()) item['corp_type'] = safe_strip(table.xpath('tr[3]/td[1]/text()').extract_first()) item['legal_rep'] = safe_strip(table.xpath('tr[3]/td[2]/text()').extract_first()) item['reg_capital'] = safe_strip(table.xpath('tr[4]/td[1]/text()').extract_first()) item['fund_date'] = safe_strip(table.xpath('tr[4]/td[2]/text()').extract_first()) item['address'] = safe_strip(table.xpath('tr[5]/td/text()').extract_first()) item['op_period_from'] = safe_strip(table.xpath('tr[6]/td[1]/text()').extract_first()) item['op_period_to'] = safe_strip(table.xpath('tr[6]/td[2]/text()').extract_first()) item['busi_scope'] = safe_strip(table.xpath('tr[7]/td/text()').extract_first()) item['reg_office'] = safe_strip(table.xpath('tr[8]/td[1]/text()').extract_first()) item['appr_date'] = safe_strip(table.xpath('tr[8]/td[2]/text()').extract_first()) item['reg_status'] = safe_strip(table.xpath('tr[9]/td[1]/text()').extract_first()) # sub items item['shareholders'] = self._parse_shareholders(response) item['main_staff'] = self._parse_staff(response) item['subsidiaries'] = self._parse_subsidiaries(response) yield item