def parse(self, response): tc = self.get_thread_category_from_url(response.url) if not tc[0] or not tc[1]: self.logger.warning('Invalid Wangjia News Item From <%s>.' % response.url) return None symbol = (tc[0], self.tab[tc[1]], response.url) if response.xpath('//div[@id="messagetext" and @class="alert_info"]'): self.logger.warning('No.%s Wangjia News %s Item From <%s> Maybe Limited.' % symbol) return None self.logger.info('Parsing No.%s Wangjia News %s Item From <%s>.' % symbol) item = XinwenItem() item['thread'] = int(symbol[0]) item['category_id'] = tc[1] item['source'] = symbol[2] article = response.xpath('//div[@class="show-box"]') item['title'] = get_content(article.xpath('h1/text()').extract()) subtitle = article.xpath('div[@class="s-bq"]/span') item['created'] = subtitle[0].xpath('text()').extract()[0] if len(subtitle) >= 3: item['author'] = get_content(subtitle[2].xpath('text()').extract()).split(u':')[1] item['summary'] = get_content(article.xpath('div[@class="s-zy"]/span/text()').extract()) body = article.xpath('div[@class="c-cen"]') item['content'] = ''.join([get_trunk(c) for c in body.xpath('.//text()').extract()]) item['raw_content'] = get_content(body.extract()) item['image_url'] = '#'.join([self.modify_image_url(get_trunk(c)) for c in body.xpath('.//img/@src').extract()]) or None self.logger.info(item) return None
def parse_news_detail(self, response): news = NewsItem() news['thread'] = self.get_thread_from_url(response.url) news['source'] = response.url news['title'] = get_content(response.xpath('//title/text()').extract()) news['created'] = get_content( response.xpath('//small/span[last()]/text()').extract()) news['author'] = response.xpath( '//meta[@name="author"]/@content').extract_first() news['summary'] = response.xpath( '//meta[@name="description"]/@content').extract_first() news['keywords'] = response.xpath( '//meta[@name="keywords"]/@content').extract_first() news['category'] = get_content( response.xpath('//small/span[1]/a/text()').extract()) article = response.xpath('//div[@class="article-txt"]') news['raw_content'] = article.extract_first() news['content'] = ''.join( [get_trunk(c) for c in article.xpath('.//text()').extract()]) news['image_url'] = '#'.join( [get_trunk(c) for c in article.xpath('.//img/@src').extract()]) or None # print(news) yield news
def parse_news_detail(self, response): news = NewsItem() news['thread'] = self.get_thread_from_url(response.url) news['source'] = response.url news['title'] = get_content(response.xpath('//h1/text()').extract()) news['created'] = response.meta['created'] news['author'] = response.meta['author'] news['category'] = response.meta['category'] news['summary'] = response.meta['summary'] article = response.xpath('//td[@id="article_content"]') news['raw_content'] = article.extract_first() news['content'] = ''.join([ get_trunk(c) for c in article.xpath( './/p[contains(@class, "ke-editor-inner-p")]/text()').extract( ) ]) news['image_url'] = '#'.join([ self.modify_image_url(get_trunk(c)) for c in article.xpath('.//img/@src').extract() ]) or None yield news
def parse_exposure_detail(self, response): exposure = ExposureItem() exposure['thread'] = self.get_thread_from_url(response.url) exposure['source'] = response.url exposure['title'] = get_content(response.xpath('//span[@id="thread_subject"]/text()').extract()) poston = response.xpath('(//div[@class="authi"])[2]/em/text()').extract_first() exposure['created'] = poston[poston.index(' ') + 1:] exposure['name'] = get_content(response.xpath('//div[@class="typeoption"]//tr[1]/td/text()').extract()) exposure['link'] = get_content(response.xpath('//div[@class="typeoption"]//tr[2]/td/a/text()').extract()) exposure['reason'] = get_content(response.xpath('//div[@class="typeoption"]//tr[3]/td/text()').extract()) body = response.xpath('//td[contains(@id, "postmessage")]') exposure['content'] = ''.join([get_trunk(c) for c in body.xpath('.//text()').extract()]) exposure['raw_content'] = body.extract_first() exposure['image_url'] = '#'.join([self.modify_image_url(get_trunk(c)) for c in response.xpath( '//ignore_js_op//img[re:test(@zoomfile, "^data")]/@zoomfile').extract()]) or None # exposure['image_url'] = response.xpath('//ignore_js_op//img[re:test(@src, "^data")]/@src').extract() # print(exposure) yield exposure
def parse(self, response): tc = self.get_thread_category_from_url(response.url) if not tc[0] or not tc[1]: self.logger.warning("Invalid Wangjia News Item From <%s>." % response.url) return None symbol = (tc[0], self.tab[tc[1]], response.url) if response.xpath('//div[@id="messagetext" and @class="alert_info"]'): self.logger.warning("No.%s Wangjia News %s Item From <%s> Maybe Limited." % symbol) return None self.logger.info("Parsing No.%s Wangjia News %s Item From <%s>." % symbol) item = XinwenItem() item["thread"] = int(symbol[0]) item["category_id"] = tc[1] item["source"] = symbol[2] article = response.xpath('//div[@class="con_news"]') item["title"] = get_content(article.xpath("h1/text()").extract()) subtitle = article.xpath('ul/li[@class="n_time"]/text()').extract()[0].encode("utf8").split(":") item["created"] = get_content(subtitle[1].split()) item["author"] = get_content(subtitle[-1].split()) item["summary"] = get_content(article.xpath('ul/li[@class="a_abstract"]/span/text()').extract()) body = article.xpath('ul/li[@class="news_con_p"]') item["content"] = "".join([get_trunk(c) for c in body.xpath(".//text()").extract()]) item["raw_content"] = get_content(body.extract()) item["image_url"] = ( "#".join([self.modify_image_url(get_trunk(c)) for c in body.xpath(".//img/@src").extract()]) or None ) return item
def parse_govern_info(self, response): name = get_content( response.xpath('//div[@class="comp-intro"]').xpath( './/div[@class="intro-txt"]/span')[0].xpath( 'string(.)').extract()) company = GovernInfoItem() company['link'] = response.url company['name'] = name company['code'] = response.meta['code'] govern_info = response.xpath('//div[@id="govern-info"]') company['structure'] = get_content( response.xpath( '//div[@class="mask"]/img[@class="mask-img"]/@src').extract()) relation = dict() for tr in govern_info.xpath('table[2]/tbody/tr'): key = get_content(tr.xpath('string(td[1])').extract()) value = get_content(tr.xpath('string(td[2])').extract()) relation[key] = value company['relation'] = json.dumps(relation, encoding="UTF-8", ensure_ascii=False) controller = govern_info.xpath( 'table[3]/tbody/tr[1]/td/text()').extract() company['controller'] = json.dumps(controller, encoding="UTF-8", ensure_ascii=False) shareholder_list = list() for tr in govern_info.xpath('table[4]/tbody/tr'): shareholder_list.append([ get_trunk(item) for item in tr.xpath('td//text()').extract() if get_trunk(item) != '' ]) company['shareholder_list'] = json.dumps(shareholder_list, encoding="UTF-8", ensure_ascii=False) manager_list = list() for tr in govern_info.xpath('table[5]/tbody/tr'): manager_list.append([ get_trunk(item) for item in tr.xpath('td//text()').extract() if get_trunk(item) != '' ]) company['manager_list'] = json.dumps(manager_list, encoding="UTF-8", ensure_ascii=False) return company
def parse(self, response): symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url) self.logger.info('Parsing ID.%d Wangjia Feature From <%s>.' % symbol) self.object = DaohangItem.get_object_by_pk(symbol[0]) item = TedianItem() item['name'] = self.object.name rtag = response.xpath('//div[@class="rTags"]') if rtag: item['status'] = get_content( rtag.xpath('./span[@class="tag3"]/text()').extract()) item['company_tag'] = get_content( rtag.xpath('./span[@class="tag tag2"]/text()').extract()) tag_info = rtag.xpath('./span[@class = "tag"]') item['illustration'] = '/'.join([ get_trunk(info) for info in tag_info.xpath('text()').extract() ]) comment_info = response.xpath( '//div[contains(@class,"box commentBox")]') if comment_info: commentScores = comment_info.xpath('./dl[@class="comment"]') item['recommendation'] = get_content( commentScores.xpath('./dt/span/text()').extract()) score = commentScores.xpath('./dd/span[@class="num"]') item['withdraw_num'] = get_content( score[0].xpath('text()').extract()) item['guard_num'] = get_content(score[1].xpath('text()').extract()) item['service_num'] = get_content( score[2].xpath('text()').extract()) item['experience_num'] = get_content( score[3].xpath('text()').extract()) scoreInfo = commentScores.xpath('.//span[not(@class="num")]') item['withdraw_day'] = get_content( scoreInfo[0].xpath('text()').extract()) item['guard_day'] = get_content( scoreInfo[1].xpath('text()').extract()) item['service_status'] = get_content( scoreInfo[2].xpath('text()').extract()) item['experience_status'] = get_content( scoreInfo[3].xpath('text()').extract()) impress_info = comment_info.xpath( './dl[@class="impression"]/dd//span') item['impression'] = '\001'.join([ get_trunk(impress) for impress in impress_info.xpath('text()').extract() ]) return item
def parse(self, response): symbol = (self.get_thread_from_url(response.url), response.url) if not symbol[0]: self.logger.warning('Invalid Wangjia Exposure Item From <%s>.' % symbol[1]) return None if response.xpath('//div[@class="wrap"]'): self.logger.warning('May Redirect To Warning Page Of Wangjia.') return None if response.xpath('//div[@id="messagetext" and @class="alert_info"]'): self.logger.warning( 'No.%s Wangjia Exposure Item From <%s> Maybe Limited.' % symbol) return None self.logger.info('Parsing No.%s Wangjia Exposure Item From <%s>.' % symbol) item = BaoguangItem() item['thread'] = int(symbol[0]) item['source'] = symbol[1] item['title'] = get_content( response.xpath('//h1[@class="context-title"]/text()').extract()) subtitle = response.xpath('//div[@class="post-time"]/span') poston = get_content(subtitle.xpath('text()').extract(), skipBlank=False) item['created'] = poston[poston.index(' ') + 1:] header = response.xpath( '//div[@class="post-pub-txt mb12"]/table/tbody/tr') if header: item['name'] = get_content( header[1].xpath('./td[2]/text()').extract()) item['link'] = get_content( header[2].xpath('./td[2]/text()').extract()) item['reason'] = get_content( header[3].xpath('./td[2]/text()').extract()) body = response.xpath('//div[@class="news_con_p"]') #item['content'] = ''.join([get_trunk(c) for c in body.xpath('text()|*[not(@class="pstatus")]/text()|*[not(@class="pstatus")]/*/text()').extract()]) item['content'] = ''.join( [get_trunk(c) for c in body.xpath('.//text()').extract()]) item['raw_content'] = get_content(body.extract()) item['image_url'] = '#'.join([ self.modify_image_url(get_trunk(c)) for c in body.xpath('.//img/@src').extract() ]) or None return item
def parse(self, response): # self.object = self.mapping.get(response.url) # symbol = (self.object.manual_id, response.url) # self.logger.info("Parsing ID.%d 39Health Drug Informations From <%s>." % symbol) symbol = (self.mapping.get(response.url), response.url) self.logger.info( "Parsing ID.%d 39Health Drug Informations From <%s>." % symbol) item = YaopingItem() item['manual_id'] = symbol[0] sub = response.xpath('//div[@class="subs"]//a') item['category_list'] = '>>'.join( [get_trunk(s) for s in sub.xpath('text()').extract()]) category_list = item['category_list'].split(">>") if len(category_list) == 3: item['category_first'] = category_list[1] elif len(category_list) == 4: item['category_first'] = category_list[1] item['category_second'] = category_list[2] item['name'] = get_content( response.xpath('//div[@class="t1"]/h1/a/text()').extract()) cites = response.xpath('//div[@class="t1"]//cite') item['cites'] = '&&'.join( [get_trunk(cite) for cite in cites.xpath('span/text()').extract()]) item['english_name'] = get_content( response.xpath('//cite[@class="t2"]/text()').extract(), skipBlank=False) item['company'] = get_content( response.xpath('//li[@class="company"]/text()').extract()) item['address'] = get_content( response.xpath('//li[@class="address"]/text()').extract()) item['telephone'] = get_content( response.xpath('//li[@class="telephone"]/text()').extract(), skipBlank=False) information = response.xpath('//div[@class="tab_box"]//dl') for info in information: key = get_content(info.xpath('dt/text()').extract()) if self.detail_map.get(key): attr = self.detail_map[key] detail = info.xpath('dd') #using string(.) to remove html label item[attr] = get_content(detail.xpath('string(.)').extract()) return item
def parse(self, response): symbol = (self.get_thread_from_url(response.url), response.url) if not symbol[0]: self.logger.warning('Invalid Wangjia Exposure Item From <%s>.' % symbol[1]) return None if response.xpath('//div[@class="wrap"]'): self.logger.warning('May Redirect To Warning Page Of Wangjia.') return None if response.xpath('//div[@id="messagetext" and @class="alert_info"]'): self.logger.warning( 'No.%s Wangjia Exposure Item From <%s> Maybe Limited.' % symbol) return None self.logger.info('Parsing No.%s Wangjia Exposure Item From <%s>.' % symbol) item = BaoguangItem() item['thread'] = int(symbol[0]) item['source'] = symbol[1] title = response.xpath('//span[@id="thread_subject"]') item['title'] = get_content(title.xpath('text()').extract()) subtitle = response.xpath('//em[starts-with(@id, "authorposton")]')[0] poston = get_content(subtitle.xpath('text()').extract(), skipBlank=False) item['created'] = poston[poston.index(' ') + 1:] header = response.xpath('//div[@class="typeoption"]/table/tbody/tr/td') if header: item['name'] = get_content(header[0].xpath('.//text()').extract()) item['link'] = get_content(header[1].xpath('.//text()').extract()) item['reason'] = get_content( header[2].xpath('.//text()').extract()) body = response.xpath('//td[starts-with(@id, "postmessage")]')[0] #item['content'] = ''.join([get_trunk(c) for c in body.xpath('text()|*[not(@class="pstatus")]/text()|*[not(@class="pstatus")]/*/text()').extract()]) item['content'] = ''.join( [get_trunk(c) for c in body.xpath('.//text()').extract()]) item['raw_content'] = get_content(body.extract()) item['image_url'] = '#'.join([ self.modify_image_url(get_trunk(c)) for c in body.xpath('.//@file').extract() ]) or None return item
def parse_detail(self, response): member = MemberItem() info = response.xpath('//div[@id="tytext"]') member['name'] = get_content(info.xpath('h1/text()').extract()) member['date'] = get_content( info.xpath('p[@class="tytdate"]/text()').extract()) member['link'] = response.url if len(info.xpath('./div/p')) > 0: for p in info.xpath('./div/p'): content = get_content(p.xpath('string(.)').extract()) print(member['name']) print(content) print('--------1') if content == None: continue if content.find(u'网址') >= 0: member['website'] = content.split(':')[-1] elif content.find(u'电话') >= 0: member['phone'] = content.split(':')[-1] elif content.find(u'地址') >= 0: member['address'] = content.split(':')[-1] elif content.find(u'邮编') >= 0: member['zip'] = content.split(':')[-1] elif len(info.xpath('./p')) < 4: content = info.xpath('string(./p[2])').extract_first().split('\n') for s in content: print(s) print('--------2') value = get_trunk(s.split(u':')[-1]) if s.find(u'网址') >= 0: member['website'] = value elif s.find(u'电话') >= 0: member['phone'] = value elif s.find(u'地址') >= 0: member['address'] = value elif s.find(u'邮编') >= 0: member['zip'] = value else: for p in info.xpath('./p'): content = get_content(p.xpath('string(.)').extract()) print(member['name']) print(content) print('--------3') if content == None: continue if content.find(u'网址') >= 0: member['website'] = content.split(':')[-1] elif content.find(u'电话') >= 0: member['phone'] = content.split(':')[-1] elif content.find(u'地址') >= 0: member['address'] = content.split(':')[-1] elif content.find(u'邮编') >= 0: member['zip'] = content.split(':')[-1] if member['website'][0] == '/': member['website'] = 'http:' + member['website'] yield member
def parse(self, response): # self.object = self.mapping.get(response.url) # symbol = (self.object.manual_id, response.url) # self.logger.info("Parsing ID.%d 39Health Drug Informations From <%s>." % symbol) symbol = (self.mapping.get(response.url), response.url) self.logger.info("Parsing ID.%d 39Health Drug Informations From <%s>." % symbol) item = YaopingItem() item['manual_id'] = symbol[0] sub = response.xpath('//div[@class="subs"]//a') item['category_list'] = '>>'.join([get_trunk(s) for s in sub.xpath('text()').extract()]) category_list = item['category_list'].split(">>") if len(category_list) == 3: item['category_first'] = category_list[1] elif len(category_list) == 4: item['category_first'] = category_list[1] item['category_second'] = category_list[2] item['name'] = get_content(response.xpath('//div[@class="t1"]/h1/a/text()').extract()) cites = response.xpath('//div[@class="t1"]//cite') item['cites'] = '&&'.join([get_trunk(cite) for cite in cites.xpath('span/text()').extract()]) item['english_name'] = get_content(response.xpath('//cite[@class="t2"]/text()').extract(), skipBlank=False) item['company'] = get_content(response.xpath('//li[@class="company"]/text()').extract()) item['address'] = get_content(response.xpath('//li[@class="address"]/text()').extract()) item['telephone'] = get_content(response.xpath('//li[@class="telephone"]/text()').extract(), skipBlank=False) information = response.xpath('//div[@class="tab_box"]//dl') for info in information: key = get_content(info.xpath('dt/text()').extract()) if self.detail_map.get(key): attr = self.detail_map[key] detail = info.xpath('dd') #using string(.) to remove html label item[attr] = get_content(detail.xpath('string(.)').extract()) return item
def parse(self, response): symbol = (self.mapping[response.meta['cpid']], response.meta['cpid']) self.logger.info('Parsing ID.%d Chinawealth Area Info From Pid:%s' % symbol) item = ChanpinItem() data = json.loads(response.body) provinces = data.get('List', []) areas = '#'.join(get_trunk(each.get('cpxsqy','')) for each in provinces) item['pid'] = symbol[1] item['area'] = areas return item
def parse_platform_honor(self, response, thread, name, link): platform_honor = HonorItem() platform_honor['thread'] = thread platform_honor['name'] = name platform_honor['link'] = link platform_honor['honor_list'] = [get_trunk(honor) for honor in response.xpath('//div[contains(@class, "honor")]/ul/li/text()').extract()] return platform_honor
def parse_detail(self, response): report = ReportItem() report['thread'] = self.get_id_from_url(response.url) report['category'] = response.meta['category'] report['link'] = response.url report['title'] = get_content( response.xpath('//div[@class="report"]/h1/text()').extract()) report['created'] = get_content( response.xpath( '//span[@class="inputtime"]/text()').extract())[-10:] article = response.xpath('//div[@class="dianping"]') report['raw_content'] = article.extract_first() report['content'] = ''.join( [get_trunk(c) for c in article.xpath('.//text()').extract()]) report['image_url'] = '#'.join( [get_trunk(c) for c in article.xpath('.//img/@src').extract()]) or None yield report
def parse(self, response): symbol = (self.get_thread_from_url(response.url), response.url) if not symbol[0]: self.logger.warning('Invalid Wangjia Exposure Item From <%s>.' % symbol[1]) return None if response.xpath('//div[@class="wrap"]'): self.logger.warning('May Redirect To Warning Page Of Wangjia.') return None if response.xpath('//div[@id="messagetext" and @class="alert_info"]'): self.logger.warning('No.%s Wangjia Exposure Item From <%s> Maybe Limited.' % symbol) return None self.logger.info('Parsing No.%s Wangjia Exposure Item From <%s>.' % symbol) item = BaoguangItem() item['thread'] = int(symbol[0]) item['source'] = symbol[1] title = response.xpath('//span[@id="thread_subject"]') item['title'] = get_content(title.xpath('text()').extract()) subtitle = response.xpath('//em[starts-with(@id, "authorposton")]')[0] poston = get_content(subtitle.xpath('text()').extract(), skipBlank=False) item['created'] = poston[poston.index(' ')+1:] header = response.xpath('//div[@class="typeoption"]/table/tbody/tr/td') if header: item['name'] = get_content(header[0].xpath('.//text()').extract()) item['link'] = get_content(header[1].xpath('.//text()').extract()) item['reason'] = get_content(header[2].xpath('.//text()').extract()) body = response.xpath('//td[starts-with(@id, "postmessage")]')[0] #item['content'] = ''.join([get_trunk(c) for c in body.xpath('text()|*[not(@class="pstatus")]/text()|*[not(@class="pstatus")]/*/text()').extract()]) item['content'] = ''.join([get_trunk(c) for c in body.xpath('.//text()').extract()]) item['raw_content'] = get_content(body.extract()) item['image_url'] = '#'.join([self.modify_image_url(get_trunk(c)) for c in body.xpath('.//@file').extract()]) or None return item
def parse_news_detail(self, response): news = NewsItem() news['thread'] = self.get_thread_from_url(response.url) news['source'] = response.url news['title'] = response.meta['title'] # news['created'] = response.meta['created'] news['created'] = get_content( response.xpath( '//div[@class="reInfo"]/div[1]/span[2]/text()').extract()) news['keywords'] = get_content( response.xpath('//meta[@name="keywords"]/@content').extract()) news['summary'] = get_content( response.xpath('//meta[@name="description"]/@content').extract()) # if response.xpath('//div[@class="reInfo"]/div[1]/span[last()]/a/text()'): # news['category'] = get_content( # response.xpath('//div[@class="reInfo"]/div[1]/span[last()]/a/text()').extract()) # else: # news['category'] = get_content( # response.xpath('//div[@class="reInfo"]/span[last()]/a/text()').extract()) news['category'] = response.meta['category'] article = response.xpath( '//div[@class="article-content" or @id="ctrlfscont"]' ) if response.xpath( '//div[@class="article-content" or @id="ctrlfscont"]' ) else response.xpath('//div[@class="Custom_UnionStyle"]') news['raw_content'] = article.extract_first() news['content'] = ''.join( [get_trunk(c) for c in article.xpath('.//text()').extract()]) news['image_url'] = '#'.join( [get_trunk(c) for c in article.xpath('.//img/@src').extract()]) or None yield news
def parse_detail(self, response): title = response.meta['title'] year, month = self.parse_title(title) created = response.meta['created'] link = response.url capital_structure = None content = ' '.join( [get_trunk(c) for c in response.xpath('//p//text()').extract()]) amount = None caichanxian = None shouxian = None yiwaixian = None jiankangxian = None for tbody in response.xpath('//tbody'): if len(tbody.xpath('tr')) > 5: for tr in tbody.xpath('tr'): # try: if len(tr.xpath('td')) == 6: region_name = get_content( tr.xpath('string(td[1])').extract()) amount = get_content( tr.xpath('string(td[2])').extract()) caichanxian = get_content( tr.xpath('string(td[3])').extract()) shouxian = get_content( tr.xpath('string(td[4])').extract()) yiwaixian = get_content( tr.xpath('string(td[5])').extract()) jiankangxian = get_content( tr.xpath('string(td[6])').extract()) else: continue if region_name and region_name.find(u'地区') < 0: region = RegionItem() region['title'] = title region['year'] = year region['month'] = month region['link'] = link region['region'] = region_name region['amount'] = amount region['caichanxian'] = caichanxian region['shouxian'] = shouxian region['yiwaixian'] = yiwaixian region['jiankangxian'] = jiankangxian region['content'] = content region['created'] = created yield region
def parse_trade_log(self, response): attr_list = response.xpath( '//*[@id="trade-log"]/table[2]/tr[1]/td/text()').extract() date_list = response.xpath( '//*[@id="trade-log"]/table[1]/tr/td[@class="table-label"]/text()' ).extract() date_list = [ get_trunk(date) for date in date_list if get_trunk(date) != '' and get_trunk(date).find(u'信息截止日期') < 0 ] for i, date in enumerate(date_list): company = TradeLogItem() company['link'] = response.url company['code'] = response.meta['code'] name = get_content( response.xpath('//div[@class="comp-intro"]').xpath( './/div[@class="intro-txt"]/span')[0].xpath( 'string(.)').extract()) company['name'] = name company['date'] = date log = dict() attr_value_list = response.xpath( '//*[@id="trade-log"]/table[2]/tr[{}]/td/text()'.format( str(i + 2))).extract() attr_value_list = [get_trunk(value) for value in attr_value_list] for j in range(len(attr_value_list)): log[attr_list[j]] = attr_value_list[j] company['log'] = json.dumps(log, encoding="UTF-8", ensure_ascii=False) yield company
def parse(self, response): symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url) self.logger.info('Parsing ID.%d Wangjia Feature From <%s>.' % symbol) self.object = DaohangItem.get_object_by_pk(symbol[0]) item = TedianItem() item['name'] = self.object.name rtag = response.xpath('//div[@class="rTags"]') if rtag: item['status'] = get_content(rtag.xpath('./span[@class="tag3"]/text()').extract()) item['company_tag'] = get_content(rtag.xpath('./span[@class="tag tag2"]/text()').extract()) tag_info = rtag.xpath('./span[@class = "tag"]') item['illustration'] = '/'.join([get_trunk(info) for info in tag_info.xpath('text()').extract()]) comment_info = response.xpath('//div[contains(@class,"box commentBox")]') if comment_info: commentScores = comment_info.xpath('./dl[@class="comment"]') item['recommendation'] = get_content(commentScores.xpath('./dt/span/text()').extract()) score = commentScores.xpath('./dd/span[@class="num"]') item['withdraw_num'] = get_content(score[0].xpath('text()').extract()) item['guard_num'] = get_content(score[1].xpath('text()').extract()) item['service_num'] = get_content(score[2].xpath('text()').extract()) item['experience_num'] = get_content(score[3].xpath('text()').extract()) scoreInfo = commentScores.xpath('.//span[not(@class="num")]') item['withdraw_day'] = get_content(scoreInfo[0].xpath('text()').extract()) item['guard_day'] = get_content(scoreInfo[1].xpath('text()').extract()) item['service_status'] = get_content(scoreInfo[2].xpath('text()').extract()) item['experience_status'] = get_content(scoreInfo[3].xpath('text()').extract()) impress_info = comment_info.xpath('./dl[@class="impression"]/dd//span') item['impression'] = '\001'.join([get_trunk(impress) for impress in impress_info.xpath('text()').extract()]) return item
def parse_investor(self, response, thread, name, link): investor = InvestorItem() investor['thread'] = thread investor['name'] = name investor['link'] = link investor['date'] = datetime.now().strftime('%Y-%m-%d') investor['age_distribution'] = [propo + '%' for propo in response.xpath('//div[@id="ageList"]/dl/dd/span[1]/em/text()').extract()] investor['sex_distribution'] = response.xpath( '//div[contains(@class, "index-investors-sex")]/script/text()').re_first( r'= (.*);') investor['tag_list'] = '#'.join( [get_trunk(tag) for tag in response.xpath('//div[@id="index_tag"]/a/text()').extract()]) return investor
def parse(self, response): #NOTE: (zacky, 2015.APR.27th) PIPELINE FUNCTIONS RELATED WILL BE PROCESSED, SO WE KEEP THE OBJECT STATE HERE. symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url) self.logger.info('Parsing ID.%d Wangjia Archive From <%s>.' % symbol) self.object = DaohangItem.get_object_by_pk(symbol[0]) item = DanganItem() item['name'] = self.object.name item['logo_url'] = get_content(response.xpath('//div[@class="rLogo"]/a/img/@src').extract()) detail = response.xpath('//div[contains(@class, "detailBox")]/p') if detail: item['link'] = get_content(detail[1].xpath('a/@href').extract()) item['location'] = get_content(detail[3].xpath('text()').extract()) item['launch_time'] = get_content(detail[4].xpath('text()').extract()) about = response.xpath('//div[contains(@class, "aboutBd")]/p') if about: item['introduction'] = ' '.join([get_trunk(c) for c in about.xpath('.//text()').extract()]) info = response.xpath('//div[contains(@class, "inforBd")]/p[not(contains(@class, "line"))]') if info: item['company_name'] = get_content(info[0].xpath('text()').extract()) item['artificial_person'] = get_content(info[1].xpath('text()').extract()) item['company_type'] = get_content(info[2].xpath('text()').extract()) item['shareholder_stucture'] = get_content(info[3].xpath('text()').extract()) item['registered_capital'] = get_content(info[4].xpath('text()').extract()) item['contributed_capital'] = get_content(info[5].xpath('text()').extract()) item['registered_address'] = get_content(info[6].xpath('text()').extract()) item['opening_date'] = get_content(info[7].xpath('text()').extract()) item['approved_date'] = get_content(info[8].xpath('text()').extract()) item['registration_authority'] = get_content(info[9].xpath('text()').extract()) item['business_licence'] = get_content(info[10].xpath('text()').extract()) item['institutional_framework'] = get_content(info[11].xpath('text()').extract()) item['tax_registration_num'] = get_content(info[12].xpath('text()').extract()) record = response.xpath('//div[contains(@class, "webRecordBd")]/table/tbody/tr')[1].xpath('td') if record: item['domain_name'] = get_content(record[0].xpath('text()').extract()) item['domain_date'] = get_content(record[1].xpath('text()').extract()) item['domain_company_type'] = get_content(record[2].xpath('text()').extract()) item['domain_company_name'] = get_content(record[3].xpath('text()').extract()) item['icp'] = get_content(record[4].xpath('text()').extract()) people = response.xpath('//div[contains(@class, "peopleBd")]/ul/li') if people: avatar_url = [] content = [] for i in xrange(len(people)): avatar_url.extend(people[i].xpath('div[@class="avatar"]/img/@src').extract()) content.extend([get_trunk(c) for c in people[i].xpath('p//text()').extract()]) item['company_person_avatar_url'] = '#'.join(avatar_url) item['company_person'] = ' '.join(content) cost = response.xpath('//div[contains(@class, "costBd")]')[0].xpath('p') if cost: item['management_fee'] = get_content(cost[0].xpath('text()').extract()) item['prepaid_fee'] = get_content(cost[1].xpath('text()').extract()) item['cash_withdrawal_fee'] = get_content(cost[2].xpath('text()').extract()) item['vip_fee'] = get_content(cost[3].xpath('text()').extract()) item['transfer_fee'] = get_content(cost[4].xpath('text()').extract()) item['mode_of_payment'] = get_content(cost[5].xpath('text()').extract()) contact = response.xpath('//div[contains(@class, "costBd")]')[1].xpath('p') if contact: item['contact_address'] = get_content(contact[0].xpath('text()').extract()) item['phone_400'] = get_content(contact[1].xpath('text()').extract()) item['phone'] = get_content(contact[2].xpath('text()').extract()) item['fax'] = get_content(contact[3].xpath('text()').extract()) item['email'] = get_content(contact[4].xpath('text()').extract()) record = response.xpath('//div[contains(@class, "recordListBox")]/ul/li') if record: item['is_automatic_bid'] = get_content(record[3].xpath('.//text()').extract(), skipFirst=True) item['is_equitable_assignment'] = get_content(record[4].xpath('.//text()').extract(), skipFirst=True) item['trust_fund'] = get_content(record[5].xpath('.//text()').extract(), skipFirst=True) item['tender_security'] = get_content(record[6].xpath('.//text()').extract(), skipFirst=True) item['security_mode'] = get_content(record[7].xpath('.//text()').extract(), skipFirst=True) item['guarantee_institution'] = get_content(record[8].xpath('.//text()').extract(), skipFirst=True) item['business_type'] = len(record) >= 10 and get_content(record[9].xpath('.//text()').extract(), skipFirst=True) log_empty_fields(item, self.logger) return item
def parse(self, response): #NOTE: (zacky, 2015.APR.27th) PIPELINE FUNCTIONS RELATED WILL BE PROCESSED, SO WE KEEP THE OBJECT STATE HERE. symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url) self.logger.info('Parsing ID.%d Wangjia Archive From <%s>.' % symbol) self.object = DaohangItem.get_object_by_pk(symbol[0]) item = DanganItem() item['name'] = self.object.name item['logo_url'] = get_content( response.xpath('//div[@class="rLogo"]/a/img/@src').extract()) detail = response.xpath('//div[contains(@class, "detailBox")]/p') if detail: item['link'] = get_content(detail[1].xpath('a/@href').extract()) item['location'] = get_content(detail[3].xpath('text()').extract()) item['launch_time'] = get_content( detail[4].xpath('text()').extract()) about = response.xpath('//div[contains(@class, "aboutBd")]/p') if about: item['introduction'] = ' '.join( [get_trunk(c) for c in about.xpath('.//text()').extract()]) info = response.xpath( '//div[contains(@class, "inforBd")]/p[not(contains(@class, "line"))]' ) if info: item['company_name'] = get_content( info[0].xpath('text()').extract()) item['artificial_person'] = get_content( info[1].xpath('text()').extract()) item['company_type'] = get_content( info[2].xpath('text()').extract()) item['shareholder_stucture'] = get_content( info[3].xpath('text()').extract()) item['registered_capital'] = get_content( info[4].xpath('text()').extract()) item['contributed_capital'] = get_content( info[5].xpath('text()').extract()) item['registered_address'] = get_content( info[6].xpath('text()').extract()) item['opening_date'] = get_content( info[7].xpath('text()').extract()) item['approved_date'] = get_content( info[8].xpath('text()').extract()) item['registration_authority'] = get_content( info[9].xpath('text()').extract()) item['business_licence'] = get_content( info[10].xpath('text()').extract()) item['institutional_framework'] = get_content( info[11].xpath('text()').extract()) item['tax_registration_num'] = get_content( info[12].xpath('text()').extract()) record = response.xpath( '//div[contains(@class, "webRecordBd")]/table/tbody/tr')[1].xpath( 'td') if record: item['domain_name'] = get_content( record[0].xpath('text()').extract()) item['domain_date'] = get_content( record[1].xpath('text()').extract()) item['domain_company_type'] = get_content( record[2].xpath('text()').extract()) item['domain_company_name'] = get_content( record[3].xpath('text()').extract()) item['icp'] = get_content(record[4].xpath('text()').extract()) people = response.xpath('//div[contains(@class, "peopleBd")]/ul/li') if people: avatar_url = [] content = [] for i in xrange(len(people)): avatar_url.extend( people[i].xpath('div[@class="avatar"]/img/@src').extract()) content.extend([ get_trunk(c) for c in people[i].xpath('p//text()').extract() ]) item['company_person_avatar_url'] = '#'.join(avatar_url) item['company_person'] = ' '.join(content) cost = response.xpath('//div[contains(@class, "costBd")]')[0].xpath( 'p') if cost: item['management_fee'] = get_content( cost[0].xpath('text()').extract()) item['prepaid_fee'] = get_content( cost[1].xpath('text()').extract()) item['cash_withdrawal_fee'] = get_content( cost[2].xpath('text()').extract()) item['vip_fee'] = get_content(cost[3].xpath('text()').extract()) item['transfer_fee'] = get_content( cost[4].xpath('text()').extract()) item['mode_of_payment'] = get_content( cost[5].xpath('text()').extract()) contact = response.xpath('//div[contains(@class, "costBd")]')[1].xpath( 'p') if contact: item['contact_address'] = get_content( contact[0].xpath('text()').extract()) item['phone_400'] = get_content( contact[1].xpath('text()').extract()) item['phone'] = get_content(contact[2].xpath('text()').extract()) item['fax'] = get_content(contact[3].xpath('text()').extract()) item['email'] = get_content(contact[4].xpath('text()').extract()) record = response.xpath( '//div[contains(@class, "recordListBox")]/ul/li') if record: item['is_automatic_bid'] = get_content( record[3].xpath('.//text()').extract(), skipFirst=True) item['is_equitable_assignment'] = get_content( record[4].xpath('.//text()').extract(), skipFirst=True) item['trust_fund'] = get_content( record[5].xpath('.//text()').extract(), skipFirst=True) item['tender_security'] = get_content( record[6].xpath('.//text()').extract(), skipFirst=True) item['security_mode'] = get_content( record[7].xpath('.//text()').extract(), skipFirst=True) item['guarantee_institution'] = get_content( record[8].xpath('.//text()').extract(), skipFirst=True) item['business_type'] = len(record) >= 10 and get_content( record[9].xpath('.//text()').extract(), skipFirst=True) log_empty_fields(item, self.logger) return item
def parse_detail(self, response): title = response.meta['title'] year, month = self.parse_title(title) created = response.meta['created'] link = response.url capital_structure = None content = ' '.join( [get_trunk(c) for c in response.xpath('//p//text()').extract()]) share = None flag = False for tbody in response.xpath('//tbody'): if len(tbody.xpath('tr')) > 10: for tr in tbody.xpath('tr'): if len(tr.xpath('td')) == 3: if flag: name = get_content( tr.xpath('string(td[1])').extract()) income = get_content( tr.xpath('string(td[2])').extract()) share = get_content( tr.xpath('string(td[3])').extract()) else: name = get_content( tr.xpath('string(td[2])').extract()) income = get_content( tr.xpath('string(td[3])').extract()) try: if get_content( tr.xpath('string(td[1])').extract() ).find(u'资') >= 0: capital_structure = get_content( tr.xpath('string(td[1])').extract()) except: pass elif len(tr.xpath('td')) == 2: name = get_content(tr.xpath('string(td[1])').extract()) income = get_content( tr.xpath('string(td[2])').extract()) elif len(tr.xpath('td')) == 4: try: if get_content( tr.xpath('string(td[1])').extract()).find( u'资') >= 0: capital_structure = get_content( tr.xpath('string(td[1])').extract()) except: pass name = get_content(tr.xpath('string(td[3])').extract()) income = get_content( tr.xpath('string(td[4])').extract()) else: continue if income and income.find(u'份额') >= 0: flag = True if name and income and name.find( u'公司名称') < 0 and name.find( u'小计') < 0 and name.find(u'合计') < 0: if income.find(u'万元') >= 0 or income.find( u'保费') >= 0 or income.find(u'份额') >= 0: continue company = CaichanxianItem() company['title'] = title company['year'] = year company['month'] = month company['link'] = link company['company_name'] = name company['income'] = income company['capital_structure'] = capital_structure company['content'] = content company['created'] = created if flag: company['share'] = share yield company
def parse(self, response): self.logger.info('Parsing Wangjia Archive From <%s>.' % response.url) item = DanganItem() item['pin'] = response.meta.get('pin') item['logo_url'] = get_content( response.xpath('//div[@class="pt-logo"]/img/@src').extract()) web_url = get_content( response.xpath('//div[@class="on4"]/a[1]/@href').extract()) if web_url and 'javascript' not in web_url: item['web_url'] = web_url if response.xpath('//div[@class="bq-box"]')[0].xpath('.//span'): tag = get_content( response.xpath('//div[@class="bq-box"]')[0].xpath('.//span') [-1].xpath('text()').extract()) if tag in self.problem_label: item['product_state'] = tag intro = response.xpath('//div[@class="cen-zk"]') item['introduction'] = ''.join( [get_trunk(c) for c in intro.xpath('.//text()').extract()]) title_div = response.xpath('//div[@class="title"]') item['launch_time'] = get_content( title_div.xpath('span[2]/em/text()').extract()).replace(u"上线", '') item['product_name'] = get_content( title_div.xpath('h1/text()').extract()) location = get_content(title_div.xpath('span[1]/em/text()').extract()) if len(location.split(u'·')) > 1: item['province'] = location.split(u'·')[0].strip() item['city'] = location.split(u'·')[1].strip() else: item['province'] = location.strip() business_icp = response.xpath('//div[@class="da-ggxx"]') if business_icp and len(business_icp) > 1: # 工商信息 business_info = response.xpath('//div[@class="da-ggxx"]')[0] if business_info: part1 = business_info.xpath('table[1]//tr') item['company_name'] = get_content( part1[0].xpath('td[2]/text()').extract()) item['artificial_person'] = get_content( part1[1].xpath('td[2]/text()').extract()) item['company_type'] = get_content( part1[2].xpath('td[2]/text()').extract()) item['ownership_structure'] = get_content( part1[3].xpath('td[2]/text()').extract()).replace( "--", '') part2 = business_info.xpath('table[2]//tr') item['registered_capital'] = get_content( part2[0].xpath('td[2]/text()').extract()) item['contributed_capital'] = get_content( part2[1].xpath('td[2]/text()').extract()) item['registered_address'] = get_content( part2[2].xpath('td[2]/text()').extract()) part3 = business_info.xpath('table[3]//tr') item['opening_date'] = get_content( part3[0].xpath('td[2]/text()').extract()) item['approved_date'] = get_content( part3[1].xpath('td[2]/text()').extract()) item['registration_authority'] = get_content( part3[2].xpath('td[2]/text()').extract()) item['business_licence'] = get_content( part3[3].xpath('td[2]/text()').extract()) item['institutional_framework'] = get_content( part3[4].xpath('td[2]/text()').extract()) item['tax_registration_num'] = get_content( part3[5].xpath('td[2]/text()').extract()) item['business_scope'] = get_content( business_info.xpath('table[4]/tr/td[2]/text()').extract()) # 备案信息 icp_info = response.xpath('//div[@class="da-ggxx"]')[1].xpath( 'table//tr') if icp_info: item['domain_name'] = get_content( icp_info[0].xpath('td[2]/text()').extract()) item['domain_date'] = get_content( icp_info[1].xpath('td[2]/text()').extract()) item['domain_company_type'] = get_content( icp_info[2].xpath('td[2]/text()').extract()) item['domain_company_name'] = get_content( icp_info[3].xpath('td[2]/text()').extract()) item['ICP_number'] = get_content( icp_info[4].xpath('td[2]/text()').extract()) item['ICP_approval_number'] = get_content( icp_info[5].xpath('td[2]/text()').extract()) elif business_icp and len(business_icp) == 1: icp_info = response.xpath('//div[@class="da-ggxx"]')[0].xpath( 'table//tr') if len(icp_info) == 6: # 备案信息 item['domain_name'] = get_content( icp_info[0].xpath('td[2]/text()').extract()) item['domain_date'] = get_content( icp_info[1].xpath('td[2]/text()').extract()) item['domain_company_type'] = get_content( icp_info[2].xpath('td[2]/text()').extract()) item['domain_company_name'] = get_content( icp_info[3].xpath('td[2]/text()').extract()) item['ICP_number'] = get_content( icp_info[4].xpath('td[2]/text()').extract()) item['ICP_approval_number'] = get_content( icp_info[5].xpath('td[2]/text()').extract()) else: business_info = response.xpath('//div[@class="da-ggxx"]')[0] # 工商信息 part1 = business_info.xpath('table[1]//tr') item['company_name'] = get_content( part1[0].xpath('td[2]/text()').extract()) item['artificial_person'] = get_content( part1[1].xpath('td[2]/text()').extract()) item['company_type'] = get_content( part1[2].xpath('td[2]/text()').extract()) item['ownership_structure'] = get_content( part1[3].xpath('td[2]/text()').extract()).replace( "--", '') part2 = business_info.xpath('table[2]//tr') item['registered_capital'] = get_content( part2[0].xpath('td[2]/text()').extract()) item['contributed_capital'] = get_content( part2[1].xpath('td[2]/text()').extract()) item['registered_address'] = get_content( part2[2].xpath('td[2]/text()').extract()) part3 = business_info.xpath('table[3]//tr') item['opening_date'] = get_content( part3[0].xpath('td[2]/text()').extract()) item['approved_date'] = get_content( part3[1].xpath('td[2]/text()').extract()) item['registration_authority'] = get_content( part3[2].xpath('td[2]/text()').extract()) item['business_licence'] = get_content( part3[3].xpath('td[2]/text()').extract()) item['institutional_framework'] = get_content( part3[4].xpath('td[2]/text()').extract()) item['tax_registration_num'] = get_content( part3[5].xpath('td[2]/text()').extract()) item['business_scope'] = get_content( business_info.xpath('table[4]/tr/td[2]/text()').extract()) # 平台费用 plat_fee = response.xpath('//div[@class="da-ptfy"]//dl') if plat_fee: item['account_fee'] = get_content( plat_fee[0].xpath('dt/em/text()').extract()) item['cash_fee'] = get_content( plat_fee[1].xpath('dt/em/text()').extract()) item['fueling_fee'] = get_content( plat_fee[2].xpath('dt/em/text()').extract()) item['transfer_fee'] = get_content( plat_fee[3].xpath('dt/em/text()').extract()) item['vip_fee'] = get_content( plat_fee[4].xpath('dt/em/text()').extract()) # 联系方式 contact = response.xpath('//div[@class="da-lxfs zzfwbox"]//dd') for ele in contact: key = ele.xpath( ".//div[@class='l']/em/text()").extract()[0].strip() value = get_content( ele.xpath(".//div[@class='r']").xpath("string(.)").extract()) if self.map_ch2en.has_key(key): item[self.map_ch2en[key]] = value # 实力资质 平台服务 basic_info = response.xpath("//div[@class='bgbox-bt zzfwbox']//dd") for ele in basic_info: key = ele.xpath( ".//div[@class='l']/em/text()").extract()[0].strip() if self.map_ch2en.has_key(key): if key == u'担保机构': value = get_content( ele.xpath(".//div[@class='r dbjg']").xpath( "string(.)").extract()) else: value = get_content( ele.xpath(".//div[@class='r']").xpath( "string(.)").extract()) item[self.map_ch2en[key]] = value return item
def parse_detail(self, response): report = JingyingItem() report['title'] = response.meta['title'] year, month = self.parse_title(report['title']) report['year'] = year report['month'] = month report['id'] = response.meta['id'] report['link'] = response.url report['created'] = response.meta['created'] # data = dict() data = list() content_p = list() content = None if len( response.xpath( '//*[@id="zoom"]/div/table/tbody/tr/td/div[2]/table/tbody/tr' )) > 0: for item in response.xpath( '//*[@id="zoom"]/div/table/tbody/tr/td/div[2]/table/tbody/tr' ): key = get_content(item.xpath('string(td[1])').extract()) value = get_content(item.xpath('string(td[2])').extract()) if key and value: # data[key] = value data.append((key, value)) content = response.xpath( '//*[@id="zoom"]/div/table/tbody/tr/td/div[3]/table/tbody/tr/td[2]/p' ) elif len( response.xpath( '//*[@id="zoom"]/table/tbody/tr[1]/td/table/tbody/tr') ) > 0: for item in response.xpath( '//*[@id="zoom"]/table/tbody/tr[1]/td/table/tbody/tr'): key = get_content(item.xpath('string(td[1])').extract()) value = get_content(item.xpath('string(td[2])').extract()) if key and value: data.append((key, value)) if key and not value: content_p.append(key) content = response.xpath('//span[@id="zoom"]') elif len(response.xpath('//*[@id="zoom"]/table/tbody/tr')) > 0: for item in response.xpath('//*[@id="zoom"]/table/tbody/tr'): key = get_content(item.xpath('string(td[1])').extract()) value = get_content(item.xpath('string(td[2])').extract()) if key and value: data.append((key, value)) if key and not value: content_p.append(key) content = response.xpath('//span[@id="zoom"]') elif len(response.xpath('//*[@id="zoom"]/table/tr')) > 0: for item in response.xpath('//*[@id="zoom"]/table/tr'): key = get_content(item.xpath('string(td[1])').extract()) value = get_content(item.xpath('string(td[2])').extract()) if key and value: data.append((key, value)) if key and not value: content_p.append(key) content = response.xpath('//span[@id="zoom"]') elif len(response.xpath('//*[@id="zoom"]/div/table/tbody/tr')) > 0: for item in response.xpath('//*[@id="zoom"]/div/table/tbody/tr'): key = get_content(item.xpath('string(td[1])').extract()) value = get_content(item.xpath('string(td[2])').extract()) if key and value: data.append((key, value)) if key and not value: content_p.append(key) content = response.xpath('//span[@id="zoom"]') else: for item in response.xpath( '//*[@id="zoom"]/strong/table/tbody/tr'): key = get_content(item.xpath('string(td[1])').extract()) value = get_content(item.xpath('string(td[2])').extract()) if key and value: data.append((key, value)) if key and not value: content_p.append(key) content = response.xpath('//span[@id="zoom"]') flag = False for key, value in data: if key.find(u'收入') >= 0: report['income'] = value elif key.find(u'保户投资') >= 0: report['baohu_xz'] = value elif key.find(u'独立账户') >= 0: report['duli_xz'] = value elif key.find(u'给付') >= 0 or key.find(u'赔付支出') >= 0: report['expense'] = value flag = True elif key.find(u'年金缴费') >= 0: report['yanglao_cost'] = value elif key.find(u'受托') >= 0: report['yanglao_shoutuo'] = value elif key.find(u'年金投资管理') >= 0: report['yanglao_touzi'] = value elif key.find(u'业务') >= 0 or key.find(u'营业') >= 0: report['manage_fee'] = value elif key.find(u'银行存款') >= 0: report['bank_deposits'] = value elif key.find(u'投资') >= 0: report['invest'] = value elif key.find(u'资产总额') >= 0: report['amount'] = value elif key.find(u'财产险') >= 0: if flag: report['caichanxian2'] = value else: report['caichanxian1'] = value elif key.find(u'人身险') >= 0: if flag: report['renshenxian2'] = value else: report['renshenxian1'] = value elif key.find(u'寿险') >= 0: if flag: report['shouxian2'] = value else: report['shouxian1'] = value elif key.find(u'健康险') >= 0: if flag: report['jiankangxian2'] = value else: report['jiankangxian1'] = value elif key.find(u'意外') >= 0: if flag: report['yiwaixian2'] = value else: report['yiwaixian1'] = value report['data'] = json.dumps(data, encoding="UTF-8", ensure_ascii=False) report['raw_content'] = content.extract_first() if len(content_p) > 1: report['content'] = ' '.join(content_p) else: report['content'] = ''.join([ get_trunk(c) for c in content.xpath( './/p/text() or string(span)').extract() ]) report['image_url'] = '#'.join( [get_trunk(c) for c in content.xpath('.//img/@src').extract()]) or None yield report
def parse_detail(self, response): title = response.meta['title'] year, month = self.parse_title(title) created = response.meta['created'] link = response.url capital_structure = None content = ' '.join( [get_trunk(c) for c in response.xpath('//p//text()').extract()]) shoutuo_jf = None touzi_jf = None weituo_jf = None shoutuo_zc = None touzi_zc = None weituo_zc = None for tbody in response.xpath('//tbody'): if len(tbody.xpath('tr')) > 5: for tr in tbody.xpath('tr'): # try: if len(tr.xpath('td')) == 4: name = get_content(tr.xpath('string(td[1])').extract()) weituo_jf = get_content( tr.xpath('string(td[2])').extract()) shoutuo_jf = get_content( tr.xpath('string(td[3])').extract()) touzi_jf = get_content( tr.xpath('string(td[4])').extract()) elif len(tr.xpath('td')) == 7: name = get_content(tr.xpath('string(td[1])').extract()) shoutuo_jf = get_content( tr.xpath('string(td[2])').extract()) touzi_jf = get_content( tr.xpath('string(td[3])').extract()) weituo_jf = get_content( tr.xpath('string(td[4])').extract()) shoutuo_zc = get_content( tr.xpath('string(td[5])').extract()) touzi_zc = get_content( tr.xpath('string(td[6])').extract()) weituo_zc = get_content( tr.xpath('string(td[7])').extract()) else: continue if shoutuo_jf and shoutuo_jf.find(u'企业') >= 0: continue if name and name.find(u'简称') < 0: company = YanglaoxianItem() company['title'] = title company['year'] = year company['month'] = month company['link'] = link company['company_name'] = name company['shoutuo_jf'] = shoutuo_jf company['touzi_jf'] = touzi_jf company['weituo_jf'] = weituo_jf company['shoutuo_zc'] = shoutuo_zc company['touzi_zc'] = touzi_zc company['weituo_zc'] = weituo_zc company['content'] = content company['created'] = created yield company