def parse_sub_company_list(self, response): company = response.meta['company'] sub_company_list = list() for entry in response.xpath('//div[@class="ge"]/ul/li'): name = get_content(entry.xpath('string(p[1])').extract()) address = get_content(entry.xpath('string(p[2])').extract()) phone = get_content(entry.xpath('string(p[3])').extract()) sub_company_list.append({ 'name': name, 'address': address, 'phone': phone }) company['sub_company_list'] = json.dumps(sub_company_list, encoding="UTF-8", ensure_ascii=False) # 当前保险产品信息 yield scrapy.FormRequest( url='http://icid.iachina.cn/ICID/front/viewAllPros.do', method='POST', formdata={ 'columnid': company['column_id'], 'internetInformationNo': company['info_no'], 'informationno': company['info_no'], 'zj': company['zj'] }, meta={ 'company': company, 'type': 'cur' }, callback=self.parse_product_list, dont_filter=True)
def parse_company_detail(self, response): company = response.meta['company'] data_dict = dict() for entry in response.xpath('//div[@class="jie_nei"]/ul/li'): key = get_content(entry.xpath('p[1]/text()').extract()) value = get_content(entry.xpath('string(p[2])').extract()) data_dict[key] = value company['detail_info'] = json.dumps(data_dict, encoding="UTF-8", ensure_ascii=False) # 二级分公司信息 yield scrapy.FormRequest( url='http://icid.iachina.cn/ICID/front/viewAllBranch.do', method='POST', formdata={ 'columnid': company['column_id'], 'internetInformationNo': company['info_no'], 'informationno': company['info_no'], 'zj': company['zj'] }, meta={'company': company}, callback=self.parse_sub_company_list, dont_filter=True)
def parse(self, response): attr_type = response.url.split('/')[-1] symbol = (self.mapping.get(response.url), self.url_attr_map[attr_type], response.url) self.logger.info('Parsing ID.%d 39health Symptom %s Info From <%s>.' % symbol) item = SymptomDetailInfoItem() item['s_id'] = symbol[0] if attr_type == '': try: item['name'] = get_content( response.xpath('//h1/text()').extract()) item['description'] = get_content( response.xpath('//dd[@id="intro"]/p/text()').extract()) except: pass else: try: item[self.url_attr_db_map[attr_type]] = \ get_content(response.xpath('//div[@class="item catalogItem"]').extract()) except: pass return item
def parse(self, response): tc = self.get_thread_category_from_url(response.url) if not tc[0] or not tc[1]: self.logger.warning("Invalid Wangjia News Item From <%s>." % response.url) return None symbol = (tc[0], self.tab[tc[1]], response.url) if response.xpath('//div[@id="messagetext" and @class="alert_info"]'): self.logger.warning("No.%s Wangjia News %s Item From <%s> Maybe Limited." % symbol) return None self.logger.info("Parsing No.%s Wangjia News %s Item From <%s>." % symbol) item = XinwenItem() item["thread"] = int(symbol[0]) item["category_id"] = tc[1] item["source"] = symbol[2] article = response.xpath('//div[@class="con_news"]') item["title"] = get_content(article.xpath("h1/text()").extract()) subtitle = article.xpath('ul/li[@class="n_time"]/text()').extract()[0].encode("utf8").split(":") item["created"] = get_content(subtitle[1].split()) item["author"] = get_content(subtitle[-1].split()) item["summary"] = get_content(article.xpath('ul/li[@class="a_abstract"]/span/text()').extract()) body = article.xpath('ul/li[@class="news_con_p"]') item["content"] = "".join([get_trunk(c) for c in body.xpath(".//text()").extract()]) item["raw_content"] = get_content(body.extract()) item["image_url"] = ( "#".join([self.modify_image_url(get_trunk(c)) for c in body.xpath(".//img/@src").extract()]) or None ) return item
def parse(self, response): symbol = (self.timestamp, response.url) self.logger.info('Parsing %s Wangjia Rating From <%s>.' % symbol) rating_list = [] ratings = response.xpath('//div[@class="main_con1"]/table/tbody/tr') for rt in ratings: content = rt.xpath('td') # Decimal fields can be transformed by django itself. item = PingjiItem() item['timestamp'] = symbol[0] item['name'] = get_content(content[0].xpath('a/text()').extract()) item['exponent'] = get_content(content[1].xpath('.//text()').extract()) item['launch_time'] = get_content(content[2].xpath('.//text()').extract()) item['location'] = get_content(content[3].xpath('span/text()').extract()) item['deal'] = get_content(content[4].xpath('.//text()').extract()) item['popularity'] = get_content(content[5].xpath('.//text()').extract()) item['profit'] = get_content(content[6].xpath('.//text()').extract()) item['dispersity'] = get_content(content[7].xpath('.//text()').extract()) item['mobility'] = get_content(content[8].xpath('.//text()').extract()) item['transparency'] = get_content(content[9].xpath('.//text()').extract()) #log_empty_fields(item, self.logger) if item.get_uk(): rating_list.append(item) return rating_list
def parse(self, response): attr_type = response.url.split('/')[-1] symbol = (self.mapping.get(response.url), self.url_attr_map[attr_type], response.url) self.logger.info('Parsing ID.%d 39health Disease %s Info From <%s>.' % symbol) item = DiseaseDetailInfoItem() item['d_id'] = symbol[0] if attr_type == 'jbzs': try: item['name'] = get_content( response.xpath('//h1/text()').extract()) item['description'] = get_content( response.xpath('//div[@class="chi-know"]').extract()) except: pass else: try: item[self.url_attr_db_map[attr_type]] = get_content( response.xpath('//div[@class="art-box"]').extract()) except: pass return item
def parse(self, response): self.logger.info('Parsing Yinrendai Tender List Info From <%s>.' % response.url) item_list = [] tender_list = response.xpath('//li[@class="clearfix"]') for tender in tender_list: item = ToubiaoItem() item['loan_type'] = get_content(tender.xpath('div/@class').extract()).split('_')[-1] left = tender.xpath('div/div[@class="leftpart"]') if left: item['loan_url'] = get_content(left.xpath('./h3/a/@href').extract()) item['pin'] = self.get_pin_from_url(get_content(left.xpath('h3/a/@href').extract())) item['loan_description'] = get_content(left.xpath('h3/a/text()').extract()) item['warrant_icon'] = get_content(left.xpath('h3/a/span/@class').extract()) item['progress'] = get_content(left.xpath('div[@class="l bidDetail"]/p/text()').extract()) item['volume'] = get_content(left.xpath('div[@class="l bid_total"]/h4/span/text()').extract()) item['interest_rate'] = get_content(left.xpath('div[@class="l bid_rate"]/h4/span/text()').extract()) item['term'] = get_content(left.xpath('div[@class="l bidInfor"]/h4/span/text()').extract()) right = tender.xpath('div/div[@class="rightpart"]') if right: item['status'] = self.status_list.get(get_content(right.xpath('div/@class').extract())) item_list.append(item) return item_list
def parse_news_detail(self, response): news = NewsItem() news['thread'] = self.get_thread_from_url(response.url) news['source'] = response.url news['title'] = get_content(response.xpath('//title/text()').extract()) news['created'] = get_content( response.xpath('//small/span[last()]/text()').extract()) news['author'] = response.xpath( '//meta[@name="author"]/@content').extract_first() news['summary'] = response.xpath( '//meta[@name="description"]/@content').extract_first() news['keywords'] = response.xpath( '//meta[@name="keywords"]/@content').extract_first() news['category'] = get_content( response.xpath('//small/span[1]/a/text()').extract()) article = response.xpath('//div[@class="article-txt"]') news['raw_content'] = article.extract_first() news['content'] = ''.join( [get_trunk(c) for c in article.xpath('.//text()').extract()]) news['image_url'] = '#'.join( [get_trunk(c) for c in article.xpath('.//img/@src').extract()]) or None # print(news) yield news
def parse(self, response): symbol = (self.timestamp, response.url) self.logger.info('Parsing %s Wangjia Rating From <%s>.' % symbol) rating_list = [] ratings = response.xpath('//div[@class="mod-tablelists"]/table/tbody/tr') for rt in ratings: content = rt.xpath('td') # Decimal fields can be transformed by django itself. item = PingjiItem() item['timestamp'] = symbol[0] item['name'] = get_content(content[1].xpath('div/a/text()').extract()) item['exponent'] = get_content(content[2].xpath('div/em/text()').extract()) item['launch_time'] = get_content(content[3].xpath('.//text()').extract()) item['location'] = get_content(content[4].xpath('.//text()').extract()) item['deal'] = get_content(content[5].xpath('.//text()').extract()) item['popularity'] = get_content(content[6].xpath('.//text()').extract()) item['lever'] = get_content(content[7].xpath('.//text()').extract()) item['dispersity'] = get_content(content[8].xpath('.//text()').extract()) item['mobility'] = get_content(content[9].xpath('.//text()').extract()) item['transparency'] = get_content(content[10].xpath('.//text()').extract()) #log_empty_fields(item, self.logger) if item.get_uk(): rating_list.append(item) return rating_list
def parse_site_info(self, response): company = SiteInfoItem() company['link'] = response.url company['code'] = response.meta['code'] site_info = response.xpath('//div[@id="site-plate"]') for tr in site_info.xpath('table[@class="table"]//tr'): key = get_content(tr.xpath('string(td[1])').extract()) value = get_content(tr.xpath('string(td[2])').extract()) if not key or not value: continue if key.find(u'平台地址') >= 0: company['website'] = value elif key.find(u'平台简称') >= 0: company['short_name'] = value elif key.find(u'上线运营时间') >= 0: company['online_time'] = value elif key.find(u'许可') >= 0: company['license'] = value elif key.find(u'应用') >= 0: company['app'] = value elif key.find(u'微信') >= 0: company['wechat'] = value certification = dict() for tr in site_info.xpath('table[@class="small-table"]/tbody/tr'): key = get_content(tr.xpath('string(td[1])').extract()) value = get_content(tr.xpath('string(td[2])').extract()) certification[key] = value company['certification'] = json.dumps(certification, encoding="UTF-8", ensure_ascii=False) return company
def parse_flow_index(self, response, thread, name, link): platform_flow = FlowItem() platform_flow['thread'] = thread platform_flow['name'] = name platform_flow['link'] = link flow_date = datetime.strptime(self.flow_date, '%Y-%m-%d') flow_monitoring = response.xpath('//div[@class="flow-monitoring"]') for index_info in flow_monitoring.xpath('./div[@class="bd"]/dl'): index_list = list(json.loads(index_info.xpath('./script/text()').re_first(r'=(.*?);').strip())) platform_flow_item = copy.deepcopy(platform_flow) platform_flow_item['institution'] = get_content(index_info.xpath('./dt/text()').extract()) platform_flow_item['date'] = flow_date.strftime('%Y-%m-%d') platform_flow_item['flow'] = index_list[-1] yield platform_flow_item # history # for i, index_value in enumerate(index_list[::-1]): # platform_flow_item = copy.deepcopy(platform_flow) # platform_flow_item['institution'] = get_content(index_info.xpath('./dt/text()').extract()) # platform_flow_item['date'] = (flow_date - timedelta(days=i)).strftime('%Y-%m-%d') # platform_flow_item['flow'] = index_value # yield platform_flow_item platform_flow_item = copy.deepcopy(platform_flow) platform_flow_item['institution'] = '综合指数' platform_flow_item['date'] = flow_date.strftime('%Y-%m-%d') platform_flow_item['flow'] = get_content(flow_monitoring.xpath('./div[@class="hd"]/strong/text()').extract()) yield platform_flow_item
def parse(self, response): item_list = [] content = response.xpath( '//div[@id="issuePlatList"]/div[starts-with(@class, "rnav")]') for sel_ct in content: province_name = get_content( sel_ct.xpath( 'div[@class="til"]/div/p[not(@class="til_num")]/text()'). extract()) province_id = ProvinceItem.get_id_by_name(province_name) plat_list = sel_ct.xpath('ul[@class="til_cn"]/li') for sel_pt in plat_list: item = WentiItem() item['name'] = get_content(sel_pt.xpath('a/text()').extract()) purl = get_content( sel_pt.xpath('a/@purl').extract()).split('/') while not purl[-1]: purl.pop() item['pin'] = purl.pop() item['province_id'] = province_id item['event_category'] = self.get_event_category_by_classname( get_content(sel_pt.xpath('i/@class').extract())) item_list.append(item) return item_list
def parse_exposure_detail(self, response): exposure = ExposureItem() exposure['thread'] = self.get_thread_from_url(response.url) exposure['source'] = response.url exposure['title'] = get_content(response.xpath('//span[@id="thread_subject"]/text()').extract()) poston = response.xpath('(//div[@class="authi"])[2]/em/text()').extract_first() exposure['created'] = poston[poston.index(' ') + 1:] exposure['name'] = get_content(response.xpath('//div[@class="typeoption"]//tr[1]/td/text()').extract()) exposure['link'] = get_content(response.xpath('//div[@class="typeoption"]//tr[2]/td/a/text()').extract()) exposure['reason'] = get_content(response.xpath('//div[@class="typeoption"]//tr[3]/td/text()').extract()) body = response.xpath('//td[contains(@id, "postmessage")]') exposure['content'] = ''.join([get_trunk(c) for c in body.xpath('.//text()').extract()]) exposure['raw_content'] = body.extract_first() exposure['image_url'] = '#'.join([self.modify_image_url(get_trunk(c)) for c in response.xpath( '//ignore_js_op//img[re:test(@zoomfile, "^data")]/@zoomfile').extract()]) or None # exposure['image_url'] = response.xpath('//ignore_js_op//img[re:test(@src, "^data")]/@src').extract() # print(exposure) yield exposure
def parse(self, response): symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url) self.logger.info('Parsing ID.%d Yinrendai Bidder List Info From <%s>' % symbol) self.object = ToubiaoItem.get_object_by_pk(symbol[0]) item_list = [] record = response.xpath('//table[@class="bidRecord"]//tr') for row in record: item = BiaorenItem() detail = row.xpath('.//td') if not detail: continue item['pin'] = self.object.pin item['bid_nikename'] = get_content( detail[0].xpath('text()').extract()) item['bid_amount'] = get_content( detail[1].xpath('text()').extract()) item['bid_time'] = get_content(detail[2].xpath('text()').extract()) item_list.append(item) return item_list
def parse(self, response): self.logger.info('Parsing 39 Disease URLs From <%s>.' % response.url) item_list = [] elements = response.xpath('//div[@class="res_list"]') for ele in elements: item = DiseaseItem() item['name'] = get_content( ele.xpath('dl/dt/h3/a/text()').extract()) item['link'] = get_content(ele.xpath('dl/dt/h3/a/@href').extract()) try: item['alias'] = get_content( ele.xpath('dl/dt/cite/text()').extract()) symptoms_list = ele.xpath('div/p/a') relevant_symptoms = [] for s in symptoms_list: rs = get_content(s.xpath('text()').extract()) if rs: relevant_symptoms.append(rs) item['relevant_symptoms'] = ' '.join(relevant_symptoms) except: pass item_list.append(item) return item_list
def parse(self, response): tc = self.get_thread_category_from_url(response.url) if not tc[0] or not tc[1]: self.logger.warning('Invalid Wangjia News Item From <%s>.' % response.url) return None symbol = (tc[0], self.tab[tc[1]], response.url) if response.xpath('//div[@id="messagetext" and @class="alert_info"]'): self.logger.warning('No.%s Wangjia News %s Item From <%s> Maybe Limited.' % symbol) return None self.logger.info('Parsing No.%s Wangjia News %s Item From <%s>.' % symbol) item = XinwenItem() item['thread'] = int(symbol[0]) item['category_id'] = tc[1] item['source'] = symbol[2] article = response.xpath('//div[@class="show-box"]') item['title'] = get_content(article.xpath('h1/text()').extract()) subtitle = article.xpath('div[@class="s-bq"]/span') item['created'] = subtitle[0].xpath('text()').extract()[0] if len(subtitle) >= 3: item['author'] = get_content(subtitle[2].xpath('text()').extract()).split(u':')[1] item['summary'] = get_content(article.xpath('div[@class="s-zy"]/span/text()').extract()) body = article.xpath('div[@class="c-cen"]') item['content'] = ''.join([get_trunk(c) for c in body.xpath('.//text()').extract()]) item['raw_content'] = get_content(body.extract()) item['image_url'] = '#'.join([self.modify_image_url(get_trunk(c)) for c in body.xpath('.//img/@src').extract()]) or None self.logger.info(item) return None
def parse_detail(self, response): member = MemberItem() info = response.xpath('//div[@id="tytext"]') member['name'] = get_content(info.xpath('h1/text()').extract()) member['date'] = get_content( info.xpath('p[@class="tytdate"]/text()').extract()) member['link'] = response.url if len(info.xpath('./div/p')) > 0: for p in info.xpath('./div/p'): content = get_content(p.xpath('string(.)').extract()) print(member['name']) print(content) print('--------1') if content == None: continue if content.find(u'网址') >= 0: member['website'] = content.split(':')[-1] elif content.find(u'电话') >= 0: member['phone'] = content.split(':')[-1] elif content.find(u'地址') >= 0: member['address'] = content.split(':')[-1] elif content.find(u'邮编') >= 0: member['zip'] = content.split(':')[-1] elif len(info.xpath('./p')) < 4: content = info.xpath('string(./p[2])').extract_first().split('\n') for s in content: print(s) print('--------2') value = get_trunk(s.split(u':')[-1]) if s.find(u'网址') >= 0: member['website'] = value elif s.find(u'电话') >= 0: member['phone'] = value elif s.find(u'地址') >= 0: member['address'] = value elif s.find(u'邮编') >= 0: member['zip'] = value else: for p in info.xpath('./p'): content = get_content(p.xpath('string(.)').extract()) print(member['name']) print(content) print('--------3') if content == None: continue if content.find(u'网址') >= 0: member['website'] = content.split(':')[-1] elif content.find(u'电话') >= 0: member['phone'] = content.split(':')[-1] elif content.find(u'地址') >= 0: member['address'] = content.split(':')[-1] elif content.find(u'邮编') >= 0: member['zip'] = content.split(':')[-1] if member['website'][0] == '/': member['website'] = 'http:' + member['website'] yield member
def parse_list(self, response): for member in response.xpath( '//div[@class="memberab_lsit" or @class="memtab_list"]/ul/li/a' ): name = get_content(member.xpath('text()').extract()) link = 'http://old.iachina.cn/' + get_content( member.xpath('@href').extract()) yield scrapy.Request(url=link, callback=self.parse_detail, dont_filter=True)
def parse_report_list(self, response): for report in response.xpath('//ul[@class="reportList"]/li/a'): title = get_content(report.xpath('./text()').extract()) link = 'http://www.dailuopan.com' + get_content( report.xpath('./@href').extract()) print(link) yield scrapy.Request(url=link, meta={'category': response.meta['category']}, callback=self.parse_detail, dont_filter=True)
def parse_govern_info(self, response): name = get_content( response.xpath('//div[@class="comp-intro"]').xpath( './/div[@class="intro-txt"]/span')[0].xpath( 'string(.)').extract()) company = GovernInfoItem() company['link'] = response.url company['name'] = name company['code'] = response.meta['code'] govern_info = response.xpath('//div[@id="govern-info"]') company['structure'] = get_content( response.xpath( '//div[@class="mask"]/img[@class="mask-img"]/@src').extract()) relation = dict() for tr in govern_info.xpath('table[2]/tbody/tr'): key = get_content(tr.xpath('string(td[1])').extract()) value = get_content(tr.xpath('string(td[2])').extract()) relation[key] = value company['relation'] = json.dumps(relation, encoding="UTF-8", ensure_ascii=False) controller = govern_info.xpath( 'table[3]/tbody/tr[1]/td/text()').extract() company['controller'] = json.dumps(controller, encoding="UTF-8", ensure_ascii=False) shareholder_list = list() for tr in govern_info.xpath('table[4]/tbody/tr'): shareholder_list.append([ get_trunk(item) for item in tr.xpath('td//text()').extract() if get_trunk(item) != '' ]) company['shareholder_list'] = json.dumps(shareholder_list, encoding="UTF-8", ensure_ascii=False) manager_list = list() for tr in govern_info.xpath('table[5]/tbody/tr'): manager_list.append([ get_trunk(item) for item in tr.xpath('td//text()').extract() if get_trunk(item) != '' ]) company['manager_list'] = json.dumps(manager_list, encoding="UTF-8", ensure_ascii=False) return company
def parse(self, response): self.logger.info('Parsing Wangjia Problem Platform From <%s>.' % response.url) platform_list = [] platforms = response.xpath('//div[@class="wtpt"]/div/table/tbody/tr') for rt in platforms: #for idx, rt in enumerate(platforms[1:]): content = rt.xpath('td') item = WentiItem() item['name'] = get_content(content[1].xpath('.//text()').extract()) item['problem_time'] = get_content(content[2].xpath('text()').extract(), exclude=('-')) item['launch_time'] = get_content(content[3].xpath('text()').extract(), exclude=('-')) item['registered_capital'] = get_content(content[4].xpath('text()').extract(), exclude=('-')) #if idx == 179: item['province_id'] = 22 #else: province_name = get_content(content[5].xpath('text()').extract()) item['province_id'] = ProvinceItem.get_id_by_name(province_name) if item['province_id'] is None: item.pop('province_id') #print item.get_uk(), province_name, item['province_id'] item['accounted_revenue'] = get_content(content[6].xpath('text()').extract(), exclude=('-')) item['involved_passenger'] = get_content(content[7].xpath('text()').extract(), exclude=('-')) item['event_category'] = get_content(content[8].xpath('text()').extract(), exclude=('-')) #log_empty_fields(item, self.logger) if item.get_uk(): platform_list.append(item) return platform_list
def parse_company_list(self, response): for member_info in response.xpath( '//*[@id="hysjbox"]/div[2]/table/tbody/tr'): name = get_content(member_info.xpath('string(./td[2])').extract()) if MemberItem.get_member(name=name): member = MemberItem() member['name'] = name member['position'] = get_content( member_info.xpath('string(./td[3])').extract()) member['represent'] = get_content( member_info.xpath('string(./td[4])').extract()) member['type'] = get_content( member_info.xpath('string(./td[5])').extract()) yield member
def parse(self, response): self.logger.info('Parsing Wangjia Problem Platform From <%s>.' % response.url) platform_list = [] platforms = response.xpath('//div[@class="wtpt"]/div/table/tbody/tr') for rt in platforms: #for idx, rt in enumerate(platforms[1:]): content = rt.xpath('td') item = WentiItem() item['name'] = get_content(content[1].xpath('.//text()').extract()) item['problem_time'] = get_content( content[2].xpath('text()').extract(), exclude=('-')) item['launch_time'] = get_content( content[3].xpath('text()').extract(), exclude=('-')) item['registered_capital'] = get_content( content[4].xpath('text()').extract(), exclude=('-')) #if idx == 179: item['province_id'] = 22 #else: province_name = get_content(content[5].xpath('text()').extract()) item['province_id'] = ProvinceItem.get_id_by_name(province_name) if item['province_id'] is None: item.pop('province_id') #print item.get_uk(), province_name, item['province_id'] item['accounted_revenue'] = get_content( content[6].xpath('text()').extract(), exclude=('-')) item['involved_passenger'] = get_content( content[7].xpath('text()').extract(), exclude=('-')) item['event_category'] = get_content( content[8].xpath('text()').extract(), exclude=('-')) #log_empty_fields(item, self.logger) if item.get_uk(): platform_list.append(item) return platform_list
def parse(self, response): symbol = (self.mapping.get(response.url), response.url) self.logger.info( "Parsing ID.%d 39Health News Disease Ditail From <%s>" % symbol) self.object = NewsListItem.get_object_by_pk(symbol[0]) item = XinwenItem() left = response.xpath('//div[@class="art_left"]') if left: item['title'] = get_content(left.xpath('div/h1/text()').extract()) info = left.xpath('div/div[@class="art_info"]') detail = info.xpath('div[@class="date"]//em') item['time'] = get_content(detail[0].xpath('text()').extract()) source = detail[1].xpath('a') if source: item['source_website_link'] = get_content( source.xpath('@href').extract()) item['source_website'] = get_content( source.xpath('text()').extract()) else: item['source_website'] = get_content( detail[1].xpath('text()').extract()) item['source_author'] = get_content( detail[2].xpath('text()').extract(), skipBlank=False) item['summary'] = get_content( left.xpath('div/p[@class="summary"]/text()').extract()) item['content'] = get_content( left.xpath('div/div[@class="art_con"]').extract()) return item
def parse(self, response): symbol = (self.mapping.get(response.url), response.url) self.logger.info("Parsing ID.%d 39Health News Disease Ditail From <%s>" % symbol) self.object = NewsListItem.get_object_by_pk(symbol[0]) item = XinwenItem() left = response.xpath('//div[@class="art_left"]') if left: item['title'] = get_content(left.xpath('div/h1/text()').extract()) info = left.xpath('div/div[@class="art_info"]') detail = info.xpath('div[@class="date"]//em') item['time'] = get_content(detail[0].xpath('text()').extract()) source = detail[1].xpath('a') if source: item['source_website_link'] = get_content(source.xpath('@href').extract()) item['source_website'] = get_content(source.xpath('text()').extract()) else: item['source_website'] = get_content(detail[1].xpath('text()').extract()) item['source_author'] = get_content(detail[2].xpath('text()').extract(), skipBlank=False); item['summary'] = get_content(left.xpath('div/p[@class="summary"]/text()').extract()) item['content'] = get_content(left.xpath('div/div[@class="art_con"]').extract()) return item
def parse_news_detail(self, response): news = NewsItem() news['thread'] = self.get_thread_from_url(response.url) news['source'] = response.url news['title'] = get_content(response.xpath('//h1/text()').extract()) news['created'] = response.meta['created'] news['author'] = response.meta['author'] news['category'] = response.meta['category'] news['summary'] = response.meta['summary'] article = response.xpath('//td[@id="article_content"]') news['raw_content'] = article.extract_first() news['content'] = ''.join([ get_trunk(c) for c in article.xpath( './/p[contains(@class, "ke-editor-inner-p")]/text()').extract( ) ]) news['image_url'] = '#'.join([ self.modify_image_url(get_trunk(c)) for c in article.xpath('.//img/@src').extract() ]) or None yield news
def parse(self, response): item_list = [] plats = response.xpath('//div[@class="c_module2 clear"]/div[@class="main"]/div[@class="warp"]/div[@class="c_modreg"]/ul/li') for plat in plats: item = FeatureItem() url = get_content(plat.xpath('a/@href').extract()) purl = url.split('/') while purl and not purl[-1]: purl.pop() if purl: item['pin'] = purl.pop().split('.')[0] if item['pin'] in ['www', 'statistics', '']: continue item['name'] = get_content(plat.xpath('a/text()').extract()) item['link'] = url item_list.append(item) return item_list
def parse_platform_list(self, response): for info in response.xpath('//tbody/tr'): href = 'http://www.dailuopan.com' + info.xpath('./td[last()]/a/@href').extract_first() name = get_content(info.xpath('./td[2]/a[1]/text()').extract()) yield scrapy.Request(url=href, callback=self.parse_detail, meta={'name': name, 'link': href}, dont_filter=True)
def parse(self, response): self.logger.info('Parsing Wangjia Rating Item URLs From <%s>.' % response.url) item = ExporterItem() elements = response.xpath('//table[@id="rateTable_body"]/tbody/tr') for ele in elements: item.set_record(self.url_prefix + get_content(ele.xpath('td/a[@class="pname"]/@href').extract())) return item
def parse(self, response): self.logger.info('Parsing 39Health News Disease URLs From <%s>.' % response.url) item_list = [] elements = response.xpath('//div[@class="listbox"]//ul') for ele in elements: detail_list = ele.xpath('li') for detail in detail_list: item = NewsListItem() item['category_id'] = self.category_id item['link'] = get_content(detail.xpath('span/a/@href').extract()) item['title'] = get_content(detail.xpath('span')[0].xpath('a/text()').extract()) item['time'] = get_content(detail.xpath('span')[1].xpath('text()').extract(), skipBlank=False) if not self.judgeYear(item['time']): continue item_list.append(item) return item_list
def parse_cooperation_product_list(self, response): cooperation = response.meta['cooperation'] product_list = list() for entry in response.xpath('//div[@class="xz_nei_lxf"]/ul/li'): actual_name = get_content(entry.xpath('string(p[1])').extract()) record_name = get_content(entry.xpath('string(p[2])').extract()) product_list.append({ 'actual_name': actual_name, 'record_name': record_name }) if response.meta['type'] == 'cur': cooperation['cur_product_list'] = json.dumps(product_list, encoding="UTF-8", ensure_ascii=False) # 历史 company = cooperation['company'] yield scrapy.FormRequest( url='http://icid.iachina.cn/ICID/front/viewAllProHis.do', method='POST', formdata={ 'columnid': company.column_id, 'internetInformationNo': company.info_no, 'zj': company.zj, 'terraceNo': cooperation['terrace_no'], 'oldTerraceNo': cooperation['old_terrace_no'], 'type': cooperation['flag'], 'comType': cooperation['type'] }, meta={ 'cooperation': cooperation, 'type': 'his' }, callback=self.parse_cooperation_product_list, dont_filter=True) else: cooperation['his_product_list'] = json.dumps(product_list, encoding="UTF-8", ensure_ascii=False) yield cooperation
def parse_exposure_list(self, response): for exposure_abs in response.xpath('//div[contains(@class, "item")]'): href = exposure_abs.xpath('.//div[@class="forum-main left"]/a/@href').extract_first() title = get_content(exposure_abs.xpath('.//div[@class="forum-main left"]/a/div/text()').extract_first()) yield scrapy.Request(url="http://www.p2peye.com" + href, callback=self.parse_exposure_detail, meta={'title': title}, dont_filter=True)
def parse_cooperation_detail(self, response): cooperation = response.meta['cooperation'] company = cooperation['company'] for entry in response.xpath('//div[@class="ppp"]/p'): key = get_content(entry.xpath('span/text()').extract()) value = get_content(entry.xpath('text()').extract()).replace( u':', '') if u'全称' in key: cooperation['full_name'] = value elif u'简称' in key: cooperation['short_name'] = value elif u'地址' in key: cooperation['website'] = value elif u'备案' in key: cooperation['records'] = value elif u'范围' in key: cooperation['scope'] = value elif u'起始' in key: cooperation['start_date'] = value elif u'终止' in key: cooperation['end_date'] = value yield scrapy.FormRequest( url='http://icid.iachina.cn/ICID/front/viewAllPro.do', method='POST', formdata={ 'columnid': company.column_id, 'internetInformationNo': company.info_no, 'zj': company.zj, 'terraceNo': cooperation['terrace_no'], 'oldTerraceNo': cooperation['old_terrace_no'], 'type': cooperation['flag'], 'comType': cooperation['type'] }, meta={ 'cooperation': cooperation, 'type': 'cur' }, callback=self.parse_cooperation_product_list, dont_filter=True)
def parse(self, response): item_list = [] content = response.xpath('//div[@id="issuePlatList"]/div[starts-with(@class, "rnav")]') for sel_ct in content: province_name = get_content(sel_ct.xpath('div[@class="til"]/div/p[not(@class="til_num")]/text()').extract()) province_id = ProvinceItem.get_id_by_name(province_name) plat_list = sel_ct.xpath('ul[@class="til_cn"]/li') for sel_pt in plat_list: item = WentiItem() item['name'] = get_content(sel_pt.xpath('a/text()').extract()) purl = get_content(sel_pt.xpath('a/@purl').extract()).split('/') while not purl[-1]: purl.pop() item['pin'] = purl.pop() item['province_id'] = province_id item['event_category'] = self.get_event_category_by_classname(get_content(sel_pt.xpath('i/@class').extract())) item_list.append(item) return item_list
def parse_list(self, response): for report_item in response.xpath( '//table[contains(@id, "ListC_Info_LstC_Info")]/tr'): title = get_content( report_item.xpath('.//td[@class="hui14"]//a/text()').extract()) id = get_content( report_item.xpath('.//td[@class="hui14"]//a/@id').re(r'\d+')) link = 'http://www.circ.gov.cn' + get_content( report_item.xpath('.//td[@class="hui14"]//a/@href').extract()) created = get_content( report_item.xpath('.//td[@class="hui14"]/../td[last()]/text()' ).extract())[1:-1] yield scrapy.Request(url=link, callback=self.parse_detail, meta={ 'title': title, 'id': id, 'created': created }, dont_filter=True)
def parse(self, response): attr_type = response.url.split('/')[-1] symbol = (self.mapping.get(response.url), self.url_attr_map[attr_type], response.url) self.logger.info('Parsing ID.%d 39health Disease %s Info From <%s>.' % symbol) item = DiseaseDetailInfoItem() item['d_id'] = symbol[0] if attr_type == 'jbzs': try: item['name'] = get_content(response.xpath('//h1/text()').extract()) item['description'] = get_content(response.xpath('//div[@class="chi-know"]').extract()) except: pass else: try: item[self.url_attr_db_map[attr_type]] = get_content(response.xpath('//div[@class="art-box"]').extract()) except: pass return item
def parse_detail(self, response): product = ProductItem() code = response.meta['code'] product['code'] = code product[ 'pdf'] = 'http://www.iachina.cn/IC/tkk/03/' + code + '_TERMS.PDF' product['link'] = response.url update_fields_list = [ 'link', 'company_name', 'product_name', 'product_type', 'design_type', 'feature', 'insured', 'period_type', 'pay_type', 'clause', 'state', 'end_date', 'summary', 'pdf' ] for entry in response.xpath('//table[@class="biaoge"]/tr'): key = get_content(entry.xpath('string(td[1])').extract()) value = get_content(entry.xpath('string(td[2])').extract()) if key.find(u'公司名称') >= 0: product['company_name'] = value elif key.find(u'产品名称') >= 0: product['product_name'] = value elif key.find(u'产品类别') >= 0: product['product_type'] = value elif key.find(u'设计类型') >= 0: product['design_type'] = value elif key.find(u'产品特殊属性') >= 0: product['feature'] = value elif key.find(u'承保方式') >= 0: product['insured'] = value elif key.find(u'保险期间类型') >= 0: product['period_type'] = value elif key.find(u'产品交费方式') >= 0: product['pay_type'] = value elif key.find(u'条款文字编码') >= 0: product['clause'] = value elif key.find(u'销售状态') >= 0: product['state'] = value elif key.find(u'停止销售日期') >= 0: product['end_date'] = value yield product
def parse_product_list(self, response): company = response.meta['company'] product_list = list() for entry in response.xpath('//div[@class="ge"]/ul/li'): actual_name = get_content(entry.xpath('string(p[1])').extract()) record_name = get_content(entry.xpath('string(p[2])').extract()) record_no = get_content(entry.xpath('string(p[3])').extract()) product_list.append({ 'actual_name': actual_name, 'record_name': record_name, 'record_no': record_no }) if response.meta['type'] == 'cur': company['cur_product_list'] = json.dumps(product_list, encoding="UTF-8", ensure_ascii=False) # 历史保险产品信息 yield scrapy.FormRequest( url='http://icid.iachina.cn/ICID/front/viewAllProsHis.do', method='POST', formdata={ 'columnid': company['column_id'], 'internetInformationNo': company['info_no'], 'informationno': company['info_no'], 'zj': company['zj'] }, meta={ 'company': company, 'type': 'his' }, callback=self.parse_product_list, dont_filter=True) else: company['his_product_list'] = json.dumps(product_list, encoding="UTF-8", ensure_ascii=False) company['type'] = ['人身险', '财产险', '中介类'][int(company['column_id'][-1]) - 1] yield company
def parse(self, response): symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url) self.logger.info('Parsing ID.%d Yinrendai Bidder List Info From <%s>' % symbol) self.object = ToubiaoItem.get_object_by_pk(symbol[0]) item_list=[] record = response.xpath('//table[@class="bidRecord"]//tr') for row in record: item = BiaorenItem() detail = row.xpath('.//td') if not detail: continue item['pin'] = self.object.pin item['bid_nikename'] = get_content(detail[0].xpath('text()').extract()) item['bid_amount'] = get_content(detail[1].xpath('text()').extract()) item['bid_time'] = get_content(detail[2].xpath('text()').extract()) item_list.append(item) return item_list
def parse(self, response): self.logger.info('Parsing 39 Symptom URLs From <%s>.' % response.url) item_list = [] elements = response.xpath('//div[@class="res_list"]') for ele in elements: item = SymptomItem() item['name'] = get_content(ele.xpath('dl/dt/h3/a/@title').extract()) item['link'] = get_content(ele.xpath('dl/dt/h3/a/@href').extract()) try: item['alias'] = get_content(ele.xpath('dl/dt/cite/@title').extract()) disease_list = ele.xpath('div/p/a') relevant_diseases = [] for d in disease_list: rd = get_content(d.xpath('text()').extract()) if rd: relevant_diseases.append(rd) item['relevant_diseases'] = ' '.join(relevant_diseases) except: pass item_list.append(item) return item_list
def parse(self, response): attr_type = response.url.split('/')[-1] symbol = (self.mapping.get(response.url), self.url_attr_map[attr_type], response.url) self.logger.info('Parsing ID.%d 39health Symptom %s Info From <%s>.' % symbol) item = SymptomDetailInfoItem() item['s_id'] = symbol[0] if attr_type == '': try: item['name'] = get_content(response.xpath('//h1/text()').extract()) item['description'] = get_content(response.xpath('//dd[@id="intro"]/p/text()').extract()) except: pass else: try: item[self.url_attr_db_map[attr_type]] = \ get_content(response.xpath('//div[@class="item catalogItem"]').extract()) except: pass return item
def parse(self, response): self.logger.info('Parsing Wangjia News %s URLs From <%s>.' % (self.category, response.url)) item = ExporterItem() elements = response.xpath('//div[contains(@class, "specialBox")]//div[@class="news_title"]') for ele in elements: url = get_content(ele.xpath('a/@href').extract()) if url.find(self.category) == -1: continue thread = get_thread_from_news_url(url) if int(self.max_thread) < int(thread): item.set_record(url) return item
def parse(self, response): self.logger.info('Parsing 39 Disease URLs From <%s>.' % response.url) item_list = [] elements = response.xpath('//div[@class="res_list"]') for ele in elements: item = DiseaseItem() item['name'] = get_content(ele.xpath('dl/dt/h3/a/text()').extract()) item['link'] = get_content(ele.xpath('dl/dt/h3/a/@href').extract()) try: item['alias'] = get_content(ele.xpath('dl/dt/cite/text()').extract()) symptoms_list = ele.xpath('div/p/a') relevant_symptoms = [] for s in symptoms_list: rs = get_content(s.xpath('text()').extract()) if rs: relevant_symptoms.append(rs) item['relevant_symptoms'] = ' '.join(relevant_symptoms) except: pass item_list.append(item) return item_list
def parse(self, response): self.logger.info('Parsing P2peye Archive Feature From <%s>.' % response.url) item = FeatureItem() item['pin'] = self.get_pin_from_url(response.url) feature_list = response.xpath('//div[@class="bd ui-yun-parent"]/a') features = [] if feature_list: for fl in feature_list: fc = get_content(fl.xpath('text()').extract()) if fc: features.append(fc) item['feature'] = ' '.join(features) return item
def parse(self, response): self.logger.info('Parsing Wangjia Exporsure URLs From <%s>.' % response.url) item = ExporterItem() elements = response.xpath('//table[starts-with(@summary, "forum")]/tbody') #elements = response.xpath('//div[@class="comeing_channel_tab_area"]/table/tbody') for ele in elements: content = ele.xpath('tr/th[@class="new"]') #content = ele.xpath('tr/td[@class="comeing_channel_threadlist_sub"]') if not content: continue url = get_content(content.xpath('a[contains(@class, "xst")]/@href').extract()) thread = get_thread_from_exposure_url(url) if int(self.max_thread) < int(thread): item.set_record(url) return item
def parse(self, response): symbol = (self.mapping.get(response.url), response.url) self.logger.info('Parsing ID.%d 39health Disease Elementary Info From <%s>.' % symbol) disease_ele_item = DiseaseElementaryInfoItem() try: disease_ele_item['d_id'] = symbol[0] disease_ele_item['name'] = get_content(response.xpath('//dl[@class="intro"]/dt/text()').extract()) try: relative_drug_path = response.xpath('//div[@class="drug"]/ul/li') has_drug = get_content(relative_drug_path[0].xpath('i/text()').extract()) if self.d_map.has_key(has_drug): drug_list = relative_drug_path[0].xpath('a') dn = [] for d in drug_list: dl = get_content(d.xpath('@title').extract()) if dl: dn.append(dl) disease_ele_item[self.d_map[has_drug]] = ' '.join(dn) except: pass ele = response.xpath('//div[@class="info"]/ul/li') for li in ele: attr = get_content(li.xpath('i/text()').extract()) if self.d_map.has_key(attr): if self.type_map[attr]: label_list = li.xpath('a') ll = [] for l in label_list: if l.xpath('@title'): lc = get_content(l.xpath('@title').extract()) else: lc = get_content(l.xpath('text()').extract()) if lc: ll.append(lc) disease_ele_item[self.d_map[attr]] = ' '.join(ll) else: disease_ele_item[self.d_map[attr]] = get_content(li.xpath('text()').extract()) return disease_ele_item except: return None
def parse(self, response): # self.object = self.mapping.get(response.url) # symbol = (self.object.manual_id, response.url) # self.logger.info("Parsing ID.%d 39Health Drug Informations From <%s>." % symbol) symbol = (self.mapping.get(response.url), response.url) self.logger.info("Parsing ID.%d 39Health Drug Informations From <%s>." % symbol) item = YaopingItem() item['manual_id'] = symbol[0] sub = response.xpath('//div[@class="subs"]//a') item['category_list'] = '>>'.join([get_trunk(s) for s in sub.xpath('text()').extract()]) category_list = item['category_list'].split(">>") if len(category_list) == 3: item['category_first'] = category_list[1] elif len(category_list) == 4: item['category_first'] = category_list[1] item['category_second'] = category_list[2] item['name'] = get_content(response.xpath('//div[@class="t1"]/h1/a/text()').extract()) cites = response.xpath('//div[@class="t1"]//cite') item['cites'] = '&&'.join([get_trunk(cite) for cite in cites.xpath('span/text()').extract()]) item['english_name'] = get_content(response.xpath('//cite[@class="t2"]/text()').extract(), skipBlank=False) item['company'] = get_content(response.xpath('//li[@class="company"]/text()').extract()) item['address'] = get_content(response.xpath('//li[@class="address"]/text()').extract()) item['telephone'] = get_content(response.xpath('//li[@class="telephone"]/text()').extract(), skipBlank=False) information = response.xpath('//div[@class="tab_box"]//dl') for info in information: key = get_content(info.xpath('dt/text()').extract()) if self.detail_map.get(key): attr = self.detail_map[key] detail = info.xpath('dd') #using string(.) to remove html label item[attr] = get_content(detail.xpath('string(.)').extract()) return item
def parse(self, response): symbol = (self.get_thread_from_url(response.url), response.url) if not symbol[0]: self.logger.warning('Invalid Wangjia Exposure Item From <%s>.' % symbol[1]) return None if response.xpath('//div[@class="wrap"]'): self.logger.warning('May Redirect To Warning Page Of Wangjia.') return None if response.xpath('//div[@id="messagetext" and @class="alert_info"]'): self.logger.warning('No.%s Wangjia Exposure Item From <%s> Maybe Limited.' % symbol) return None self.logger.info('Parsing No.%s Wangjia Exposure Item From <%s>.' % symbol) item = BaoguangItem() item['thread'] = int(symbol[0]) item['source'] = symbol[1] title = response.xpath('//span[@id="thread_subject"]') item['title'] = get_content(title.xpath('text()').extract()) subtitle = response.xpath('//em[starts-with(@id, "authorposton")]')[0] poston = get_content(subtitle.xpath('text()').extract(), skipBlank=False) item['created'] = poston[poston.index(' ')+1:] header = response.xpath('//div[@class="typeoption"]/table/tbody/tr/td') if header: item['name'] = get_content(header[0].xpath('.//text()').extract()) item['link'] = get_content(header[1].xpath('.//text()').extract()) item['reason'] = get_content(header[2].xpath('.//text()').extract()) body = response.xpath('//td[starts-with(@id, "postmessage")]')[0] #item['content'] = ''.join([get_trunk(c) for c in body.xpath('text()|*[not(@class="pstatus")]/text()|*[not(@class="pstatus")]/*/text()').extract()]) item['content'] = ''.join([get_trunk(c) for c in body.xpath('.//text()').extract()]) item['raw_content'] = get_content(body.extract()) item['image_url'] = '#'.join([self.modify_image_url(get_trunk(c)) for c in body.xpath('.//@file').extract()]) or None return item
def parse(self, response): symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url) self.logger.info('Parsing ID.%d Yinrendai Bid List Info From <%s>' % symbol) self.object = ToubiaoItem.get_object_by_pk(symbol[0]) item = BiaodiItem() finance = response.xpath('//div[@class="finance_box clearfix"]') if finance: left = finance.xpath('div[@class="elite_left l"]') left_info = left.xpath('table/tr[@class="num"]/td') item['interest_rate'] = get_content(left_info[0].xpath('strong/text()').extract()) item['term'] = get_content(left_info[1].xpath('strong/text()').extract()) item['volume'] = get_content(left_info[2].xpath('strong/text()').extract()) # We use this string format to get the bid detail information easily. bid_detail_info = left.xpath('div/p[@class="progressTxt l"]') item['bid_detail'] = self.bid_detail_form.format(num=get_content(bid_detail_info.xpath('span/text()').extract()), percentage=get_content(bid_detail_info.xpath('//span[@id="percent"]/text()').extract())) item['remain_amount'] = get_content(finance.xpath('div[@class="elite_right l"]/p/span/text()').extract()) detail = response.xpath('//li[@class="oneInfo"]') if detail: personal_info = detail[0].xpath('table//td[not(@class="dd")]') if personal_info: item['nikename'] = get_content(personal_info[0].xpath('text()').extract()) item['gender'] = get_content(personal_info[1].xpath('text()').extract()) item['phone_number'] = get_content(personal_info[2].xpath('text()').extract()) item['education'] = get_content(personal_info[3].xpath('text()').extract()) item['marital_status'] = get_content(personal_info[4].xpath('text()').extract()) item['house'] = get_content(personal_info[5].xpath('text()').extract()) item['address'] = get_content(personal_info[6].xpath('text()').extract()) job_status = detail[1].xpath('table//td[not(@class="dd")]') if job_status: item['job_type'] = get_content(job_status[0].xpath('text()').extract()) item['job_city'] = get_content(job_status[1].xpath('text()').extract()) item['job_year'] = get_content(job_status[2].xpath('text()').extract()) item['annual_income'] = get_content(job_status[3].xpath('text()').extract()) item['credit_limit'] = get_content(job_status[4].xpath('text()').extract()) bid_info = detail[2].xpath('table//td[not(@class="dd")]') if bid_info: item['loan_volume'] = get_content(bid_info[0].xpath('text()').extract()) item['loan_term'] = get_content(bid_info[1].xpath('text()').extract()) item['loan_interest_rate'] = get_content(bid_info[2].xpath('text()').extract()) item['loan_purpose'] = get_content(bid_info[3].xpath('text()').extract()) item['payment_method'] = get_content(bid_info[4].xpath('text()').extract()) item['tender_deadline'] = get_content(bid_info[5].xpath('text()').extract()) return item
def parse(self, response): symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url) self.logger.info('Parsing ID.%d Wangjia Feature From <%s>.' % symbol) self.object = DaohangItem.get_object_by_pk(symbol[0]) item = TedianItem() item['name'] = self.object.name rtag = response.xpath('//div[@class="rTags"]') if rtag: item['status'] = get_content(rtag.xpath('./span[@class="tag3"]/text()').extract()) item['company_tag'] = get_content(rtag.xpath('./span[@class="tag tag2"]/text()').extract()) tag_info = rtag.xpath('./span[@class = "tag"]') item['illustration'] = '/'.join([get_trunk(info) for info in tag_info.xpath('text()').extract()]) comment_info = response.xpath('//div[contains(@class,"box commentBox")]') if comment_info: commentScores = comment_info.xpath('./dl[@class="comment"]') item['recommendation'] = get_content(commentScores.xpath('./dt/span/text()').extract()) score = commentScores.xpath('./dd/span[@class="num"]') item['withdraw_num'] = get_content(score[0].xpath('text()').extract()) item['guard_num'] = get_content(score[1].xpath('text()').extract()) item['service_num'] = get_content(score[2].xpath('text()').extract()) item['experience_num'] = get_content(score[3].xpath('text()').extract()) scoreInfo = commentScores.xpath('.//span[not(@class="num")]') item['withdraw_day'] = get_content(scoreInfo[0].xpath('text()').extract()) item['guard_day'] = get_content(scoreInfo[1].xpath('text()').extract()) item['service_status'] = get_content(scoreInfo[2].xpath('text()').extract()) item['experience_status'] = get_content(scoreInfo[3].xpath('text()').extract()) impress_info = comment_info.xpath('./dl[@class="impression"]/dd//span') item['impression'] = '\001'.join([get_trunk(impress) for impress in impress_info.xpath('text()').extract()]) return item
def parse(self, response): symbol = (self.timestamp, response.url) self.logger.info('Parsing %s Wangjia Rating From Archive <%s>.' % symbol) item = PingjiItem() item['timestamp'] = symbol[0] detail = response.xpath('//div[contains(@class, "detailBox")]/p') if detail: item['name'] = get_content(detail[0].xpath('text()').extract()) item['launch_time'] = get_content(detail[4].xpath('text()').extract()) item['location'] = get_content(detail[3].xpath('text()').extract()) record = response.xpath('//div[@class="recordHead"]/div[@class="con"]/p') if record: item['exponent'] = get_content(record.xpath('span[@class="num"]/text()').extract()) exp = response.xpath('//div[contains(@class, "expBox")]/div[@class="bd"]/div[@class="detail"]/p') if not exp: return None item['deal'] = get_content(exp[0].xpath('span[@class="num"]/text()').extract()) item['popularity'] = get_content(exp[1].xpath('span[@class="num"]/text()').extract()) item['profit'] = get_content(exp[2].xpath('span[@class="num"]/text()').extract()) item['revenue'] = get_content(exp[3].xpath('span[@class="num"]/text()').extract()) item['lever'] = get_content(exp[4].xpath('span[@class="num"]/text()').extract()) item['brand'] = get_content(exp[5].xpath('span[@class="num"]/text()').extract()) item['dispersity'] = get_content(exp[7].xpath('span[@class="num"]/text()').extract()) item['mobility'] = get_content(exp[8].xpath('span[@class="num"]/text()').extract()) item['transparency'] = get_content(exp[6].xpath('span[@class="num"]/text()').extract()) log_empty_fields(item, self.logger) return item
def parse(self, response): item_list = [] if response.url.endswith('html'): # For Regular Platform. content = response.xpath('//div[@id="platList"]/div[starts-with(@class, "rnav")]') for sel_ct in content: province_name = get_content(sel_ct.xpath('div[@class="til"]/div/p[not(@class="til_num")]/text()').extract()) province_id = ProvinceItem.get_id_by_name(province_name) plat_list = sel_ct.xpath('ul[@class="til_cn"]/li') for sel_pt in plat_list: daohang = DaohangItem() purl = get_content(sel_pt.xpath('a/@purl').extract()).split('/') while not purl[-1]: purl.pop() daohang['pin'] = purl.pop() daohang['name'] = get_content(sel_pt.xpath('a/text()').extract()) daohang['link'] = get_content(sel_pt.xpath('a/@href').extract()) daohang['province_id'] = province_id item_list.append(daohang) # For Problematic Platform. # Disabled Here Temporarily. #content = response.xpath('//div[@id="issuePlatList"]/div[starts-with(@class, "rnav")]') #for sel_ct in content: # province_name = get_content(sel_ct.xpath('div[@class="til"]/div/p[not(@class="til_num")]/text()').extract()) # province_id = ProvinceItem.get_id_by_name(province_name) # plat_list = sel_ct.xpath('ul[@class="til_cn"]/li') # for sel_pt in plat_list: # daohang = DaohangItem() # purl = get_content(sel_pt.xpath('a/@purl').extract()).split('/') # while not purl[-1]: purl.pop() # daohang['pin'] = purl.pop() # daohang['name'] = get_content(sel_pt.xpath('a/text()').extract()) # # Invalid Link For Problematic Platform. # #daohang['link'] = get_content(sel_pt.xpath('a/@href').extract()) # daohang['province_id'] = province_id # item_list.append(daohang) else: content = json.loads(response.body_as_unicode()) if response.url.endswith('json'): for ct in content: daohang = DaohangItem() daohang['pin'] = ct.get('platPin', None) daohang['allPin'] = ct.get('allPlatPin', None) daohang['name'] = ct.get('platName', None) daohang['link'] = ct.get('platUrl', None) item_list.append(daohang) else: for ct in content: if not ct.get('city'): continue province_id = ProvinceItem.get_id_by_name(ct.get('city')) plat_list = ct.get('platList') for pt in plat_list: daohang = DaohangItem() daohang['pin'] = pt.get('platLetter', None) daohang['name'] = pt.get('platName', None) daohang['link'] = pt.get('platUrl', None) daohang['province_id'] = province_id daohang['launch_time'] = pt.get('onlineDateStr', None) daohang['icon_url'] = pt.get('platIconUrl', None) item_list.append(daohang) return item_list
def parse(self, response): self.logger.info('Parsing cfda drug info From <%s>.' % response.url) item = CFDADrug() elements = response.xpath('//div[@class="listmain"]/div/table[1]/tr') if len(elements) > 13: item['url_id'] = int(response.url.split('=')[-1]) item['approval_num'] = get_content(elements[1].xpath('td[2]/text()').extract()) item['name'] = get_content(elements[2].xpath('td[2]/text()').extract()) item['en_name'] = get_content(elements[3].xpath('td[2]/text()').extract()) item['trade_name'] = get_content(elements[4].xpath('td[2]/text()').extract()) item['dosage_forms'] = get_content(elements[5].xpath('td[2]/text()').extract()) item['norm'] = get_content(elements[6].xpath('td[2]/text()').extract()) item['producer'] = get_content(elements[7].xpath('td[2]/a/text()').extract()) item['product_address'] = get_content(elements[8].xpath('td[2]/text()').extract()) item['type'] = get_content(elements[9].xpath('td[2]/text()').extract()) item['origin_approval_num'] = get_content(elements[10].xpath('td[2]/text()').extract()) item['approval_date'] = get_content(elements[11].xpath('td[2]/text()').extract()) item['drug_based_code'] = get_content(elements[12].xpath('td[2]/text()').extract()) item['remark'] = get_content(elements[13].xpath('td[2]/text()').extract()) return item else: return None
def parse(self, response): #NOTE: (zacky, 2015.APR.27th) PIPELINE FUNCTIONS RELATED WILL BE PROCESSED, SO WE KEEP THE OBJECT STATE HERE. symbol = (self.mapping.get(self.get_pin_from_url(response.url)), response.url) self.logger.info('Parsing ID.%d Wangjia Archive From <%s>.' % symbol) self.object = DaohangItem.get_object_by_pk(symbol[0]) item = DanganItem() item['name'] = self.object.name item['logo_url'] = get_content(response.xpath('//div[@class="rLogo"]/a/img/@src').extract()) detail = response.xpath('//div[contains(@class, "detailBox")]/p') if detail: item['link'] = get_content(detail[1].xpath('a/@href').extract()) item['location'] = get_content(detail[3].xpath('text()').extract()) item['launch_time'] = get_content(detail[4].xpath('text()').extract()) about = response.xpath('//div[contains(@class, "aboutBd")]/p') if about: item['introduction'] = ' '.join([get_trunk(c) for c in about.xpath('.//text()').extract()]) info = response.xpath('//div[contains(@class, "inforBd")]/p[not(contains(@class, "line"))]') if info: item['company_name'] = get_content(info[0].xpath('text()').extract()) item['artificial_person'] = get_content(info[1].xpath('text()').extract()) item['company_type'] = get_content(info[2].xpath('text()').extract()) item['shareholder_stucture'] = get_content(info[3].xpath('text()').extract()) item['registered_capital'] = get_content(info[4].xpath('text()').extract()) item['contributed_capital'] = get_content(info[5].xpath('text()').extract()) item['registered_address'] = get_content(info[6].xpath('text()').extract()) item['opening_date'] = get_content(info[7].xpath('text()').extract()) item['approved_date'] = get_content(info[8].xpath('text()').extract()) item['registration_authority'] = get_content(info[9].xpath('text()').extract()) item['business_licence'] = get_content(info[10].xpath('text()').extract()) item['institutional_framework'] = get_content(info[11].xpath('text()').extract()) item['tax_registration_num'] = get_content(info[12].xpath('text()').extract()) record = response.xpath('//div[contains(@class, "webRecordBd")]/table/tbody/tr')[1].xpath('td') if record: item['domain_name'] = get_content(record[0].xpath('text()').extract()) item['domain_date'] = get_content(record[1].xpath('text()').extract()) item['domain_company_type'] = get_content(record[2].xpath('text()').extract()) item['domain_company_name'] = get_content(record[3].xpath('text()').extract()) item['icp'] = get_content(record[4].xpath('text()').extract()) people = response.xpath('//div[contains(@class, "peopleBd")]/ul/li') if people: avatar_url = [] content = [] for i in xrange(len(people)): avatar_url.extend(people[i].xpath('div[@class="avatar"]/img/@src').extract()) content.extend([get_trunk(c) for c in people[i].xpath('p//text()').extract()]) item['company_person_avatar_url'] = '#'.join(avatar_url) item['company_person'] = ' '.join(content) cost = response.xpath('//div[contains(@class, "costBd")]')[0].xpath('p') if cost: item['management_fee'] = get_content(cost[0].xpath('text()').extract()) item['prepaid_fee'] = get_content(cost[1].xpath('text()').extract()) item['cash_withdrawal_fee'] = get_content(cost[2].xpath('text()').extract()) item['vip_fee'] = get_content(cost[3].xpath('text()').extract()) item['transfer_fee'] = get_content(cost[4].xpath('text()').extract()) item['mode_of_payment'] = get_content(cost[5].xpath('text()').extract()) contact = response.xpath('//div[contains(@class, "costBd")]')[1].xpath('p') if contact: item['contact_address'] = get_content(contact[0].xpath('text()').extract()) item['phone_400'] = get_content(contact[1].xpath('text()').extract()) item['phone'] = get_content(contact[2].xpath('text()').extract()) item['fax'] = get_content(contact[3].xpath('text()').extract()) item['email'] = get_content(contact[4].xpath('text()').extract()) record = response.xpath('//div[contains(@class, "recordListBox")]/ul/li') if record: item['is_automatic_bid'] = get_content(record[3].xpath('.//text()').extract(), skipFirst=True) item['is_equitable_assignment'] = get_content(record[4].xpath('.//text()').extract(), skipFirst=True) item['trust_fund'] = get_content(record[5].xpath('.//text()').extract(), skipFirst=True) item['tender_security'] = get_content(record[6].xpath('.//text()').extract(), skipFirst=True) item['security_mode'] = get_content(record[7].xpath('.//text()').extract(), skipFirst=True) item['guarantee_institution'] = get_content(record[8].xpath('.//text()').extract(), skipFirst=True) item['business_type'] = len(record) >= 10 and get_content(record[9].xpath('.//text()').extract(), skipFirst=True) log_empty_fields(item, self.logger) return item