def parse(self, response): self.logger.info('Parsing Wangjia Problem Platform From <%s>.' % response.url) platform_list = [] platforms = response.xpath('//div[@class="wtpt"]/div/table/tbody/tr') for rt in platforms: #for idx, rt in enumerate(platforms[1:]): content = rt.xpath('td') item = WentiItem() item['name'] = get_content(content[1].xpath('.//text()').extract()) item['problem_time'] = get_content(content[2].xpath('text()').extract(), exclude=('-')) item['launch_time'] = get_content(content[3].xpath('text()').extract(), exclude=('-')) item['registered_capital'] = get_content(content[4].xpath('text()').extract(), exclude=('-')) #if idx == 179: item['province_id'] = 22 #else: province_name = get_content(content[5].xpath('text()').extract()) item['province_id'] = ProvinceItem.get_id_by_name(province_name) if item['province_id'] is None: item.pop('province_id') #print item.get_uk(), province_name, item['province_id'] item['accounted_revenue'] = get_content(content[6].xpath('text()').extract(), exclude=('-')) item['involved_passenger'] = get_content(content[7].xpath('text()').extract(), exclude=('-')) item['event_category'] = get_content(content[8].xpath('text()').extract(), exclude=('-')) #log_empty_fields(item, self.logger) if item.get_uk(): platform_list.append(item) return platform_list
def parse(self, response): item_list = [] content = response.xpath( '//div[@id="issuePlatList"]/div[starts-with(@class, "rnav")]') for sel_ct in content: province_name = get_content( sel_ct.xpath( 'div[@class="til"]/div/p[not(@class="til_num")]/text()'). extract()) province_id = ProvinceItem.get_id_by_name(province_name) plat_list = sel_ct.xpath('ul[@class="til_cn"]/li') for sel_pt in plat_list: item = WentiItem() item['name'] = get_content(sel_pt.xpath('a/text()').extract()) purl = get_content( sel_pt.xpath('a/@purl').extract()).split('/') while not purl[-1]: purl.pop() item['pin'] = purl.pop() item['province_id'] = province_id item['event_category'] = self.get_event_category_by_classname( get_content(sel_pt.xpath('i/@class').extract())) item_list.append(item) return item_list
def parse(self, response): self.logger.info('Parsing Wangjia Problem Platform From <%s>.' % response.url) platform_list = [] platforms = response.xpath('//div[@class="wtpt"]/div/table/tbody/tr') for rt in platforms: #for idx, rt in enumerate(platforms[1:]): content = rt.xpath('td') item = WentiItem() item['name'] = get_content(content[1].xpath('.//text()').extract()) item['problem_time'] = get_content( content[2].xpath('text()').extract(), exclude=('-')) item['launch_time'] = get_content( content[3].xpath('text()').extract(), exclude=('-')) item['registered_capital'] = get_content( content[4].xpath('text()').extract(), exclude=('-')) #if idx == 179: item['province_id'] = 22 #else: province_name = get_content(content[5].xpath('text()').extract()) item['province_id'] = ProvinceItem.get_id_by_name(province_name) if item['province_id'] is None: item.pop('province_id') #print item.get_uk(), province_name, item['province_id'] item['accounted_revenue'] = get_content( content[6].xpath('text()').extract(), exclude=('-')) item['involved_passenger'] = get_content( content[7].xpath('text()').extract(), exclude=('-')) item['event_category'] = get_content( content[8].xpath('text()').extract(), exclude=('-')) #log_empty_fields(item, self.logger) if item.get_uk(): platform_list.append(item) return platform_list
def parse(self, response): item_list = [] content = response.xpath('//div[@id="issuePlatList"]/div[starts-with(@class, "rnav")]') for sel_ct in content: province_name = get_content(sel_ct.xpath('div[@class="til"]/div/p[not(@class="til_num")]/text()').extract()) province_id = ProvinceItem.get_id_by_name(province_name) plat_list = sel_ct.xpath('ul[@class="til_cn"]/li') for sel_pt in plat_list: item = WentiItem() item['name'] = get_content(sel_pt.xpath('a/text()').extract()) purl = get_content(sel_pt.xpath('a/@purl').extract()).split('/') while not purl[-1]: purl.pop() item['pin'] = purl.pop() item['province_id'] = province_id item['event_category'] = self.get_event_category_by_classname(get_content(sel_pt.xpath('i/@class').extract())) item_list.append(item) return item_list
def parse(self, response): item_list = [] if response.url.endswith('html'): # For Regular Platform. content = response.xpath( '//div[@id="platList"]/div[starts-with(@class, "rnav")]') for sel_ct in content: province_name = get_content( sel_ct.xpath( 'div[@class="til"]/div/p[not(@class="til_num")]/text()' ).extract()) province_id = ProvinceItem.get_id_by_name(province_name) plat_list = sel_ct.xpath('ul[@class="til_cn"]/li') for sel_pt in plat_list: daohang = DaohangItem() purl = get_content( sel_pt.xpath('a/@purl').extract()).split('/') while not purl[-1]: purl.pop() daohang['pin'] = purl.pop() daohang['name'] = get_content( sel_pt.xpath('a/text()').extract()) daohang['link'] = get_content( sel_pt.xpath('a/@href').extract()) daohang['province_id'] = province_id item_list.append(daohang) # For Problematic Platform. # Disabled Here Temporarily. #content = response.xpath('//div[@id="issuePlatList"]/div[starts-with(@class, "rnav")]') #for sel_ct in content: # province_name = get_content(sel_ct.xpath('div[@class="til"]/div/p[not(@class="til_num")]/text()').extract()) # province_id = ProvinceItem.get_id_by_name(province_name) # plat_list = sel_ct.xpath('ul[@class="til_cn"]/li') # for sel_pt in plat_list: # daohang = DaohangItem() # purl = get_content(sel_pt.xpath('a/@purl').extract()).split('/') # while not purl[-1]: purl.pop() # daohang['pin'] = purl.pop() # daohang['name'] = get_content(sel_pt.xpath('a/text()').extract()) # # Invalid Link For Problematic Platform. # #daohang['link'] = get_content(sel_pt.xpath('a/@href').extract()) # daohang['province_id'] = province_id # item_list.append(daohang) else: content = json.loads(response.body_as_unicode()) if response.url.endswith('json'): for ct in content: daohang = DaohangItem() daohang['pin'] = ct.get('platPin', None) daohang['allPin'] = ct.get('allPlatPin', None) daohang['name'] = ct.get('platName', None) daohang['link'] = ct.get('platUrl', None) item_list.append(daohang) else: for ct in content: if not ct.get('city'): continue province_id = ProvinceItem.get_id_by_name(ct.get('city')) plat_list = ct.get('platList') for pt in plat_list: daohang = DaohangItem() daohang['pin'] = pt.get('platLetter', None) daohang['name'] = pt.get('platName', None) daohang['link'] = pt.get('platUrl', None) daohang['province_id'] = province_id daohang['launch_time'] = pt.get('onlineDateStr', None) daohang['icon_url'] = pt.get('platIconUrl', None) item_list.append(daohang) return item_list
def parse(self, response): item_list = [] if response.url.endswith('html'): # For Regular Platform. content = response.xpath('//div[@id="platList"]/div[starts-with(@class, "rnav")]') for sel_ct in content: province_name = get_content(sel_ct.xpath('div[@class="til"]/div/p[not(@class="til_num")]/text()').extract()) province_id = ProvinceItem.get_id_by_name(province_name) plat_list = sel_ct.xpath('ul[@class="til_cn"]/li') for sel_pt in plat_list: daohang = DaohangItem() purl = get_content(sel_pt.xpath('a/@purl').extract()).split('/') while not purl[-1]: purl.pop() daohang['pin'] = purl.pop() daohang['name'] = get_content(sel_pt.xpath('a/text()').extract()) daohang['link'] = get_content(sel_pt.xpath('a/@href').extract()) daohang['province_id'] = province_id item_list.append(daohang) # For Problematic Platform. # Disabled Here Temporarily. #content = response.xpath('//div[@id="issuePlatList"]/div[starts-with(@class, "rnav")]') #for sel_ct in content: # province_name = get_content(sel_ct.xpath('div[@class="til"]/div/p[not(@class="til_num")]/text()').extract()) # province_id = ProvinceItem.get_id_by_name(province_name) # plat_list = sel_ct.xpath('ul[@class="til_cn"]/li') # for sel_pt in plat_list: # daohang = DaohangItem() # purl = get_content(sel_pt.xpath('a/@purl').extract()).split('/') # while not purl[-1]: purl.pop() # daohang['pin'] = purl.pop() # daohang['name'] = get_content(sel_pt.xpath('a/text()').extract()) # # Invalid Link For Problematic Platform. # #daohang['link'] = get_content(sel_pt.xpath('a/@href').extract()) # daohang['province_id'] = province_id # item_list.append(daohang) else: content = json.loads(response.body_as_unicode()) if response.url.endswith('json'): for ct in content: daohang = DaohangItem() daohang['pin'] = ct.get('platPin', None) daohang['allPin'] = ct.get('allPlatPin', None) daohang['name'] = ct.get('platName', None) daohang['link'] = ct.get('platUrl', None) item_list.append(daohang) else: for ct in content: if not ct.get('city'): continue province_id = ProvinceItem.get_id_by_name(ct.get('city')) plat_list = ct.get('platList') for pt in plat_list: daohang = DaohangItem() daohang['pin'] = pt.get('platLetter', None) daohang['name'] = pt.get('platName', None) daohang['link'] = pt.get('platUrl', None) daohang['province_id'] = province_id daohang['launch_time'] = pt.get('onlineDateStr', None) daohang['icon_url'] = pt.get('platIconUrl', None) item_list.append(daohang) return item_list