Example #1
0
    def parse(self, response):
        self.logger.info('Parsing Wangjia Problem Platform From <%s>.' % response.url)

        platform_list = []
        platforms = response.xpath('//div[@class="wtpt"]/div/table/tbody/tr')
        for rt in platforms:
        #for idx, rt in enumerate(platforms[1:]):
            content = rt.xpath('td')

            item = WentiItem()
            item['name'] = get_content(content[1].xpath('.//text()').extract())
            item['problem_time'] = get_content(content[2].xpath('text()').extract(), exclude=('-'))
            item['launch_time'] = get_content(content[3].xpath('text()').extract(), exclude=('-'))
            item['registered_capital'] = get_content(content[4].xpath('text()').extract(), exclude=('-'))
            #if idx == 179: item['province_id'] = 22
            #else:
            province_name = get_content(content[5].xpath('text()').extract())
            item['province_id'] = ProvinceItem.get_id_by_name(province_name)
            if item['province_id'] is None: item.pop('province_id')
            #print item.get_uk(), province_name, item['province_id']
            item['accounted_revenue'] = get_content(content[6].xpath('text()').extract(), exclude=('-'))
            item['involved_passenger'] = get_content(content[7].xpath('text()').extract(), exclude=('-'))
            item['event_category'] = get_content(content[8].xpath('text()').extract(), exclude=('-'))

            #log_empty_fields(item, self.logger)
            if item.get_uk(): platform_list.append(item)

        return platform_list
Example #2
0
    def parse(self, response):
        item_list = []
        content = response.xpath(
            '//div[@id="issuePlatList"]/div[starts-with(@class, "rnav")]')
        for sel_ct in content:
            province_name = get_content(
                sel_ct.xpath(
                    'div[@class="til"]/div/p[not(@class="til_num")]/text()').
                extract())
            province_id = ProvinceItem.get_id_by_name(province_name)

            plat_list = sel_ct.xpath('ul[@class="til_cn"]/li')
            for sel_pt in plat_list:
                item = WentiItem()
                item['name'] = get_content(sel_pt.xpath('a/text()').extract())
                purl = get_content(
                    sel_pt.xpath('a/@purl').extract()).split('/')
                while not purl[-1]:
                    purl.pop()
                item['pin'] = purl.pop()
                item['province_id'] = province_id
                item['event_category'] = self.get_event_category_by_classname(
                    get_content(sel_pt.xpath('i/@class').extract()))

                item_list.append(item)

        return item_list
Example #3
0
    def parse(self, response):
        self.logger.info('Parsing Wangjia Problem Platform From <%s>.' %
                         response.url)

        platform_list = []
        platforms = response.xpath('//div[@class="wtpt"]/div/table/tbody/tr')
        for rt in platforms:
            #for idx, rt in enumerate(platforms[1:]):
            content = rt.xpath('td')

            item = WentiItem()
            item['name'] = get_content(content[1].xpath('.//text()').extract())
            item['problem_time'] = get_content(
                content[2].xpath('text()').extract(), exclude=('-'))
            item['launch_time'] = get_content(
                content[3].xpath('text()').extract(), exclude=('-'))
            item['registered_capital'] = get_content(
                content[4].xpath('text()').extract(), exclude=('-'))
            #if idx == 179: item['province_id'] = 22
            #else:
            province_name = get_content(content[5].xpath('text()').extract())
            item['province_id'] = ProvinceItem.get_id_by_name(province_name)
            if item['province_id'] is None: item.pop('province_id')
            #print item.get_uk(), province_name, item['province_id']
            item['accounted_revenue'] = get_content(
                content[6].xpath('text()').extract(), exclude=('-'))
            item['involved_passenger'] = get_content(
                content[7].xpath('text()').extract(), exclude=('-'))
            item['event_category'] = get_content(
                content[8].xpath('text()').extract(), exclude=('-'))

            #log_empty_fields(item, self.logger)
            if item.get_uk(): platform_list.append(item)

        return platform_list
Example #4
0
    def parse(self, response):
        item_list = []
        content = response.xpath('//div[@id="issuePlatList"]/div[starts-with(@class, "rnav")]')
        for sel_ct in content:
            province_name = get_content(sel_ct.xpath('div[@class="til"]/div/p[not(@class="til_num")]/text()').extract())
            province_id = ProvinceItem.get_id_by_name(province_name)

            plat_list = sel_ct.xpath('ul[@class="til_cn"]/li')
            for sel_pt in plat_list:
                item = WentiItem()
                item['name'] = get_content(sel_pt.xpath('a/text()').extract())
                purl = get_content(sel_pt.xpath('a/@purl').extract()).split('/')
                while not purl[-1]: purl.pop()
                item['pin'] = purl.pop()
                item['province_id'] = province_id
                item['event_category'] = self.get_event_category_by_classname(get_content(sel_pt.xpath('i/@class').extract()))

                item_list.append(item)

        return item_list
Example #5
0
    def parse(self, response):
        item_list = []
        if response.url.endswith('html'):
            # For Regular Platform.
            content = response.xpath(
                '//div[@id="platList"]/div[starts-with(@class, "rnav")]')
            for sel_ct in content:
                province_name = get_content(
                    sel_ct.xpath(
                        'div[@class="til"]/div/p[not(@class="til_num")]/text()'
                    ).extract())
                province_id = ProvinceItem.get_id_by_name(province_name)

                plat_list = sel_ct.xpath('ul[@class="til_cn"]/li')
                for sel_pt in plat_list:
                    daohang = DaohangItem()
                    purl = get_content(
                        sel_pt.xpath('a/@purl').extract()).split('/')
                    while not purl[-1]:
                        purl.pop()
                    daohang['pin'] = purl.pop()
                    daohang['name'] = get_content(
                        sel_pt.xpath('a/text()').extract())
                    daohang['link'] = get_content(
                        sel_pt.xpath('a/@href').extract())
                    daohang['province_id'] = province_id

                    item_list.append(daohang)

            # For Problematic Platform.
            # Disabled Here Temporarily.
            #content = response.xpath('//div[@id="issuePlatList"]/div[starts-with(@class, "rnav")]')
            #for sel_ct in content:
            #    province_name = get_content(sel_ct.xpath('div[@class="til"]/div/p[not(@class="til_num")]/text()').extract())
            #    province_id = ProvinceItem.get_id_by_name(province_name)

            #    plat_list = sel_ct.xpath('ul[@class="til_cn"]/li')
            #    for sel_pt in plat_list:
            #        daohang = DaohangItem()
            #        purl = get_content(sel_pt.xpath('a/@purl').extract()).split('/')
            #        while not purl[-1]: purl.pop()
            #        daohang['pin'] = purl.pop()
            #        daohang['name'] = get_content(sel_pt.xpath('a/text()').extract())
            #        # Invalid Link For Problematic Platform.
            #        #daohang['link'] = get_content(sel_pt.xpath('a/@href').extract())
            #        daohang['province_id'] = province_id

            #        item_list.append(daohang)
        else:
            content = json.loads(response.body_as_unicode())
            if response.url.endswith('json'):
                for ct in content:
                    daohang = DaohangItem()
                    daohang['pin'] = ct.get('platPin', None)
                    daohang['allPin'] = ct.get('allPlatPin', None)
                    daohang['name'] = ct.get('platName', None)
                    daohang['link'] = ct.get('platUrl', None)

                    item_list.append(daohang)
            else:
                for ct in content:
                    if not ct.get('city'): continue

                    province_id = ProvinceItem.get_id_by_name(ct.get('city'))
                    plat_list = ct.get('platList')
                    for pt in plat_list:
                        daohang = DaohangItem()
                        daohang['pin'] = pt.get('platLetter', None)
                        daohang['name'] = pt.get('platName', None)
                        daohang['link'] = pt.get('platUrl', None)
                        daohang['province_id'] = province_id
                        daohang['launch_time'] = pt.get('onlineDateStr', None)
                        daohang['icon_url'] = pt.get('platIconUrl', None)

                        item_list.append(daohang)

        return item_list
Example #6
0
    def parse(self, response):
        item_list = []
        if response.url.endswith('html'):
            # For Regular Platform.
            content = response.xpath('//div[@id="platList"]/div[starts-with(@class, "rnav")]')
            for sel_ct in content:
                province_name = get_content(sel_ct.xpath('div[@class="til"]/div/p[not(@class="til_num")]/text()').extract())
                province_id = ProvinceItem.get_id_by_name(province_name)

                plat_list = sel_ct.xpath('ul[@class="til_cn"]/li')
                for sel_pt in plat_list:
                    daohang = DaohangItem()
                    purl = get_content(sel_pt.xpath('a/@purl').extract()).split('/')
                    while not purl[-1]: purl.pop()
                    daohang['pin'] = purl.pop()
                    daohang['name'] = get_content(sel_pt.xpath('a/text()').extract())
                    daohang['link'] = get_content(sel_pt.xpath('a/@href').extract())
                    daohang['province_id'] = province_id

                    item_list.append(daohang)

            # For Problematic Platform.
            # Disabled Here Temporarily.
            #content = response.xpath('//div[@id="issuePlatList"]/div[starts-with(@class, "rnav")]')
            #for sel_ct in content:
            #    province_name = get_content(sel_ct.xpath('div[@class="til"]/div/p[not(@class="til_num")]/text()').extract())
            #    province_id = ProvinceItem.get_id_by_name(province_name)

            #    plat_list = sel_ct.xpath('ul[@class="til_cn"]/li')
            #    for sel_pt in plat_list:
            #        daohang = DaohangItem()
            #        purl = get_content(sel_pt.xpath('a/@purl').extract()).split('/')
            #        while not purl[-1]: purl.pop()
            #        daohang['pin'] = purl.pop()
            #        daohang['name'] = get_content(sel_pt.xpath('a/text()').extract())
            #        # Invalid Link For Problematic Platform.
            #        #daohang['link'] = get_content(sel_pt.xpath('a/@href').extract())
            #        daohang['province_id'] = province_id

            #        item_list.append(daohang)
        else:
            content = json.loads(response.body_as_unicode())
            if response.url.endswith('json'):
                for ct in content:
                    daohang = DaohangItem()
                    daohang['pin']    = ct.get('platPin', None)
                    daohang['allPin'] = ct.get('allPlatPin', None)
                    daohang['name']   = ct.get('platName', None)
                    daohang['link']   = ct.get('platUrl', None)

                    item_list.append(daohang)
            else:
                for ct in content:
                    if not ct.get('city'): continue

                    province_id = ProvinceItem.get_id_by_name(ct.get('city'))
                    plat_list = ct.get('platList')
                    for pt in plat_list:
                        daohang = DaohangItem()
                        daohang['pin']         = pt.get('platLetter', None)
                        daohang['name']        = pt.get('platName', None)
                        daohang['link']        = pt.get('platUrl', None)
                        daohang['province_id'] = province_id
                        daohang['launch_time'] = pt.get('onlineDateStr', None)
                        daohang['icon_url']    = pt.get('platIconUrl', None)

                        item_list.append(daohang)

        return item_list