Python whichの例、project_insurance_scrap.scrap_functions.which Pythonの例

コード例 #1

0

ファイルを表示

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css('#a2 font , .aproName').extract()
        zs_result = result[0:shan.which(shan.str_detect("人寿保险", result))[0]]
        ts_result = result[shan.which(shan.str_detect("人寿保险", result))[0]:(
            shan.which(shan.str_detect("zip", result))[0] - 1)]

        zs_result = shan.str_keep('国泰', zs_result)
        ts_result = shan.str_keep('国泰', ts_result)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '陆家嘴国泰'
            item['product_name'] = shan.str_extract('>(.*?)</a>', part)
            item['product_sale_status'] = "在售"
            item[
                'product_contract_link'] = "http://www.cathaylife.cn" + shan.str_extract(
                    'href="(.*)"', part)
            # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '陆家嘴国泰'
            item['product_name'] = shan.str_extract('>(.*?)</a>', part)
            item['product_sale_status'] = "停售"
            item[
                'product_contract_link'] = "http://www.cathaylife.cn" + shan.str_extract(
                    'href="(.*)"', part)
            # 输出数据
            yield item

コード例 #2

0

ファイルを表示

ファイル: a恒安标准.py プロジェクト: Shanlearning/insurance-data-scraping-project

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css('.list_ul a , .list_title').extract()
        zs_result = result[(shan.which(shan.str_detect("在售", result))[0] +
                            1):shan.which(shan.str_detect("停售", result))[0]]
        ts_result = result[(shan.which(shan.str_detect("停售", result))[0] +
                            1):len(result)]

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '恒安标准'
            item['product_name'] = shan.str_extract('>(.*?)</a>', part)
            item['product_sale_status'] = "在售"
            item['product_contract_link'] = shan.str_extract(
                'href="(.*)" target=', part)
            # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '恒安标准'
            item['product_name'] = shan.str_extract('>(.*?)</a>', part)
            item['product_sale_status'] = "停售"
            item['product_contract_link'] = shan.str_extract(
                'href="(.*)" target=', part)
            # 输出数据
            yield item

コード例 #3

0

ファイルを表示

ファイル: a太平养老.py プロジェクト: Shanlearning/insurance-data-scraping-project

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css('tr').extract()
        result = result[1:len(result)]
        a = shan.str_detect("健康保险", result)
        zs_result = result[0:shan.which(a)[len(shan.which(a)) - 1]]
        ts_result = result[shan.which(a)[len(shan.which(a)) - 1]:(len(result))]

        zs_result = shan.str_keep('太平', zs_result)
        ts_result = shan.str_keep('太平', ts_result)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '太平养老'
            item['product_name'] = shan.str_extract('<td>(.*?)</td>', part)
            item['product_sale_status'] = "在售"
            item['product_contract_link'] = shan.str_extract(
                'href="(.*)"', part)
            # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '太平养老'
            item['product_name'] = shan.str_extract('<td>(.*?)</td>', part)
            item['product_sale_status'] = "停售"
            item['product_contract_link'] = shan.str_extract(
                'href="(.*)"', part)
            # 输出数据
            yield item

コード例 #4

0

ファイルを表示

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css('.product_right_content').extract()
        result = result[2]
        result = re.split('<tr>', result)

        zs_result = result[shan.which(shan.str_detect("在售", result))[0]:(
            shan.which(shan.str_detect("停售", result))[0] + 1)]
        ts_result = result[(shan.which(shan.str_detect("停售", result))[0] +
                            1):len(result)]

        zs_result = shan.str_keep('德华', zs_result)
        ts_result = shan.str_keep('德华', ts_result)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '德华安顾'
            item['product_name'] = shan.str_extract('<td>(.*?)</td>', part)
            item['product_sale_status'] = "在售"
            item[
                'product_contract_link'] = "https://www.ergo-life.cn/dhag" + shan.str_extract(
                    "dhag(.*)pdf", part) + "pdf"
            if "rar" in part:
                item[
                    'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract(
                        "dhag(.*)rar", part) + "rar"
            else:
                item[
                    'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract(
                        "dhag(.*)zip", part) + "zip"
            # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '德华安顾'
            item['product_name'] = shan.str_extract('<td>(.*?)</td>', part)
            item['product_sale_status'] = "停售"
            item[
                'product_contract_link'] = "https://www.ergo-life.cn/dhag" + shan.str_extract(
                    "dhag(.*)pdf", part) + "pdf"
            if "rar" in part:
                item[
                    'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract(
                        "dhag(.*)rar", part) + "rar"
            else:
                item[
                    'product_official_report_list'] = "https://www.ergo-life.cn/dhag" + shan.str_extract(
                        "dhag(.*)zip", part) + "zip"
            # 输出数据
            yield item

コード例 #5

0

ファイルを表示

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css("p , #content a , .h2_title1").extract()
        result = result[2:len(result)]
        zs_result = result[shan.which(shan.str_detect("在售", result)[0]):shan.
                           which(shan.str_detect("停售", result))[0]]
        ts_result = result[shan.which(shan.str_detect("停售", result))[0]:shan.
                           which(shan.str_detect("在售", result))[1]]

        zs_result = shan.str_keep('style="color:#626263;"', zs_result)

        ts_result = shan.str_keep('style="color:#626263;"', ts_result)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '工银安盛'
            item['product_type'] = ''

            item['product_name'] = shan.str_extract(">(.*?)</a>", part)
            item['product_sale_status'] = '在售'

            item[
                'product_contract_link'] = "www.icbc-axa.com" + shan.str_extract(
                    'href="(.*?)pdf', part) + "pdf"
            item['product_price_link'] = ''

            item['product_start_date'] = ''
            item['product_end_date'] = ''
            # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '工银安盛'
            item['product_type'] = ''

            item['product_name'] = shan.str_extract(">(.*?)</a>", part)
            item['product_sale_status'] = '停售'

            item[
                'product_contract_link'] = "www.icbc-axa.com" + shan.str_extract(
                    'href="(.*?)pdf', part) + "pdf"
            item['product_price_link'] = ''

            item['product_start_date'] = ''
            item['product_end_date'] = ''
            # 输出数据
            yield item

コード例 #6

0

ファイルを表示

ファイル: a交银康联.py プロジェクト: Shanlearning/insurance-data-scraping-project

    def parse(self, response):
        # 从每一行抽取数据
        result = shan.str_keep("险", response.css("tr").extract())
        zs_result = result[shan.which(shan.str_detect("在售", result))[0]:shan.
                           which(shan.str_detect("停售", result))[0]]
        ts_result = result[shan.which(shan.str_detect("停售", result)
                                      )[0]:len(result)]

        zs_result = shan.str_keep('交银', zs_result)
        ts_result = shan.str_keep('交银', ts_result)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '交银康联'
            if "附加交银" in part:
                item['product_name'] = "附加交银" + shan.str_extract(
                    '附加交银(.*?)</', part)
            else:
                item['product_name'] = "交银" + shan.str_extract(
                    '交银(.*?)</', part)
            item['product_sale_status'] = '在售'
            item[
                'product_contract_link'] = "www.bocommlife.com" + shan.str_extract(
                    'href="(.*?)">', part)
            # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '交银康联'
            if "附加交银" in part:
                item['product_name'] = "附加交银" + shan.str_extract(
                    '附加交银(.*?)<', part)
            else:
                item['product_name'] = "交银" + shan.str_extract(
                    '交银(.*?)</', part)
            item['product_sale_status'] = '停售'
            item[
                'product_contract_link'] = "www.bocommlife.com" + shan.str_extract(
                    'href="(.*?)">', part)
            # 输出数据
            yield item

コード例 #7

0

ファイルを表示

ファイル: a和谐健康.py プロジェクト: Shanlearning/insurance-data-scraping-project

 def parse(self, response):
     # 从每一行抽取数据
     result = re.split("tr", response.text)
     result = result[shan.which(shan.str_detect("在售产品目录及条款", result))[0]:
                     shan.which(shan.str_detect("在售产品目录及条款", result))[1]]
     result = shan.str_keep('和谐', result)
     result = result[1:len(result)]
     for part in result:
         # 在售保险的内容输入
         item = ProjectInsuranceScrapItem()
         item['company_name'] = '和谐健康'
         item['product_name'] = "和谐" + shan.str_extract(
             '和谐(.*?)</span>', part)
         item['product_sale_status'] = '在售'
         item[
             'product_contract_link'] = "http://www.hexiehealth.com/docs" + shan.str_extract(
                 '/docs(.*?)pdf"', part) + "pdf"
         # 输出数据
         yield item

コード例 #8

0

ファイルを表示

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css('P').extract()

        zs_result = result[shan.which(shan.str_detect("表一", result))[0]:shan.
                           which(shan.str_detect("表二", result))[0]]
        ts_result = result[shan.which(shan.str_detect("表三", result)
                                      )[0]:len(result)]

        zs_result = shan.str_keep('险', zs_result)
        ts_result = shan.str_keep('险', ts_result)

        zs_result = zs_result[1:len(zs_result)]
        ts_result = ts_result[1:len(ts_result)]

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '幸福人寿'
            item['product_sale_status'] = '在售'
            item['product_contract_link'] = shan.str_extract(
                'href="(.*?)">', part)
            name = shan.str_extract('幸福(.*?)</a>', part)
            if "（<" in name:
                item['product_name'] = "幸福" + shan.str_extract(
                    '）(.*?)</font>', name)
            else:
                item['product_name'] = "幸福" + name

                # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '幸福人寿'
            item['product_name'] = "幸福" + shan.str_extract('幸福(.*?)</a>', part)
            item['product_sale_status'] = '停售'
            item['product_contract_link'] = shan.str_extract(
                'href="(.*?)">', part)
            # 输出数据
            yield item

コード例 #9

0

ファイルを表示

ファイル: S光大永明.py プロジェクト: Shanlearning/insurance-data-scraping-project

    def first_parse(self, response):
        result = response.css("tr").extract()
        zs_result = result[shan.which(shan.str_detect("在售", result))[0]:shan.
                           which(shan.str_detect("停售", result))[0]]
        zs_result = shan.str_keep("(寿|保)险", zs_result)
        ts_result = result[shan.which(shan.str_detect("停售", result)
                                      )[0]:len(result)]
        ts_result = shan.str_keep("(寿|保)险", ts_result)

        urls = [
            'http://www.sunlife-everbright.com/sleb/info/jbxx/cpjbxx/cpxxp/468a89fa-1.html'
        ]

        header = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
            'Accept-Encoding':
            'gzip, deflate',
            'Accept-Language':
            'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7',
            'Cache-Control':
            'max-age=0',
            'Connection':
            'keep-alive',
            'Cookie':
            'UM_distinctid=16bf88a1698404-01cafbdd88510b-37647e05-13c680-16bf88a169975c; CNZZDATA1274208563=1695324027-1563242927-%7C1563352631',
            'Host':
            'www.sunlife-everbright.com',
            'If-Modified-Since':
            'Wed, 03 Jul 2019 07:18:28 GMT',
            'If-None-Match':
            "4a89-58cc1aaf03900-gzip",
            'Referer':
            'http://www.sunlife-everbright.com/sleb/info/jbxx/cpjbxx/cpxxp/468a89fa-10.html',
            'Upgrade-Insecure-Requests':
            '1',
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36',
        }
        yield scrapy.Request(url=urls,
                             headers=header,
                             callback=self.second_parse)

コード例 #10

0

ファイルを表示

ファイル: a光大永明.py プロジェクト: Shanlearning/insurance-data-scraping-project

 def zts_parse(self, response):
     result = response.css('tr').extract()
     result = result[1:len(result)]
     result1 = shan.str_keep('险', result)
     zs_result = result[shan.which(shan.str_detect("在售", result))[0]:shan.
                        which(shan.str_detect("停售", result))[0]]
     item = response.meta['item']
     for part in result1:
         if shan.str_extract('href="(.*?)pdf', part) in zs_result:
             item['product_sale_status'] = '在售'
         else:
             item['product_sale_status'] = '停售'
         yield item

コード例 #11

0

ファイルを表示

    def parse(self, response):
        # 从每一行抽取数据
        result = response.css('tr').extract()
        zs_result = result[shan.which(shan.str_detect("在售", result))[0]:shan.
                           which(shan.str_detect("停售", result))[0]]
        ts_result = result[shan.which(shan.str_detect("停售", result)
                                      )[0]:len(result)]

        zs_result = shan.str_keep('中融', zs_result)
        ts_result = shan.str_keep('中融', ts_result)

        for part in zs_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '中融人寿'
            item['product_name'] = '中融' + shan.str_extract(
                '中融(.*?)</td>', part)
            item['product_sale_status'] = "在售"
            part1 = re.split('</td>', part)
            if len(part1) == 7:
                item['product_official_report_list'] = shan.str_extract(
                    'href="(.*?)" target', part1[1])
                item['product_contract_link'] = shan.str_extract(
                    'href="(.*?)" target', part1[2])
                item['product_chief_actuary_claim_link'] = shan.str_extract(
                    'href="(.*?)" target', part1[4])
                item['prodcct_law_response_link'] = shan.str_extract(
                    'href="(.*?)" target', part1[5])
            if len(part1) == 6:
                item['product_official_report_list'] = shan.str_extract(
                    'href="(.*?)" target', part1[1])
                item['product_contract_link'] = shan.str_extract(
                    'href="(.*?)" target', part1[2])
                item['product_chief_actuary_claim_link'] = shan.str_extract(
                    'href="(.*?)" target', part1[3])
                item['prodcct_law_response_link'] = shan.str_extract(
                    'href="(.*?)" target', part1[4])
            # 输出数据
            yield item

        for part in ts_result:
            # 在售保险的内容输入
            item = ProjectInsuranceScrapItem()
            item['company_name'] = '中融人寿'
            item['product_name'] = '中融' + shan.str_extract(
                '中融(.*?)</td>', part)
            item['product_sale_status'] = "停售"
            part1 = re.split('</td>', part)
            if len(part1) == 7:
                item['product_official_report_list'] = shan.str_extract(
                    'href="(.*?)" target', part1[1])
                item['product_contract_link'] = shan.str_extract(
                    'href="(.*?)" target', part1[2])
                item['product_chief_actuary_claim_link'] = shan.str_extract(
                    'href="(.*?)" target', part1[4])
                item['prodcct_law_response_link'] = shan.str_extract(
                    'href="(.*?)" target', part1[5])
            if len(part1) == 6:
                item['product_official_report_list'] = shan.str_extract(
                    'href="(.*?)" target', part1[1])
                item['product_contract_link'] = shan.str_extract(
                    'href="(.*?)" target', part1[2])
                item['product_chief_actuary_claim_link'] = shan.str_extract(
                    'href="(.*?)" target', part1[3])
                item['prodcct_law_response_link'] = shan.str_extract(
                    'href="(.*?)" target', part1[4])
            # 输出数据
            yield item