Python stripの例、product_spider.utils.functions.strip Pythonの例

コード例 #1

0

ファイルを表示

ファイル: chemimpex.py プロジェクト: Pandaaaa906/product_spider

    def parse(self, response):
        a_nodes = response.xpath('//div[@class="count-box"]/a[translate(normalize-space(text())," ","")!="0"]')
        for a in a_nodes:
            cat_url = strip(a.xpath('./@href').get())
            if not cat_url:
                continue
            parent = strip(a.xpath('./text()').get())
            yield Request(urljoin(self.base_url, cat_url), callback=self.parse, meta={'parent': parent})

        prd_urls = response.xpath('//h3[@class="prodname"]/a/@href').getall()
        for prd_url in prd_urls:
            yield Request(urljoin(self.base_url, prd_url), callback=self.parse_detail,
                          meta={'parent': response.meta.get('parent')}
                          )

        next_page = strip(response.xpath(
            '//span[@class="selectedpage"]/../following-sibling::li/a[not(parent::li/span)]/text()'
        ).get())
        if next_page:
            url, *_ = response.url.split('?')
            params = urlencode({
                'custguid': '',
                'custclsid': '',
                'pn': next_page,
            })
            yield Request(f'{url}?{params}', callback=self.parse, meta={'parent': response.meta.get('parent')})

コード例 #2

0

ファイルを表示

    def parse_detail(self, response):
        tmp = '//th[contains(text(), {!r})]/following-sibling::td/text()'
        rel_img = response.xpath('//div[@class="pic"]/img/@src').get()
        cat_no = response.xpath('//div/span[@style]/text()').get()
        d = {
            'brand': self.brand,
            'cat_no': cat_no,
            'en_name': response.xpath('//div/span/@data-nameen').get(),
            'cas': response.xpath(tmp.format("CAS:")).get(),
            'mdl': response.xpath(tmp.format("MDL:")).get(),
            'mf':
            formula_trans(strip(response.xpath(tmp.format("分子式:")).get())),
            'mw': response.xpath(tmp.format("分子量:")).get(),
            'smiles': response.xpath(tmp.format("SMILES code:")).get(),
            'purity': response.xpath(tmp.format("化学纯度:")).get(),
            'img_url': rel_img and urljoin(response.url, rel_img),
            'prd_url': response.url,
        }
        yield RawData(**d)

        rows = response.xpath('//div[@class="table-1"]//tbody/tr')
        for row in rows:
            package = {
                'brand': self.brand,
                'cat_no': cat_no,
                'package': row.xpath('./td[1]/text()').get(),
                'price': strip(row.xpath('./td[2]/text()').get()),
                'stock_num': row.xpath('./td[5]/text()').get(),
                'currency': 'RMB',
            }
            yield ProductPackage(**package)

コード例 #3

0

ファイルを表示

ファイル: o2si_spider.py プロジェクト: Pandaaaa906/product_spider

    def parse_list(self, response):
        parent = response.meta.get('parent')
        rows = response.xpath('//li/form')
        for row in rows:
            rel_url = row.xpath('.//span[@class="title"]/a/@href').get()

            yield Request(
                urljoin(self.base_url, rel_url),
                callback=self.parse_detail,
                meta={
                    'parent':
                    parent,
                    'cat_no':
                    strip(row.xpath('./span[@class="number"]/text()').get()),
                    'en_name':
                    strip(row.xpath('./span[@class="title"]/a/text()').get()),
                    'package':
                    strip(row.xpath('./span[@class="size"]/text()').get()),
                })

        next_page = response.xpath(
            '//div[contains(text(),"Page")]/a[contains(@class,"current")]/following-sibling::a/@href'
        ).get()
        if next_page:
            yield Request(urljoin(self.base_url, next_page),
                          callback=self.parse_list,
                          meta={'parent': parent})

コード例 #4

0

ファイルを表示

ファイル: chemimpex.py プロジェクト: Pandaaaa906/product_spider

 def parse_table(self, response):
     d = {
         'info3': strip(response.xpath('//td[@class="skusize"]/text()').get()),
         'info4': strip(response.xpath('//span[@class="price"]/text()').get()),
         'stock_info': strip(response.xpath('//span[contains(@class, "stockstatus")]/text()').get()),
     }
     yield RawData(**response.meta.get('prd_info', {}), **d)

コード例 #5

0

ファイルを表示

ファイル: alta_spider.py プロジェクト: Pandaaaa906/product_spider

 def parse_detail(self, response):
     tmp = 'normalize-space(//td[contains(div/text(), {!r})]/following-sibling::td/text())'
     rel_img = response.xpath('//div[@class="c_c_p"]//div/img/@src').get()
     cat_no = strip(response.xpath(tmp.format("产品号/Catalog#")).get())
     d = {
         'brand': self.brand,
         'parent': response.meta.get('parent'),
         'cat_no': cat_no,
         'en_name':
         strip(response.xpath(tmp.format("Product Name：")).get()),
         'chs_name': strip(response.xpath(tmp.format("产品名称：")).get()),
         'cas': strip(response.xpath(tmp.format("CAS#：")).get()),
         'mf': strip(response.xpath(tmp.format("分子式/Formula：")).get()),
         'mw': strip(response.xpath(tmp.format("分子量/MW：")).get()),
         'purity':
         strip(response.xpath(tmp.format("纯度/Purity (%)：")).get()),
         'info1': strip(response.xpath(tmp.format("Synonyms：")).get()),
         'info2': strip(response.xpath(tmp.format("储藏条件/Storage：")).get()),
         'appearance': strip(response.xpath(tmp.format("颜色/Color：")).get()),
         'img_url': rel_img and urljoin(response.url, rel_img),
         'prd_url': response.url,
     }
     for k in d:
         d[k] = d[k] if d[k] != 'NA' else None
     yield RawData(**d)
     rows = response.xpath(
         '//table[@class="c_p_size"]//tr[td and td/text()!="NA"]')
     for row in rows:
         dd = {
             'brand': self.brand,
             'cat_no': cat_no,
             'package': row.xpath('./td[1]/text()').get(),
             'price': row.xpath('./td[1]/text()').get(),
         }
         yield ProductPackage(**dd)

コード例 #6

0

ファイルを表示

    def detail_parse(self, response):
        tmp = '//th[contains(text(),{0!r})]/following-sibling::td/descendant-or-self::text()'
        img_url = response.xpath(
            '//th[contains(text(),"Structure")]/following-sibling::td/img/@src'
        ).get()
        cat_no = strip(response.xpath(tmp.format("Product No.")).get())
        d = {
            "brand":
            self.brand,
            "cat_no":
            cat_no,
            "parent":
            response.xpath(tmp.format("Category")).get(),
            "info1":
            "".join(response.xpath(tmp.format("Synonym(s)")).extract()),
            "mw":
            response.xpath(tmp.format("Molecular Weight")).get(),
            "mf":
            "".join(response.xpath(tmp.format("Formula")).extract()),
            "cas":
            response.xpath(tmp.format("CAS Number")).get(),
            "en_name":
            strip("".join(
                response.xpath(
                    '//div[@class="product-name"]/span/descendant-or-self::text()'
                ).extract())),
            "img_url":
            img_url and urljoin(self.base_url, img_url),
            "stock_info":
            response.xpath(
                '//table[@id="product-matrix"]//td[@class="unit-price"]/text()'
            ).get(),
            "prd_url":
            response.url,
        }
        yield RawData(**d)

        matrix = first(
            re.findall(r'var matrixChildrenProducts = ({.+});', response.text),
            None)
        if not matrix:
            return
        packages = json.loads(matrix)
        for _, item in packages.items():
            sku = item.get('sku')
            if not sku:
                continue
            package = sku.replace(f'{cat_no}-', '')
            dd = {
                'brand': self.brand,
                'cat_no': cat_no,
                'cat_no_unit': sku,
                'package': strip(package),
                'price': item.get('price'),
                'currency': 'USD',
                'delivery_time':
                'In-stock' if item.get('is_in_stock') else None
            }
            yield ProductPackage(**dd)

コード例 #7

0

ファイルを表示

ファイル: synthose_spider.py プロジェクト: Pandaaaa906/product_spider

 def parse(self, response):
     a_nodes = response.xpath('//div[@id="lnav"]//li[not(child::ul)]/a')
     for a in a_nodes:
         parent = strip(a.xpath('./text()').get())
         rel_url = strip(a.xpath('./@href').get())
         yield Request(urljoin(self.base_url, rel_url),
                       callback=self.parse_list,
                       meta={'parent': parent})

コード例 #8

0

ファイルを表示

ファイル: paitai_spider.py プロジェクト: Pandaaaa906/product_spider

 def parse_detail(self, response):
     tmp = '//td[contains(./span/text(), {!r})]/following-sibling::td//span//text()'
     tmp2 = '//td[contains(./span/text(), {!r})]/following-sibling::td/p[{}]/span/text()'
     en_name = strip(response.xpath(tmp2.format("Product Name", 1)).get()) or \
               strip(response.xpath(tmp.format("Product Name")).get())
     d = {
         'brand':
         self.brand,
         'cat_no':
         en_name,
         'en_name':
         en_name,
         'chs_name':
         strip(response.xpath(tmp2.format("Product Name", 2)).get()),
         'cas':
         strip(response.xpath(tmp.format("Cas No.")).get()),
         'info1':
         strip(response.xpath(tmp.format("Sequence")).get()),
         'mf':
         strip(''.join(
             response.xpath(tmp.format("Molecular Formula")).getall())),
         'mw':
         strip(response.xpath(tmp.format("Molar Mass")).get()),
         'purity':
         strip(''.join(response.xpath(tmp.format("Purity")).getall())),
         'info2':
         strip(response.xpath(tmp.format("Storage Temperature")).get()),
         'img_url':
         response.xpath(
             '//div[contains(@class, "slick-slide")][1]/a/img/@src').get(),
         'prd_url':
         response.url,
     }
     yield RawData(**d)

コード例 #9

0

ファイルを表示

ファイル: chemimpex.py プロジェクト: Pandaaaa906/product_spider

 def parse_detail(self, response):
     tmp = '//span[contains(text(), {!r})]/following-sibling::span//text()'
     d = {
         'brand': 'chemimpex',
         'parent': response.meta.get('parent'),
         'cat_no': response.xpath(tmp.format("Catalog Number:")).get(),
         'en_name': strip(''.join(response.xpath('//h1[@itemprop="name"]//text()[not(parent::span)]').getall())),
         'purity': strip(response.xpath('//h1[@itemprop="name"]/span[@style]/text()').get()),
         'mf': strip(''.join(response.xpath(tmp.format('Molecular Formula:')).getall())),
         'mw': strip(response.xpath(tmp.format('Molecular Weight:')).get()),
         'cas': strip(response.xpath(tmp.format('CAS No:')).get()),
         'appearance': strip(response.xpath(tmp.format('Appearance:')).get()),
         'info1': strip(';'.join(response.xpath(tmp.format('Synonyms:')).getall())),
         'info2': strip(response.xpath(tmp.format('Storage Temp:')).get()),
         'img_url': strip(response.xpath('//div[@id="catalog_content"]/img/@src').get()),
         'prd_url': response.url,
     }
     m = re.search(r'push\(({.+\})\);', response.text)
     if not m:
         yield RawData(**d)
         return
     j_obj = json.loads(m.group(1))
     params = [j_obj.get(f'param{i}', '') for i in range(1, 7)]
     url = 'https://www.chemimpex.com/Widgets-product/gethtml_skulist/{}/{}/{}/{}/{}/{}'.format(*params)
     yield Request(url, callback=self.parse_table, meta={'prd_info': d})

コード例 #10

0

ファイルを表示

 def detail_parse(self, response):
     tmp = 'normalize-space(//div[@class="product1_l"]//span[contains(text(), "{}")]/../text())'
     rel_img = response.xpath('//div[@class="product1"]/img/@src').get()
     d = {
         "brand":
         "synpharmatech",
         "cat_no":
         strip(response.xpath(tmp.format("Cat. No")).get()),
         "en_name":
         strip(
             response.xpath('//div[@class="product1_l"]//h1/text()').get()),
         "info1":
         strip(response.xpath(tmp.format("Synonyms")).get()),
         "cas":
         strip(response.xpath(tmp.format("CAS No")).get()),
         "mf":
         strip(response.xpath(tmp.format("Formula")).get()),
         "mw":
         strip(response.xpath(tmp.format("F.W")).get()),
         "purity":
         strip(response.xpath(tmp.format("Purity")).get()),
         "stock_info":
         strip(
             response.xpath(
                 'normalize-space(//div[@class="product2"]//tr[position()>1]/td[4]/text())'
             ).get()) or None,
         "prd_url":
         response.url,
         "img_url":
         urljoin(self.base_url, rel_img) if rel_img else None,
     }
     yield RawData(**d)

コード例 #11

0

ファイルを表示

ファイル: cprd_spider.py プロジェクト: Pandaaaa906/product_spider

 def parse_detail(self, response):
     tmp = '//strong[contains(text(),{!r})]/following-sibling::text()'
     rel_img = response.xpath('//article//a/img/@src').get()
     d = {
         "brand": "cprd",
         # "parent": response.xpath('//p[@class="catalogue_number"]/a/text()').get(),
         "cat_no": strip(response.xpath(tmp.format("Catalogue Number:")).get()),
         "cas": strip(response.xpath(tmp.format("CAS Number:")).get()),
         "en_name": strip(response.xpath(tmp.format("Chemical Name:")).get()),
         "img_url": rel_img and urljoin(response.url, rel_img),
         "mf": strip(response.xpath(tmp.format("Molecular Formula:")).get()),
         "mw": strip(response.xpath(tmp.format("Molecular Weight:")).get()),
         "prd_url": response.url,
     }
     yield RawData(**d)

コード例 #12

0

ファイルを表示

ファイル: o2si_spider.py プロジェクト: Pandaaaa906/product_spider

 def parse_detail(self, response):
     cas = response.xpath('//tr[@class="style17"]/td[3]/text()').getall()
     cas = tuple(filter(lambda x: x, (strip(i) for i in cas)))
     d = {
         'brand':
         self.brand,
         'parent':
         response.meta.get('parent'),
         'cat_no':
         response.meta.get('cat_no'),
         'en_name':
         response.meta.get('en_name'),
         'cas':
         first(cas, None) if len(cas) == 1 else None,
         'info1':
         ';'.join(set(cas)),
         'info3':
         response.meta.get('package'),
         'info4':
         response.xpath(
             '//p[contains(text(), "Price:")]/strong/text()').get(),
         'prd_url':
         response.url,
     }
     yield RawData(**d)

コード例 #13

0

ファイルを表示

ファイル: payne_spider.py プロジェクト: Pandaaaa906/product_spider

 def parse_list(self, response):
     parent = strip(response.xpath('//strong//text()').get())
     urls = response.xpath('//div[@class="iproimg"]/a/@href').getall()
     for url in urls:
         yield Request(url,
                       callback=self.parse_detail,
                       meta={'parent': parent})

コード例 #14

0

ファイルを表示

ファイル: usp_spider.py プロジェクト: Pandaaaa906/product_spider

    def parse_detail(self, response):
        tmp = '//td[contains(text(), {!r})]/following-sibling::td/text()'
        cat_no = response.xpath(tmp.format('Catalog #')).get()
        d = {
            'brand': self.brand,
            'cat_no': cat_no,
            'en_name': response.xpath('//td[@class="pageTitle"]/text()').get(),
            'cas': response.xpath(tmp.format('CAS#')).get(),
            'stock_info': response.xpath(tmp.format('In Stock')).get(),
            'prd_url': response.url,
        }
        yield RawData(**d)

        raw_price = strip(response.xpath(
            'normalize-space(//td[contains(text(), "Retail Price:")]/following-sibling::td/text())'
        ).get())
        price = None
        if raw_price:
            raw_price = re.sub(r'\s+', ' ', raw_price)
            price = first(map(lambda m: m.group(0) if m is not None else None,
                              re.finditer(r'(\d+(\.\d+)?)', raw_price)), None)
        dd = {
            'brand': self.brand,
            'cat_no': cat_no,
            'price': price,
            'currency': 'USD',
            'info': raw_price,
            'delivery_time': response.xpath(tmp.format('In Stock')).get(),
        }
        yield ProductPackage(**dd)

コード例 #15

0

ファイルを表示

    def detail_parse(self, response):
        cat_no_unit = response.xpath('//span[@itemprop="sku"]/text()').get("")
        m = re.match(r'[A-Z]{3}-\d+', cat_no_unit)
        cat_no = m.group(0) if m else cat_no_unit
        rel_img = response.xpath('//img[@class="zoomImg"]/@src').get()
        full_name = response.xpath('//h1[@itemprop="name"][1]/text()').get(
            "").title()
        tmp_full_name = response.xpath(
            '//div[@itemprop="description"]/text()').get("").title()
        if '-' in full_name:
            en_name, package = full_name.rsplit('-', 1)
        elif '-' in tmp_full_name:
            en_name, package = tmp_full_name.rsplit('-', 1)
        else:
            en_name, package = full_name, 'kit'

        d = {
            "brand": self.brand,
            "parent": self.extract_value(response, "Chemical Family: "),
            "cat_no": cat_no,
            "en_name": strip(en_name),
            "cas": self.extract_value(response, "CAS: "),
            "mf": self.extract_value(response, "Chemical Formula: "),
            "mw": self.extract_value(response, "Formula Weight: "),
            "info2": self.extract_value(response, "Long Term Storage: "),
            "appearance": self.extract_value(response, "Appearance: "),
            "purity": self.extract_value(response, "Purity: "),
            'img_url': rel_img and urljoin(self.base_url, rel_img),
            "prd_url": response.url,
        }
        yield RawData(**d)

        stock_num = response.xpath(
            '//div[@class="items_left"]//em/text()').get()
        package = strip(package)
        dd = {
            'brand': self.brand,
            'cat_no_unit': cat_no_unit,
            'cat_no': cat_no,
            'package': package and package.lower(),
            'price':
            response.xpath('//span[@itemprop="price"]/@content').get(),
            'currency': 'USD',
            'stock_num': stock_num
            and first(re.findall(r'\d+', stock_num), None),
        }
        yield ProductPackage(**dd)

コード例 #16

0

ファイルを表示

 def parse(self, response):
     a_nodes = response.xpath('//a[@class="sort-alpha"]')
     for a in a_nodes:
         parent = strip(a.xpath('./text()').get())
         rel_url = a.xpath('./@href').get()
         if not rel_url:
             continue
         yield Request(urljoin(response.url, rel_url), callback=self.parse_list, meta={'parent': parent})

コード例 #17

0

ファイルを表示

 def process_item(self, item, spider):
     adapter = ItemAdapter(item)
     cas = strip(adapter.get('cas'))
     if cas is None or not isinstance(cas, str):
         return item
     adapter['cas'] = None if cas.lower() in {'n/a', 'na', 'null', ''
                                              } else cas
     return item

コード例 #18

0

ファイルを表示

ファイル: medicalisotopes_spider.py プロジェクト: Pandaaaa906/product_spider

 def parse_detail(self, response):
     tmp = '//td[contains(text(), {!r})]/following-sibling::td//text()'
     package = strip(
         response.xpath('normalize-space(//td/table//td[1]/text())').get())
     d = {
         'brand':
         'medicalisotopes',
         'parent':
         response.meta.get('parent'),
         'cat_no':
         strip(response.xpath(tmp.format("Catalog Number:")).get()),
         'en_name':
         strip(
             response.xpath(
                 '//th[contains(text(), "Product:")]/following-sibling::th/text()'
             ).get()),
         'cas':
         strip(response.xpath(tmp.format("CAS Number:")).get()),
         'mf':
         strip(''.join(response.xpath(tmp.format("Formula:")).getall())),
         'mw':
         strip(response.xpath(tmp.format("Molecular Weight:")).get()),
         'info3':
         package and package.rstrip('\xa0='),
         'info4':
         strip(response.xpath('//td/table//td[2]/text()').get()),
         'prd_url':
         response.url,
     }
     yield RawData(**d)

コード例 #19

0

ファイルを表示

ファイル: cpachem.py プロジェクト: Pandaaaa906/product_spider

 def parse_detail(self, response):
     tmp = '//b[text()={!r}]/following-sibling::text()'
     catagory = strip(
         response.xpath(
             '//b[text()="Category:"]/following-sibling::a/text()').get())
     d = {
         'brand':
         'cpachem',
         'parent':
         catagory,
         'cat_no':
         strip(response.xpath(tmp.format("Ref Num:")).get()),
         'en_name':
         strip(response.xpath(tmp.format("Full Name:")).get()),
         'info2':
         strip(
             response.xpath(tmp.format("Shelf Life on Ship Date:")).get()),
         'info3':
         strip(response.xpath(tmp.format("Vol.:")).get()),
         'info4':
         strip(
             response.xpath('//h3[contains(text(), "Price:")]/text()').get(
             )).lstrip('Price: '),
         'stock_info':
         strip(
             response.xpath(
                 '//p[@style="padding:15px 0px 5px 0px;"]/text()').get()),
         'prd_url':
         response.url,
     }
     yield RawData(**d)

コード例 #20

0

ファイルを表示

    def parse_list(self, response):
        xp_boxes = response.xpath("//table[@id]//div[@class='PRODUCT_box']")
        for xp_box in xp_boxes:
            div = xp_box.xpath(".//div[2][@class='left_right mulu_text']")
            brand = strip(
                div.xpath(
                    './/li[@id="ctl00_cph_Content_li_lt_Brand"]/text()').get(),
                '')
            rel_url = div.xpath('.//a[@class="name"]/@href').get()
            img_url = div.xpath('.//img/@src').get()
            d = {
                'brand':
                brand.replace('-', '') or None,
                "purity":
                div.xpath(".//li[1]/text()").get('').split(u"：")[-1].strip(),
                "cas":
                strip(div.xpath(".//li[2]//a/text()").get()),
                "cat_no":
                div.xpath(".//li[4]/text()").get().split(u"：")[-1].strip(),
                "en_name":
                strip(xp_box.xpath(".//a[@class='name']/text()").get()),
                "cn_name":
                strip(
                    xp_box.xpath(".//a[@class='name']//span[1]/text()").get()),
                'prd_url':
                rel_url and urljoin(response.url, rel_url),
                'img_url':
                img_url and urljoin(response.url, img_url),
            }
            data_jkid = xp_box.xpath(".//div[@data-jkid]/@data-jkid").get()
            data_cid = xp_box.xpath(".//div[@data-cid]/@data-cid").get()

            yield Request(self.prd_size_url.format(value=data_jkid,
                                                   cid=data_cid,
                                                   ts=int(time())),
                          body=u"",
                          meta={"prd_data": d},
                          callback=self.parse_package)

        next_page = response.xpath('//a[contains(text(), "下一页")]/@href').get()
        if next_page:
            yield Request(urljoin(response.url, next_page),
                          callback=self.parse_list)

コード例 #21

0

ファイルを表示

 def parse_detail(self, response):
     d = {
         'brand': '海岸鸿蒙',
         'parent': response.meta.get('parent'),
         'cat_no': strip(response.xpath('//span[contains(@class, "kj_customno")]/text()').get()),
         'cas': strip(response.xpath('//p/text()[contains(self::text(), "CAS")]/following-sibling::span/text()').get()),
         'cn_name': strip(response.xpath('//h4[@class="c red1"]/text()').get()),
         'prd_url': response.url,
     }
     pd_id = response.xpath('//input[@id="nowproductid"]/@value').get()
     if not pd_id:
         return
     yield Request(
         'http://www.bjhongmeng.com/ajaxpro/Web960.Web.index,Web960.Web.ashx',
         method='POST',
         body=json.dumps({'pd_id': pd_id, }),
         headers={'X-AjaxPro-Method': 'LoadGoods', },
         callback=self.parse_price,
         meta={'product': d}
     )

コード例 #22

0

ファイルを表示

 def parse_detail(self, response):
     tmp = '//td[contains(text(), {!r})]/following-sibling::td//text()'
     parent = response.meta.get('parent')
     name = strip(response.xpath('//h1[@class]/text()').get())
     chemical_name = response.xpath(tmp.format("Chemical name")).get()
     d = {
         'brand': 'pharmaffiliates',
         'parent': parent and parent.title(),
         'cat_no': response.xpath(tmp.format("Catalogue number")).get(),
         'en_name': name or chemical_name,
         'cas': strip(response.xpath('//h2[contains(text(), "CAS Number")]/../following-sibling::td//text()').get()),
         'mf': ''.join(response.xpath(tmp.format("Molecular form")).getall()),
         'mw': response.xpath(tmp.format("Mol. Weight")).get(),
         'appearance': response.xpath(tmp.format("Appearance")).get(),
         'info1': response.xpath(tmp.format("Synonyms")).get() or chemical_name,
         'info2': strip(response.xpath(tmp.format("Storage")).get()),
         'img_url': response.xpath('//img[@id="mainimg"]/@src').get(),
         'prd_url': response.url,
     }
     yield RawData(**d)

コード例 #23

0

ファイルを表示

ファイル: sddstore_spider.py プロジェクト: Pandaaaa906/product_spider

    def parse_detail(self, response):
        tmp = '//div[contains(text(), {!r})]/following-sibling::div/text()'
        rel_img = response.xpath('//img[@class="pic"]/@src').get()

        d = {
            'brand':
            'sdd',
            'cat_no':
            response.xpath('//tr/td[1]/text()').get(),
            'en_name':
            response.xpath('//div[@class="row"]//dl/dd/text()').get(),
            'chs_name':
            response.xpath('//div[@class="row"]//dl/dt/text()').get(),
            'cas':
            strip(response.xpath(tmp.format("CAS NO.")).get()),
            'mf':
            strip(response.xpath(tmp.format("分子式")).get()),
            'mw':
            strip(response.xpath(tmp.format("分子量")).get()),
            'info1':
            strip(response.xpath(tmp.format("英文异名")).get()),
            'info2':
            response.xpath(
                '//td[contains(text(), "存储条件")]/following-sibling::td[1]/text()'
            ).get(),
            'info3':
            response.xpath('//tr/td[6]/text()').get(),
            'info4':
            response.xpath('//tr/td[5]/text()').get(),
            'stock_info':
            response.xpath('//tr/td[7]/text()').get(),
            'appearance':
            response.xpath(
                '//td[contains(text(), "性状")]/following-sibling::td[1]/text()'
            ).get(),
            'img_url':
            rel_img and urljoin(self.base_url, rel_img),
            'prd_url':
            response.url,
        }
        yield RawData(**d)

コード例 #24

0

ファイルを表示

ファイル: payne_spider.py プロジェクト: Pandaaaa906/product_spider

 def parse_detail(self, response):
     tmp = '//td[contains(text(), {!r})]/following-sibling::td//text()'
     d = {
         'brand':
         self.brand,
         'parent':
         response.meta.get('parent'),
         'cat_no':
         strip(response.xpath(tmp.format("产品编号：")).get()),
         'en_name':
         strip(response.xpath('//div[@class="proinftit_t"]/text()').get()),
         'cas':
         strip(response.xpath(tmp.format("CAS号：")).get()),
         'mf':
         strip(''.join(response.xpath(tmp.format("分子式：")).getall())),
         'mw':
         strip(response.xpath(tmp.format("分子量：")).get()),
         'info1':
         strip(response.xpath(tmp.format("化学名：")).get()),
         'img_url':
         response.xpath('//div[@class="proinfotableimg"]/img/@src').get(),
         'prd_url':
         response.url,
     }
     yield RawData(**d)

コード例 #25

0

ファイルを表示

 def detail_parse(self, response):
     tmp = '//td[contains(descendant-or-self::text(), "{}")]//following-sibling::td/text()'
     d = {
         "brand": "qcc",
         "parent": response.meta.get('parent'),
         "cat_no": response.xpath(tmp.format("QCC Cat No.:")).get(),
         "cas": strip(response.xpath(tmp.format("CAS No.:")).get()),
         "en_name":
         strip(response.xpath(tmp.format("Chemical Name:")).get()),
         "info1": strip(response.xpath(tmp.format("Synonyms:")).get()),
         "mf":
         strip(response.xpath(tmp.format("Molecular Formula:")).get()),
         "mw": strip(response.xpath(tmp.format("Molecular Weight:")).get()),
         "prd_url": response.url,
     }
     img_url = urljoin(
         self.base_url,
         response.xpath(
             '//table//td/div[@style and not(div)]//img/@src').get())
     if img_url and not img_url.endswith('Uploads/'):
         d['img_url'] = img_url
     yield RawData(**d)

コード例 #26

0

ファイルを表示

    def parse_detail(self, response):
        tmp = '//li[contains(text(), {!r})]/text()'
        func = lambda res, t: res.xpath(tmp.format(t)).get('').lstrip(t
                                                                      ) or None
        img_rel = response.xpath('//td/img/@src').get()

        cat_no = response.xpath('//tr[@id][1]/td[2]/text()').get()
        if not cat_no:
            return
        d = {
            'brand': self.brand,
            'cat_no': cat_no,
            'parent': response.meta.get('parent'),
            'en_name': strip(response.xpath('//h2/text()[1]').get()),
            'chs_name': strip(response.xpath('//h2/text()[2]').get()),
            'cas': func(response, 'CAS号：'),
            'mf': func(response, '分子式：'),
            'mw': func(response, '分子量：'),
            'purity': func(response, '韶远库存批次纯度：'),
            'info3': response.xpath('//tr[@id][1]/td[4]/text()').get(),
            'info4': response.xpath('//tr[@id][1]/td[5]/text()').get(),
            'stock_info': response.xpath('//tr[@id][1]/td[8]/text()').get(),
            'img_url': img_rel and urljoin(self.base_url, img_rel),
            'prd_url': response.url,
        }
        yield RawData(**d)

        for tr in response.xpath('//tr[@id]'):
            d_package = {
                'brand': self.brand,
                'cat_no': cat_no,
                'package': tr.xpath('./td[4]/text()').get(),
                'price': tr.xpath('./td[5]/text()').get(),
                'currency': 'RMB',
                'delivery_time': tr.xpath('./td[8]/text()').get(),
            }
            if d_package['package'] == 'bulk':
                continue
            yield ProductPackage(**d_package)

コード例 #27

0

ファイルを表示

ファイル: tci_spider.py プロジェクト: Pandaaaa906/product_spider

    def parse_detail(self, response):
        tmp = '//span[@class={!r}]/text()'
        tmp2 = '//td[contains(text(), {!r})]/following-sibling::td/text()'
        cat_no = response.xpath(tmp.format("code productVal")).get()
        mw = strip(response.xpath(tmp2.format("分子式/分子量")).get())
        img_rel = response.xpath('//div[@data-attr]/@data-attr').get()
        d = {
            'brand': self.brand,
            'parent': '_'.join(response.xpath(
                '//div[@class="subCategory clearfix"][1]//span[@class="startPoint"]//a/text()').getall()),
            'cat_no': cat_no,
            'en_name': ''.join(response.xpath('//h1[@class="name"]//text()').getall()),
            'cas': response.xpath(tmp.format("cas productVal")).get(),
            'mf': ''.join(response.xpath('//span[@id="molecularFormula"]//text()').getall()).replace('_', ''),
            'mw': mw and mw.replace('=', ''),
            'purity': response.xpath(tmp2.format("纯度/分析方法")).get(),
            'appearance': response.xpath(tmp2.format("外观与形状")).get(),
            'info2': response.xpath(tmp2.format("储存温度")).get(),
            'mdl': response.xpath(tmp2.format("MDL编号")).get(),

            'img_url': img_rel and urljoin(self.base_url, img_rel),
            'prd_url': response.url,
        }
        yield RawData(**d)

        rows = response.xpath('//table[@id="PricingTable"]/tbody/tr')
        for row in rows:
            stock_num = strip(row.xpath('./td[3]/text()').get())
            package = {
                'brand': self.brand,
                'cat_no': cat_no,
                'package': row.xpath('./td[1]/text()').get(),
                'delivery_time': '现货' if stock_num != '0' else None,
                'price': strip(row.xpath('./td[2]/div/text()').get()),
                'stock_num': stock_num,
                'currency': 'RMB',
            }
            yield ProductPackage(**package)

コード例 #28

0

ファイルを表示

 def parse(self, response):
     a_nodes = response.xpath('//ul[not(@id) and not(@class)]/li/a')
     for a in a_nodes:
         parent = strip(a.xpath('./text()').get())
         rel_url = a.xpath('./@href').get()
         url = urljoin(response.url, rel_url)
         if rel_url.startswith('..'):
             yield Request(url,
                           callback=self.parse_detail,
                           meta={'parent': parent})
         else:
             yield Request(url,
                           callback=self.parse_list,
                           meta={'parent': parent})

コード例 #29

0

ファイルを表示

 def parse_list(self, response):
     tables = response.xpath('//table')
     for table in tables:
         en_name = table.xpath('.//td[@class="info"]/h5[not(@class)]/strong//text()').get('')
         short_desc = table.xpath('normalize-space(.//td[@class="info"]/h5[@class="short_desc"]/strong//text())').get('')
         en_name = en_name.strip(' :')
         tmp = short_desc.split(';')
         tmp = map(str.strip, tmp)
         tmp = tuple(filter(bool, tmp))
         m_cas = re.search(r'\d+-\d{2}-\d', short_desc)
         m_mw = re.search(r'Mol\. Wt\.: ([^;]+);', short_desc)
         m_mf = re.search(r'CAS : [^;]+; ([^;]+)', short_desc)
         d = {
             'brand': 'srinidhiindsynth',
             'parent': response.meta.get('parent'),
             'cat_no': en_name,
             'en_name': en_name,
             'cas': m_cas and m_cas.group(),
             'mf': m_mf and strip(m_mf.group(1)),
             'mw': m_mw and strip(m_mw.group(1)),
             'img_url': table.xpath('.//img/@src').get(),
             'prd_url': response.url,
         }

コード例 #30

0

ファイルを表示

 def parse_detail(self, response):
     mf = strip(''.join(response.xpath('//label[text()="Mol. Formula : "]/..//text()[not(parent::label)]').getall()))
     row = response.xpath(
         '//div[not(@style)]/table[@class="table table-condensed"]/tbody/tr[position()=1 and position()!=last()]'
     )
     price = row.xpath('./td[2]/text()').get()
     cas = strip(response.xpath('//b[contains(text(), "CAS")]/../following-sibling::div/text()').get())
     d = {
         'brand': 'syninnova',
         'parent': response.meta.get('category'),
         'cat_no': response.xpath('//div[contains(@class, "productinfo")]/h2[1]/text()').get(),
         'en_name': response.xpath('//div[contains(@class, "productinfo")]/h2[2]/text()').get(),
         'cas': cas and cas.translate(t),
         'mf': mf,
         'mw': strip(response.xpath('//label[text()="Mol. Weight : "]/following-sibling::text()').get()),
         'appearance': strip(response.xpath('//label[text()="Appearance : "]/following-sibling::text()').get()),
         'info3': row.xpath('./td[1]/text()').get(),
         'info4': price and f'USD {price}',
         'stock_info': row.xpath('./td[4]/text()').get(),
         'img_url': response.xpath('//div[@class="prodImage"]/img/@src').get(),
         'prd_url': response.url,
     }
     yield RawData(**d)