Example #1
0
    def parse_detail(self, response):
        tmp = '//th[contains(text(), {!r})]/following-sibling::td/text()'
        rel_img = response.xpath('//div[@class="pic"]/img/@src').get()
        cat_no = response.xpath('//div/span[@style]/text()').get()
        d = {
            'brand': self.brand,
            'cat_no': cat_no,
            'en_name': response.xpath('//div/span/@data-nameen').get(),
            'cas': response.xpath(tmp.format("CAS:")).get(),
            'mdl': response.xpath(tmp.format("MDL:")).get(),
            'mf':
            formula_trans(strip(response.xpath(tmp.format("分子式:")).get())),
            'mw': response.xpath(tmp.format("分子量:")).get(),
            'smiles': response.xpath(tmp.format("SMILES code:")).get(),
            'purity': response.xpath(tmp.format("化学纯度:")).get(),
            'img_url': rel_img and urljoin(response.url, rel_img),
            'prd_url': response.url,
        }
        yield RawData(**d)

        rows = response.xpath('//div[@class="table-1"]//tbody/tr')
        for row in rows:
            package = {
                'brand': self.brand,
                'cat_no': cat_no,
                'package': row.xpath('./td[1]/text()').get(),
                'price': strip(row.xpath('./td[2]/text()').get()),
                'stock_num': row.xpath('./td[5]/text()').get(),
                'currency': 'RMB',
            }
            yield ProductPackage(**package)
Example #2
0
 def detail_parse(self, response):
     tmp = '//td[contains(text(), {!r})]/following-sibling::td/text()'
     d = {
         "brand": "hic",
         "parent": response.meta.get("parent"),
         "cat_no": response.xpath('//h1[@class="header-post-title-class"]/text()').get(),
         "cas": response.xpath(tmp.format("CAS:")).get(),
         "en_name": response.xpath(tmp.format("Chemical name:")).get(),
         "info1": response.xpath(tmp.format("Synonyms:")).get(),
         "img_url": response.xpath('//div[@class="images"]//img/@src').get(),
         "mf": formula_trans(response.xpath(tmp.format("Molecular form:")).get()),
         "mw": response.xpath(tmp.format("Mol. Weight:")).get(),
         "prd_url": response.url,
         "stock_info": response.xpath('//div[@class="InventoryStatus"]/strong/text()').get(),
     }
     yield RawData(**d)
Example #3
0
 def parse_detail(self, response):
     tmp = '//span[text()={!r}]/following-sibling::span/text()'
     tmp2 = '//h2/div[@class="std"]//text()[not(parent::span) and not(parent::a/parent::span)]'
     sequence = strip(''.join(
         response.xpath(
             '//div[@class="std"]/span[@class]//text()').getall()))
     en_name = strip(''.join(response.xpath(tmp2).getall())) or None
     synonyms = strip(response.xpath(tmp.format("Synonyms")).get())
     d = {
         'brand':
         'bachem',
         'parent':
         strip(
             response.xpath(
                 '//li[@class="product"]/preceding-sibling::li[1]/a/text()'
             ).get()),
         'cat_no':
         strip(response.xpath('//div[@id="productname"]/text()').get()),
         'en_name':
         en_name or sequence,
         'cas':
         response.xpath(tmp.format("CAS Registry Number")).get(),
         'mf':
         formula_trans(
             response.xpath(tmp.format("Molecular Formula")).get()),
         'mw':
         response.xpath(tmp.format("Relative Molecular Mass")).get(),
         'info1':
         ';'.join(filter(lambda x: x, (synonyms, sequence))),
         'info2':
         response.xpath(tmp.format("Storage Conditions")).get(),
         'info3':
         response.xpath('//td[@class="masWeight"]/text()').get(),
         'info4':
         strip(
             response.xpath(
                 '//div[contains(@class, "formatted-price")]/text()').get()
         ),
         'stock_info':
         response.xpath(
             '//p[contains(@class, "availability")]/span/text()').get(),
         'img_url':
         response.xpath('//img[@class="zoom-image"]/@src').get(),
         'prd_url':
         response.url,
     }
     yield RawData(**d)
Example #4
0
 def detail_parse(self, response):
     tmp = '//td[@class="dleft" and contains(./p/text(), "{}")]/following-sibling::td/p/text()'
     cas = response.xpath(tmp.format("Labeled CAS#")).get()
     unlabeled_cas = response.xpath(tmp.format("Unlabeled CAS#")).get()
     r_img_url = response.xpath('//div[@class="image-section"]/p//img/@src').get()
     d = {
         "brand": "cil",
         "parent": response.meta.get("parent"),
         "cat_no": response.xpath(tmp.format("Item Number")).get(),
         "cas": f"{cas}; Unlabeled Cas:{unlabeled_cas}",
         "en_name": response.xpath('//h1[@class="ldescription"]/text()').get(),
         "img_url": urljoin(response.url, r_img_url),
         "mf": formula_trans(response.xpath(tmp.format("Chemical Formula")).get()),
         "mw": response.xpath(tmp.format("Molecular Weight")).get(),
         "prd_url": response.url,
     }
     yield RawData(**d)
Example #5
0
 def parse_detail(self, response):
     tmp = '//h4[contains(text(), {!r})]/following-sibling::p/text()'
     rel_img = response.xpath('//div[contains(@class, "product-detail-image")]/figure/img/@src').get()
     sym = response.xpath('//ul[contains(@class, "synmlist")]/li/text()').getall()
     sym = filter(bool, map(str.strip, sym))
     d = {
         'brand': 'vivan',
         'cat_no': response.xpath(tmp.format('Catalogue No.:')).get(),
         'en_name': response.xpath('//div[@class="product-detail"]//h2/text()').get(),
         'cas': response.xpath(tmp.format('CAS No. :')).get(),
         'mf': formula_trans(response.xpath(tmp.format('Mol. Formula :')).get()),
         'mw': response.xpath(tmp.format('Mol. Weight :')).get(),
         'img_url': rel_img and urljoin(self.base_url, rel_img),
         'info1': ';'.join(sym),
         'prd_url': response.url,
     }
     yield RawData(**d)
Example #6
0
 def detail_parse(self, response):
     img_url = response.xpath('//div[@class="product-img"]//img/@src').get()
     d = {
         "brand": "simson",
         "en_name": response.xpath('//h1[contains(@class, "pro-title")]/text()').get(),
         "prd_url": response.url,
         "info1": self.extract_value(response, "Chemical Name"),
         "cat_no": self.extract_value(response, "Cat. No."),
         "cas": self.extract_value(response, "CAS. No."),
         "mf": formula_trans(self.extract_value(response, "Molecular Formula")),
         "mw": self.extract_value(response, "Formula Weight"),
         "img_url": img_url or urljoin(self.base_url, img_url),
         "info4": self.extract_value(response, "Category"),
         "stock_info": self.extract_value(response, "Product Stock Status"),
     }
     # TODO should have a middleware to warn this
     if d.get('en_name') is None or d.get('cat_no') is None:
         self.logger.warn(f'Get data loss from {response.url!r}')
     yield RawData(**d)
Example #7
0
 def detail_parse(self, response):
     div = response.xpath('//div[contains(@class,"heading")]')
     tmp_xpath = './h6[contains(text(),"{0}")]/parent::*/following-sibling::*/h5/descendant-or-self::*/text()'
     tmp_xpath_2 = './h6[contains(text(),"{0}")]/parent::*/following-sibling::*/h5/text()'
     mf = ''.join(
         div.xpath(
             tmp_xpath.format("Molecular Formula")).extract()).strip()
     d = {
         'brand':
         "anant",
         'en_name':
         response.xpath(
             'normalize-space(//div[contains(@class,"prod-details")]//h1/text())'
         ).get(),
         'prd_url':
         response.request.url,  # 产品详细连接
         'cat_no':
         response.xpath('//h5[@class="prod-cat"]/text()').get("").strip(),
         'cas':
         div.xpath(tmp_xpath_2.format("CAS")).get("").strip(),
         'stock_info':
         div.xpath(tmp_xpath_2.format("Stock Status")).get("").strip(),
         'mf':
         formula_trans(mf),
         'mw':
         div.xpath(tmp_xpath_2.format("Molecular Weight")).get("").strip(),
         'info1':
         response.xpath(
             '//b[contains(text(),"Synonyms : ")]/following-sibling::text()'
         ).get("").strip(),
         'parent':
         response.meta.get('parent'),
         'img_url':
         response.xpath(
             '//div[contains(@class,"entry-thumb")]/a/img/@src').get(),
     }
     yield RawData(**d)
Example #8
0
    def detail_parse(self, response):
        tmp = 'normalize-space(//td[contains(text(),{!r})]/following-sibling::td//text())'
        tmp2 = '//strong[contains(text(), {!r})]/../following-sibling::td/text()'
        parent = response.xpath(tmp.format("Parent API")).get()
        category = response.xpath(tmp.format("Category")).get()
        img_rel_url = response.xpath('//div[@class="product-media"]//img/@src').get()
        d = {
            "brand": "clearsynth",
            "parent": parent or category,
            "cat_no": response.xpath(tmp.format("CAT No.")).get(),
            "en_name": ''.join(response.xpath('//div[@class="product-name"]//text()').getall()),
            "cas": response.xpath(tmp.format("CAS")).get(),
            "mf": formula_trans(strip("".join(
                response.xpath("//td[contains(text(),'Mol. Formula')]/following-sibling::td//text()").extract()))),

            "mw": response.xpath(tmp.format("Mol. Weight")).get(),
            "img_url": img_rel_url and urljoin(response.request.url, img_rel_url),
            "info1": strip(response.xpath(tmp2.format('Synonyms')).get()),
            "info2": strip(response.xpath(tmp2.format("Storage Conditions")).get()),
            "smiles": strip(response.xpath(tmp2.format("Smiles")).get()),
            "prd_url": response.request.url,
            "stock_info": strip(response.xpath(tmp2.format("Stock Status")).get()),
        }
        yield RawData(**d)
Example #9
0
 def parse_detail(self, response):
     tmp = '//th[contains(text(), {!r})]/following-sibling::td//p//text()'
     package = '//tr[td and td[@class="pro_price_3"]/span[not(@class)]]/td[@class="pro_price_1"]'
     rel_img = response.xpath(
         '//div[@class="struct-img-wrapper"]/img/@src').get()
     cat_no = response.xpath('//dt/span/text()').get('').replace(
         'Cat. No.: ', '').replace('目录号: ', '')
     tmp_package = strip(
         response.xpath(f'normalize-space({package}/text())').get())
     d = {
         'brand':
         self.brand,
         'parent':
         response.meta.get('parent'),
         'cat_no':
         cat_no,
         'en_name':
         response.xpath('//h1/strong/text()').get(),
         'cas':
         strip(response.xpath(tmp.format("CAS No.")).get()),
         'mf':
         formula_trans(strip(response.xpath(tmp.format("Formula")).get())),
         'mw':
         strip(response.xpath(tmp.format("Molecular Weight")).get()),
         'smiles':
         strip(''.join(response.xpath(tmp.format("SMILES")).getall())),
         'info3':
         tmp_package and tmp_package.replace('\xa0', ' '),
         'info4':
         strip(
             response.xpath(
                 f'{package}/following-sibling::td[1]/text()').get()),
         'img_url':
         rel_img and urljoin(response.url, rel_img),
         'prd_url':
         response.url,
     }
     yield RawData(**d)
     if not cat_no:
         return
     rows = response.xpath(
         '//tr[td and td[@class="pro_price_3"]/span[not(@class)]]')
     for row in rows:
         price = strip(row.xpath('./td[@class="pro_price_2"]/text()').get())
         tmp_package = strip(
             row.xpath('normalize-space(./td[@class="pro_price_1"]/text())'
                       ).get())
         dd = {
             'brand':
             self.brand,
             'cat_no':
             cat_no,
             'package':
             tmp_package and tmp_package.replace('\xa0', ' '),
             'price':
             price and price.strip('¥'),
             'delivery_time':
             strip(''.join(
                 row.xpath(
                     './td[@class="pro_price_3"]/span//text()').getall()))
             or None,
             'currency':
             'RMB',
         }
         yield ProductPackage(**dd)