Esempi in Python per RawData, esempi in Python per product_spider.items.RawData

Esempio n. 1

0

Mostra file

    def parse_detail(self, response):
        tmp = '//th[contains(text(), {!r})]/following-sibling::td/text()'
        rel_img = response.xpath('//div[@class="pic"]/img/@src').get()
        cat_no = response.xpath('//div/span[@style]/text()').get()
        d = {
            'brand': self.brand,
            'cat_no': cat_no,
            'en_name': response.xpath('//div/span/@data-nameen').get(),
            'cas': response.xpath(tmp.format("CAS:")).get(),
            'mdl': response.xpath(tmp.format("MDL:")).get(),
            'mf':
            formula_trans(strip(response.xpath(tmp.format("分子式:")).get())),
            'mw': response.xpath(tmp.format("分子量:")).get(),
            'smiles': response.xpath(tmp.format("SMILES code:")).get(),
            'purity': response.xpath(tmp.format("化学纯度:")).get(),
            'img_url': rel_img and urljoin(response.url, rel_img),
            'prd_url': response.url,
        }
        yield RawData(**d)

        rows = response.xpath('//div[@class="table-1"]//tbody/tr')
        for row in rows:
            package = {
                'brand': self.brand,
                'cat_no': cat_no,
                'package': row.xpath('./td[1]/text()').get(),
                'price': strip(row.xpath('./td[2]/text()').get()),
                'stock_num': row.xpath('./td[5]/text()').get(),
                'currency': 'RMB',
            }
            yield ProductPackage(**package)

Esempio n. 2

0

Mostra file

File: paitai_spider.py Progetto: Pandaaaa906/product_spider

 def parse_detail(self, response):
     tmp = '//td[contains(./span/text(), {!r})]/following-sibling::td//span//text()'
     tmp2 = '//td[contains(./span/text(), {!r})]/following-sibling::td/p[{}]/span/text()'
     en_name = strip(response.xpath(tmp2.format("Product Name", 1)).get()) or \
               strip(response.xpath(tmp.format("Product Name")).get())
     d = {
         'brand':
         self.brand,
         'cat_no':
         en_name,
         'en_name':
         en_name,
         'chs_name':
         strip(response.xpath(tmp2.format("Product Name", 2)).get()),
         'cas':
         strip(response.xpath(tmp.format("Cas No.")).get()),
         'info1':
         strip(response.xpath(tmp.format("Sequence")).get()),
         'mf':
         strip(''.join(
             response.xpath(tmp.format("Molecular Formula")).getall())),
         'mw':
         strip(response.xpath(tmp.format("Molar Mass")).get()),
         'purity':
         strip(''.join(response.xpath(tmp.format("Purity")).getall())),
         'info2':
         strip(response.xpath(tmp.format("Storage Temperature")).get()),
         'img_url':
         response.xpath(
             '//div[contains(@class, "slick-slide")][1]/a/img/@src').get(),
         'prd_url':
         response.url,
     }
     yield RawData(**d)

Esempio n. 3

0

Mostra file

File: tlc_spider.py Progetto: Pandaaaa906/product_spider

 def detail_parse(self, response):
     img_src = response.xpath(
         '//section[@class="page"][1]//img/@src').extract_first()
     d = {
         'en_name':
         self.extract_value(response, "Compound Name:"),
         'cat_no':
         self.extract_value(response, "Catalogue Number:"),
         'img_url':
         img_src and urljoin(self.base_url, img_src),
         'info1':
         self.extract_value(response, "Synonyms:"),
         'cas':
         self.extract_value(response, "CAS#:"),
         'mw':
         self.extract_value(response, "Molecular Weight:"),
         'mf':
         self.extract_value(response, "Molecular Formula:"),
         'parent':
         response.xpath(
             '//section[@class="page"][1]//h3[@class="title--product"]/text()'
         ).extract_first(default="").title() or None,
         'brand':
         'tlc',
         'prd_url':
         response.request.url,
         'stock_info':
         response.xpath('//span[@class="status"]/text()').extract_first(
             "").strip().title() or None,
     }
     yield RawData(**d)

Esempio n. 4

0

Mostra file

File: eco_spider.py Progetto: Pandaaaa906/product_spider

 def detail_parse(self, response):
     tmp = '//span[contains(text(),"{}")]/following-sibling::font/text()'
     d = {
         "brand":
         "eco",
         "parent":
         response.meta.get("parent"),
         "cat_no":
         response.xpath(tmp.format("Catalogue number")).get(),
         "cas":
         response.xpath(tmp.format("CAS Number")).get(),
         "en_name":
         response.xpath('//div[@class="p_vtitle"]/text()').get(),
         "img_url":
         urljoin(
             self.base_url,
             response.xpath(
                 '//div[@class="p_viewimg pcshow"]//img/@src').get()),
         "mf":
         response.xpath(tmp.format("Molecular Formula")).get(),
         "mw":
         response.xpath(tmp.format("Molecular Weight")).get(),
         "prd_url":
         response.url,
     }
     yield RawData(**d)

Esempio n. 5

0

Mostra file

File: medicalisotopes_spider.py Progetto: Pandaaaa906/product_spider

 def parse_detail(self, response):
     tmp = '//td[contains(text(), {!r})]/following-sibling::td//text()'
     package = strip(
         response.xpath('normalize-space(//td/table//td[1]/text())').get())
     d = {
         'brand':
         'medicalisotopes',
         'parent':
         response.meta.get('parent'),
         'cat_no':
         strip(response.xpath(tmp.format("Catalog Number:")).get()),
         'en_name':
         strip(
             response.xpath(
                 '//th[contains(text(), "Product:")]/following-sibling::th/text()'
             ).get()),
         'cas':
         strip(response.xpath(tmp.format("CAS Number:")).get()),
         'mf':
         strip(''.join(response.xpath(tmp.format("Formula:")).getall())),
         'mw':
         strip(response.xpath(tmp.format("Molecular Weight:")).get()),
         'info3':
         package and package.rstrip('\xa0='),
         'info4':
         strip(response.xpath('//td/table//td[2]/text()').get()),
         'prd_url':
         response.url,
     }
     yield RawData(**d)

Esempio n. 6

0

Mostra file

File: chemimpex.py Progetto: Pandaaaa906/product_spider

 def parse_table(self, response):
     d = {
         'info3': strip(response.xpath('//td[@class="skusize"]/text()').get()),
         'info4': strip(response.xpath('//span[@class="price"]/text()').get()),
         'stock_info': strip(response.xpath('//span[contains(@class, "stockstatus")]/text()').get()),
     }
     yield RawData(**response.meta.get('prd_info', {}), **d)

Esempio n. 7

0

Mostra file

File: alta_spider.py Progetto: Pandaaaa906/product_spider

 def parse_detail(self, response):
     tmp = 'normalize-space(//td[contains(div/text(), {!r})]/following-sibling::td/text())'
     rel_img = response.xpath('//div[@class="c_c_p"]//div/img/@src').get()
     cat_no = strip(response.xpath(tmp.format("产品号/Catalog#")).get())
     d = {
         'brand': self.brand,
         'parent': response.meta.get('parent'),
         'cat_no': cat_no,
         'en_name':
         strip(response.xpath(tmp.format("Product Name：")).get()),
         'chs_name': strip(response.xpath(tmp.format("产品名称：")).get()),
         'cas': strip(response.xpath(tmp.format("CAS#：")).get()),
         'mf': strip(response.xpath(tmp.format("分子式/Formula：")).get()),
         'mw': strip(response.xpath(tmp.format("分子量/MW：")).get()),
         'purity':
         strip(response.xpath(tmp.format("纯度/Purity (%)：")).get()),
         'info1': strip(response.xpath(tmp.format("Synonyms：")).get()),
         'info2': strip(response.xpath(tmp.format("储藏条件/Storage：")).get()),
         'appearance': strip(response.xpath(tmp.format("颜色/Color：")).get()),
         'img_url': rel_img and urljoin(response.url, rel_img),
         'prd_url': response.url,
     }
     for k in d:
         d[k] = d[k] if d[k] != 'NA' else None
     yield RawData(**d)
     rows = response.xpath(
         '//table[@class="c_p_size"]//tr[td and td/text()!="NA"]')
     for row in rows:
         dd = {
             'brand': self.brand,
             'cat_no': cat_no,
             'package': row.xpath('./td[1]/text()').get(),
             'price': row.xpath('./td[1]/text()').get(),
         }
         yield ProductPackage(**dd)

Esempio n. 8

0

Mostra file

File: prd_spider.py Progetto: Pandaaaa906/product_spider

 def parse(self, response):
     prd_rows = response.xpath("//table[@class='list_tab003']//tr")
     for row in prd_rows:
         d = {
             'cat_no': row.xpath(".//td[1]/input/@value").get(default=""),
             'chs_name': row.xpath(".//td[2]/input/@value").get(default=""),
             'info2': row.xpath(".//td[3]/input/@value").get(default=""),
             'info1': row.xpath(".//td[4]/input/@value").get(default=""),  # 规格
             'info3': row.xpath(".//td[5]/input/@value").get(default=""),  # 批号
             'info4': row.xpath(".//td[6]/input/@value").get(default=""),  # 保存条件
             'stock_info': row.xpath(".//td[1]/font/text()").get(default=""),
         }
         yield RawData(**d)
     pager_script = response.xpath("//div[@class='page']/script/text()").re(r"(\d+),(\d+),(\d+)")
     if pager_script:
         cur_page, page_size, total_items = map(int, pager_script)
         if page_size * cur_page < total_items:
             data = [('sgoodsno', ''),
                     ('sgoodsname', ''),
                     ('curPage', str(cur_page + 1)),
                     ('pageSize', pager_script[1]),
                     ('toPage', pager_script[0]),
                     ]
             if cur_page == 1:
                 print("WWW", response.request.body)
             yield FormRequest.from_response(response, callback=self.parse, method="POST", formname="formList",
                                             formdata=data, dont_filter=True, errback=self.err_parse)

Esempio n. 9

0

Mostra file

File: chiron_spider.py Progetto: Pandaaaa906/product_spider

 def parse_detail(self, response):
     tmp = '//td[contains(text(), {!r})]/following-sibling::td/text()'
     currency = response.xpath(tmp.format("Currency")).get()
     price = response.xpath(tmp.format("excl. VAT")).get()
     specification = response.xpath(tmp.format("Specification")).get()
     package = response.xpath(tmp.format("Pack size")).get()
     d = {
         'brand':
         'chiron',
         'parent':
         response.meta.get('parent'),
         'cat_no':
         response.xpath(tmp.format("Product no.")).get(),
         'en_name':
         response.xpath('//h2/text()').get(),
         'cas':
         response.xpath(tmp.format("CAS Nr.")).get(),
         'info3':
         f'{package}; {specification or "N/A"}',
         'info4':
         price and f'{currency} {price}',
         'stock_info':
         response.xpath(
             '//span[contains(text(), "Stock status")]/parent::div/parent::td/following-sibling::td/span/text()'
         ).get(),
         'prd_url':
         response.url,
     }
     yield RawData(**d)

Esempio n. 10

0

Mostra file

File: cpachem.py Progetto: Pandaaaa906/product_spider

 def parse_detail(self, response):
     tmp = '//b[text()={!r}]/following-sibling::text()'
     catagory = strip(
         response.xpath(
             '//b[text()="Category:"]/following-sibling::a/text()').get())
     d = {
         'brand':
         'cpachem',
         'parent':
         catagory,
         'cat_no':
         strip(response.xpath(tmp.format("Ref Num:")).get()),
         'en_name':
         strip(response.xpath(tmp.format("Full Name:")).get()),
         'info2':
         strip(
             response.xpath(tmp.format("Shelf Life on Ship Date:")).get()),
         'info3':
         strip(response.xpath(tmp.format("Vol.:")).get()),
         'info4':
         strip(
             response.xpath('//h3[contains(text(), "Price:")]/text()').get(
             )).lstrip('Price: '),
         'stock_info':
         strip(
             response.xpath(
                 '//p[@style="padding:15px 0px 5px 0px;"]/text()').get()),
         'prd_url':
         response.url,
     }
     yield RawData(**d)

Esempio n. 11

0

Mostra file

File: lgc_spider.py Progetto: Pandaaaa906/product_spider

 def detail_parse(self, response):
     tmp = '//div[contains(@class,"product__item")]/h2[text()={!r}]/following-sibling::*/descendant-or-self::text()'
     cat_no = response.xpath(tmp.format("Product Code")).get('')
     if not cat_no.upper().startswith('MM'):
         return
     parents = response.xpath(
         '//div[contains(@class,"product page-section")]//div[contains(@class,"product__item")]/h2[contains(text(),"API Family")]/following-sibling::*/descendant-or-self::text()').extract()
     parent = "".join(parents)
     related_categories = response.xpath(
         '//ul[contains(@class,"breadcrumb")]/li[position()=last()-1]/a/text()').get(default="").strip()
     d = {
         "brand": 'lgc',
         "parent": parent or related_categories,
         "cat_no": cat_no,
         "en_name": response.xpath('//h1[@class="product__title"]/text()').get(default="").strip(),
         "cas": response.xpath(tmp.format("CAS Number")).get(default="").strip() or None,
         "mf": response.xpath(tmp.format("Molecular Formula")).get("").replace(" ", "") or None,
         "mw": response.xpath(tmp.format("Molecular Weight")).get(),
         "stock_info": response.xpath(
             '//h4[contains(@class,"orderbar__stock-title")]/descendant-or-self::text()').get(
             "").strip() or None,
         "img_url": response.xpath('//div[contains(@class, "product__brand-img")]/img/@src').get(),
         "info1": response.xpath(tmp.format("IUPAC")).get(default="").strip(),
         "info3": response.xpath('//span[text()="Pack Size:"]/following-sibling::p/text()').get(),
         "prd_url": response.request.url,
     }
     yield RawData(**d)

Esempio n. 12

0

Mostra file

File: acc_spider.py Progetto: Pandaaaa906/product_spider

 def detail_parse(self, response):
     tmp = '//div[contains(@class, {!r})]/div[contains(@class, "value")]//text()'
     d = {
         "brand":
         "accustandard",
         "parent":
         response.xpath(
             '//li[position()=last()-1]/a/span[@itemprop]/text()').get(),
         "cat_no":
         response.xpath('//div[@itemprop="sku"]/text()').get(),
         "en_name":
         response.xpath('//h1[@class="page-title"]//text()').get(),
         "cas":
         ";".join(
             response.xpath(
                 '//td[contains(@class, "cas_number")]/text()').extract()),
         "mf":
         "".join(response.xpath(tmp.format('molecular_formula')).extract())
         or None,
         "mw":
         response.xpath(tmp.format('molecular_weight')).get(),
         # "stock_info": response.xpath('//meta[@itemprop="availability"]/@content').get(),
         "img_url":
         response.xpath('//img[@itemprop="image"]/@data-src').get(),
         "info2":
         response.xpath(tmp.format('sales_unit_size')).get(),
         "info3":
         response.xpath(tmp.format('storage_condition')).get(),
         # "info4": response.xpath('//span[@class="price"]/text()').get(),
         "prd_url":
         response.url,
     }
     yield RawData(**d)

Esempio n. 13

0

Mostra file

File: leyan_spider.py Progetto: Pandaaaa906/product_spider

    def parse_detail(self, response):
        tmp = '//div[contains(*/text(), {!r})]/following-sibling::div/*/text()'
        cat_no = response.xpath('//span[@id="catalogNo"]/text()').get()
        rel_img = response.xpath('//input[@id="image"]/@value').get()
        d = {
            'brand': self.brand,
            'parent': '_'.join(response.xpath('//li[@class="active"]/following-sibling::li/a/text()').getall()),
            'cat_no': cat_no,
            'en_name': response.xpath('//h2/span/text()').get(),
            'purity': response.xpath('//span[@class="d-purity"]/text()').get(),
            'cas': response.xpath(tmp.format("CAS 号")).get(),
            'mf': response.xpath(tmp.format("分子式")).get(),
            'mw': response.xpath(tmp.format("分子量")).get(),
            'smiles': response.xpath(tmp.format("Smiles Code")).get(),
            'info2': response.xpath(tmp.format("存储条件")).get(),
            'mdl': response.xpath(tmp.format("MDL 号")).get(),

            'img_url': rel_img and urljoin(response.url, rel_img),
            'prd_url': response.url,
        }
        yield RawData(**d)

        rows = response.xpath('//div[@class="table-responsive"]//tr[position()!=1]')
        for row in rows:
            package = {
                'brand': self.brand,
                'cat_no': cat_no,
                'package': row.xpath('./td[@id="packing"]/text()').get(),
                'price': row.xpath('./td[@id="money"]/text()').get(),
                'currency': 'RMB',
                'stock_num': row.xpath('./td[@id="stock"]/text()').get(),
            }
            yield ProductPackage(**package)

Esempio n. 14

0

Mostra file

File: sigma_spider.py Progetto: Pandaaaa906/product_spider

    def parse_detail(self, response):
        brand = strip(response.xpath('//input[@id="brand"]/@value').get())
        cat_no = strip(response.xpath('//strong[@itemprop="productID"]/text()').get())
        tmp = '//p[contains(text(), {!r})]/span//text()'
        tmp2 = '//td[contains(text(),{!r})]/following-sibling::td//text()[parent::a[not(@id="relatedCategoryLink")]]'
        rel_img = response.xpath('//div[@class="productMedia"]//img/@src').get()
        d = {
            'brand': brand and brand.lower() or self.name,
            'cat_no': cat_no,
            'en_name': strip(''.join(response.xpath('//h1[@itemprop="name"]//text()').getall())),
            'cas': strip(''.join(response.xpath(tmp.format("CAS")).getall())),
            'mf': strip(''.join(response.xpath(tmp.format("Formula")).getall())),
            'mw': strip(''.join(response.xpath(tmp.format("Molecular Weight")).getall())),
            'mdl': strip(''.join(response.xpath(tmp.format("MDL number")).getall())),
            'parent': strip(''.join(response.xpath(tmp2.format("Related Categories")).getall())) or None,
            'grade': strip(''.join(response.xpath(tmp2.format("grade")).getall())) or None,
            'info5': strip(''.join(response.xpath(tmp2.format("product line")).getall())) or None,
            'info2': strip(''.join(response.xpath(tmp2.format("storage temp.")).getall())) or None,
            'purity': strip(''.join(response.xpath(tmp2.format("assay")).getall())) or None,
            'smiles': strip(''.join(response.xpath(tmp2.format("SMILES string")).getall())) or None,

            'img_url': rel_img and urljoin(response.url, rel_img),
            'prd_url': response.url,
        }
        yield RawData(**d)

Esempio n. 15

0

Mostra file

File: bidepharm_spider.py Progetto: Pandaaaa906/product_spider

    def parse_detail(self, response):
        tmp = '//td[contains(text(), {!r})]/following-sibling::td//text()'
        cat_no = response.xpath('//input[@id="catNum"]/@value').get()
        if not cat_no:
            return

        d = {
            'brand':
            '毕得',
            'parent':
            response.meta.get('parent'),
            'cat_no':
            cat_no,
            'en_name':
            response.xpath('//span[@class="sp_pro_name_en"]/text()').get(),
            'chs_name':
            response.xpath('//span[@class="sp_pro_name_cn"]/text()').get(),
            'cas':
            response.xpath(tmp.format("CAS号：")).get(),
            'mf':
            ''.join(response.xpath(tmp.format("分子式：")).getall()),
            'mw':
            response.xpath(tmp.format("分子量：")).get(),
            'purity':
            response.xpath('//span[@id="first_purity"]/text()').get(),
            'info2':
            response.xpath('//span[contains(text(), "存储:")]/text()').get(),
            'img_url':
            response.xpath(
                '//div[@class="products-big-img img-box position-R"]/img/@src'
            ).get(),
            'prd_url':
            response.url,
        }
        yield RawData(**d)

Esempio n. 16

0

Mostra file

File: prd_spider.py Progetto: Pandaaaa906/product_spider

    def detail_parse(self, response):
        tmp = '//div[contains(@class,"product__item")]/h2[text()={!r}]/following-sibling::*/descendant-or-self::text()'
        parents = response.xpath(
            '//div[contains(@class,"product page-section")]//div[contains(@class,"product__item")]/h2[contains(text(),"API Family")]/following-sibling::*/descendant-or-self::text()').extract()
        parent = "".join(parents)
        related_categories = response.xpath(
            '//ul[contains(@class,"breadcrumb")]/li[position()=last()-1]/a/text()').get(default="").strip()

        color = response.xpath('//h2[text()="Color"]/following-sibling::p/text()').get("")
        appearance = response.xpath('//h2[text()="Appearance/Form"]/following-sibling::p/text()').get("")
        d = {
            "brand": "dre",
            "parent": parent or related_categories,
            "cat_no": response.xpath(tmp.format("Product Code")).get(),
            "en_name": response.xpath('//h1[@class="product__title"]/text()').get(default="").strip(),
            "cas": response.xpath(tmp.format("CAS Number")).get(default="").strip() or None,
            "mf": response.xpath(tmp.format("Molecular Formula")).get("").replace(" ", "") or None,
            "mw": response.xpath(tmp.format("Molecular Weight")).get(),
            "stock_info": response.xpath(
                '//h4[contains(@class,"orderbar__stock-title")]/descendant-or-self::text()').get(
                "").strip() or None,
            "img_url": response.xpath('//div[contains(@class, "product__brand-img")]/img/@src').get(),
            "info1": response.xpath(tmp.format("IUPAC")).get(default="").strip(),
            "info2": response.xpath('//h2[text()="Storage Temperature"]/following-sibling::p/text()').get(),
            "info3": response.xpath('//h2[text()="Shipping Temperature"]/following-sibling::p/text()').get(),
            "info4": ' '.join((color, appearance)),
            "prd_url": response.request.url,
        }

        yield RawData(**d)

Esempio n. 17

0

Mostra file

    def parse(self, response):
        xml = XML(response.body)
        prds = xml.xpath('//Reference')
        for prd in prds:
            cat_no = first(prd.xpath('./Order_Code/text()'), None)
            d = {
                "brand":
                self.brand,
                "cat_no":
                cat_no,
                "cas":
                first(prd.xpath('./CAS_Registry_Number/text()'), None),
                "en_name":
                first(prd.xpath('./Reference_Standard/text()'), None),
                "info2":
                first(prd.xpath('./Storage/text()'), None),
                "info3":
                first(prd.xpath('./Quantity_per_vial/text()'), None),
                "info4":
                first(prd.xpath('./Price/text()'), None),
                "prd_url":
                f"https://crs.edqm.eu/db/4DCGI/View={first(prd.xpath('./Order_Code/text()'), '')}",
            }
            yield RawData(**d)

            price = first(prd.xpath('./Price/text()'), None)
            yield ProductPackage(
                brand=self.brand,
                cat_no=cat_no,
                package=first(prd.xpath('./Quantity_per_vial/text()'), None),
                price=price and price.replace('€', ''),
                currency='EUR',
            )

Esempio n. 18

0

Mostra file

File: tenovapharama_spider.py Progetto: Pandaaaa906/product_spider

    def parse_detail(self, response):
        parent = response.meta.get('parent')
        cat_no = response.xpath("//span[@class='variant-sku']//text()").get()
        cat_no = first(re.findall(r'SKU:(.+)-', cat_no), None)
        d = {
            "brand": self.name,
            "parent": parent,
            "en_name": response.xpath("//h1[@class='product-header']/text()").get(),
            "cat_no": cat_no,
            "prd_url": response.url,
            "mf": response.xpath('//td[contains(text(), "Molecular Formula:")]/following-sibling::td/text()').get(),
            "mw": response.xpath('//td[contains(text(), "Molecular Weight:")]/following-sibling::td/text()').get(),
            "cas": response.xpath('//td[contains(text(), "CAS Number:")]/following-sibling::td/text()').get(),
            "smiles": response.xpath('//td[contains(text(), "SMILES:")]/following-sibling::td/text()').get(),
            "purity": response.xpath('//td[contains(text(), "Purity (HPLC):")]/following-sibling::td/text()').get(),
            "info1": response.xpath('//td[contains(text(), "Synonyms:")]/following-sibling::td/text()').get(),
            "info2": response.xpath('//td[contains(text(), "Storage Conditions:")]/following-sibling::td/text()').get(),
            "img_url": (m := response.xpath('//noscript/img/@src').get()) and urljoin(response.url, m),
        }
        yield RawData(**d)

        rows = response.xpath('//select[@id="product-select-product-template"]/option/text()').getall()
        for row in rows:
            package, price = row.split("-")
            price = price.replace("$", '')
            dd = {
                "brand": self.name,
                "cat_no": cat_no,
                "package": package,
                "currency": "USD",
                "price": price
            }
            yield ProductPackage(**dd)

Esempio n. 19

0

Mostra file

File: o2si_spider.py Progetto: Pandaaaa906/product_spider

 def parse_detail(self, response):
     cas = response.xpath('//tr[@class="style17"]/td[3]/text()').getall()
     cas = tuple(filter(lambda x: x, (strip(i) for i in cas)))
     d = {
         'brand':
         self.brand,
         'parent':
         response.meta.get('parent'),
         'cat_no':
         response.meta.get('cat_no'),
         'en_name':
         response.meta.get('en_name'),
         'cas':
         first(cas, None) if len(cas) == 1 else None,
         'info1':
         ';'.join(set(cas)),
         'info3':
         response.meta.get('package'),
         'info4':
         response.xpath(
             '//p[contains(text(), "Price:")]/strong/text()').get(),
         'prd_url':
         response.url,
     }
     yield RawData(**d)

Esempio n. 20

0

Mostra file

File: chemimpex.py Progetto: Pandaaaa906/product_spider

 def parse_detail(self, response):
     tmp = '//span[contains(text(), {!r})]/following-sibling::span//text()'
     d = {
         'brand': 'chemimpex',
         'parent': response.meta.get('parent'),
         'cat_no': response.xpath(tmp.format("Catalog Number:")).get(),
         'en_name': strip(''.join(response.xpath('//h1[@itemprop="name"]//text()[not(parent::span)]').getall())),
         'purity': strip(response.xpath('//h1[@itemprop="name"]/span[@style]/text()').get()),
         'mf': strip(''.join(response.xpath(tmp.format('Molecular Formula:')).getall())),
         'mw': strip(response.xpath(tmp.format('Molecular Weight:')).get()),
         'cas': strip(response.xpath(tmp.format('CAS No:')).get()),
         'appearance': strip(response.xpath(tmp.format('Appearance:')).get()),
         'info1': strip(';'.join(response.xpath(tmp.format('Synonyms:')).getall())),
         'info2': strip(response.xpath(tmp.format('Storage Temp:')).get()),
         'img_url': strip(response.xpath('//div[@id="catalog_content"]/img/@src').get()),
         'prd_url': response.url,
     }
     m = re.search(r'push\(({.+\})\);', response.text)
     if not m:
         yield RawData(**d)
         return
     j_obj = json.loads(m.group(1))
     params = [j_obj.get(f'param{i}', '') for i in range(1, 7)]
     url = 'https://www.chemimpex.com/Widgets-product/gethtml_skulist/{}/{}/{}/{}/{}/{}'.format(*params)
     yield Request(url, callback=self.parse_table, meta={'prd_info': d})

Esempio n. 21

0

Mostra file

File: std_spider.py Progetto: Pandaaaa906/product_spider

 def list_parse(self, response):
     nodes = response.xpath('//ul[@class="pro"]/li')
     tmp = './/*[contains(text(),{!r})]/text()'
     for node in nodes:
         d = {
             "brand":
             "std",
             "parent":
             response.meta.get('parent'),
             "cat_no":
             node.xpath(tmp.format("STD No.")).get("").replace(
                 "STD No.", "").strip(),
             "cas":
             node.xpath(tmp.format("CAS No.")).get("").replace(
                 "CAS No.", "").strip(),
             "en_name":
             node.xpath('./h3//p/text()').get(),
             "img_url":
             urljoin(self.base_url,
                     node.xpath('./span//img/@src').get()),
             "mf":
             node.xpath(tmp.format("Chemical Formula")).get("").replace(
                 "Chemical Formula :", "").strip(),
             "prd_url":
             urljoin(self.base_url,
                     node.xpath('./a/@href').get('')),
         }
         yield RawData(**d)

Esempio n. 22

0

Mostra file

File: payne_spider.py Progetto: Pandaaaa906/product_spider

 def parse_detail(self, response):
     tmp = '//td[contains(text(), {!r})]/following-sibling::td//text()'
     d = {
         'brand':
         self.brand,
         'parent':
         response.meta.get('parent'),
         'cat_no':
         strip(response.xpath(tmp.format("产品编号：")).get()),
         'en_name':
         strip(response.xpath('//div[@class="proinftit_t"]/text()').get()),
         'cas':
         strip(response.xpath(tmp.format("CAS号：")).get()),
         'mf':
         strip(''.join(response.xpath(tmp.format("分子式：")).getall())),
         'mw':
         strip(response.xpath(tmp.format("分子量：")).get()),
         'info1':
         strip(response.xpath(tmp.format("化学名：")).get()),
         'img_url':
         response.xpath('//div[@class="proinfotableimg"]/img/@src').get(),
         'prd_url':
         response.url,
     }
     yield RawData(**d)

Esempio n. 23

0

Mostra file

 def detail_parse(self, response):
     en_name = response.xpath(
         '//h1[@class="product-detail-title"]/text()').get(default="")
     en_name = re.sub(r'\r?\n', "", en_name)
     tmp = '//td[contains(text(),{!r})]/following-sibling::td/text()'
     d = {
         'brand':
         "synzeal",
         'en_name':
         en_name.strip(),
         'prd_url':
         response.request.url,  # 产品详细连接
         'cat_no':
         response.xpath(tmp.format('SZ CAT No')).get(),
         'cas':
         response.xpath(tmp.format('CAS No')).get(default=""),
         'stock_info':
         response.xpath(tmp.format('Inv. Status')).get(),
         'mf':
         response.xpath(tmp.format('Mol.F.')).get(),
         'mw':
         response.xpath(tmp.format('Mol.Wt.')).get(),
         'info1':
         response.xpath('//b[text()="Synonym: "]/../text()').get(
             default="").strip(),
         'parent':
         response.meta.get('parent'),
         'img_url':
         response.xpath(
             '//div[@class="product-details-tab"]//img/@src').get(),
     }
     yield RawData(**d)

Esempio n. 24

0

Mostra file

File: usp_spider.py Progetto: Pandaaaa906/product_spider

    def parse_detail(self, response):
        tmp = '//td[contains(text(), {!r})]/following-sibling::td/text()'
        cat_no = response.xpath(tmp.format('Catalog #')).get()
        d = {
            'brand': self.brand,
            'cat_no': cat_no,
            'en_name': response.xpath('//td[@class="pageTitle"]/text()').get(),
            'cas': response.xpath(tmp.format('CAS#')).get(),
            'stock_info': response.xpath(tmp.format('In Stock')).get(),
            'prd_url': response.url,
        }
        yield RawData(**d)

        raw_price = strip(response.xpath(
            'normalize-space(//td[contains(text(), "Retail Price:")]/following-sibling::td/text())'
        ).get())
        price = None
        if raw_price:
            raw_price = re.sub(r'\s+', ' ', raw_price)
            price = first(map(lambda m: m.group(0) if m is not None else None,
                              re.finditer(r'(\d+(\.\d+)?)', raw_price)), None)
        dd = {
            'brand': self.brand,
            'cat_no': cat_no,
            'price': price,
            'currency': 'USD',
            'info': raw_price,
            'delivery_time': response.xpath(tmp.format('In Stock')).get(),
        }
        yield ProductPackage(**dd)

Esempio n. 25

0

Mostra file

File: prd_spider.py Progetto: Pandaaaa906/product_spider

    def cat_parse(self, response):
        rows = response.xpath('//form/div[@class="row"]/div')
        catalog = response.meta.get('catalog')
        for row in rows:
            name = row.xpath('./a/text()').get()
            url_prd = urljoin(self.base_url, row.xpath('./a/@href').get())
            mol_text = row.xpath('./div/div/object/param/@value').get()
            text = row.xpath('./div/div[contains(text(),"Purity")]/text()').getall()
            if not text:
                # Controlled Drugs
                continue
            purity = text[0].split(':', 1)[-1].strip()
            cat_no = text[1].split(':', 1)[-1].strip()
            cas = text[2].split(':', 1)[-1].strip()
            stock = text[3].split(':', 1)[-1].strip()
            mf = text[4].split(':', 1)[-1].strip()
            if mol_text:
                mol_text = mol_text.encode('u8').decode('unicode-escape')

            d = {
                'brand': 'dalton',
                'en_name': name,
                'prd_url': url_prd,  # 产品详细连接
                'mol_text': mol_text,
                'purity': purity,
                'cat_no': cat_no,
                'cas': cas,
                'stock_info': stock,
                'mf': mf,
                'parent': catalog,
            }
            yield RawData(**d)

Esempio n. 26

0

Mostra file

File: steraloids_spider.py Progetto: Pandaaaa906/product_spider

 def parse_detail(self, response):
     tmp = '//strong[contains(text(), {!r})]/../text()'
     cas = response.xpath(tmp.format("CAS")).get('')
     cas = cas if cas != 'No' else None
     d = {
         'brand':
         'steraloids',
         'parent':
         None,
         'cat_no':
         response.xpath(tmp.format("Catalogue ID")).get(),
         'en_name':
         response.xpath('//h1[@class="product-title"]/text()').get(),
         'cas':
         cas,
         'mf':
         response.xpath(tmp.format("Formula")).get('').replace(' ', '')
         or None,
         'mw':
         response.xpath(tmp.format("Molecular Weight")).get(),
         'stock_info':
         response.xpath(
             '//meta[@property="product:availability"]/@content').get(),
         'img_url':
         response.meta.get('img_url'),
         'info1':
         response.xpath('//h1[@class="product-title"]/text()').get(),
         'prd_url':
         response.url,
     }
     yield RawData(**d)

Esempio n. 27

0

Mostra file

File: anaxlab_spider.py Progetto: Pandaaaa906/product_spider

 def parse_detail(self, response):
     tmp_cat_no = '//b[contains(text(), "Product Code")]/../following-sibling::li[1]/text()'
     tmp = '//b[contains(text(), {!r})]/../following-sibling::li[1]//span[@itemprop]/text()'
     rel_img = response.xpath(
         '//img[@class="productDetailsImage"]/@src').get()
     d = {
         'brand':
         "anaxlab",
         'en_name':
         response.xpath('//h1[@class="title"]/text()').get(),
         'prd_url':
         response.request.url,  # 产品详细连接
         'cat_no':
         cus_strip(response.xpath(tmp_cat_no).get()),
         'cas':
         cus_strip(response.xpath(tmp.format('CAS Number')).get()),
         'mf':
         cus_strip(response.xpath(tmp.format('Molecular Formula')).get()),
         'mw':
         cus_strip(response.xpath(tmp.format('Molecular Weight')).get()),
         'smiles':
         response.xpath(
             '//li[contains(text(), "Smile Code")]/following-sibling::li[1]/text()'
         ).get(),
         'info1':
         cus_strip(response.xpath(tmp.format('Synonyms')).get()),
         'parent':
         response.meta.get('parent'),
         'img_url':
         rel_img and urljoin(self.base_url, rel_img),
     }
     yield RawData(**d)

Esempio n. 28

0

Mostra file

 def detail_parse(self, response):
     tmp = 'normalize-space(//div[@class="product1_l"]//span[contains(text(), "{}")]/../text())'
     rel_img = response.xpath('//div[@class="product1"]/img/@src').get()
     d = {
         "brand":
         "synpharmatech",
         "cat_no":
         strip(response.xpath(tmp.format("Cat. No")).get()),
         "en_name":
         strip(
             response.xpath('//div[@class="product1_l"]//h1/text()').get()),
         "info1":
         strip(response.xpath(tmp.format("Synonyms")).get()),
         "cas":
         strip(response.xpath(tmp.format("CAS No")).get()),
         "mf":
         strip(response.xpath(tmp.format("Formula")).get()),
         "mw":
         strip(response.xpath(tmp.format("F.W")).get()),
         "purity":
         strip(response.xpath(tmp.format("Purity")).get()),
         "stock_info":
         strip(
             response.xpath(
                 'normalize-space(//div[@class="product2"]//tr[position()>1]/td[4]/text())'
             ).get()) or None,
         "prd_url":
         response.url,
         "img_url":
         urljoin(self.base_url, rel_img) if rel_img else None,
     }
     yield RawData(**d)

Esempio n. 29

0

Mostra file

File: veeprho_spider.py Progetto: Pandaaaa906/product_spider

 def detail_parse(self, response):
     tmp = '//td[contains(text(), {!r})]/following-sibling::td/text()'
     tmp2 = '//strong[text()={!r}]/following-sibling::text()'
     rel_img = response.xpath('//div[@class="image"]/img/@data-src').get()
     img_url = urljoin(self.base_url, rel_img) if rel_img else None
     d = {
         "brand":
         "veeprho",
         "parent":
         response.meta.get('parent'),
         "cat_no":
         response.xpath(tmp2.format("Catalogue No.:")).get('').strip()
         or None,
         "en_name":
         response.xpath('//div[@class="container"]/h1/text()').get(),
         "img_url":
         img_url,
         "cas":
         response.xpath(tmp2.format("CAS No.:")).get('').strip() or None,
         "prd_url":
         response.url,
         'mf':
         response.xpath(tmp.format('Molecular Formula')).get(),
         'mw':
         response.xpath(tmp.format('Molecular Weight')).get(),
         'stock_info':
         response.xpath(tmp.format('Status')).get(),
         'info1':
         response.xpath(tmp.format('IUPAC Name')).get(),
     }
     yield RawData(**d)

Esempio n. 30

0

Mostra file

    def detail_parse(self, response):
        tmp = '//th[contains(text(),{0!r})]/following-sibling::td/descendant-or-self::text()'
        img_url = response.xpath(
            '//th[contains(text(),"Structure")]/following-sibling::td/img/@src'
        ).get()
        cat_no = strip(response.xpath(tmp.format("Product No.")).get())
        d = {
            "brand":
            self.brand,
            "cat_no":
            cat_no,
            "parent":
            response.xpath(tmp.format("Category")).get(),
            "info1":
            "".join(response.xpath(tmp.format("Synonym(s)")).extract()),
            "mw":
            response.xpath(tmp.format("Molecular Weight")).get(),
            "mf":
            "".join(response.xpath(tmp.format("Formula")).extract()),
            "cas":
            response.xpath(tmp.format("CAS Number")).get(),
            "en_name":
            strip("".join(
                response.xpath(
                    '//div[@class="product-name"]/span/descendant-or-self::text()'
                ).extract())),
            "img_url":
            img_url and urljoin(self.base_url, img_url),
            "stock_info":
            response.xpath(
                '//table[@id="product-matrix"]//td[@class="unit-price"]/text()'
            ).get(),
            "prd_url":
            response.url,
        }
        yield RawData(**d)

        matrix = first(
            re.findall(r'var matrixChildrenProducts = ({.+});', response.text),
            None)
        if not matrix:
            return
        packages = json.loads(matrix)
        for _, item in packages.items():
            sku = item.get('sku')
            if not sku:
                continue
            package = sku.replace(f'{cat_no}-', '')
            dd = {
                'brand': self.brand,
                'cat_no': cat_no,
                'cat_no_unit': sku,
                'package': strip(package),
                'price': item.get('price'),
                'currency': 'USD',
                'delivery_time':
                'In-stock' if item.get('is_in_stock') else None
            }
            yield ProductPackage(**dd)