def parse_detail(self, response): tmp = '//th[contains(text(), {!r})]/following-sibling::td/text()' rel_img = response.xpath('//div[@class="pic"]/img/@src').get() cat_no = response.xpath('//div/span[@style]/text()').get() d = { 'brand': self.brand, 'cat_no': cat_no, 'en_name': response.xpath('//div/span/@data-nameen').get(), 'cas': response.xpath(tmp.format("CAS:")).get(), 'mdl': response.xpath(tmp.format("MDL:")).get(), 'mf': formula_trans(strip(response.xpath(tmp.format("分子式:")).get())), 'mw': response.xpath(tmp.format("分子量:")).get(), 'smiles': response.xpath(tmp.format("SMILES code:")).get(), 'purity': response.xpath(tmp.format("化学纯度:")).get(), 'img_url': rel_img and urljoin(response.url, rel_img), 'prd_url': response.url, } yield RawData(**d) rows = response.xpath('//div[@class="table-1"]//tbody/tr') for row in rows: package = { 'brand': self.brand, 'cat_no': cat_no, 'package': row.xpath('./td[1]/text()').get(), 'price': strip(row.xpath('./td[2]/text()').get()), 'stock_num': row.xpath('./td[5]/text()').get(), 'currency': 'RMB', } yield ProductPackage(**package)
def parse_detail(self, response): tmp = '//td[contains(./span/text(), {!r})]/following-sibling::td//span//text()' tmp2 = '//td[contains(./span/text(), {!r})]/following-sibling::td/p[{}]/span/text()' en_name = strip(response.xpath(tmp2.format("Product Name", 1)).get()) or \ strip(response.xpath(tmp.format("Product Name")).get()) d = { 'brand': self.brand, 'cat_no': en_name, 'en_name': en_name, 'chs_name': strip(response.xpath(tmp2.format("Product Name", 2)).get()), 'cas': strip(response.xpath(tmp.format("Cas No.")).get()), 'info1': strip(response.xpath(tmp.format("Sequence")).get()), 'mf': strip(''.join( response.xpath(tmp.format("Molecular Formula")).getall())), 'mw': strip(response.xpath(tmp.format("Molar Mass")).get()), 'purity': strip(''.join(response.xpath(tmp.format("Purity")).getall())), 'info2': strip(response.xpath(tmp.format("Storage Temperature")).get()), 'img_url': response.xpath( '//div[contains(@class, "slick-slide")][1]/a/img/@src').get(), 'prd_url': response.url, } yield RawData(**d)
def detail_parse(self, response): img_src = response.xpath( '//section[@class="page"][1]//img/@src').extract_first() d = { 'en_name': self.extract_value(response, "Compound Name:"), 'cat_no': self.extract_value(response, "Catalogue Number:"), 'img_url': img_src and urljoin(self.base_url, img_src), 'info1': self.extract_value(response, "Synonyms:"), 'cas': self.extract_value(response, "CAS#:"), 'mw': self.extract_value(response, "Molecular Weight:"), 'mf': self.extract_value(response, "Molecular Formula:"), 'parent': response.xpath( '//section[@class="page"][1]//h3[@class="title--product"]/text()' ).extract_first(default="").title() or None, 'brand': 'tlc', 'prd_url': response.request.url, 'stock_info': response.xpath('//span[@class="status"]/text()').extract_first( "").strip().title() or None, } yield RawData(**d)
def detail_parse(self, response): tmp = '//span[contains(text(),"{}")]/following-sibling::font/text()' d = { "brand": "eco", "parent": response.meta.get("parent"), "cat_no": response.xpath(tmp.format("Catalogue number")).get(), "cas": response.xpath(tmp.format("CAS Number")).get(), "en_name": response.xpath('//div[@class="p_vtitle"]/text()').get(), "img_url": urljoin( self.base_url, response.xpath( '//div[@class="p_viewimg pcshow"]//img/@src').get()), "mf": response.xpath(tmp.format("Molecular Formula")).get(), "mw": response.xpath(tmp.format("Molecular Weight")).get(), "prd_url": response.url, } yield RawData(**d)
def parse_detail(self, response): tmp = '//td[contains(text(), {!r})]/following-sibling::td//text()' package = strip( response.xpath('normalize-space(//td/table//td[1]/text())').get()) d = { 'brand': 'medicalisotopes', 'parent': response.meta.get('parent'), 'cat_no': strip(response.xpath(tmp.format("Catalog Number:")).get()), 'en_name': strip( response.xpath( '//th[contains(text(), "Product:")]/following-sibling::th/text()' ).get()), 'cas': strip(response.xpath(tmp.format("CAS Number:")).get()), 'mf': strip(''.join(response.xpath(tmp.format("Formula:")).getall())), 'mw': strip(response.xpath(tmp.format("Molecular Weight:")).get()), 'info3': package and package.rstrip('\xa0='), 'info4': strip(response.xpath('//td/table//td[2]/text()').get()), 'prd_url': response.url, } yield RawData(**d)
def parse_table(self, response): d = { 'info3': strip(response.xpath('//td[@class="skusize"]/text()').get()), 'info4': strip(response.xpath('//span[@class="price"]/text()').get()), 'stock_info': strip(response.xpath('//span[contains(@class, "stockstatus")]/text()').get()), } yield RawData(**response.meta.get('prd_info', {}), **d)
def parse_detail(self, response): tmp = 'normalize-space(//td[contains(div/text(), {!r})]/following-sibling::td/text())' rel_img = response.xpath('//div[@class="c_c_p"]//div/img/@src').get() cat_no = strip(response.xpath(tmp.format("产品号/Catalog#")).get()) d = { 'brand': self.brand, 'parent': response.meta.get('parent'), 'cat_no': cat_no, 'en_name': strip(response.xpath(tmp.format("Product Name:")).get()), 'chs_name': strip(response.xpath(tmp.format("产品名称:")).get()), 'cas': strip(response.xpath(tmp.format("CAS#:")).get()), 'mf': strip(response.xpath(tmp.format("分子式/Formula:")).get()), 'mw': strip(response.xpath(tmp.format("分子量/MW:")).get()), 'purity': strip(response.xpath(tmp.format("纯度/Purity (%):")).get()), 'info1': strip(response.xpath(tmp.format("Synonyms:")).get()), 'info2': strip(response.xpath(tmp.format("储藏条件/Storage:")).get()), 'appearance': strip(response.xpath(tmp.format("颜色/Color:")).get()), 'img_url': rel_img and urljoin(response.url, rel_img), 'prd_url': response.url, } for k in d: d[k] = d[k] if d[k] != 'NA' else None yield RawData(**d) rows = response.xpath( '//table[@class="c_p_size"]//tr[td and td/text()!="NA"]') for row in rows: dd = { 'brand': self.brand, 'cat_no': cat_no, 'package': row.xpath('./td[1]/text()').get(), 'price': row.xpath('./td[1]/text()').get(), } yield ProductPackage(**dd)
def parse(self, response): prd_rows = response.xpath("//table[@class='list_tab003']//tr") for row in prd_rows: d = { 'cat_no': row.xpath(".//td[1]/input/@value").get(default=""), 'chs_name': row.xpath(".//td[2]/input/@value").get(default=""), 'info2': row.xpath(".//td[3]/input/@value").get(default=""), 'info1': row.xpath(".//td[4]/input/@value").get(default=""), # 规格 'info3': row.xpath(".//td[5]/input/@value").get(default=""), # 批号 'info4': row.xpath(".//td[6]/input/@value").get(default=""), # 保存条件 'stock_info': row.xpath(".//td[1]/font/text()").get(default=""), } yield RawData(**d) pager_script = response.xpath("//div[@class='page']/script/text()").re(r"(\d+),(\d+),(\d+)") if pager_script: cur_page, page_size, total_items = map(int, pager_script) if page_size * cur_page < total_items: data = [('sgoodsno', ''), ('sgoodsname', ''), ('curPage', str(cur_page + 1)), ('pageSize', pager_script[1]), ('toPage', pager_script[0]), ] if cur_page == 1: print("WWW", response.request.body) yield FormRequest.from_response(response, callback=self.parse, method="POST", formname="formList", formdata=data, dont_filter=True, errback=self.err_parse)
def parse_detail(self, response): tmp = '//td[contains(text(), {!r})]/following-sibling::td/text()' currency = response.xpath(tmp.format("Currency")).get() price = response.xpath(tmp.format("excl. VAT")).get() specification = response.xpath(tmp.format("Specification")).get() package = response.xpath(tmp.format("Pack size")).get() d = { 'brand': 'chiron', 'parent': response.meta.get('parent'), 'cat_no': response.xpath(tmp.format("Product no.")).get(), 'en_name': response.xpath('//h2/text()').get(), 'cas': response.xpath(tmp.format("CAS Nr.")).get(), 'info3': f'{package}; {specification or "N/A"}', 'info4': price and f'{currency} {price}', 'stock_info': response.xpath( '//span[contains(text(), "Stock status")]/parent::div/parent::td/following-sibling::td/span/text()' ).get(), 'prd_url': response.url, } yield RawData(**d)
def parse_detail(self, response): tmp = '//b[text()={!r}]/following-sibling::text()' catagory = strip( response.xpath( '//b[text()="Category:"]/following-sibling::a/text()').get()) d = { 'brand': 'cpachem', 'parent': catagory, 'cat_no': strip(response.xpath(tmp.format("Ref Num:")).get()), 'en_name': strip(response.xpath(tmp.format("Full Name:")).get()), 'info2': strip( response.xpath(tmp.format("Shelf Life on Ship Date:")).get()), 'info3': strip(response.xpath(tmp.format("Vol.:")).get()), 'info4': strip( response.xpath('//h3[contains(text(), "Price:")]/text()').get( )).lstrip('Price: '), 'stock_info': strip( response.xpath( '//p[@style="padding:15px 0px 5px 0px;"]/text()').get()), 'prd_url': response.url, } yield RawData(**d)
def detail_parse(self, response): tmp = '//div[contains(@class,"product__item")]/h2[text()={!r}]/following-sibling::*/descendant-or-self::text()' cat_no = response.xpath(tmp.format("Product Code")).get('') if not cat_no.upper().startswith('MM'): return parents = response.xpath( '//div[contains(@class,"product page-section")]//div[contains(@class,"product__item")]/h2[contains(text(),"API Family")]/following-sibling::*/descendant-or-self::text()').extract() parent = "".join(parents) related_categories = response.xpath( '//ul[contains(@class,"breadcrumb")]/li[position()=last()-1]/a/text()').get(default="").strip() d = { "brand": 'lgc', "parent": parent or related_categories, "cat_no": cat_no, "en_name": response.xpath('//h1[@class="product__title"]/text()').get(default="").strip(), "cas": response.xpath(tmp.format("CAS Number")).get(default="").strip() or None, "mf": response.xpath(tmp.format("Molecular Formula")).get("").replace(" ", "") or None, "mw": response.xpath(tmp.format("Molecular Weight")).get(), "stock_info": response.xpath( '//h4[contains(@class,"orderbar__stock-title")]/descendant-or-self::text()').get( "").strip() or None, "img_url": response.xpath('//div[contains(@class, "product__brand-img")]/img/@src').get(), "info1": response.xpath(tmp.format("IUPAC")).get(default="").strip(), "info3": response.xpath('//span[text()="Pack Size:"]/following-sibling::p/text()').get(), "prd_url": response.request.url, } yield RawData(**d)
def detail_parse(self, response): tmp = '//div[contains(@class, {!r})]/div[contains(@class, "value")]//text()' d = { "brand": "accustandard", "parent": response.xpath( '//li[position()=last()-1]/a/span[@itemprop]/text()').get(), "cat_no": response.xpath('//div[@itemprop="sku"]/text()').get(), "en_name": response.xpath('//h1[@class="page-title"]//text()').get(), "cas": ";".join( response.xpath( '//td[contains(@class, "cas_number")]/text()').extract()), "mf": "".join(response.xpath(tmp.format('molecular_formula')).extract()) or None, "mw": response.xpath(tmp.format('molecular_weight')).get(), # "stock_info": response.xpath('//meta[@itemprop="availability"]/@content').get(), "img_url": response.xpath('//img[@itemprop="image"]/@data-src').get(), "info2": response.xpath(tmp.format('sales_unit_size')).get(), "info3": response.xpath(tmp.format('storage_condition')).get(), # "info4": response.xpath('//span[@class="price"]/text()').get(), "prd_url": response.url, } yield RawData(**d)
def parse_detail(self, response): tmp = '//div[contains(*/text(), {!r})]/following-sibling::div/*/text()' cat_no = response.xpath('//span[@id="catalogNo"]/text()').get() rel_img = response.xpath('//input[@id="image"]/@value').get() d = { 'brand': self.brand, 'parent': '_'.join(response.xpath('//li[@class="active"]/following-sibling::li/a/text()').getall()), 'cat_no': cat_no, 'en_name': response.xpath('//h2/span/text()').get(), 'purity': response.xpath('//span[@class="d-purity"]/text()').get(), 'cas': response.xpath(tmp.format("CAS 号")).get(), 'mf': response.xpath(tmp.format("分子式")).get(), 'mw': response.xpath(tmp.format("分子量")).get(), 'smiles': response.xpath(tmp.format("Smiles Code")).get(), 'info2': response.xpath(tmp.format("存储条件")).get(), 'mdl': response.xpath(tmp.format("MDL 号")).get(), 'img_url': rel_img and urljoin(response.url, rel_img), 'prd_url': response.url, } yield RawData(**d) rows = response.xpath('//div[@class="table-responsive"]//tr[position()!=1]') for row in rows: package = { 'brand': self.brand, 'cat_no': cat_no, 'package': row.xpath('./td[@id="packing"]/text()').get(), 'price': row.xpath('./td[@id="money"]/text()').get(), 'currency': 'RMB', 'stock_num': row.xpath('./td[@id="stock"]/text()').get(), } yield ProductPackage(**package)
def parse_detail(self, response): brand = strip(response.xpath('//input[@id="brand"]/@value').get()) cat_no = strip(response.xpath('//strong[@itemprop="productID"]/text()').get()) tmp = '//p[contains(text(), {!r})]/span//text()' tmp2 = '//td[contains(text(),{!r})]/following-sibling::td//text()[parent::a[not(@id="relatedCategoryLink")]]' rel_img = response.xpath('//div[@class="productMedia"]//img/@src').get() d = { 'brand': brand and brand.lower() or self.name, 'cat_no': cat_no, 'en_name': strip(''.join(response.xpath('//h1[@itemprop="name"]//text()').getall())), 'cas': strip(''.join(response.xpath(tmp.format("CAS")).getall())), 'mf': strip(''.join(response.xpath(tmp.format("Formula")).getall())), 'mw': strip(''.join(response.xpath(tmp.format("Molecular Weight")).getall())), 'mdl': strip(''.join(response.xpath(tmp.format("MDL number")).getall())), 'parent': strip(''.join(response.xpath(tmp2.format("Related Categories")).getall())) or None, 'grade': strip(''.join(response.xpath(tmp2.format("grade")).getall())) or None, 'info5': strip(''.join(response.xpath(tmp2.format("product line")).getall())) or None, 'info2': strip(''.join(response.xpath(tmp2.format("storage temp.")).getall())) or None, 'purity': strip(''.join(response.xpath(tmp2.format("assay")).getall())) or None, 'smiles': strip(''.join(response.xpath(tmp2.format("SMILES string")).getall())) or None, 'img_url': rel_img and urljoin(response.url, rel_img), 'prd_url': response.url, } yield RawData(**d)
def parse_detail(self, response): tmp = '//td[contains(text(), {!r})]/following-sibling::td//text()' cat_no = response.xpath('//input[@id="catNum"]/@value').get() if not cat_no: return d = { 'brand': '毕得', 'parent': response.meta.get('parent'), 'cat_no': cat_no, 'en_name': response.xpath('//span[@class="sp_pro_name_en"]/text()').get(), 'chs_name': response.xpath('//span[@class="sp_pro_name_cn"]/text()').get(), 'cas': response.xpath(tmp.format("CAS号:")).get(), 'mf': ''.join(response.xpath(tmp.format("分子式:")).getall()), 'mw': response.xpath(tmp.format("分子量:")).get(), 'purity': response.xpath('//span[@id="first_purity"]/text()').get(), 'info2': response.xpath('//span[contains(text(), "存储:")]/text()').get(), 'img_url': response.xpath( '//div[@class="products-big-img img-box position-R"]/img/@src' ).get(), 'prd_url': response.url, } yield RawData(**d)
def detail_parse(self, response): tmp = '//div[contains(@class,"product__item")]/h2[text()={!r}]/following-sibling::*/descendant-or-self::text()' parents = response.xpath( '//div[contains(@class,"product page-section")]//div[contains(@class,"product__item")]/h2[contains(text(),"API Family")]/following-sibling::*/descendant-or-self::text()').extract() parent = "".join(parents) related_categories = response.xpath( '//ul[contains(@class,"breadcrumb")]/li[position()=last()-1]/a/text()').get(default="").strip() color = response.xpath('//h2[text()="Color"]/following-sibling::p/text()').get("") appearance = response.xpath('//h2[text()="Appearance/Form"]/following-sibling::p/text()').get("") d = { "brand": "dre", "parent": parent or related_categories, "cat_no": response.xpath(tmp.format("Product Code")).get(), "en_name": response.xpath('//h1[@class="product__title"]/text()').get(default="").strip(), "cas": response.xpath(tmp.format("CAS Number")).get(default="").strip() or None, "mf": response.xpath(tmp.format("Molecular Formula")).get("").replace(" ", "") or None, "mw": response.xpath(tmp.format("Molecular Weight")).get(), "stock_info": response.xpath( '//h4[contains(@class,"orderbar__stock-title")]/descendant-or-self::text()').get( "").strip() or None, "img_url": response.xpath('//div[contains(@class, "product__brand-img")]/img/@src').get(), "info1": response.xpath(tmp.format("IUPAC")).get(default="").strip(), "info2": response.xpath('//h2[text()="Storage Temperature"]/following-sibling::p/text()').get(), "info3": response.xpath('//h2[text()="Shipping Temperature"]/following-sibling::p/text()').get(), "info4": ' '.join((color, appearance)), "prd_url": response.request.url, } yield RawData(**d)
def parse(self, response): xml = XML(response.body) prds = xml.xpath('//Reference') for prd in prds: cat_no = first(prd.xpath('./Order_Code/text()'), None) d = { "brand": self.brand, "cat_no": cat_no, "cas": first(prd.xpath('./CAS_Registry_Number/text()'), None), "en_name": first(prd.xpath('./Reference_Standard/text()'), None), "info2": first(prd.xpath('./Storage/text()'), None), "info3": first(prd.xpath('./Quantity_per_vial/text()'), None), "info4": first(prd.xpath('./Price/text()'), None), "prd_url": f"https://crs.edqm.eu/db/4DCGI/View={first(prd.xpath('./Order_Code/text()'), '')}", } yield RawData(**d) price = first(prd.xpath('./Price/text()'), None) yield ProductPackage( brand=self.brand, cat_no=cat_no, package=first(prd.xpath('./Quantity_per_vial/text()'), None), price=price and price.replace('€', ''), currency='EUR', )
def parse_detail(self, response): parent = response.meta.get('parent') cat_no = response.xpath("//span[@class='variant-sku']//text()").get() cat_no = first(re.findall(r'SKU:(.+)-', cat_no), None) d = { "brand": self.name, "parent": parent, "en_name": response.xpath("//h1[@class='product-header']/text()").get(), "cat_no": cat_no, "prd_url": response.url, "mf": response.xpath('//td[contains(text(), "Molecular Formula:")]/following-sibling::td/text()').get(), "mw": response.xpath('//td[contains(text(), "Molecular Weight:")]/following-sibling::td/text()').get(), "cas": response.xpath('//td[contains(text(), "CAS Number:")]/following-sibling::td/text()').get(), "smiles": response.xpath('//td[contains(text(), "SMILES:")]/following-sibling::td/text()').get(), "purity": response.xpath('//td[contains(text(), "Purity (HPLC):")]/following-sibling::td/text()').get(), "info1": response.xpath('//td[contains(text(), "Synonyms:")]/following-sibling::td/text()').get(), "info2": response.xpath('//td[contains(text(), "Storage Conditions:")]/following-sibling::td/text()').get(), "img_url": (m := response.xpath('//noscript/img/@src').get()) and urljoin(response.url, m), } yield RawData(**d) rows = response.xpath('//select[@id="product-select-product-template"]/option/text()').getall() for row in rows: package, price = row.split("-") price = price.replace("$", '') dd = { "brand": self.name, "cat_no": cat_no, "package": package, "currency": "USD", "price": price } yield ProductPackage(**dd)
def parse_detail(self, response): cas = response.xpath('//tr[@class="style17"]/td[3]/text()').getall() cas = tuple(filter(lambda x: x, (strip(i) for i in cas))) d = { 'brand': self.brand, 'parent': response.meta.get('parent'), 'cat_no': response.meta.get('cat_no'), 'en_name': response.meta.get('en_name'), 'cas': first(cas, None) if len(cas) == 1 else None, 'info1': ';'.join(set(cas)), 'info3': response.meta.get('package'), 'info4': response.xpath( '//p[contains(text(), "Price:")]/strong/text()').get(), 'prd_url': response.url, } yield RawData(**d)
def parse_detail(self, response): tmp = '//span[contains(text(), {!r})]/following-sibling::span//text()' d = { 'brand': 'chemimpex', 'parent': response.meta.get('parent'), 'cat_no': response.xpath(tmp.format("Catalog Number:")).get(), 'en_name': strip(''.join(response.xpath('//h1[@itemprop="name"]//text()[not(parent::span)]').getall())), 'purity': strip(response.xpath('//h1[@itemprop="name"]/span[@style]/text()').get()), 'mf': strip(''.join(response.xpath(tmp.format('Molecular Formula:')).getall())), 'mw': strip(response.xpath(tmp.format('Molecular Weight:')).get()), 'cas': strip(response.xpath(tmp.format('CAS No:')).get()), 'appearance': strip(response.xpath(tmp.format('Appearance:')).get()), 'info1': strip(';'.join(response.xpath(tmp.format('Synonyms:')).getall())), 'info2': strip(response.xpath(tmp.format('Storage Temp:')).get()), 'img_url': strip(response.xpath('//div[@id="catalog_content"]/img/@src').get()), 'prd_url': response.url, } m = re.search(r'push\(({.+\})\);', response.text) if not m: yield RawData(**d) return j_obj = json.loads(m.group(1)) params = [j_obj.get(f'param{i}', '') for i in range(1, 7)] url = 'https://www.chemimpex.com/Widgets-product/gethtml_skulist/{}/{}/{}/{}/{}/{}'.format(*params) yield Request(url, callback=self.parse_table, meta={'prd_info': d})
def list_parse(self, response): nodes = response.xpath('//ul[@class="pro"]/li') tmp = './/*[contains(text(),{!r})]/text()' for node in nodes: d = { "brand": "std", "parent": response.meta.get('parent'), "cat_no": node.xpath(tmp.format("STD No.")).get("").replace( "STD No.", "").strip(), "cas": node.xpath(tmp.format("CAS No.")).get("").replace( "CAS No.", "").strip(), "en_name": node.xpath('./h3//p/text()').get(), "img_url": urljoin(self.base_url, node.xpath('./span//img/@src').get()), "mf": node.xpath(tmp.format("Chemical Formula")).get("").replace( "Chemical Formula :", "").strip(), "prd_url": urljoin(self.base_url, node.xpath('./a/@href').get('')), } yield RawData(**d)
def parse_detail(self, response): tmp = '//td[contains(text(), {!r})]/following-sibling::td//text()' d = { 'brand': self.brand, 'parent': response.meta.get('parent'), 'cat_no': strip(response.xpath(tmp.format("产品编号:")).get()), 'en_name': strip(response.xpath('//div[@class="proinftit_t"]/text()').get()), 'cas': strip(response.xpath(tmp.format("CAS号:")).get()), 'mf': strip(''.join(response.xpath(tmp.format("分子式:")).getall())), 'mw': strip(response.xpath(tmp.format("分子量:")).get()), 'info1': strip(response.xpath(tmp.format("化学名:")).get()), 'img_url': response.xpath('//div[@class="proinfotableimg"]/img/@src').get(), 'prd_url': response.url, } yield RawData(**d)
def detail_parse(self, response): en_name = response.xpath( '//h1[@class="product-detail-title"]/text()').get(default="") en_name = re.sub(r'\r?\n', "", en_name) tmp = '//td[contains(text(),{!r})]/following-sibling::td/text()' d = { 'brand': "synzeal", 'en_name': en_name.strip(), 'prd_url': response.request.url, # 产品详细连接 'cat_no': response.xpath(tmp.format('SZ CAT No')).get(), 'cas': response.xpath(tmp.format('CAS No')).get(default=""), 'stock_info': response.xpath(tmp.format('Inv. Status')).get(), 'mf': response.xpath(tmp.format('Mol.F.')).get(), 'mw': response.xpath(tmp.format('Mol.Wt.')).get(), 'info1': response.xpath('//b[text()="Synonym: "]/../text()').get( default="").strip(), 'parent': response.meta.get('parent'), 'img_url': response.xpath( '//div[@class="product-details-tab"]//img/@src').get(), } yield RawData(**d)
def parse_detail(self, response): tmp = '//td[contains(text(), {!r})]/following-sibling::td/text()' cat_no = response.xpath(tmp.format('Catalog #')).get() d = { 'brand': self.brand, 'cat_no': cat_no, 'en_name': response.xpath('//td[@class="pageTitle"]/text()').get(), 'cas': response.xpath(tmp.format('CAS#')).get(), 'stock_info': response.xpath(tmp.format('In Stock')).get(), 'prd_url': response.url, } yield RawData(**d) raw_price = strip(response.xpath( 'normalize-space(//td[contains(text(), "Retail Price:")]/following-sibling::td/text())' ).get()) price = None if raw_price: raw_price = re.sub(r'\s+', ' ', raw_price) price = first(map(lambda m: m.group(0) if m is not None else None, re.finditer(r'(\d+(\.\d+)?)', raw_price)), None) dd = { 'brand': self.brand, 'cat_no': cat_no, 'price': price, 'currency': 'USD', 'info': raw_price, 'delivery_time': response.xpath(tmp.format('In Stock')).get(), } yield ProductPackage(**dd)
def cat_parse(self, response): rows = response.xpath('//form/div[@class="row"]/div') catalog = response.meta.get('catalog') for row in rows: name = row.xpath('./a/text()').get() url_prd = urljoin(self.base_url, row.xpath('./a/@href').get()) mol_text = row.xpath('./div/div/object/param/@value').get() text = row.xpath('./div/div[contains(text(),"Purity")]/text()').getall() if not text: # Controlled Drugs continue purity = text[0].split(':', 1)[-1].strip() cat_no = text[1].split(':', 1)[-1].strip() cas = text[2].split(':', 1)[-1].strip() stock = text[3].split(':', 1)[-1].strip() mf = text[4].split(':', 1)[-1].strip() if mol_text: mol_text = mol_text.encode('u8').decode('unicode-escape') d = { 'brand': 'dalton', 'en_name': name, 'prd_url': url_prd, # 产品详细连接 'mol_text': mol_text, 'purity': purity, 'cat_no': cat_no, 'cas': cas, 'stock_info': stock, 'mf': mf, 'parent': catalog, } yield RawData(**d)
def parse_detail(self, response): tmp = '//strong[contains(text(), {!r})]/../text()' cas = response.xpath(tmp.format("CAS")).get('') cas = cas if cas != 'No' else None d = { 'brand': 'steraloids', 'parent': None, 'cat_no': response.xpath(tmp.format("Catalogue ID")).get(), 'en_name': response.xpath('//h1[@class="product-title"]/text()').get(), 'cas': cas, 'mf': response.xpath(tmp.format("Formula")).get('').replace(' ', '') or None, 'mw': response.xpath(tmp.format("Molecular Weight")).get(), 'stock_info': response.xpath( '//meta[@property="product:availability"]/@content').get(), 'img_url': response.meta.get('img_url'), 'info1': response.xpath('//h1[@class="product-title"]/text()').get(), 'prd_url': response.url, } yield RawData(**d)
def parse_detail(self, response): tmp_cat_no = '//b[contains(text(), "Product Code")]/../following-sibling::li[1]/text()' tmp = '//b[contains(text(), {!r})]/../following-sibling::li[1]//span[@itemprop]/text()' rel_img = response.xpath( '//img[@class="productDetailsImage"]/@src').get() d = { 'brand': "anaxlab", 'en_name': response.xpath('//h1[@class="title"]/text()').get(), 'prd_url': response.request.url, # 产品详细连接 'cat_no': cus_strip(response.xpath(tmp_cat_no).get()), 'cas': cus_strip(response.xpath(tmp.format('CAS Number')).get()), 'mf': cus_strip(response.xpath(tmp.format('Molecular Formula')).get()), 'mw': cus_strip(response.xpath(tmp.format('Molecular Weight')).get()), 'smiles': response.xpath( '//li[contains(text(), "Smile Code")]/following-sibling::li[1]/text()' ).get(), 'info1': cus_strip(response.xpath(tmp.format('Synonyms')).get()), 'parent': response.meta.get('parent'), 'img_url': rel_img and urljoin(self.base_url, rel_img), } yield RawData(**d)
def detail_parse(self, response): tmp = 'normalize-space(//div[@class="product1_l"]//span[contains(text(), "{}")]/../text())' rel_img = response.xpath('//div[@class="product1"]/img/@src').get() d = { "brand": "synpharmatech", "cat_no": strip(response.xpath(tmp.format("Cat. No")).get()), "en_name": strip( response.xpath('//div[@class="product1_l"]//h1/text()').get()), "info1": strip(response.xpath(tmp.format("Synonyms")).get()), "cas": strip(response.xpath(tmp.format("CAS No")).get()), "mf": strip(response.xpath(tmp.format("Formula")).get()), "mw": strip(response.xpath(tmp.format("F.W")).get()), "purity": strip(response.xpath(tmp.format("Purity")).get()), "stock_info": strip( response.xpath( 'normalize-space(//div[@class="product2"]//tr[position()>1]/td[4]/text())' ).get()) or None, "prd_url": response.url, "img_url": urljoin(self.base_url, rel_img) if rel_img else None, } yield RawData(**d)
def detail_parse(self, response): tmp = '//td[contains(text(), {!r})]/following-sibling::td/text()' tmp2 = '//strong[text()={!r}]/following-sibling::text()' rel_img = response.xpath('//div[@class="image"]/img/@data-src').get() img_url = urljoin(self.base_url, rel_img) if rel_img else None d = { "brand": "veeprho", "parent": response.meta.get('parent'), "cat_no": response.xpath(tmp2.format("Catalogue No.:")).get('').strip() or None, "en_name": response.xpath('//div[@class="container"]/h1/text()').get(), "img_url": img_url, "cas": response.xpath(tmp2.format("CAS No.:")).get('').strip() or None, "prd_url": response.url, 'mf': response.xpath(tmp.format('Molecular Formula')).get(), 'mw': response.xpath(tmp.format('Molecular Weight')).get(), 'stock_info': response.xpath(tmp.format('Status')).get(), 'info1': response.xpath(tmp.format('IUPAC Name')).get(), } yield RawData(**d)
def detail_parse(self, response): tmp = '//th[contains(text(),{0!r})]/following-sibling::td/descendant-or-self::text()' img_url = response.xpath( '//th[contains(text(),"Structure")]/following-sibling::td/img/@src' ).get() cat_no = strip(response.xpath(tmp.format("Product No.")).get()) d = { "brand": self.brand, "cat_no": cat_no, "parent": response.xpath(tmp.format("Category")).get(), "info1": "".join(response.xpath(tmp.format("Synonym(s)")).extract()), "mw": response.xpath(tmp.format("Molecular Weight")).get(), "mf": "".join(response.xpath(tmp.format("Formula")).extract()), "cas": response.xpath(tmp.format("CAS Number")).get(), "en_name": strip("".join( response.xpath( '//div[@class="product-name"]/span/descendant-or-self::text()' ).extract())), "img_url": img_url and urljoin(self.base_url, img_url), "stock_info": response.xpath( '//table[@id="product-matrix"]//td[@class="unit-price"]/text()' ).get(), "prd_url": response.url, } yield RawData(**d) matrix = first( re.findall(r'var matrixChildrenProducts = ({.+});', response.text), None) if not matrix: return packages = json.loads(matrix) for _, item in packages.items(): sku = item.get('sku') if not sku: continue package = sku.replace(f'{cat_no}-', '') dd = { 'brand': self.brand, 'cat_no': cat_no, 'cat_no_unit': sku, 'package': strip(package), 'price': item.get('price'), 'currency': 'USD', 'delivery_time': 'In-stock' if item.get('is_in_stock') else None } yield ProductPackage(**dd)