def parse_detail(self, response): tmp = '//th[contains(text(), {!r})]/following-sibling::td/text()' rel_img = response.xpath('//div[@class="pic"]/img/@src').get() cat_no = response.xpath('//div/span[@style]/text()').get() d = { 'brand': self.brand, 'cat_no': cat_no, 'en_name': response.xpath('//div/span/@data-nameen').get(), 'cas': response.xpath(tmp.format("CAS:")).get(), 'mdl': response.xpath(tmp.format("MDL:")).get(), 'mf': formula_trans(strip(response.xpath(tmp.format("分子式:")).get())), 'mw': response.xpath(tmp.format("分子量:")).get(), 'smiles': response.xpath(tmp.format("SMILES code:")).get(), 'purity': response.xpath(tmp.format("化学纯度:")).get(), 'img_url': rel_img and urljoin(response.url, rel_img), 'prd_url': response.url, } yield RawData(**d) rows = response.xpath('//div[@class="table-1"]//tbody/tr') for row in rows: package = { 'brand': self.brand, 'cat_no': cat_no, 'package': row.xpath('./td[1]/text()').get(), 'price': strip(row.xpath('./td[2]/text()').get()), 'stock_num': row.xpath('./td[5]/text()').get(), 'currency': 'RMB', } yield ProductPackage(**package)
def detail_parse(self, response): tmp = '//td[contains(text(), {!r})]/following-sibling::td/text()' d = { "brand": "hic", "parent": response.meta.get("parent"), "cat_no": response.xpath('//h1[@class="header-post-title-class"]/text()').get(), "cas": response.xpath(tmp.format("CAS:")).get(), "en_name": response.xpath(tmp.format("Chemical name:")).get(), "info1": response.xpath(tmp.format("Synonyms:")).get(), "img_url": response.xpath('//div[@class="images"]//img/@src').get(), "mf": formula_trans(response.xpath(tmp.format("Molecular form:")).get()), "mw": response.xpath(tmp.format("Mol. Weight:")).get(), "prd_url": response.url, "stock_info": response.xpath('//div[@class="InventoryStatus"]/strong/text()').get(), } yield RawData(**d)
def parse_detail(self, response): tmp = '//span[text()={!r}]/following-sibling::span/text()' tmp2 = '//h2/div[@class="std"]//text()[not(parent::span) and not(parent::a/parent::span)]' sequence = strip(''.join( response.xpath( '//div[@class="std"]/span[@class]//text()').getall())) en_name = strip(''.join(response.xpath(tmp2).getall())) or None synonyms = strip(response.xpath(tmp.format("Synonyms")).get()) d = { 'brand': 'bachem', 'parent': strip( response.xpath( '//li[@class="product"]/preceding-sibling::li[1]/a/text()' ).get()), 'cat_no': strip(response.xpath('//div[@id="productname"]/text()').get()), 'en_name': en_name or sequence, 'cas': response.xpath(tmp.format("CAS Registry Number")).get(), 'mf': formula_trans( response.xpath(tmp.format("Molecular Formula")).get()), 'mw': response.xpath(tmp.format("Relative Molecular Mass")).get(), 'info1': ';'.join(filter(lambda x: x, (synonyms, sequence))), 'info2': response.xpath(tmp.format("Storage Conditions")).get(), 'info3': response.xpath('//td[@class="masWeight"]/text()').get(), 'info4': strip( response.xpath( '//div[contains(@class, "formatted-price")]/text()').get() ), 'stock_info': response.xpath( '//p[contains(@class, "availability")]/span/text()').get(), 'img_url': response.xpath('//img[@class="zoom-image"]/@src').get(), 'prd_url': response.url, } yield RawData(**d)
def detail_parse(self, response): tmp = '//td[@class="dleft" and contains(./p/text(), "{}")]/following-sibling::td/p/text()' cas = response.xpath(tmp.format("Labeled CAS#")).get() unlabeled_cas = response.xpath(tmp.format("Unlabeled CAS#")).get() r_img_url = response.xpath('//div[@class="image-section"]/p//img/@src').get() d = { "brand": "cil", "parent": response.meta.get("parent"), "cat_no": response.xpath(tmp.format("Item Number")).get(), "cas": f"{cas}; Unlabeled Cas:{unlabeled_cas}", "en_name": response.xpath('//h1[@class="ldescription"]/text()').get(), "img_url": urljoin(response.url, r_img_url), "mf": formula_trans(response.xpath(tmp.format("Chemical Formula")).get()), "mw": response.xpath(tmp.format("Molecular Weight")).get(), "prd_url": response.url, } yield RawData(**d)
def parse_detail(self, response): tmp = '//h4[contains(text(), {!r})]/following-sibling::p/text()' rel_img = response.xpath('//div[contains(@class, "product-detail-image")]/figure/img/@src').get() sym = response.xpath('//ul[contains(@class, "synmlist")]/li/text()').getall() sym = filter(bool, map(str.strip, sym)) d = { 'brand': 'vivan', 'cat_no': response.xpath(tmp.format('Catalogue No.:')).get(), 'en_name': response.xpath('//div[@class="product-detail"]//h2/text()').get(), 'cas': response.xpath(tmp.format('CAS No. :')).get(), 'mf': formula_trans(response.xpath(tmp.format('Mol. Formula :')).get()), 'mw': response.xpath(tmp.format('Mol. Weight :')).get(), 'img_url': rel_img and urljoin(self.base_url, rel_img), 'info1': ';'.join(sym), 'prd_url': response.url, } yield RawData(**d)
def detail_parse(self, response): img_url = response.xpath('//div[@class="product-img"]//img/@src').get() d = { "brand": "simson", "en_name": response.xpath('//h1[contains(@class, "pro-title")]/text()').get(), "prd_url": response.url, "info1": self.extract_value(response, "Chemical Name"), "cat_no": self.extract_value(response, "Cat. No."), "cas": self.extract_value(response, "CAS. No."), "mf": formula_trans(self.extract_value(response, "Molecular Formula")), "mw": self.extract_value(response, "Formula Weight"), "img_url": img_url or urljoin(self.base_url, img_url), "info4": self.extract_value(response, "Category"), "stock_info": self.extract_value(response, "Product Stock Status"), } # TODO should have a middleware to warn this if d.get('en_name') is None or d.get('cat_no') is None: self.logger.warn(f'Get data loss from {response.url!r}') yield RawData(**d)
def detail_parse(self, response): div = response.xpath('//div[contains(@class,"heading")]') tmp_xpath = './h6[contains(text(),"{0}")]/parent::*/following-sibling::*/h5/descendant-or-self::*/text()' tmp_xpath_2 = './h6[contains(text(),"{0}")]/parent::*/following-sibling::*/h5/text()' mf = ''.join( div.xpath( tmp_xpath.format("Molecular Formula")).extract()).strip() d = { 'brand': "anant", 'en_name': response.xpath( 'normalize-space(//div[contains(@class,"prod-details")]//h1/text())' ).get(), 'prd_url': response.request.url, # 产品详细连接 'cat_no': response.xpath('//h5[@class="prod-cat"]/text()').get("").strip(), 'cas': div.xpath(tmp_xpath_2.format("CAS")).get("").strip(), 'stock_info': div.xpath(tmp_xpath_2.format("Stock Status")).get("").strip(), 'mf': formula_trans(mf), 'mw': div.xpath(tmp_xpath_2.format("Molecular Weight")).get("").strip(), 'info1': response.xpath( '//b[contains(text(),"Synonyms : ")]/following-sibling::text()' ).get("").strip(), 'parent': response.meta.get('parent'), 'img_url': response.xpath( '//div[contains(@class,"entry-thumb")]/a/img/@src').get(), } yield RawData(**d)
def detail_parse(self, response): tmp = 'normalize-space(//td[contains(text(),{!r})]/following-sibling::td//text())' tmp2 = '//strong[contains(text(), {!r})]/../following-sibling::td/text()' parent = response.xpath(tmp.format("Parent API")).get() category = response.xpath(tmp.format("Category")).get() img_rel_url = response.xpath('//div[@class="product-media"]//img/@src').get() d = { "brand": "clearsynth", "parent": parent or category, "cat_no": response.xpath(tmp.format("CAT No.")).get(), "en_name": ''.join(response.xpath('//div[@class="product-name"]//text()').getall()), "cas": response.xpath(tmp.format("CAS")).get(), "mf": formula_trans(strip("".join( response.xpath("//td[contains(text(),'Mol. Formula')]/following-sibling::td//text()").extract()))), "mw": response.xpath(tmp.format("Mol. Weight")).get(), "img_url": img_rel_url and urljoin(response.request.url, img_rel_url), "info1": strip(response.xpath(tmp2.format('Synonyms')).get()), "info2": strip(response.xpath(tmp2.format("Storage Conditions")).get()), "smiles": strip(response.xpath(tmp2.format("Smiles")).get()), "prd_url": response.request.url, "stock_info": strip(response.xpath(tmp2.format("Stock Status")).get()), } yield RawData(**d)
def parse_detail(self, response): tmp = '//th[contains(text(), {!r})]/following-sibling::td//p//text()' package = '//tr[td and td[@class="pro_price_3"]/span[not(@class)]]/td[@class="pro_price_1"]' rel_img = response.xpath( '//div[@class="struct-img-wrapper"]/img/@src').get() cat_no = response.xpath('//dt/span/text()').get('').replace( 'Cat. No.: ', '').replace('目录号: ', '') tmp_package = strip( response.xpath(f'normalize-space({package}/text())').get()) d = { 'brand': self.brand, 'parent': response.meta.get('parent'), 'cat_no': cat_no, 'en_name': response.xpath('//h1/strong/text()').get(), 'cas': strip(response.xpath(tmp.format("CAS No.")).get()), 'mf': formula_trans(strip(response.xpath(tmp.format("Formula")).get())), 'mw': strip(response.xpath(tmp.format("Molecular Weight")).get()), 'smiles': strip(''.join(response.xpath(tmp.format("SMILES")).getall())), 'info3': tmp_package and tmp_package.replace('\xa0', ' '), 'info4': strip( response.xpath( f'{package}/following-sibling::td[1]/text()').get()), 'img_url': rel_img and urljoin(response.url, rel_img), 'prd_url': response.url, } yield RawData(**d) if not cat_no: return rows = response.xpath( '//tr[td and td[@class="pro_price_3"]/span[not(@class)]]') for row in rows: price = strip(row.xpath('./td[@class="pro_price_2"]/text()').get()) tmp_package = strip( row.xpath('normalize-space(./td[@class="pro_price_1"]/text())' ).get()) dd = { 'brand': self.brand, 'cat_no': cat_no, 'package': tmp_package and tmp_package.replace('\xa0', ' '), 'price': price and price.strip('¥'), 'delivery_time': strip(''.join( row.xpath( './td[@class="pro_price_3"]/span//text()').getall())) or None, 'currency': 'RMB', } yield ProductPackage(**dd)