def parse_more(item=None, response=None): if not item or not response: return -404 root = lxml.html.fromstring(response.text.encode('utf-8')) data = {} # family_sn match = family_sn_pattern.search(response.url) data['family_sn'] = match.group(1) if match else item['goods_name'] # catlog breadcrumb = root.xpath('//p[@class="breadcrumb"]/a') data['catlog'] = [] for catlog in breadcrumb: catlog_name = util.cleartext(catlog.text_content()) catlog_url = util.urljoin(response.url, catlog.xpath('./@href')[0]) if catlog_name and catlog_url: data['catlog'].append([catlog_name, catlog_url]) else: data['catlog'] = [] break else: data['catlog'].append([data['family_sn'], response.url]) # doc doc = root.xpath('//li[@class="pdf"]/a[@class="doclink"]/@title') data['doc'] = "http://cds.linear.com/docs/en/datasheet/{title}".format( title=doc[0]) if doc else '' item.update(data) return item
def parse_model_detail(self, response): '''解析产品详情''' json_html = re.findall( r'<script type="application/ld\+json">(.*?)</script>', response.body, re.S) if not json_html: raise DropItem('匹配源码内容异常 请检查:{0}'.format(response.url)) json_data = json.loads(json_html[0]) product_list = json_data['offers'] pre_url = 'https://www.ti.com.cn/product/cn/{}'.format( json_data['mpn']) description = json_data['description'] doc_url = urljoin( self.base_url, response.xpath( '//div/a[@data-navtitle="data sheet"]/@href').extract_first()) attrs_items = response.xpath( '//ti-multicolumn-list/ti-multicolumn-list-row') attr_list = [] # 获取属性列表 for attrs_item in attrs_items: attr = attrs_item.xpath( './ti-multicolumn-list-cell/span/text()').extract() if not attr: continue key = util.cleartext(attr[0]) val = util.cleartext(attr[1]) if key and val: attr_list.append((key, val)) # 获取分类列表 cat_list = [] cat_items = response.xpath( '//ti-breadcrumb/ti-breadcrumb-section/a')[1:] for cat_item in cat_items: ckey = util.cleartext(cat_item.xpath('./text()').extract_first()) cval = urljoin(self.base_url, cat_item.xpath('./@href').extract_first()) cat_list.append((ckey, cval)) for data in product_list: item = GoodsItem() data = data['itemOffered'] item['url'] = pre_url item['goods_sn'] = data['sku'] item['goods_other_name'] = item['goods_name'] = data['mpn'] item['provider_name'] = data['brand'] item['provider_url'] = '' item['goods_desc'] = description item['goods_img'] = item['goods_thumb'] = '' item['doc'] = doc_url item['rohs'] = 0 shop_price = data['offers'].get('price') item['tiered'] = [] if not shop_price: item['stock'] = [0, 1] # 库存 item['increment'] = 1 else: # 庫存判斷 if not data['offers'].get('inventoryLevel'): item['stock'] = [0, 1] else: item['stock'] = [ util.intval(data['offers']['inventoryLevel']), 1 ] # 库存 for price_item in data['offers']['priceSpecification']: pnum = price_item['eligibleQuantity']['minValue'] pval = price_item['price'] item['tiered'].append( (util.intval(pnum), util.floatval(pval))) item['increment'] = item['tiered'][0][0] if not item['tiered']: item['tiered'] = [[0, 0.00]] # 属性 item['attr'] = attr_list # 分类 item['catlog'] = cat_list yield item
def _parse_detail_data(resp, headers=None, **kwargs): """ 解析详情数据,独立出来 @param data 页面数据 @param url 解析的页面url(方便记录异常) @param kwargs 扩展参数 """ item = {} root = lxml.html.fromstring(resp.text.encode('utf-8')) # goods_name goods_name = root.xpath('//td[@class="lnk11b-colorOff"]') item['goods_name'] = util.cleartext( goods_name[0].text) if goods_name else '' # goods_sn match = goods_sn_pattern.search(resp.url) item['goods_sn'] = match.group(1) if match else '' if not item['goods_name'] or not item['goods_sn']: logger.debug("无法解析goods_name和goods_sn URL:{url}".format(url=resp.url)) return -404 # goods_desc goods_desc = root.xpath('//td[@class="txt11"]/text()') item['desc'] = util.cleartext(goods_desc[0], '\n', '\t') if goods_desc else '' # tiered tiered = [] price_list = root.xpath('//td[@class="texttable"]') for x in range(0, len(price_list), 2): qty = util.intval(price_list[x].text_content()) price = util.floatval(price_list[x + 1].text_content()) if qty and price: tiered.append([qty, price]) else: tiered = [[0, 0.00]] break if not tiered: price = root.xpath('//td[@class="txt18b-red"]/text()') price = util.floatval(price[0]) if price else 0 if price: tiered = [1, price] else: tiered = [] item['tiered'] = tiered if tiered else [[0, 0.00]] # stock qty = root.xpath('//input[@id="qty"]/@value') qty = util.intval(qty[0]) if qty else 1 stock = root.xpath('//input[@id="custcol7"]/@value') stock = util.intval(stock[0]) if stock else 0 item['stock'] = [stock, qty] # url item['url'] = resp.url # provider_name item['provider_name'] = 'LINEAR' item['provider_url'] = '' # doc catlog item['doc'] = '' item['catlog'] = '' # attr item['attr'] = [] # rohs item['rohs'] = -1 item['goods_other_name'] = '' # increment item['increment'] = 1 # img item['goods_img'] = '' item['goods_thumb'] = '' # 一些信息需要在linear.com.cn获取 return handle_of_redirects(item)
def _parse_detail_data(resp, headers=None, **kwargs): """ 解析详情数据,独立出来 @param data 页面数据 @param url 解析的页面url(方便记录异常) @param kwargs 扩展参数 """ items = {'list': []} item = {} """解析系列型号数据""" # gpn pattern_gpn = re.compile(r'/product/([^/\?\.%&]+)') gpn = pattern_gpn.search(resp.url) if not gpn: logger.debug('status: -403; 解析商品详情失败, url: %s', str(resp.url)) return -403 gpn = gpn.group(1) soup = BeautifulSoup(resp.text.encode('utf-8'), 'lxml') # family_sn item['family_sn'] = gpn.upper() item['product_id'] = item['family_sn'] # category breadcrumb_div = soup.find('div', class_='breadcrumb') cat_log = [] if breadcrumb_div: for a in breadcrumb_div.find_all('a'): if 'TI Home' in a.get_text(strip=True): continue cat_log.append([a.get_text(strip=True), a['href']]) item['catlog'] = cat_log if cat_log else [] # goods_img, goods_thumb img_div = soup.find('div', class_='image') img = img_div.img['src'] if img_div else '' item['goods_img'] = img item['goods_thumb'] = img # attr attr = [] params_table = soup.find('table', id='paramsName') data_table = soup.find('table', id='parametricdata') if params_table and data_table: attr_params = params_table.find_all('td')[0:-1] attr_data = data_table.find_all('td', class_='on')[0:-1] for k, v in zip(attr_params, attr_data): pattern_blank = re.compile('\s+') k = pattern_blank.sub(' ', k.get_text(strip=True)) v = pattern_blank.sub(' ', v.get_text(strip=True)) attr.append([k, v]) item['attr'] = attr # doc doc_url = soup.find('a', class_='local') item['doc'] = util.cleartext(doc_url.get('href')) if doc_url else '' # description desc = soup.find('h1', class_='productTitle') item['desc'] = desc.get_text(strip=True) if desc else '' for p in get_detail(gpn, **kwargs): item['goods_sn'] = p.get('goods_sn', '') if not item['goods_sn']: continue item['goods_name'] = p.get('goods_sn', '') item['goods_other_name'] = '' item['url'] = p.get('url', '') # item['doc'] = get_data_sheet(gpn, **kwargs) item['stock'] = p.get('stock', [0, 1]) item['tiered'] = p.get('tiered', [[0, 0.0]]) # 添加供应商品牌 item['provider_name'] = p.get('provider_name', '') item['provider_url'] = '' item['increment'] = 1 item['rohs'] = -1 items['list'].append(copy.copy(item)) if not items['list']: logger.debug('status: -403; 解析商品详情失败, url: %s', str(resp.url)) return -403 return items