def fetch_data(url, proxy=None, headers=None, **kwargs): """获取页面数据 @param proxy 代理ip,[代理数量,代理列表] @param headers 头部信息,如user_agent @param kwargs 扩展参数,如fetch_update其表示是否为获取更新 @return 获取数据异常时返回信息为负值,成功为字典类型数据 """ _headers = copy.copy(default_headers) if isinstance(headers, dict): _headers.update(util.rfc_headers(headers)) try: proxies = kwargs.get('proxies') if proxies is None and proxy: i = random.randint(0, proxy[0] - 1) proxies = { 'http': 'http://' + proxy[1][i], 'https': 'https://' + proxy[1][i] } resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies) except Exception as e: # 将进行重试,可忽略 logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) return -400 # 强制utf-8 resp.encoding = 'utf-8' if '404.html' in resp.url: return 404 return _parse_detail_data(resp, headers=_headers, **kwargs)
def fetch_update_data(self, data_list=[], proxy=None, **kwargs): '''获取更新数据 @return 无论请求data_list 0 为空(无视) -401 错误(需要重试,程序出错,语法或者由于异常删除造成错误,需要检查程序) -402 数据异常(需要重试,需要检验数据获取情况) -400 代理异常(须重试,可以无视) -200 非200状态,代理异常或者数据异常(须重试,特别注意此种情况是否进入死循环) 200 正常状态,并非指http状态码 404 产品不存在已被删除 ''' # 根据url进行网站判断, 进而调用网站爬虫的模块 update_url = kwargs.get('update_url', '') if not update_url: return if '360' in update_url: return supplier_name = update_url.split('.')[1] if supplier_name is None: return None headers = { 'user-agent': random.choice(config.USER_AGENT_LIST), } try: if not hasattr(supplier, supplier_name): module_name = 'supplier.{0}'.format(supplier_name) if module_name not in sys.modules: __import__(module_name) obj = sys.modules[module_name] else: obj = getattr(supplier, supplier_name) if 'fetch_update_data' in dir(obj): _fetch_update_data = getattr(obj, 'fetch_update_data') else: kwargs['status'] = -401 data_list.append(kwargs) return None except Exception as e: config.LOG.exception('STATUS: -401, ID: {0} 导入错误,将进行重试: {1}'.format(kwargs['id'], e)) kwargs['status'] = -401 data_list.append(kwargs) return None try: kwargs['headers'] = headers kwargs['proxy'] = proxy data_list.append(_fetch_update_data(**kwargs)) except Exception as e: kwargs['status'] = -402 if 'headers' in kwargs: del kwargs['headers'] if 'proxy' in kwargs: del kwargs['proxy'] data_list.append(kwargs) config.LOG.exception('STATUS: -402, ID: %(id)s 错误: %s', {'id': util.u2b(kwargs['id']), 'e': util.traceback_info(e)}, e)
def fetch_data(url, proxy=None, headers=None, **kwargs): """获取页面数据 @param proxy 代理ip,[代理数量,代理列表] @param headers 头部信息,如user_agent @param kwargs 扩展参数,如fetch_update其表示是否为获取更新 @return 获取数据异常时返回信息为负值,成功为字典类型数据 :param url: """ if 'goods_sn' in kwargs: del kwargs['goods_sn'] _headers = copy.copy(default_headers) if isinstance(headers, dict): _headers.update(util.rfc_headers(headers)) if url[0:2] == '//': url = 'http:' + url try: proxies = None if proxy: i = random.randint(0, proxy[0] - 1) proxies = {'http': 'http://' + proxy[1][i]} ti_domain = urlparse.urlsplit(url)[1] if 'www.ti.com.cn' == ti_domain: product_path_pattern = re.compile(r'/cn/(.*)', re.IGNORECASE) product_path = product_path_pattern.search(url) if product_path: url = "http://www.ti.com/product/{path}".format( path=product_path.group(1)) elif 'store.ti.com' in ti_domain: kwargs['proxies'] = proxies return _parse_store_ti_com(url, **kwargs) resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies) except Exception as e: # 将进行重试,可忽略 logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) return -400 # 是否需要添加500的判断 # 强制utf-8 resp.encoding = 'utf-8' if '404.html' in resp.url: return 404 if '/tool/' in resp.url: return _parse_tool_detail(resp, **kwargs) kwargs['proxies'] = proxies return _parse_detail_data(resp, headers=_headers, **kwargs)
def load_js(self): ''' 加载js文件 :return: ''' file_path = util.get_static_file(self.js_file) # try: with open(file_path, 'r', encoding='utf-8') as fp: js_str = fp.read() except Exception as e: _logger.info('INFO: 加载js文件错误 {0}'.format(util.traceback_info(e))) js_str = '' return js_str
def run(args): if not isinstance(args, argparse.Namespace): print('参数有误') return interval = args.interval while 1: try: PutQueue(**args.__dict__) if args.interval <= 0: break print('------------- sleep %s sec -------------' % interval) time.sleep(interval) except Exception as e: if 'params_error' in e: break print(util.traceback_info(e, return_all=True))
def fetch_data(url, proxy=None, headers=None, **kwargs): ''' 获取页面数据 @description @param proxy 代理ip,[代理数量,代理列表] @param headers 头部信息,如user_agent @param kwargs 扩展参数,如fetch_update其表示是否为获取更新 @return 获取数据异常时返回信息为负值,成功为字典类型数据 ''' if isinstance(headers, dict): default_headers = headers try: proxies = None if proxy: i = random.randint(0, proxy[0] - 1) proxies = {'http': 'http://' + proxy[1][i]} sess = requests.Session() rs = sess.get(url, headers=default_headers, cookies=_cookies, timeout=30, proxies=proxies) except Exception as e: # 将进行重试,可忽略 _logger.info('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) return -400 if rs.status_code != 200: if rs.status_code == 500: _logger.debug('STATUS:-500 ; INFO:请求被禁止 ; PROXY:%s ; URL:%s ; User-Agent:%s' % ( proxies['http'] if proxy else '', url, headers.get('user_agent', ''))) return -500 # 已失效产品(url不存在) elif rs.status_code == 404: _logger.debug('STATUS:404 ; INFO:请求错误 ; URL:%s' % url) return 404 _logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' % ( rs.status_code, proxies['http'] if proxy else '', url)) return -405 # 强制utf-8 rs.encoding = 'utf-8' return _parse_detail_data(rs.text, url=url, **kwargs)
def fetch_data(self): ''' 获取页面数据 ''' headers = self.headers if self.headers else DEFAULT_HEADER try: sess = requests.Session() print('获取url: {0}'.format(self.url)) if self.method == 'GET': rs = sess.get(self.url, headers=headers, cookies=None, timeout=30, proxies=None) elif self.method == 'POST': rs = sess.post(self.url, data=self.form_data, headers=headers, cookies=None, timeout=30, proxies=None) else: _logger.info('INFO:请求方法未定义 ; URL: {0}'.format(self.url)) print('rs', rs) print(rs.text, rs.text) except Exception as e: # 将进行重试,可忽略 _logger.info('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), self.url)) return -400 if rs.status_code != 200: if rs.status_code == 404: _logger.debug('STATUS:404 ; INFO:请求错误 ; URL:%s' % self.url) return 404 # 强制utf-8 # rs.encoding = 'utf-8' rs.encoding = rs.apparent_encoding return self._parse_detail_data(rs.content)
def fetch_search_data(self, data_list=[], err_list=[], proxy=None, supp=None, **kwargs): """ 根据搜索关键词获取产品产品数据(可能为url也可能为详细信息) """ if not supp or 'keyword' not in kwargs: return None headers = { 'user-agent': random.choice(config.USER_AGENT_LIST), } keyword = util.u2b(kwargs['keyword']) supplier_name = config.DB_KEY[supp] try: if not hasattr(supplier, supplier_name): module_name = 'supplier.{0}'.format(supplier_name) if module_name not in sys.modules: __import__(module_name) obj = sys.modules[module_name] else: obj = getattr(supplier, supplier_name) if hasattr(obj, 'api_search_data'): _fetch_function = getattr(obj, 'api_search_data') else: _fetch_function = getattr(obj, 'fetch_search_data') except Exception as e: config.LOG.exception('STATUS: -401, Keyword: %(keyword)s', {'keyword': keyword}) if kwargs.get('count', 1) < self.exception_threshold: kwargs['status'] = -401 kwargs['count'] = kwargs.get('count', 1) + 1 err_list.append(kwargs) return None data_dict = { 'detail': [], 'list': [], 'url': [] } if self.optype == 'hot' and self.use: kwargs['hot_search'] = True del kwargs['keyword'] try: _fetch_function(keyword, supp, data_dict, headers, **kwargs) except Exception as e: config.LOG.exception('STATUS: -402, Keyword: %(keyword)s', {'keyword': keyword}) if kwargs.get('count', 1) < self.exception_threshold: kwargs['status'] = -402 kwargs['count'] = kwargs.get('count', 1) + 1 kwargs['keyword'] = keyword err_list.append(kwargs) return None if data_dict['list']: try: _fetch_function = getattr(obj, 'fetch_search_list') except Exception as e: _fetch_function = None print(util.traceback_info(e, return_all=1)) if _fetch_function: res = self._crawl(_fetch_function, data_dict['list'], headers, proxy) if 'url' in res: for url in res['url']: data_dict['url'].append(url) if 'detail' in res: for data in res['detail']: data_dict['detail'].append(data) if data_dict['url']: try: _fetch_function = getattr(obj, 'fetch_data') except Exception as e: _fetch_function = None print(util.traceback_info(e, return_all=1)) if _fetch_function: res = self._crawl(_fetch_function, data_dict['url'], headers, proxy) if 'detail' in res: for data in res['detail']: data_dict['detail'].append(data) for data in data_dict['detail']: pass data_list.append(data) ''' 此处进行每条数据的清洗整理 ''' return data_list
def fetch_search_data(keyword=None, id=None, data_dict=None, headers=None, proxy=None, **kwargs): """获取搜索数据""" if keyword: print '正在获取 richardsonrfpd 中关键词:%s 的相关数据' % keyword url = 'http://www.richardsonrfpd.com/Pages/AdvanceSearch.aspx' elif 'url' in kwargs: url = kwargs['url'] else: return 404 _headers = copy.copy(default_headers) if isinstance(headers, dict): _headers.update(util.rfc_headers(headers)) try: proxies = kwargs.get('proxies') if proxies is None and proxy: i = random.randint(0, proxy[0] - 1) proxies = { 'http': 'http://' + proxy[1][i], 'https': 'https://' + proxy[1][i] } response = requests.get(url, headers=_headers, timeout=30, proxies=proxies) resp = do_search(response, keyword) if isinstance(resp, int): raise ValueError except Exception as e: logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) if 'Invalid URL' not in str(e): data_dict['list'].append({ 'status': -400, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -400 if resp.status_code != 200: if resp.status_code == 404 and '404.html' in resp.url: logger.info('STATUS:404; INFO:无效产品; URL: %s' % url) return 404 logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' % (resp.status_code, proxies['http'] if proxy else '', url)) data_dict['list'].append({ 'status': -405, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -405 resp.encoding = 'utf-8' # 开始解析resp # 获取搜索的数量 if 'Search-Results.aspx' in resp.url: product_list = analyse_product_url(resp) root = lxml.html.fromstring(resp.text.encode('utf-8')) product_list = root.xpath('//tr[@valign="top"][@height=85]') if len(product_list) <= 0: data_dict['list'].append({ 'status': 404, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return 404 for product in product_list: detail = product.xpath('.//a[@class="lnk12b-blackOff"]') detail_url = util.urljoin( resp.url, detail[0].xpath('./@href')[0]) if detail else '' match = goods_sn_pattern.search(detail_url) if not match and detail_url: logger.debug(u"无法匹配链接中的goods_sn URL{url}".format(url=detail_url)) return -404 goods_sn = match.group(1) goods_name = detail[0].text_content() if detail else '' data_dict['url'].append({ 'id': id, 'url': detail_url, 'goods_sn': goods_sn, 'goods_name': goods_name, }) if 'showMore=true' in url: return 200 count = root.xpath('//td[@class="medtext"]') count = util.number_format(count[0].text, places=0, index=999, smart=True) if count else 0 page_num = int(math.ceil(count / 10.0)) if page_num <= 1: return 200 max_list_num = util.intval(kwargs.get('max_list_num', 5)) page_list = root.xpath('//td[@class="medtext"]/a/@href') for x in xrange(1, page_num + 1): if max_list_num and x > max_list_num: break page_url = 'http://shopping.netsuite.com/s.nl/c.402442/sc.2/.f?search={search}&range={start}%2C{end}%2C{total}'.format( search=keyword, start=x * 10 + 1, end=(x + 1) * 10, total=count) data_dict['list'].append({ 'id': id, 'url': page_url, }) return 200
def fetch_search_data(keyword=None, id=None, data_dict=None, headers=None, proxy=None, **kwargs): """获取搜索数据""" if keyword: if not kwargs.get('other_usage', False): print '正在获取 ti.com 中关键词:%s 的相关数据' % keyword url = 'http://www.ti.com/sitesearch/docs/partnumsearch.tsp?sort=asc&linkId=2&filter=p&sortBy=pstatus&searchTerm=%s' % keyword elif 'url' in kwargs: url = kwargs['url'] else: return 404 _headers = copy.copy(default_headers) if isinstance(headers, dict): _headers.update(util.rfc_headers(headers)) try: proxies = None if proxy: i = random.randint(0, proxy[0] - 1) proxies = {'http': 'http://' + proxy[1][i]} resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies) except Exception as e: logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) if 'Invalid URL' not in str(e): data_dict['list'].append({ 'status': -400, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -400 if resp.status_code != 200: if resp.status_code == 404 and '404.html' in resp.url: logger.info('STATUS:404; INFO:无效产品; URL: %s' % url) return 404 logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' % (resp.status_code, proxies['http'] if proxy else '', url)) data_dict['list'].append({ 'status': -405, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -405 resp.encoding = 'utf-8' resp_json = {} try: resp_json = json.loads(resp.content) product = resp_json.get('response', {}).get('searchResults', {}).get('PartNoArray', []) # print len(product) except: product = [] logger.debug('STATUS:-404 ; INFO:数据异常 ; URL:%s' % url) if len(product) <= 0: data_dict['list'].append({ 'status': 404, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return 404 links = product for vo in links: pn = vo.get('PartNumber', '') tn = vo.get('PartType', '') if pn: link = 'http://www.ti.com/product/%s' % pn if 'tool' in tn: link = 'http://www.ti.com/tool/%s' % pn data_dict['url'].append({'id': id, 'url': link, 'goods_sn': pn}) if 'startNum=' in resp.url: return 200 page_num = 0 count = 0 try: count = resp_json.get('response', {}).get('searchResults', {}).get('filter', {}).get('MaxRecordCount', '') count = util.intval(count) except: count = 0 page_num = int(math.ceil(count / 25.0)) if page_num <= 1: return 200 # 翻页的form_data max_list_num = util.intval(kwargs.get('max_list_num', 5)) for x in xrange(1, page_num + 1): if max_list_num and x > max_list_num: break url = 'http://www.ti.com/sitesearch/docs/partnumsearch.tsp?sort=asc&linkId=2&startNum=%d&filter=p&sortBy=pstatus&searchTerm=%s' % ( 25 * x, keyword) page_url = url data_dict['list'].append({ 'id': id, 'url': page_url, }) return 200
def fetch_search_data(keyword=None, id=None, data_dict=None, headers=None, proxy=None, **kwargs): """获取搜索数据""" if keyword: print '正在获取 avnet 中关键词:%s 的相关数据' % keyword url = "https://www.avnet.com/search/resources/store/715839038/productview/bySearchTerm/select?searchType=102&profileName=Avn_findProductsBySearchTermCatNav_Ajax&searchSource=Q&landingPage=true&storeId=715839038&catalogId=10001&langId=-1¤cy=USD&orgEntityId=-2000&responseFormat=json&pageSize=20&pageNumber=1&_wcf.search.internal.boostquery=price_USD:{{0.00001+TO+*}}^499999.0+inStock:%22true%22^9000.0+topSellerFlag:%22Yes%22^0.085+newProductFlag:%22Yes%22^0.080+packageTypeCode:%22BKN%22^0.075&_wcf.search.internal.filterquery=-newProductFlag%3ANPI&q={keyword}&intentSearchTerm={keyword}&searchTerm={keyword}&wt=json".format( keyword=keyword) elif 'url' in kwargs: url = kwargs['url'] else: return 404 _headers = copy.copy(default_headers) if isinstance(headers, dict): _headers.update(util.rfc_headers(headers)) try: proxies = kwargs.get('proxies') if proxies is None and proxy: i = random.randint(0, proxy[0] - 1) proxies = { 'http': 'http://' + proxy[1][i], 'https': 'https://' + proxy[1][i] } resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies) except Exception as e: logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) if 'Invalid URL' not in str(e): data_dict['list'].append({ 'status': -400, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -400 if resp.status_code != 200: if resp.status_code == 404 and '404.html' in resp.url: logger.info('STATUS:404; INFO:无效产品; URL: %s' % url) return 404 logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' % (resp.status_code, proxies['http'] if proxy else '', url)) data_dict['list'].append({ 'status': -405, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -405 resp.encoding = 'utf-8' # 开始解析resp # 获取搜索的数量 search_dict = {} try: search_dict = json.loads(resp.text.encode('utf-8')) product_list = search_dict.get('catalogEntryView', []) except: product_list = [] logger.debug('STATUS:-404 ; INFO:数据异常 ; URL:%s' % url) if len(product_list) <= 0: data_dict['list'].append({ 'status': 404, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return 404 # sn = product.xpath('.//td[@class="partColHeader"]//span[@class="defaultSearchText"]') for product in product_list: goods_sn = product.get('seo_token_ntk', '') base_url = 'https://www.avnet.com/shop/apac/' product_url = product.get('avn_pdp_seo_path', '') data_dict['url'].append({ 'id': id, 'url': util.urljoin(base_url, product_url), 'goods_sn': goods_sn }) if 'showMore=true' in url: return 200 count = search_dict.get('recordSetTotal', 0) page_num = int(math.ceil(count / 20.0)) if page_num <= 1: return 200 max_list_num = util.intval(kwargs.get('max_list_num', 5)) for x in xrange(2, page_num + 1): if max_list_num and x > max_list_num: break page_url = 'https://www.avnet.com/search/resources/store/715839038/productview/bySearchTerm/select?searchType=102&profileName=Avn_findProductsBySearchTermCatNav_More_Ajax&searchSource=Q&landingPage=true&storeId=715839038&catalogId=10001&langId=-1¤cy=USD&orgEntityId=-2000&responseFormat=json&pageSize=20&pageNumber={next_page}&_wcf.search.internal.boostquery=price_USD:{{0.00001+TO+*}}^499999.0+inStock:%22true%22^9000.0+topSellerFlag:%22Yes%22^0.085+newProductFlag:%22Yes%22^0.080+packageTypeCode:%22BKN%22^0.075&_wcf.search.internal.filterquery=-newProductFlag:NPI&q={keyword}&intentSearchTerm={keyword}&searchTerm={keyword}&showMore=true&wt=json'.format( next_page=x, keyword=keyword) # print page_url data_dict['list'].append({ 'id': id, 'url': page_url, }) return 200
def _parse_detail_data(resp, headers=None, **kwargs): """ 解析详情数据,独立出来 @param data 页面数据 @param url 解析的页面url(方便记录异常) @param kwargs 扩展参数 """ item = {} try: soup = BeautifulSoup(resp.text, 'lxml') if soup is None: logger.debug('初始化商品详情页面失败 URL: %s', resp.url) return -404 except Exception as e: logger.debug('初始化商品详情页面失败 URL: %s ERROR: %s', (resp.url, util.traceback_info(e))) return -404 # goods_sn url_path_list = resp.url.split('/') goods_sn_pattern = re.compile(r'.*-\d{19}') for path in url_path_list[::-1]: if goods_sn_pattern.findall(path): item['goods_sn'] = path break if not item.get('goods_sn', False): logger.debug("无法从链接中解析goods_sn URL: {url} ".format(url=resp.url)) return -400 # goods_name goods_info_div = soup.find('div', class_='section-left') item['goods_name'] = goods_info_div.find('h1').get_text( strip=True) if goods_info_div else item['goods_sn'] # url item['url'] = resp.url # goods_img img_div = soup.find('div', id="outer-div1") img = img_div.find('img') if img_div else None item['goods_img'] = util.urljoin(resp.url, img.get('src')) if img else '' # goods_thumb item['goods_thumb'] = item['goods_img'] # desc desc_p = soup.find('p', class_='RB-pdp_short_Desc') item['desc'] = desc_p.get_text(strip=True) if desc_p else '' # provider_name item['provider_name'] = "AVNET" # provider_url item['provider_url'] = '' # attr: [[None, None]] attr_body = soup.find('div', id="techAttr") attr_div = attr_body.find_all('div', class_='pdpDescriptionsBodyContent') attr = [] if attr_div is not None: for content in attr_div: att_name = content.find('div', class_='pdpDescriptionColumn') attr_value = content.find('div', class_='pdpValueColumn') if att_name and attr_value: attr.append([ att_name.get_text(strip=True), attr_value.get_text(strip=True) ]) else: continue item['attr'] = attr else: item['attr'] = attr # tiered: [[0, 0.00]] tiered_span = soup.find_all('span', class_='usdpart1') tiered = [] if tiered_span: for span in tiered_span: qty_span = span.find('span', class_='pdpTierMinQty') qty = qty_span.get_text(strip=True) if qty_span else 0 price_p = span.find('p') price = price_p.get_text(strip=True) if price_p else 0.00 if qty and price: tiered.append([util.intval(qty), util.floatval(price)]) else: tiered = [[0, 0.00]] break item['tiered'] = tiered else: item['tiered'] = [[0, 0.00]] # stock: [0, 1] >> [stock, qty] stock_input = soup.find('input', id='inStock') stock = stock_input.get('value') if stock_input else 0 stock = util.intval(stock) # qty min_qty_input = soup.find('input', attrs={'name': 'min'}) min_qty = min_qty_input.get('value') if min_qty_input else 1 min_qty = util.intval(min_qty) item['stock'] = [stock, min_qty] if stock else ['0', '1'] # increment: 1 multi_input = soup.find('input', attrs={'name': 'mult'}) item['increment'] = util.intval( multi_input.get('value')) if multi_input else 1 # doc doc_div = soup.find('div', class_='pdfcontent') if doc_div is not None: doc_url = doc_div.find('a', class_='datasheet_align') item['doc'] = doc_url.get('href') if doc_url else '' else: item['doc'] = '' # rohs: -1 rohs_div = soup.find('div', class_='leafcontent') item['rohs'] = 1 if rohs_div else -1 # catlog: [[name, url]] nav = soup.find('nav', class_='breadcrumb') nav_ul = nav.find('ul', class_='nav') catlog = [] if nav is not None: lis = nav.find_all('a') for a in lis: cat_name = a.get_text(strip=True) cat_url = util.urljoin(resp.url, a.get('href')) if cat_name and cat_url: catlog.append([cat_name, cat_url]) else: continue item['catlog'] = catlog else: item['catlog'] = catlog # goods_other_name item['goods_other_name'] = '' # product_id # family_sn return item