def parse_price(self, resp): """解析库存价格数据""" items = resp.meta.get('items') if not items: logger.error('request meta data error, url: %s', resp.url) return prices = {} try: data = json.loads(resp.body) for entprice in data['EntitledPrice']: tiered = [] if 'RangePrice' not in entprice: entprice['RangePrice'] = [] for vo in entprice['RangePrice']: qty = util.intval(vo['minimumQuantity']['value']) if 'minimumQuantity' in vo else 1 price = util.floatval(vo['priceInRange']['value']) if 'priceInRange' in vo else 0 if not qty or (tiered and qty < tiered[-1][0]): continue tiered.append([qty, price]) if not tiered: tiered.append([0, 0.0]) prices[entprice['productId']] = tiered except: logger.exception('parse stock price error, url: {0}---price_Json_error---{1}'.format(resp.url, resp.body) ) for item in items: if item['goods_sn'] in prices: item['tiered'] = prices[item['goods_sn']] yield item
def parse_resp(self, resp): search_match = self.product_url_pattern_0.search( urllib.unquote(resp.url)) detail_match = self.product_url_pattern_1.search( urllib.unquote(resp.url)) or self.product_url_pattern_2.search( urllib.unquote(resp.url)) print "=" * 30 print resp.url print urllib.unquote(resp.url) print detail_match print search_match if detail_match: yield self.parse_detail(resp) elif search_match: soup = BeautifulSoup(resp.text.encode('utf-8'), 'lxml') # 获取搜索数目 try: total = soup.find('h3', class_='results') total = util.intval(total.get_text(strip=True)) if total else 0 except: total = 0 pages = int(math.ceil(total / self.limit_num)) if pages <= 1: return for x in range(1, pages + 1): page_url = "http://cn.futureelectronics.com/zh/search.aspx?dsNav=Ro:%d,Aro:%d" % ( x * 10, x * 10) search_id = search_match.group(1) page_url = page_url + ',N:{search_id}'.format( search_id=search_id) yield Request(url=page_url, headers=self.headers, cookies=self.cookies)
def parse_detail(self, data, category=None): """解析系列型号数据""" if category is None: category = {} item = GoodsItem() item['url'] = urlparse.urljoin(self.base_url, data['avn_pdp_seo_path']) item['goods_sn'] = data['uniqueID'] item['goods_name'] = data['mfPartNumber_ntk'].upper() if not item['goods_name']: return None if 'packageTypeCode' in item: item['goods_other_name'] = '{0}/{1}'.format(item['goods_name'], item['packageTypeCode']).upper() item['provider_name'] = data['manufacturer'] item['provider_url'] = '' item['goods_desc'] = data['shortDescription'] if 'shortDescription' in data else '' if 'avn_thumbnail' in data and data['avn_thumbnail']: item['goods_thumb'] = util.urljoin(self.base_url, data['avn_thumbnail']) else: item['goods_thumb'] = '' item['goods_img'] = item['goods_thumb'].replace('icon_thumb', 'icon_web') if 'auxDescription2' in data and data['auxDescription2']: item['doc'] = data['auxDescription2'] else: item['doc'] = '' min_qty = int(data['xcatField1']) if 'xcatField1' in data else 1 if 'multQuantity' in data: increment = int(data['multQuantity']) else: increment = 1 if 'inv_strlocqty' in data: stock_qty = util.intval(data['inv_strlocqty']) else: stock_qty = 0 item['rohs'] = 1 if 'ROHSComplianceCode' in data and data['ROHSComplianceCode'] == 'Y' else 0 item['tiered'] = [[0, 0.0]] item['stock'] = [stock_qty, min_qty] # 库存 item['increment'] = increment # 属性 item['attr'] = [] if 'attributes' not in data: data['attributes'] = [] for vo in data['attributes']: try: item['attr'].append([vo['name'], vo['values'][0]['value']]) except: pass # 分类 item['catlog'] = [] catelogs = data['parentCatgroup_id_path'].split('_')[-1].split(':') for vo in catelogs: if vo not in category: continue item['catlog'].append((category[vo], vo)) item['region'] = 'AMERICAS' item['id'] = 16 return item
def parse(self, resp): systems_catalog = 0 try: product_dict = json.loads(resp.text.encode('utf-8')) systems_catalog = resp.meta.get('systemsCatalog') total_match_count_string = util.intval( product_dict.get('totalMatchCountString')) pages = int(math.ceil(total_match_count_string / self.limit_num)) for pageNum in xrange(1, pages + 1): self.form_data['pageNum'] = str(pageNum) yield Request(url=self.processData_url, method='POST', headers=self.headers, body=json.dumps(self.form_data), meta={'systemsCatalog': systems_catalog}, callback=self.parse_detail) except: logger.exception('Parse error, systemsCatalog: %s', systems_catalog)
def parse_model_detail(self, response): '''解析产品详情''' json_html = re.findall( r'<script type="application/ld\+json">(.*?)</script>', response.body, re.S) if not json_html: raise DropItem('匹配源码内容异常 请检查:{0}'.format(response.url)) json_data = json.loads(json_html[0]) product_list = json_data['offers'] pre_url = 'https://www.ti.com.cn/product/cn/{}'.format( json_data['mpn']) description = json_data['description'] doc_url = urljoin( self.base_url, response.xpath( '//div/a[@data-navtitle="data sheet"]/@href').extract_first()) attrs_items = response.xpath( '//ti-multicolumn-list/ti-multicolumn-list-row') attr_list = [] # 获取属性列表 for attrs_item in attrs_items: attr = attrs_item.xpath( './ti-multicolumn-list-cell/span/text()').extract() if not attr: continue key = util.cleartext(attr[0]) val = util.cleartext(attr[1]) if key and val: attr_list.append((key, val)) # 获取分类列表 cat_list = [] cat_items = response.xpath( '//ti-breadcrumb/ti-breadcrumb-section/a')[1:] for cat_item in cat_items: ckey = util.cleartext(cat_item.xpath('./text()').extract_first()) cval = urljoin(self.base_url, cat_item.xpath('./@href').extract_first()) cat_list.append((ckey, cval)) for data in product_list: item = GoodsItem() data = data['itemOffered'] item['url'] = pre_url item['goods_sn'] = data['sku'] item['goods_other_name'] = item['goods_name'] = data['mpn'] item['provider_name'] = data['brand'] item['provider_url'] = '' item['goods_desc'] = description item['goods_img'] = item['goods_thumb'] = '' item['doc'] = doc_url item['rohs'] = 0 shop_price = data['offers'].get('price') item['tiered'] = [] if not shop_price: item['stock'] = [0, 1] # 库存 item['increment'] = 1 else: # 庫存判斷 if not data['offers'].get('inventoryLevel'): item['stock'] = [0, 1] else: item['stock'] = [ util.intval(data['offers']['inventoryLevel']), 1 ] # 库存 for price_item in data['offers']['priceSpecification']: pnum = price_item['eligibleQuantity']['minValue'] pval = price_item['price'] item['tiered'].append( (util.intval(pnum), util.floatval(pval))) item['increment'] = item['tiered'][0][0] if not item['tiered']: item['tiered'] = [[0, 0.00]] # 属性 item['attr'] = attr_list # 分类 item['catlog'] = cat_list yield item
def fetch_search_data(keyword=None, id=None, data_dict=None, headers=None, proxy=None, **kwargs): """获取搜索数据""" if keyword: print '正在获取 richardsonrfpd 中关键词:%s 的相关数据' % keyword url = 'http://www.richardsonrfpd.com/Pages/AdvanceSearch.aspx' elif 'url' in kwargs: url = kwargs['url'] else: return 404 _headers = copy.copy(default_headers) if isinstance(headers, dict): _headers.update(util.rfc_headers(headers)) try: proxies = kwargs.get('proxies') if proxies is None and proxy: i = random.randint(0, proxy[0] - 1) proxies = { 'http': 'http://' + proxy[1][i], 'https': 'https://' + proxy[1][i] } response = requests.get(url, headers=_headers, timeout=30, proxies=proxies) resp = do_search(response, keyword) if isinstance(resp, int): raise ValueError except Exception as e: logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) if 'Invalid URL' not in str(e): data_dict['list'].append({ 'status': -400, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -400 if resp.status_code != 200: if resp.status_code == 404 and '404.html' in resp.url: logger.info('STATUS:404; INFO:无效产品; URL: %s' % url) return 404 logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' % (resp.status_code, proxies['http'] if proxy else '', url)) data_dict['list'].append({ 'status': -405, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -405 resp.encoding = 'utf-8' # 开始解析resp # 获取搜索的数量 if 'Search-Results.aspx' in resp.url: product_list = analyse_product_url(resp) root = lxml.html.fromstring(resp.text.encode('utf-8')) product_list = root.xpath('//tr[@valign="top"][@height=85]') if len(product_list) <= 0: data_dict['list'].append({ 'status': 404, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return 404 for product in product_list: detail = product.xpath('.//a[@class="lnk12b-blackOff"]') detail_url = util.urljoin( resp.url, detail[0].xpath('./@href')[0]) if detail else '' match = goods_sn_pattern.search(detail_url) if not match and detail_url: logger.debug(u"无法匹配链接中的goods_sn URL{url}".format(url=detail_url)) return -404 goods_sn = match.group(1) goods_name = detail[0].text_content() if detail else '' data_dict['url'].append({ 'id': id, 'url': detail_url, 'goods_sn': goods_sn, 'goods_name': goods_name, }) if 'showMore=true' in url: return 200 count = root.xpath('//td[@class="medtext"]') count = util.number_format(count[0].text, places=0, index=999, smart=True) if count else 0 page_num = int(math.ceil(count / 10.0)) if page_num <= 1: return 200 max_list_num = util.intval(kwargs.get('max_list_num', 5)) page_list = root.xpath('//td[@class="medtext"]/a/@href') for x in xrange(1, page_num + 1): if max_list_num and x > max_list_num: break page_url = 'http://shopping.netsuite.com/s.nl/c.402442/sc.2/.f?search={search}&range={start}%2C{end}%2C{total}'.format( search=keyword, start=x * 10 + 1, end=(x + 1) * 10, total=count) data_dict['list'].append({ 'id': id, 'url': page_url, }) return 200
def _parse_detail_data(resp, headers=None, **kwargs): """ 解析详情数据,独立出来 @param data 页面数据 @param url 解析的页面url(方便记录异常) @param kwargs 扩展参数 """ item = {} root = lxml.html.fromstring(resp.text.encode('utf-8')) # goods_name goods_name = root.xpath('//td[@class="lnk11b-colorOff"]') item['goods_name'] = util.cleartext( goods_name[0].text) if goods_name else '' # goods_sn match = goods_sn_pattern.search(resp.url) item['goods_sn'] = match.group(1) if match else '' if not item['goods_name'] or not item['goods_sn']: logger.debug("无法解析goods_name和goods_sn URL:{url}".format(url=resp.url)) return -404 # goods_desc goods_desc = root.xpath('//td[@class="txt11"]/text()') item['desc'] = util.cleartext(goods_desc[0], '\n', '\t') if goods_desc else '' # tiered tiered = [] price_list = root.xpath('//td[@class="texttable"]') for x in range(0, len(price_list), 2): qty = util.intval(price_list[x].text_content()) price = util.floatval(price_list[x + 1].text_content()) if qty and price: tiered.append([qty, price]) else: tiered = [[0, 0.00]] break if not tiered: price = root.xpath('//td[@class="txt18b-red"]/text()') price = util.floatval(price[0]) if price else 0 if price: tiered = [1, price] else: tiered = [] item['tiered'] = tiered if tiered else [[0, 0.00]] # stock qty = root.xpath('//input[@id="qty"]/@value') qty = util.intval(qty[0]) if qty else 1 stock = root.xpath('//input[@id="custcol7"]/@value') stock = util.intval(stock[0]) if stock else 0 item['stock'] = [stock, qty] # url item['url'] = resp.url # provider_name item['provider_name'] = 'LINEAR' item['provider_url'] = '' # doc catlog item['doc'] = '' item['catlog'] = '' # attr item['attr'] = [] # rohs item['rohs'] = -1 item['goods_other_name'] = '' # increment item['increment'] = 1 # img item['goods_img'] = '' item['goods_thumb'] = '' # 一些信息需要在linear.com.cn获取 return handle_of_redirects(item)
def parse_detail(self, resp): """解析系列型号数据""" item = GoodsItem() try: soup = BeautifulSoup(resp.text.encode('utf-8'), 'lxml') except Exception as e: logger.debug(u"初始化BS4对象失败,重试一次 URL:{url}".format(url=resp.url)) # 重试一次 return Request(url=resp.url, headers=self.headers, cookies=self.cookies) # goods_sn product_id = self.product_id_pattern_1.search( resp.url) or self.product_id_pattern_2.search(resp.url) goods_sn = product_id.group(1) if product_id else '' item['goods_sn'] = goods_sn if not goods_sn: logger.debug(u"获取goods_sn失败 URL:{url}".format(url=resp.url)) return None try: # goods_name product_ref = soup.find('p', class_='ref') goods_name = '' if product_ref: goods_name_pattern = re.compile( ur'<b>制造商零件编号:</b>\s*([^\"\'<>/]+)') product_ref_list = unicode(product_ref).split('<br/>') for x in product_ref_list: match = goods_name_pattern.search(x) if match: goods_name = match.group(1) break item['goods_name'] = goods_name # goods_other_name item['goods_other_name'] = '' except: logger.debug(u"获取goods_name失败 URL:{url}".format(url=resp.url)) item['goods_name'] = '' item['goods_other_name'] = '' # goods_desc goods_desc = soup.find('p', class_='desc') if not goods_desc: logger.debug(u"获取goods_desc失败 URL:{url}".format(url=resp.url)) item['goods_desc'] = goods_desc.get_text( strip=True) if goods_desc else '' # provider_name and provider_url provider_name = soup.find('img', id='ctl00_PlaceHolderMain_mfrLogo') item['provider_name'] = provider_name.get('title', '') # 如果在商标图片中无法获取 provider_name ,尝试从 product-desc 中获取 if not provider_name: desc_div = soup.find('div', id='product-desc') provider_name = desc_div.find('h2') provider_name = provider_name.get_text( strip=True) if provider_name else '' item['provider_name'] = provider_name item['provider_url'] = '' # url item['url'] = resp.url # doc doc = soup.find( 'a', id='ctl00_PlaceHolderMain_csDownloadCenter_linkDatasheetUrlJustText' ) item['doc'] = doc.get('href', '') # goods_img and goods_thumb goods_img = soup.find('img', id='previewedMEDImage') item['goods_img'] = goods_img.get('src', '') goods_thumb = soup.find('img', id='thumbnail-1') item['goods_thumb'] = goods_thumb.get('src', '') # catlog item['catlog'] = [] catlog = soup.find('ul', id='breadcrumb-navigation') catlog_list = catlog.find_all('a') for a in catlog_list: breadcrumb_name = a.get_text(strip=True) breadcrumb_url = util.urljoin(resp.url, a.get('href', '')) item['catlog'].append([breadcrumb_name, breadcrumb_url]) # attr item['attr'] = [] product_attr_div = soup.find('div', id='product-details-overview-highlights') product_attr_list = product_attr_div.find_all( 'li') if product_attr_div else [] for li in product_attr_list: attr_name, attr_value = li.get_text(strip=True).split(':') item['attr'].append([attr_name, attr_value]) # tiered try: item['tiered'] = [] price_table = soup.find('table', class_='product-prices') price_tr_list = price_table.find_all('tr', class_='price-break') for tr in price_tr_list: qty_th = tr.find('th') qty = qty_th.get_text(strip=True) if qty_th else 0 qty = util.intval(qty) price_span = tr.find('span') price = price_span.get_text(strip=True) if price_span else 0.00 price = util.floatval(price) # print qty, price if qty and price: item['tiered'].append([qty, price]) else: item['tiered'] = [0, 0.00] except: logger.debug(u"获取tiered失败 URL:{url}".format(url=resp.url)) item['tiered'] = [0, 0.00] # stock、increment、 min_qty try: stock_div = soup.find('div', id='product-qty-content') stock_tr = stock_div.find('tr', class_='qtyInStock') increment_tr = stock_div.find('tr', class_='multipleOf') min_qty_tr = stock_div.find('tr', class_='minOrderQty') stock = stock_tr.find('td', class_='qty').get_text( strip=True) if stock_tr else 0 stock = util.intval(stock) increment = increment_tr.find('td', class_='qty').get_text( strip=True) if increment_tr else 1 increment = util.intval(increment) min_qty = min_qty_tr.find('td', class_='qty').get_text( strip=True) if min_qty_tr else 1 min_qty = util.intval(min_qty) item['stock'] = [stock, min_qty] item['increment'] = increment except: logger.debug(u"获取stock失败 URL:{url}".format(url=resp.url)) item['stock'] = [0, 1] item['increment'] = 1 # rohs rohs_div = soup.find('div', id='ctl00_PlaceHolderMain_imgRoHS') item['rohs'] = 1 if rohs_div else -1 return item
def fetch_search_data(keyword=None, id=None, data_dict=None, headers=None, proxy=None, **kwargs): """获取搜索数据""" if keyword: if not kwargs.get('other_usage', False): print '正在获取 ti.com 中关键词:%s 的相关数据' % keyword url = 'http://www.ti.com/sitesearch/docs/partnumsearch.tsp?sort=asc&linkId=2&filter=p&sortBy=pstatus&searchTerm=%s' % keyword elif 'url' in kwargs: url = kwargs['url'] else: return 404 _headers = copy.copy(default_headers) if isinstance(headers, dict): _headers.update(util.rfc_headers(headers)) try: proxies = None if proxy: i = random.randint(0, proxy[0] - 1) proxies = {'http': 'http://' + proxy[1][i]} resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies) except Exception as e: logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) if 'Invalid URL' not in str(e): data_dict['list'].append({ 'status': -400, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -400 if resp.status_code != 200: if resp.status_code == 404 and '404.html' in resp.url: logger.info('STATUS:404; INFO:无效产品; URL: %s' % url) return 404 logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' % (resp.status_code, proxies['http'] if proxy else '', url)) data_dict['list'].append({ 'status': -405, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -405 resp.encoding = 'utf-8' resp_json = {} try: resp_json = json.loads(resp.content) product = resp_json.get('response', {}).get('searchResults', {}).get('PartNoArray', []) # print len(product) except: product = [] logger.debug('STATUS:-404 ; INFO:数据异常 ; URL:%s' % url) if len(product) <= 0: data_dict['list'].append({ 'status': 404, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return 404 links = product for vo in links: pn = vo.get('PartNumber', '') tn = vo.get('PartType', '') if pn: link = 'http://www.ti.com/product/%s' % pn if 'tool' in tn: link = 'http://www.ti.com/tool/%s' % pn data_dict['url'].append({'id': id, 'url': link, 'goods_sn': pn}) if 'startNum=' in resp.url: return 200 page_num = 0 count = 0 try: count = resp_json.get('response', {}).get('searchResults', {}).get('filter', {}).get('MaxRecordCount', '') count = util.intval(count) except: count = 0 page_num = int(math.ceil(count / 25.0)) if page_num <= 1: return 200 # 翻页的form_data max_list_num = util.intval(kwargs.get('max_list_num', 5)) for x in xrange(1, page_num + 1): if max_list_num and x > max_list_num: break url = 'http://www.ti.com/sitesearch/docs/partnumsearch.tsp?sort=asc&linkId=2&startNum=%d&filter=p&sortBy=pstatus&searchTerm=%s' % ( 25 * x, keyword) page_url = url data_dict['list'].append({ 'id': id, 'url': page_url, }) return 200
def get_detail(gpn=None, **kwargs): data = dict() if not gpn: yield data # return product_list url = "http://www.ti.com/product/%s/samplebuy" % gpn try: proxies = kwargs.get('proxies') html = requests.get(url=url, headers=default_headers, timeout=30, proxies=proxies) if 'Page not found' in html.content: raise StopIteration except: raise StopIteration if html.status_code != 200: raise StopIteration soup = BeautifulSoup(html.content, "lxml") # category breadcrumb_div = soup.find('div', class_='breadcrumb') breadcrumb_div = breadcrumb_div.find_all('a') if breadcrumb_div else [] cat_log = [] for a in breadcrumb_div: if 'TI Home' in a.get_text(strip=True): continue cat_log.append([a.get_text(strip=True), a['href']]) data['catlog'] = cat_log if cat_log else [] # goods_img, goods_thumb img_div = soup.find('div', class_='image') img = img_div.img['src'] if img_div else '' data['goods_img'] = img data['goods_thumb'] = img # pretty table table = soup.find('table', id='tblBuy') # 存在一些GPN商品组内没有商品列表,直接返回默认值 if not table: data['goods_sn'] = gpn data['tiered'] = [[0, 0.00]] data['stock'] = [0, 1] yield data body_div = table.tbody if table else None # 如果获取不到商品列表就退出 if not body_div: raise StopIteration ths = table.find_all('th') if table else [] th_td = dict() for th in ths: if 'Part' in th.get_text(strip=True): th_td['PartNum'] = ths.index(th) if 'Price' in th.get_text(strip=True): th_td['Price'] = ths.index(th) if 'Inventory' in th.get_text(strip=True): th_td['Inventory'] = ths.index(th) tds = body_div.find_all('td') if body_div else [] step = len(ths) tr = [tds[x:x + step] for x in range(0, len(tds), step)] total_parts = len(tr) for td in tr: logger.info("GPN:%s 共有%d个商品需要抓取,正在抓取第%d个。" % (gpn.encode('utf-8'), total_parts, tr.index(td) + 1)) # tiered price = th_td.get('Price') pattern_price = re.compile(r'\s*(\d+.\d+)\s*\|\s*(\d+)ku\s*') if td[price].script: td[price].script.extract() tiered = pattern_price.search(td[price].get_text()) if tiered: price = tiered.group(1) qty = int(tiered.group(2)) * 1000 data['tiered'] = [[util.intval(qty), util.floatval(price)]] else: data['tiered'] = [[0, 0.00]] # goods_sn part_num = th_td.get('PartNum') data['goods_sn'] = '' for x in td[part_num].contents: if x.name == 'script': continue elif x.name == 'a': data['goods_sn'] = str(x.string).strip() # data['tiered'] = get_tiered(x['href'], **kwargs) stock, tiered = get_stock(data['goods_sn'], x['href'], **kwargs) data['tiered'] = tiered data['url'] = x['href'] data['provider_name'] = 'TI' data['stock'] = [util.intval(stock), 1] elif x.string and str(x.string).strip(): data['goods_sn'] = str(x.string).strip() data['stock'] = [0, 1] data['provider_name'] = '' # data['url'] = "https://store.ti.com/%s.aspx" % data['goods_sn'] data['url'] = "http://www.ti.com/product/%s" % gpn yield data
def add_to_cart(url, only_session, **kwargs): form_data = { "ctl00$ctl00$ScriptManager1": "ctl00$ctl00$NestedMaster$PageContent$ctl00$BuyProductDialog1$BuyProductPanel|ctl00$ctl00$NestedMaster$PageContent$ctl00$BuyProductDialog1$btnBuyPaid", "__EVENTTARGET": "", "__EVENTARGUMENT": "", "__VIEWSTATE": "", "__VIEWSTATEGENERATOR": "", "__VIEWSTATEENCRYPTED": "", "ctl00$ctl00$NestedMaster$PageHeader$StoreHeader_H$SearchPhrase": "", "ctl00$ctl00$NestedMaster$PageHeader$StoreHeader_H$hiLastHeaderAction": "none", "ctl00$ctl00$NestedMaster$PageHeader$StoreHeader_H$hiSearchFilterValue": "none", "__ASYNCPOST": "true", "ctl00$ctl00$NestedMaster$PageContent$ctl00$BuyProductDialog1$btnBuyPaid": "Buy", } proxies = kwargs.get('proxies') try: stock_page = only_session.get(url=url, proxies=proxies) except: return 0 if stock_page.status_code == 200: soup = BeautifulSoup(stock_page.content, 'lxml') view_state = soup.find('input', id="__VIEWSTATE") form_data['__VIEWSTATE'] = view_state.value if view_state else '' view_state_generator = soup.find('input', id="__VIEWSTATEGENERATOR") form_data[ '__VIEWSTATEGENERATOR'] = view_state_generator.value if view_state_generator else '' # tiered tiered = [] table = soup.find( 'table', id= 'ctl00_ctl00_NestedMaster_PageContent_ctl00_BuyProductDialog1_PricingTierList' ) if table: for tr in table.find_all('tr')[1:]: tds = tr.find_all('td') qty = tds[0].get_text(strip=True) price = tds[1].get_text(strip=True) tiered.append([util.intval(qty), util.floatval(price)]) else: tiered = [[0, 0.00]] else: return 0 # post try: resp = only_session.post(url=url, data=form_data, proxies=proxies) except: return 0 # print resp.content return tiered
if html.status_code == 200: item = {} _desc = soup.find( 'tr', id= 'ctl00_ctl00_NestedMaster_PageContent_ctl00_BuyProductDialog1_trSku' ) item['goods_name'] = _desc.find('h1').get_text( strip=True) if _desc else '' item['goods_sn'] = item['goods_name'] item['desc'] = _desc.get_text(strip=True) if _desc else '' _img = soup.find('img', id='ProductImage') item['goods_img'] = util.urljoin(url, _img.get('src')) stock_info = get_stock(goods_sn=goods_sn, url=url) if stock_info: item['stock'] = [util.intval(stock_info[0]), 1] item['tiered'] = stock_info[1] else: item['stock'] = [0, 1] item['tiered'] = [[0, 0.00]] item['provider_name'] = 'TI' item["increment"] = 1 item['url'] = url return item # def _parse_store_ti_com(url, **kwargs): # """ # 更新数据仅需要 # id 此处为GoodsId # tiered 价格阶梯
def fetch_search_data(keyword=None, id=None, data_dict=None, headers=None, proxy=None, **kwargs): """获取搜索数据""" if keyword: print '正在获取 avnet 中关键词:%s 的相关数据' % keyword url = "https://www.avnet.com/search/resources/store/715839038/productview/bySearchTerm/select?searchType=102&profileName=Avn_findProductsBySearchTermCatNav_Ajax&searchSource=Q&landingPage=true&storeId=715839038&catalogId=10001&langId=-1¤cy=USD&orgEntityId=-2000&responseFormat=json&pageSize=20&pageNumber=1&_wcf.search.internal.boostquery=price_USD:{{0.00001+TO+*}}^499999.0+inStock:%22true%22^9000.0+topSellerFlag:%22Yes%22^0.085+newProductFlag:%22Yes%22^0.080+packageTypeCode:%22BKN%22^0.075&_wcf.search.internal.filterquery=-newProductFlag%3ANPI&q={keyword}&intentSearchTerm={keyword}&searchTerm={keyword}&wt=json".format( keyword=keyword) elif 'url' in kwargs: url = kwargs['url'] else: return 404 _headers = copy.copy(default_headers) if isinstance(headers, dict): _headers.update(util.rfc_headers(headers)) try: proxies = kwargs.get('proxies') if proxies is None and proxy: i = random.randint(0, proxy[0] - 1) proxies = { 'http': 'http://' + proxy[1][i], 'https': 'https://' + proxy[1][i] } resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies) except Exception as e: logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' % (util.traceback_info(e), url)) if 'Invalid URL' not in str(e): data_dict['list'].append({ 'status': -400, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -400 if resp.status_code != 200: if resp.status_code == 404 and '404.html' in resp.url: logger.info('STATUS:404; INFO:无效产品; URL: %s' % url) return 404 logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' % (resp.status_code, proxies['http'] if proxy else '', url)) data_dict['list'].append({ 'status': -405, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return -405 resp.encoding = 'utf-8' # 开始解析resp # 获取搜索的数量 search_dict = {} try: search_dict = json.loads(resp.text.encode('utf-8')) product_list = search_dict.get('catalogEntryView', []) except: product_list = [] logger.debug('STATUS:-404 ; INFO:数据异常 ; URL:%s' % url) if len(product_list) <= 0: data_dict['list'].append({ 'status': 404, 'url': url, 'id': id, 'count': kwargs.get('count', 1) }) return 404 # sn = product.xpath('.//td[@class="partColHeader"]//span[@class="defaultSearchText"]') for product in product_list: goods_sn = product.get('seo_token_ntk', '') base_url = 'https://www.avnet.com/shop/apac/' product_url = product.get('avn_pdp_seo_path', '') data_dict['url'].append({ 'id': id, 'url': util.urljoin(base_url, product_url), 'goods_sn': goods_sn }) if 'showMore=true' in url: return 200 count = search_dict.get('recordSetTotal', 0) page_num = int(math.ceil(count / 20.0)) if page_num <= 1: return 200 max_list_num = util.intval(kwargs.get('max_list_num', 5)) for x in xrange(2, page_num + 1): if max_list_num and x > max_list_num: break page_url = 'https://www.avnet.com/search/resources/store/715839038/productview/bySearchTerm/select?searchType=102&profileName=Avn_findProductsBySearchTermCatNav_More_Ajax&searchSource=Q&landingPage=true&storeId=715839038&catalogId=10001&langId=-1¤cy=USD&orgEntityId=-2000&responseFormat=json&pageSize=20&pageNumber={next_page}&_wcf.search.internal.boostquery=price_USD:{{0.00001+TO+*}}^499999.0+inStock:%22true%22^9000.0+topSellerFlag:%22Yes%22^0.085+newProductFlag:%22Yes%22^0.080+packageTypeCode:%22BKN%22^0.075&_wcf.search.internal.filterquery=-newProductFlag:NPI&q={keyword}&intentSearchTerm={keyword}&searchTerm={keyword}&showMore=true&wt=json'.format( next_page=x, keyword=keyword) # print page_url data_dict['list'].append({ 'id': id, 'url': page_url, }) return 200
def _parse_detail_data(resp, headers=None, **kwargs): """ 解析详情数据,独立出来 @param data 页面数据 @param url 解析的页面url(方便记录异常) @param kwargs 扩展参数 """ item = {} try: soup = BeautifulSoup(resp.text, 'lxml') if soup is None: logger.debug('初始化商品详情页面失败 URL: %s', resp.url) return -404 except Exception as e: logger.debug('初始化商品详情页面失败 URL: %s ERROR: %s', (resp.url, util.traceback_info(e))) return -404 # goods_sn url_path_list = resp.url.split('/') goods_sn_pattern = re.compile(r'.*-\d{19}') for path in url_path_list[::-1]: if goods_sn_pattern.findall(path): item['goods_sn'] = path break if not item.get('goods_sn', False): logger.debug("无法从链接中解析goods_sn URL: {url} ".format(url=resp.url)) return -400 # goods_name goods_info_div = soup.find('div', class_='section-left') item['goods_name'] = goods_info_div.find('h1').get_text( strip=True) if goods_info_div else item['goods_sn'] # url item['url'] = resp.url # goods_img img_div = soup.find('div', id="outer-div1") img = img_div.find('img') if img_div else None item['goods_img'] = util.urljoin(resp.url, img.get('src')) if img else '' # goods_thumb item['goods_thumb'] = item['goods_img'] # desc desc_p = soup.find('p', class_='RB-pdp_short_Desc') item['desc'] = desc_p.get_text(strip=True) if desc_p else '' # provider_name item['provider_name'] = "AVNET" # provider_url item['provider_url'] = '' # attr: [[None, None]] attr_body = soup.find('div', id="techAttr") attr_div = attr_body.find_all('div', class_='pdpDescriptionsBodyContent') attr = [] if attr_div is not None: for content in attr_div: att_name = content.find('div', class_='pdpDescriptionColumn') attr_value = content.find('div', class_='pdpValueColumn') if att_name and attr_value: attr.append([ att_name.get_text(strip=True), attr_value.get_text(strip=True) ]) else: continue item['attr'] = attr else: item['attr'] = attr # tiered: [[0, 0.00]] tiered_span = soup.find_all('span', class_='usdpart1') tiered = [] if tiered_span: for span in tiered_span: qty_span = span.find('span', class_='pdpTierMinQty') qty = qty_span.get_text(strip=True) if qty_span else 0 price_p = span.find('p') price = price_p.get_text(strip=True) if price_p else 0.00 if qty and price: tiered.append([util.intval(qty), util.floatval(price)]) else: tiered = [[0, 0.00]] break item['tiered'] = tiered else: item['tiered'] = [[0, 0.00]] # stock: [0, 1] >> [stock, qty] stock_input = soup.find('input', id='inStock') stock = stock_input.get('value') if stock_input else 0 stock = util.intval(stock) # qty min_qty_input = soup.find('input', attrs={'name': 'min'}) min_qty = min_qty_input.get('value') if min_qty_input else 1 min_qty = util.intval(min_qty) item['stock'] = [stock, min_qty] if stock else ['0', '1'] # increment: 1 multi_input = soup.find('input', attrs={'name': 'mult'}) item['increment'] = util.intval( multi_input.get('value')) if multi_input else 1 # doc doc_div = soup.find('div', class_='pdfcontent') if doc_div is not None: doc_url = doc_div.find('a', class_='datasheet_align') item['doc'] = doc_url.get('href') if doc_url else '' else: item['doc'] = '' # rohs: -1 rohs_div = soup.find('div', class_='leafcontent') item['rohs'] = 1 if rohs_div else -1 # catlog: [[name, url]] nav = soup.find('nav', class_='breadcrumb') nav_ul = nav.find('ul', class_='nav') catlog = [] if nav is not None: lis = nav.find_all('a') for a in lis: cat_name = a.get_text(strip=True) cat_url = util.urljoin(resp.url, a.get('href')) if cat_name and cat_url: catlog.append([cat_name, cat_url]) else: continue item['catlog'] = catlog else: item['catlog'] = catlog # goods_other_name item['goods_other_name'] = '' # product_id # family_sn return item
def import_goods(self, data, put_xs_list=None): """导入产品数据""" put_xs_list = put_xs_list if put_xs_list else [] if not data: return 0 data['category'] = [x.encode('utf-8') for x in data['category']] cids = self.get_ic_category(data['category']) try: cat_id1 = cids[0] except IndexError: cat_id1 = 0 try: cat_id2 = cids[1] except IndexError: cat_id2 = 0 goods_sn = data['goods_sn'] min_buynum = data['min_buynum'] data['mpq'] = 1 increment = data['increment'] url = data['url'] goods_desc = util.binary_type(data['goods_desc']) if data['goods_desc'] else '' goods_img = data['goods_img'] if 'goods_img' in data else '' _unix_time = int(time.time()) goods_data = { 'cat_id1': cat_id1, 'cat_id2': cat_id2, 'cat_id3': 0, 'PN2': PN2, 'goods_name': util.binary_type(data['goods_name']), 'goods_other_name': util.binary_type(data['goods_other_name']), 'provider_name': util.binary_type(data['brand']), 'batch_number': '', 'encap': '', 'goods_desc': goods_desc, 'SPQ': data['mpq'], 'goods_number_hk': data['hk_stock'] if 'hk_stock' in data else 0, 'goods_number': data['cn_stock'] if 'cn_stock' in data else 0, 'DT_HK': HDT, 'DT': CDT, 'CDT': CDT, 'HDT': HDT, 'increment': increment, 'min_buynum': min_buynum, 'goods_sn': goods_sn, 'brand_goods_id': 0, 'doc_url': '', 'digikey_url': url, 'series': '', 'source_type': 0, 'user_id': 0, 'log_id': 0, 'to_china': 1, 'to_hongkong': 0, 'goods_weight': 0.0, 'goods_img': goods_img, 'goods_thumb': goods_img, 'last_update': _unix_time - 8 * 3600, } if goods_data['provider_name']: brand_id = self.get_ic_brand(goods_data['provider_name']) goods_data['brand_id'] = brand_id info = self.supplier.select('goods', condition={'goods_sn': goods_sn, 'PN2': PN2}, fields=('goods_id',), limit=1) if info: self.supplier.update('goods', condition={'goods_id': info['goods_id']}, data=goods_data) goods_id = info['goods_id'] print('更新mysql成功,GoodsId:%s' % (goods_id,)) else: goods_data['add_time'] = _unix_time - 8 * 3600 goods_id = self.supplier.insert('goods', data=goods_data, return_insert_id=1) put_xs_list.append({ 'goods_id': goods_id, 'goods_name': util.binary_type(goods_data['goods_name']), 'goods_other_name': util.binary_type(goods_data['goods_other_name']) }) print('保存mysql成功,GoodsId:%s' % (goods_id,)) if not goods_id: return 0 table_id = str(goods_id)[-1] if info: self.supplier.delete('goods_price_%s' % (table_id,), condition={'goods_id': goods_id}) # 获取价格阶梯 price_tiered = data['tiered'] if not price_tiered: price_tiered.append((goods_data['min_buynum'], 0.0, 0.0)) goods_price = [] for p in price_tiered: qty = util.intval(p[0]) if qty <= 0: continue goods_price.append({ "purchases": p[0], "price": 0, "price_cn": p[2] * PRICE_PROPORTION, }) self.supplier.insert('goods_price_%s' % (table_id,), data={ 'goods_id': goods_id, 'price': json.dumps(goods_price), }) tiered = [] for p in goods_price: tiered.append([ p["purchases"], p['price'], p['price_cn'], ]) mongo_data = { 'ModelName': goods_data['goods_name'], 'OtherModelName': goods_data['goods_other_name'], 'BrandName': goods_data['provider_name'], 'DT': (goods_data['HDT'], goods_data['CDT']), 'Desc': goods_data['goods_desc'], 'GoodsId': goods_id, 'GoodsSn': goods_data['goods_sn'], 'Stock': (goods_data['goods_number'], goods_data['min_buynum'], 0), 'Tiered': tiered, 'error': 0, 'time': int(time.time()), 'url': goods_data['digikey_url'], 'DocUrl': '', 'increment': goods_data['increment'] } # 保存mongodb collect = getattr(self.mongo, 'supplier') info = collect.find_one({'GoodsId': goods_id}) if info: collect.update({'GoodsId': goods_id}, {"$set": mongo_data}) print('更新mongodb成功,GoodsId:%s' % (goods_id,)) else: collect.insert(mongo_data) print('保存mongodb成功,GoodsId:%s' % (goods_id,)) # print('成功导入立创商城产品 %s 数据' % (data[4].encode('utf-8'),)) return 1