def run(self): ''' 解析网站源码 ''' time.sleep(random.randint(2, 5)) try: pq = helper.get(self.url, myHeaders=self.headers, cookies=self.cookies) # 款型名称 name = pq('h1#pdp_product_title') if name and len(name) > 0: name = name[0].text # 配色的编号 number = pq('li.description-preview__style-color').text().split(':')[1].strip() # 颜色值 color_value = pq('li.description-preview__color-description').text().split(':')[1].strip() price = 0 for div in pq('div.text-color-black'): if div.get('data-test') == 'product-price': price = float(div.text.replace('$', '')) break size_price_arr = [] for input in pq('div.availableSizeContainer input'): # M 3.5 / W 5 size = input.get('aria-label').replace('W', '').replace('M', '').replace('C', '').strip() if '/' in size: size = size.split('/')[0].strip() size_price_arr.append({ 'size': float(size), 'price': price, 'isInStock': input.get('disabled', False) == False }) img_url = None for source in pq('noscript > picture > source'): img_url = source.get('srcset') break if img_url: pass result = helper.downloadImg(img_url, os.path.join('.', 'imgs', platform, '%s.jpg' % number)) if result == 1: # 上传到七牛 qiniuUploader.upload_2_qiniu(platform, '%s.jpg' % number, './imgs/%s/%s.jpg' % (platform, number)) mongo.insert_pending_goods(name, number, self.url, size_price_arr, ['%s.jpg' % number], self.gender, color_value, platform, '5be444e3c7e854cab4b252a0', self.crawl_counter, '', True if img_url else False) else: helper.log('%s has no name' % self.url, platform) # name = pq('h1.exp-pdp-title__main-title') # name = name[0].text except Exception as e: global error_detail_url error_counter = error_detail_url.get(self.url, 1) error_detail_url[self.url] = error_counter + 1 helper.log('[ERROR] error timer = %s, url = %s' % (error_counter, self.url), platform) helper.log(e, platform) if error_counter < 3: self.q.put(self.url)
def fetch_detail(url): url = 'https://www.footaction.com' + url pq = helper.get(url, cookies) name = pq('span.c-product-name').text() print('name = %s' % name) number = pq('div.c-tab-panel').text().split(' ')[2] print('number = %s' % number) size_price_arr = [] price = '0.00' try: price = float(pq('span.sr-only').text().replace('$', '')) except: price = float(pq('span.final').text().replace('$', '')) size_arr = pq('div.c-size p > label').text().split(' ') for size in size_arr: size_price_arr.append({ 'size': float(size), 'price': price, 'isInStock': True }) print('size_price_arr = ', size_price_arr) img_json_str = helper.get('https://images.footaction.com/is/image/EBFL2/%sMM?req=set,json' % number, returnText=True) img_json = None img_url = None try: img_json = json.loads(img_json_str.replace('/*jsonp*/s7jsonResponse(', '').replace(',"");', '')) img_item_arr = img_json.get('set').get('item') for img_item in img_item_arr: if img_item.get('type') == 'img_set': img_url = img_item.get('set').get('item')[0].get('s').get('n') break except: img_json_str = helper.get('https://images.footaction.com/is/image/EBFL2/%s?req=set,json' % number, returnText=True) img_json = json.loads(img_json_str.replace('/*jsonp*/s7jsonResponse(', '').replace(',"");', '')) img_item_arr = img_json.get('set').get('item') try: img_url = img_item_arr[0].get('s').get('n') except: img_url = img_item_arr.get('s').get('n') img_url = 'https://images.footaction.com/is/image/%s?wid=600&hei=600&fmt=jpg' % img_url print(img_url) global platform helper.downloadImg(img_url, os.path.join('.', 'imgs', platform, '%s.jpg' % number)) mongo.insert_pending_goods(name, number, url, size_price_arr, ['%s.jpg' % number], platform) # 上传到七牛 qiniuUploader.upload_2_qiniu(platform, '%s.jpg' % number, './imgs/%s/%s.jpg' % (platform, number))
def run(self): ''' 解析网站源码 ''' time.sleep(2) try: pq = helper.get(self.url, cookie) name = pq('h1#prodNameId').text() number = pq('span#supplierArtNumSpan').text() color_value = pq('span#variantColorId').text() size_price_arr = [] for a in pq('div#2SizeContainer > div > a'): arr = [item.strip() for item in a.get('onclick').replace('ProductDetails.changeSizeAffectedLinks(', '').replace(');', '').split('\n')] # print(arr) # '8+', => 8+, => 8+ arr[6] = arr[6].replace('\'', '').replace(',', '').replace('Y', '') size_price_arr.append({ 'size': float(arr[6]) if '+' not in arr[6] else float(arr[6].replace('+', '')) + 0.5, # '115,76 USD', => '115.76 USD'. => '115.76 USD'. => '115.76 => 115.76 'price': float(arr[2].replace(',', '.').replace(' USD\'.', '').replace('\'', '')), 'isInStock': True }) # print(size_price_arr) img_downloaded = mongo.is_pending_goods_img_downloaded(self.url) if not img_downloaded: img_url = pq('img.productDetailPic').attr('src') result = helper.downloadImg(img_url, os.path.join('.', 'imgs', platform, '%s.jpg' % number)) if result == 1: # 上传到七牛 qiniuUploader.upload_2_qiniu(platform, '%s.jpg' % number, './imgs/%s/%s.jpg' % (platform, number)) img_downloaded = True mongo.insert_pending_goods(name, number, self.url, size_price_arr, ['%s.jpg' % number], self.gender, color_value, platform, '5bc87d6dc7e854cab4875368', self.crawl_counter, img_downloaded=img_downloaded) except Exception as e: global error_detail_url error_counter = error_detail_url.get(self.url, 1) error_detail_url[self.url] = error_counter + 1 helper.log('[ERROR] error timer = %s, url = %s' % (error_counter, self.url), platform) helper.log(e, platform) if error_counter < 3: self.q.put(self.url)
def run(self): ''' 解析网站源码 ''' time.sleep(random.uniform(1.0, 3.6)) try: pq = helper.get(self.url, myHeaders=self.headers) # 款型名称 name = pq('div.product-brand').text().strip() + ' ' + pq('h1.product-name').text().strip() # 颜色尺寸 # 找出所有的尺寸 size_span_list = pq('div.product-sizes__options span.product-sizes__detail') size_price_list = [] for size_span in size_span_list: size = PyQuery(size_span).find('span.product-sizes__size').text().strip() if 'K' in size or 'k' in size or '-' in size or 'XS' in size: continue size = re.sub(r'[WwYyCc\*]', '', size) # 还有非数字的size,醉了 if size == 'S': continue elif size == 'M': continue elif size == 'L': continue elif size == 'XL': continue elif size == 'XXL': continue elif size == 'XXXL': continue elif size == '': continue elif size == 'OS': continue price = PyQuery(size_span).find('span.product-sizes__price').text().strip() if price.startswith('$'): price = price.replace('$', '').replace(',', '') size_price_list.append({ 'size': size, 'price': float(price), 'isInStock': True }) else: size_price_list.append({ 'size': size, 'price': 0.0, 'isInStock': False }) if len(size_price_list) < 1: return # 配色的编号 number = '' # 性别 gender = 0 # 颜色 color_value = '' tr_list = pq('table#product-attribute-specs-table tr') for tr in tr_list: key = PyQuery(tr).find('th').text().strip() if key == 'Gender': gender_txt = PyQuery(tr).find('td').text().strip() if gender_txt == 'Mens': gender = 1 elif gender_txt == 'Womens': gender = 2 elif key == 'Colorway': color_value = PyQuery(tr).find('td').text().strip() elif key == 'Manufacturer Sku': number = PyQuery(tr).find('td').text().strip() # print(name, number, self.url, size_price_list, gender, color_value) img_downloaded = mongo.is_pending_goods_img_downloaded(self.url) if not img_downloaded: img_url = pq('div.product-gallery-image > img')[0].get('src') # 下载图片 result = helper.downloadImg(img_url, os.path.join('.', 'imgs', 'stadiumgoods', '%s.jpg' % number)) if result == 1: # 上传到七牛 qiniuUploader.upload_2_qiniu('stadiumgoods', '%s.jpg' % number, './imgs/stadiumgoods/%s.jpg' % number) img_downloaded = True mongo.insert_pending_goods(name, number, self.url, size_price_list, ['%s.jpg' % number], gender, color_value, 'stadiumgoods', '5b8f484b299207efc1fb0904', self.crawl_counter, img_downloaded=img_downloaded) except: global error_detail_url error_counter = error_detail_url.get(self.url, 1) error_detail_url[self.url] = error_counter + 1 helper.log('[ERROR] error timer = %s, url = %s' % (error_counter, self.url), 'stadiumgoods') if error_counter < 3: self.q.put(self.url)
def run(self): ''' 解析网站源码 ''' time.sleep(2) try: pq = helper.get(self.url, platform=platform) # 款型名称 name = pq('h1.name').text() number = '' color_value = '' # price = 0.0 for div in pq('div.detail'): div = PyQuery(div) key = div.find('span.title').text() if key == 'Style': # 配色的编号 number = div.find('span')[-1].text.strip() elif key == 'Colorway': color_value = div.find('span')[-1].text.strip() # elif key == 'Retail Price': # price = div.find('span')[-1].text.replace('US$', '').strip() # price = float(price) if number != '': # 找出所有尺寸 size_price_arr = [] select_options = pq('div.select-options') if select_options and len(select_options) > 0: div_list = PyQuery(select_options[0]).find('div.inset div') for i in range(0, len(div_list), 2): if div_list[i].text == 'All': continue if div_list[i + 1].text == 'Bid': size_price_arr.append({ 'size': div_list[i].text, 'price': 0.0, 'isInStock': False }) else: size_price_arr.append({ 'size': div_list[i].text, 'price': float(div_list[i + 1].text.replace('US$', '').replace(',', '').strip()), 'isInStock': True }) # 下载图片 img_downloaded = mongo.is_pending_goods_img_downloaded(self.url) if not img_downloaded: img_url = '' img_list = pq('div.image-container img') if img_list: img_url = img_list[-1].get('src') else: img_url = pq('div.product-media img').attr('src') img_url_list = img_url.split('?') img_url_query_list = img_url_list[1].split('&') for i in range(0, len(img_url_query_list)): if img_url_query_list[i].split('=')[0] == 'w': img_url_query_list[i] = 'w=600' elif img_url_query_list[i].split('=')[0] == 'h': img_url_query_list[i] = 'h=600' img_url = img_url_list[0] + '?' + '&'.join(img_url_query_list) result = helper.downloadImg(img_url, os.path.join('.', 'imgs', platform, '%s.jpg' % number)) if result == 1: # 上传到七牛 qiniuUploader.upload_2_qiniu(platform, '%s.jpg' % number, './imgs/%s/%s.jpg' % (platform, number)) img_downloaded = True else: size_price_arr = [] # https://stockx.com/api/products/adidas-human-race-nmd-pharrell-cream?includes=market,360¤cy=USD size_price_url = 'https://stockx.com/api/products%s?includes=market,360¤cy=USD' % self.url.split('stockx.com')[1] json_txt = helper.get(size_price_url, returnText=True) json_data = json.loads(json_txt) product_children = json_data.get('Product').get('children') for product_key in product_children.keys(): product_data = product_children[product_key] market_data = product_data.get('market') size_price_arr.append({ 'size': product_data.get("shoeSize"), 'price': market_data.get("lastSale"), 'isInStock': market_data.get("lastSale") > 0 }) number = json_data.get('Product').get('styleId') color_value = json_data.get('Product').get('colorway') name = json_data.get('Product').get('title') # print('number = ', number) # print('color_value = ', color_value) # print('name = ', name) if number != '': # 下载图片 img_downloaded = mongo.is_pending_goods_img_downloaded(self.url) if not img_downloaded: img_url_list = json_data.get('Product').get('media').get('360') if len(img_url_list) > 0: img_url = img_url_list[0] else: img_url = json_data.get('Product').get('media').get('imageUrl') img_path = os.path.join('.', 'imgs', platform, '%s.jpg' % number) helper.downloadImg(img_url, img_path) # 上传到七牛 qiniuUploader.upload_2_qiniu(platform, '%s.jpg' % number, img_path) img_downloaded = True if number != '': mongo.insert_pending_goods(name, number, self.url, size_price_arr, ['%s.jpg' % number], 0, color_value, platform, '5bace180c7e854cab4dbcc83', self.crawl_counter, img_downloaded=img_downloaded) # print(name, number, self.url, size_price_arr, ['%s.jpg' % number], 0, color_value, platform, '5bace180c7e854cab4dbcc83', self.crawl_counter, img_downloaded) except: global error_detail_url error_counter = error_detail_url.get(self.url, 1) error_detail_url[self.url] = error_counter + 1 helper.log('[ERROR] error timer = %s, url = %s' % (error_counter, self.url), platform) if error_counter < 3: self.q.put(self.url)
def run(self): ''' 解析网站源码 ''' time.sleep(2) try: pq = helper.get(self.url, myHeaders=self.headers) # 款型名称 name = pq('input.bVProductName').attr('value') # 配色的编号 span = pq('div#styleColors span.styleColorIds') number = span.text().strip().replace('- ', '') number = re.sub(re.compile(r'\s'), ' ', number) number = ''.join(number.split()) span = pq('div#productPrices span') price = span.text().replace('$', '').split(' ')[0] try: price = float(price) except: price = 0.0 aria_label_list = pq('div#productSizes button') # size_price_arr = [{'size': float(re.compile(r'\d+\.[05]').findall(a.get('aria-label'))[0]), 'price': price, 'isInStock': 'unavailable' not in a.get('aria-label')} for a in aria_label_list] size_price_arr = [{ 'size': a.get('aria-label'), 'price': price, 'isInStock': 'unavailable' not in a.get('aria-label') } for a in aria_label_list] # 下载图片 img_downloaded = is_pending_goods_img_downloaded if not img_downloaded: img_list = pq('div.pdp-image') img_url = 'https:' + (img_list[2].get('data-large') if len(img_list) > 2 else img_list[-1].get('data-large')) result = helper.downloadImg( img_url, os.path.join('.', 'imgs', 'finishline', '%s.jpg' % number)) if result == 1: # 上传到七牛 qiniuUploader.upload_2_qiniu( 'finishline', '%s.jpg' % number, './imgs/finishline/%s.jpg' % number) img_downloaded = True mongo.insert_pending_goods(name, number, self.url, size_price_arr, ['%s.jpg' % number], self.gender, '', 'finishline', '5ac8594e48555b1ba31896ba', self.crawl_counter, img_downloaded=img_downloaded) except: global error_detail_url error_counter = error_detail_url.get(self.url, 1) error_detail_url[self.url] = error_counter + 1 helper.log( '[ERROR] error timer = %s, url = %s' % (error_counter, self.url), 'finishline') if error_counter < 3: self.q.put(self.url)
def run(self): ''' 解析网站源码 ''' time.sleep(2) global platform try: pq = helper.get(self.url) name = pq('h1.product_title').text() print('name = %s' % name) number = pq('span.pistylevalue').text().strip() print('number = %s' % number) color_value = pq('span.pithecolor').text().strip() print('color_value = %s' % color_value) size_price_arr = [] price = 0.0 try: price = float( pq('div.product_price_content > span.product_price').text( ).replace('$', '')) except: price = 0.0 for a in pq('div.box_wrapper > a'): # print(a.text, a.get('class')) size = a.text size_price_arr.append({ 'isInStock': 'piunavailable' not in a.get('class'), 'size': size, 'price': price }) print('size_price_arr = ', size_price_arr) img_downloaded = mongo.is_pending_goods_img_downloaded(self.url) if not img_downloaded: img_url = pq('img.product_image').attr('src') if not img_url.startswith('http'): img_url = 'http://www.jimmyjazz.com' + img_url print('img_url = ', img_url) if helper.downloadImg( img_url, os.path.join('.', 'imgs', platform, '%s.jpg' % number)) == 1: # 上传到七牛 qiniuUploader.upload_2_qiniu( platform, '%s.jpg' % number, './imgs/%s/%s.jpg' % (platform, number)) img_downloaded = True mongo.insert_pending_goods(name, number, self.url, size_price_arr, ['%s.jpg' % number], self.gender, color_value, platform, '5b4b59b6bb8bdb5a84ddee09', self.crawl_counter, img_downloaded=img_downloaded) except Exception as e: global error_detail_url error_counter = error_detail_url.get(self.url, 1) error_detail_url[self.url] = error_counter + 1 helper.log( '[ERROR] error timer = %s, url = %s' % (error_counter, self.url), platform) helper.log(e, platform) if error_counter < 3: self.q.put(self.url)
def fetch_detail(url, page=0): if url == 'https://www.sneakersnstuff.com/en/product/21411/adidas-ultra-boost': return if url == 'https://www.sneakersnstuff.com/en/product/21129/brooks-regent-american-dream': return if url == 'https://www.sneakersnstuff.com/en/product/20527/vans-sk8-hi': return # if url == 'https://www.sneakersnstuff.com/en/product/15609/adidas-tech-super': # return # if url == 'https://www.sneakersnstuff.com/en/product/15249/reebok-instapump-fury-og': # return # if url == 'https://www.sneakersnstuff.com/en/product/11938/reebok-pump-fury': # return # if url == 'https://www.sneakersnstuff.com/en/product/12418/reebok-pump-fury': # return # if url == 'https://www.sneakersnstuff.com/en/product/10920/reebok-classic-leather': # return print('page = %d' % page) pq = helper.get(url) name = None try: name = pq('h1#product-name').text() if name == '': name = 1 / 0 except: name = pq('p.product-name > span.brand').text() name += pq('p.product-name > span.name').text() if name == '': name = pq('div.product-info h5').text().replace('<br/>', '').replace('\n', '') print('name = %s' % name) number = None try: number = pq('span#product-artno').text().split(':')[1].strip() except: number_arr = pq('div#tab1 strong').parents('p').text().split('\n') for item in number_arr: if 'number:' in item: number = item.replace('Article number:', '').replace('Artikelnummer:', '').strip() break print('number = %s' % number) size_price_arr = [] price = 0.0 span_arr = [] try: span_arr = pq('div.product-price > span') price = float(span_arr[0].text.encode('utf-8').replace('$', '').replace( '¥', '')) price *= 0.1468 price = float("%.2f" % price) for span in pq('span.size-type'): size = '.'.join(re.compile(r'\d{1,2}').findall(span.text)) size_price_arr.append({ 'isInStock': True, 'size': size, 'price': price }) except: try: price = float( pq('p.product-price > span.sale').text().encode( 'utf-8').replace('¥', '')) price *= 0.1468 price = float("%.2f" % price) for span in pq('span.size-type'): size = span.text.replace('US ', '').replace('\r', '').replace( '\n', '').replace('\t', '') size_price_arr.append({ 'isInStock': True, 'size': size, 'price': price }) except: pass print('size_price_arr = ', size_price_arr) mongo.insert_pending_goods(name, number, url, size_price_arr, ['%s.jpg' % number], 'sneakersnstuff') img_url = None try: img_url = pq('img#primary-image').attr('src') if not img_url: img_url = 1 / 0 except: img_url = pq('div.media > img').attr('src') img_url = 'https://www.sneakersnstuff.com%s' % img_url print('img_url = ', img_url) if helper.downloadImg( img_url, os.path.join('.', 'imgs', 'sneakersnstuff', '%s.jpg' % number)) == 1: # 上传到七牛 qiniuUploader.upload_2_qiniu('sneakersnstuff', '%s.jpg' % number, './imgs/sneakersnstuff/%s.jpg' % number)
def run(self): ''' 解析网站源码 ''' time.sleep(2) try: pq = helper.get(self.url) if not pq: return name = pq('div.nosto_product > span.name').text() number = '' color_value = '' index = 0 for li in pq('li.attribute-list-item'): if index == 0: number = li.text.strip() elif index == 1: color_value = li.text.strip() index += 1 size_price_arr = [] for div in pq('div.hidden > div'): price = float(div.find('span').text) size = div.find('div').find('meta').get('content').split( '_')[-1] size_price_arr.append({ 'size': size, 'price': price, 'isInStock': True }) img_downloaded = mongo.is_pending_goods_img_downloaded(self.url) if not img_downloaded: try: img_url = pq('div.mobile-product-image > img.product-img' ).attr('data-src') except: img_url = None if not img_url: img_url = pq('link.hidden').attr('src') result = helper.downloadImg( img_url, os.path.join('.', 'imgs', platform, '%s.jpg' % number)) if result == 1: # 上传到七牛 qiniuUploader.upload_2_qiniu( platform, '%s.jpg' % number, './imgs/flightclub/%s.jpg' % number) img_downloaded = True mongo.insert_pending_goods(name, number, self.url, size_price_arr, ['%s.jpg' % number], self.gender, color_value, platform, '5ac8592c48555b1ba318964a', self.crawl_counter, img_downloaded=img_downloaded) except Exception as e: global error_detail_url error_counter = error_detail_url.get(self.url, 1) error_detail_url[self.url] = error_counter + 1 helper.log( '[ERROR] error timer = %s, url = %s' % (error_counter, self.url), platform) helper.log(e, platform) if error_counter < 3: self.q.put(self.url)
def run(self): ''' 解析网站源码 ''' time.sleep(2) try: pq = helper.get(self.url, myHeaders=self.headers) # 款型名称 name = None try: name = pq('span.product_title').text() except: return # 配色的编号 number = pq('span#productSKU').text() # 颜色尺寸 # 找出所有的尺寸 size_all_list = json.loads(re.compile(r'var model =.*"\};').findall(pq.html())[0].replace('var model = ', '').replace('"};', '"}')) size_all_list = size_all_list.get('AVAILABLE_SIZES') size_all_list = [float(size.strip()) for size in size_all_list] size_price_json = json.loads(re.compile(r'var sizeObj =.*}];').findall(pq.html())[0].replace('var sizeObj = ', '').replace('}];', '}]')) size_price_json = [{'size': item.get('size').strip(), 'price': float(item.get('pr_sale').strip())} for item in size_price_json] # size_price_list = [{'size': float(a.get('size').strip()), 'price': float(a.get('pr_sale').strip())} for a in size_price_json] size_price_list = [] for size in size_all_list: finded = False for size_price in size_price_json: if size_price.get('size') == size: size_price_list.append({ 'size': size, 'price': size_price.get('price'), 'isInStock': True, }) finded = True break if not finded: size_price_list.append({ 'size': size, 'price': 0.0, 'isInStock': False, }) # helper.log(name, number, self.url, size_price_list) img_downloaded = mongo.is_pending_goods_img_downloaded(self.url) if not img_downloaded: img_response = helper.get('https://images.champssports.com/is/image/EBFL2/%s?req=imageset,json' % number, returnText=True) img_response = re.compile(r'"IMAGE_SET":"\w+/[_\w]+;').findall(img_response) img_url = 'https://images.champssports.com/is/image/%s?hei=600&wid=600' % img_response[0].replace('"IMAGE_SET":"', '').replace(';', '') # 下载图片 result = helper.downloadImg(img_url, os.path.join('.', 'imgs', 'champssports', '%s.jpg' % number)) if result == 1: # 上传到七牛 qiniuUploader.upload_2_qiniu('champssports', '%s.jpg' % number, './imgs/champssports/%s.jpg' % number) img_downloaded = True result = mongo.insert_pending_goods(name, number, self.url, size_price_list, ['%s.jpg' % number], self.gender, '', 'champssports', '5af1310e48555b1ba3387bcc', self.crawl_counter, img_downloaded=img_downloaded) except: global error_detail_url error_counter = error_detail_url.get(self.url, 1) error_detail_url[self.url] = error_counter + 1 helper.log('[ERROR] error timer = %s, url = %s' % (error_counter, self.url), 'champssports') if error_counter < 3: self.q.put(self.url)
def run(self): ''' 解析网站源码 ''' time.sleep(3.6) global platform global error_detail_url try: slug = self.url.replace('https://www.goat.com/sneakers/', '') html = helper.get(self.url, returnText=True, platform=platform) if html: json_data = re.compile(r'window.__context__.*') json_data = json_data.findall(html)[0] json_data = json_data.replace('window.__context__ = ', '') json_data = json_data.replace('</script>', '') json_data = json.loads(json_data) json_data = json_data.get('default_store') json_data = json_data.get('product-templates') product_json = json_data.get('slug_map').get(slug) name = product_json.get('name') number = product_json.get('sku') color_value = product_json.get('details') color_name = name.split('\'')[1] if '\'' in name else '' size_list = product_json.get( 'formatted_available_sizes_new_v2') size_price_list = [{ 'size': float(data.get('size')), 'price': float(data.get('price_cents') / 100), 'isInStock': True } for data in size_list] # print({ # 'name': name, # 'number': number, # 'color_value': color_value, # 'color_name': color_name, # 'size_price_list': size_price_list, # }) img_downloaded = mongo.is_pending_goods_img_downloaded( self.url) if not img_downloaded: img_url = product_json.get('original_picture_url') result = helper.downloadImg( img_url, os.path.join('.', 'imgs', platform, '%s.jpg' % number)) if result == 1: # 上传到七牛 qiniuUploader.upload_2_qiniu( platform, '%s.jpg' % number, './imgs/%s/%s.jpg' % (platform, number)) img_downloaded = True mongo.insert_pending_goods( name, number, self.url, size_price_list, ['%s.jpg' % number], self.gender, color_value, platform, '5bbf4561c7e854cab45218ba', self.crawl_counter, color_name, img_downloaded) fetched_url_list.append(self.url) helper.writeFile(json.dumps(fetched_url_list), './logs/goat-%s.json' % helper.today()) else: error_counter = error_detail_url.get(self.url, 1) error_detail_url[self.url] = error_counter + 1 helper.log( '[ERROR] error timer = %s, url = %s' % (error_counter, self.url), platform) if error_counter < 3: self.q.put(self.url) except Exception as e: error_counter = error_detail_url.get(self.url, 1) error_detail_url[self.url] = error_counter + 1 helper.log( '[ERROR] error timer = %s, url = %s' % (error_counter, self.url), platform) helper.log(e, platform) if error_counter < 3: self.q.put(self.url) finally: helper.log('[INFO] %s is done' % self.url, platform)
def run(self): ''' 解析网站源码 ''' time.sleep(2) try: pq = helper.get(self.url, myHeaders=self.headers) name = pq('h1.product_title').text() if not name: return number = pq('span#productSKU').text() if not number: return color_value = pq('span.attType_color').text() json_str = re.compile(r'var\smodel\s=\s.*"\};').findall( pq.html())[0] size_arr = json.loads( json_str.replace('var model = ', '').replace('"};', '"}')).get('AVAILABLE_SIZES') try: size_arr = [float(size) for size in size_arr] except: size_arr = [] return if len(size_arr) < 1: return json_str = re.compile(r'var\ssizeObj\s=\s.*"\}\];').findall( pq.html())[0] available_size_arr = json_str.replace('var sizeObj = ', '').replace('"}];', '"}]') available_size_arr = json.loads(available_size_arr) size_price_arr = [{ 'size': size, 'isInStock': False, 'price': 0.00 } for size in size_arr] for available_size in available_size_arr: tmp_size = float(available_size.get('size')) for size_price in size_price_arr: if tmp_size == size_price.get('size'): size_price['isInStock'] = True size_price['price'] = float( available_size.get('pr_sale')) break img_json_str = helper.get( 'https://images.eastbay.com/is/image/EBFL2/%sMM?req=set,json' % number, returnText=True) img_json = None img_url = None try: img_json = json.loads( img_json_str.replace('/*jsonp*/s7jsonResponse(', '').replace(',"");', '')) img_item_arr = img_json.get('set').get('item') for img_item in img_item_arr: if img_item.get('type') == 'img_set': img_url = img_item.get('set').get('item')[0].get( 's').get('n') break except: img_json_str = helper.get( 'https://images.eastbay.com/is/image/EBFL2/%s?req=set,json' % number, returnText=True) try: img_json = json.loads( img_json_str.replace('/*jsonp*/s7jsonResponse(', '').replace(',"");', '')) img_item_arr = img_json.get('set').get('item') if isinstance(img_item_arr, list): img_url = img_item_arr[0].get('s').get('n') elif isinstance(img_item_arr, dict): img_url = img_item_arr.get('s').get('n') except: img_url = None # print(name, number ,color_value, size_price_arr) # print(img_url) img_downloaded = mongo.is_pending_goods_img_downloaded(self.url) if img_url: img_url = 'https://images.eastbay.com/is/image/%s?wid=600&hei=600&fmt=jpg' % img_url # print(img_url) result = helper.downloadImg( img_url, os.path.join('.', 'imgs', 'eastbay', '%s.jpg' % number)) if result == 1: # 上传到七牛 qiniuUploader.upload_2_qiniu( 'eastbay', '%s.jpg' % number, './imgs/eastbay/%s.jpg' % number) img_downloaded = True mongo.insert_pending_goods(name, number, self.url, size_price_arr, ['%s.jpg' % number], self.gender, color_value, 'eastbay', '5b04ff19b0394165bc8de23d', self.crawl_counter, img_downloaded=img_downloaded) except: global error_detail_url error_counter = error_detail_url.get(self.url, 1) error_detail_url[self.url] = error_counter + 1 helper.log( '[ERROR] error timer = %s, url = %s' % (error_counter, self.url), 'eastbay') if error_counter < 3: self.q.put(self.url)