Exemple #1
0
 def run(self):
     '''
     解析网站源码
     '''
     time.sleep(2)
     try:
         pq = helper.get(self.url, cookie)
         name = pq('h1#prodNameId').text()
         number = pq('span#supplierArtNumSpan').text()
         color_value = pq('span#variantColorId').text()
         size_price_arr = []
         for a in pq('div#2SizeContainer > div > a'):
             arr = [item.strip() for item in a.get('onclick').replace('ProductDetails.changeSizeAffectedLinks(', '').replace(');', '').split('\n')]
             # print(arr)
             # '8+', => 8+, => 8+
             arr[6] = arr[6].replace('\'', '').replace(',', '').replace('Y', '')
             size_price_arr.append({
                 'size': float(arr[6]) if '+' not in arr[6] else float(arr[6].replace('+', '')) + 0.5,
                 # '115,76 USD', => '115.76 USD'. => '115.76 USD'. => '115.76 => 115.76
                 'price': float(arr[2].replace(',', '.').replace(' USD\'.', '').replace('\'', '')),
                 'isInStock': True
             })
         # print(size_price_arr)
         img_downloaded = mongo.is_pending_goods_img_downloaded(self.url)
         if not img_downloaded:
             img_url = pq('img.productDetailPic').attr('src')
             result = helper.downloadImg(img_url, os.path.join('.', 'imgs', platform, '%s.jpg' % number))
             if result == 1:
                 # 上传到七牛
                 qiniuUploader.upload_2_qiniu(platform, '%s.jpg' % number, './imgs/%s/%s.jpg' % (platform, number))
                 img_downloaded = True
         mongo.insert_pending_goods(name, number, self.url, size_price_arr, ['%s.jpg' % number], self.gender, color_value, platform, '5bc87d6dc7e854cab4875368', self.crawl_counter, img_downloaded=img_downloaded)
     except Exception as e:
         global error_detail_url
         error_counter = error_detail_url.get(self.url, 1)
         error_detail_url[self.url] = error_counter + 1
         helper.log('[ERROR] error timer = %s, url = %s' % (error_counter, self.url), platform)
         helper.log(e, platform)
         if error_counter < 3:
             self.q.put(self.url)
Exemple #2
0
    def run(self):
        '''
        解析网站源码
        '''
        time.sleep(random.uniform(1.0, 3.6))
        try:
            pq = helper.get(self.url, myHeaders=self.headers)
            # 款型名称
            name = pq('div.product-brand').text().strip() + ' ' + pq('h1.product-name').text().strip()
            # 颜色尺寸
            # 找出所有的尺寸
            size_span_list = pq('div.product-sizes__options span.product-sizes__detail')
            size_price_list = []
            for size_span in size_span_list:
                size = PyQuery(size_span).find('span.product-sizes__size').text().strip()
                if 'K' in size or 'k' in size or '-' in size or 'XS' in size:
                    continue
                size = re.sub(r'[WwYyCc\*]', '', size)
                # 还有非数字的size,醉了
                if size == 'S':
                    continue
                elif size == 'M':
                    continue
                elif size == 'L':
                    continue
                elif size == 'XL':
                    continue
                elif size == 'XXL':
                    continue
                elif size == 'XXXL':
                    continue
                elif size == '':
                    continue
                elif size == 'OS':
                    continue
                price = PyQuery(size_span).find('span.product-sizes__price').text().strip()
                if price.startswith('$'):
                    price = price.replace('$', '').replace(',', '')
                    size_price_list.append({
                        'size': size,
                        'price': float(price),
                        'isInStock': True
                    })
                else:
                    size_price_list.append({
                        'size': size,
                        'price': 0.0,
                        'isInStock': False
                    })
            if len(size_price_list) < 1:
                return
            # 配色的编号
            number = ''
            # 性别
            gender = 0
            # 颜色
            color_value = ''
            tr_list = pq('table#product-attribute-specs-table tr')
            for tr in tr_list:
                key = PyQuery(tr).find('th').text().strip()
                if key == 'Gender':
                    gender_txt = PyQuery(tr).find('td').text().strip()
                    if gender_txt == 'Mens':
                        gender = 1
                    elif gender_txt == 'Womens':
                        gender = 2
                elif key == 'Colorway':
                    color_value = PyQuery(tr).find('td').text().strip()
                elif key == 'Manufacturer Sku':
                    number = PyQuery(tr).find('td').text().strip()
            # print(name, number, self.url, size_price_list, gender, color_value)
            img_downloaded = mongo.is_pending_goods_img_downloaded(self.url)

            if not img_downloaded:
                img_url = pq('div.product-gallery-image > img')[0].get('src')
                # 下载图片
                result = helper.downloadImg(img_url, os.path.join('.', 'imgs', 'stadiumgoods', '%s.jpg' % number))
                if result == 1:
                    # 上传到七牛
                    qiniuUploader.upload_2_qiniu('stadiumgoods', '%s.jpg' % number, './imgs/stadiumgoods/%s.jpg' % number)
                img_downloaded = True
            mongo.insert_pending_goods(name, number, self.url, size_price_list, ['%s.jpg' % number], gender, color_value, 'stadiumgoods', '5b8f484b299207efc1fb0904', self.crawl_counter, img_downloaded=img_downloaded)
        except:
            global error_detail_url
            error_counter = error_detail_url.get(self.url, 1)
            error_detail_url[self.url] = error_counter + 1
            helper.log('[ERROR] error timer = %s, url = %s' % (error_counter, self.url), 'stadiumgoods')
            if error_counter < 3:
                self.q.put(self.url)
Exemple #3
0
 def run(self):
     '''
     解析网站源码
     '''
     time.sleep(2)
     try:
         pq = helper.get(self.url, platform=platform)
         # 款型名称
         name = pq('h1.name').text()
         number = ''
         color_value = ''
         # price = 0.0
         for div in pq('div.detail'):
             div = PyQuery(div)
             key = div.find('span.title').text()
             if key == 'Style':
                 # 配色的编号
                 number = div.find('span')[-1].text.strip()
             elif key == 'Colorway':
                 color_value = div.find('span')[-1].text.strip()
             # elif key == 'Retail Price':
             #     price = div.find('span')[-1].text.replace('US$', '').strip()
             #     price = float(price)
         if number != '':
             # 找出所有尺寸
             size_price_arr = []
             select_options = pq('div.select-options')
             if select_options and len(select_options) > 0:
                 div_list = PyQuery(select_options[0]).find('div.inset div')
                 for i in range(0, len(div_list), 2):
                     if div_list[i].text == 'All':
                         continue
                     if div_list[i + 1].text == 'Bid':
                         size_price_arr.append({
                             'size': div_list[i].text,
                             'price': 0.0,
                             'isInStock': False
                         })
                     else:
                         size_price_arr.append({
                             'size': div_list[i].text,
                             'price': float(div_list[i + 1].text.replace('US$', '').replace(',', '').strip()),
                             'isInStock': True
                         })
                 # 下载图片
                 img_downloaded = mongo.is_pending_goods_img_downloaded(self.url)
                 if not img_downloaded:
                     img_url = ''
                     img_list = pq('div.image-container img')
                     if img_list:
                         img_url = img_list[-1].get('src')
                     else:
                         img_url = pq('div.product-media img').attr('src')
                     img_url_list = img_url.split('?')
                     img_url_query_list = img_url_list[1].split('&')
                     for i in range(0, len(img_url_query_list)):
                         if img_url_query_list[i].split('=')[0] == 'w':
                             img_url_query_list[i] = 'w=600'
                         elif  img_url_query_list[i].split('=')[0] == 'h':
                             img_url_query_list[i] = 'h=600'
                     img_url = img_url_list[0] + '?' + '&'.join(img_url_query_list)
                     result = helper.downloadImg(img_url, os.path.join('.', 'imgs', platform, '%s.jpg' % number))
                     if result == 1:
                         # 上传到七牛
                         qiniuUploader.upload_2_qiniu(platform, '%s.jpg' % number, './imgs/%s/%s.jpg' % (platform, number))
                         img_downloaded = True
         else:
             size_price_arr = []
             # https://stockx.com/api/products/adidas-human-race-nmd-pharrell-cream?includes=market,360&currency=USD
             size_price_url = 'https://stockx.com/api/products%s?includes=market,360&currency=USD' % self.url.split('stockx.com')[1]
             json_txt = helper.get(size_price_url, returnText=True)
             json_data = json.loads(json_txt)
             product_children = json_data.get('Product').get('children')
             for product_key in product_children.keys():
                 product_data = product_children[product_key]
                 market_data = product_data.get('market')
                 size_price_arr.append({
                     'size': product_data.get("shoeSize"),
                     'price': market_data.get("lastSale"),
                     'isInStock': market_data.get("lastSale") > 0
                 })
             number = json_data.get('Product').get('styleId')
             color_value = json_data.get('Product').get('colorway')
             name = json_data.get('Product').get('title')
             # print('number = ', number)
             # print('color_value = ', color_value)
             # print('name = ', name)
             if number != '':
                 # 下载图片
                 img_downloaded = mongo.is_pending_goods_img_downloaded(self.url)
                 if not img_downloaded:
                     img_url_list = json_data.get('Product').get('media').get('360')
                     if len(img_url_list) > 0:
                         img_url = img_url_list[0]
                     else:
                         img_url = json_data.get('Product').get('media').get('imageUrl')
                     img_path = os.path.join('.', 'imgs', platform, '%s.jpg' % number)
                     helper.downloadImg(img_url, img_path)
                     # 上传到七牛
                     qiniuUploader.upload_2_qiniu(platform, '%s.jpg' % number, img_path)
                     img_downloaded = True
         if number != '':
             mongo.insert_pending_goods(name, number, self.url, size_price_arr, ['%s.jpg' % number], 0, color_value, platform, '5bace180c7e854cab4dbcc83', self.crawl_counter, img_downloaded=img_downloaded)
         # print(name, number, self.url, size_price_arr, ['%s.jpg' % number], 0, color_value, platform, '5bace180c7e854cab4dbcc83', self.crawl_counter, img_downloaded)
     except:
         global error_detail_url
         error_counter = error_detail_url.get(self.url, 1)
         error_detail_url[self.url] = error_counter + 1
         helper.log('[ERROR] error timer = %s, url = %s' % (error_counter, self.url), platform)
         if error_counter < 3:
             self.q.put(self.url)
Exemple #4
0
    def run(self):
        '''
        解析网站源码
        '''
        time.sleep(2)
        global platform
        try:
            pq = helper.get(self.url)
            name = pq('h1.product_title').text()
            print('name = %s' % name)
            number = pq('span.pistylevalue').text().strip()
            print('number = %s' % number)
            color_value = pq('span.pithecolor').text().strip()
            print('color_value = %s' % color_value)
            size_price_arr = []
            price = 0.0
            try:
                price = float(
                    pq('div.product_price_content > span.product_price').text(
                    ).replace('$', ''))
            except:
                price = 0.0

            for a in pq('div.box_wrapper > a'):
                # print(a.text, a.get('class'))
                size = a.text
                size_price_arr.append({
                    'isInStock':
                    'piunavailable' not in a.get('class'),
                    'size':
                    size,
                    'price':
                    price
                })
            print('size_price_arr = ', size_price_arr)

            img_downloaded = mongo.is_pending_goods_img_downloaded(self.url)
            if not img_downloaded:
                img_url = pq('img.product_image').attr('src')
                if not img_url.startswith('http'):
                    img_url = 'http://www.jimmyjazz.com' + img_url
                print('img_url = ', img_url)
                if helper.downloadImg(
                        img_url,
                        os.path.join('.', 'imgs', platform,
                                     '%s.jpg' % number)) == 1:
                    # 上传到七牛
                    qiniuUploader.upload_2_qiniu(
                        platform, '%s.jpg' % number,
                        './imgs/%s/%s.jpg' % (platform, number))
                img_downloaded = True
            mongo.insert_pending_goods(name,
                                       number,
                                       self.url,
                                       size_price_arr, ['%s.jpg' % number],
                                       self.gender,
                                       color_value,
                                       platform,
                                       '5b4b59b6bb8bdb5a84ddee09',
                                       self.crawl_counter,
                                       img_downloaded=img_downloaded)
        except Exception as e:
            global error_detail_url
            error_counter = error_detail_url.get(self.url, 1)
            error_detail_url[self.url] = error_counter + 1
            helper.log(
                '[ERROR] error timer = %s, url = %s' %
                (error_counter, self.url), platform)
            helper.log(e, platform)
            if error_counter < 3:
                self.q.put(self.url)
 def run(self):
     '''
     解析网站源码
     '''
     time.sleep(2)
     try:
         pq = helper.get(self.url)
         if not pq:
             return
         name = pq('div.nosto_product > span.name').text()
         number = ''
         color_value = ''
         index = 0
         for li in pq('li.attribute-list-item'):
             if index == 0:
                 number = li.text.strip()
             elif index == 1:
                 color_value = li.text.strip()
             index += 1
         size_price_arr = []
         for div in pq('div.hidden > div'):
             price = float(div.find('span').text)
             size = div.find('div').find('meta').get('content').split(
                 '_')[-1]
             size_price_arr.append({
                 'size': size,
                 'price': price,
                 'isInStock': True
             })
         img_downloaded = mongo.is_pending_goods_img_downloaded(self.url)
         if not img_downloaded:
             try:
                 img_url = pq('div.mobile-product-image > img.product-img'
                              ).attr('data-src')
             except:
                 img_url = None
             if not img_url:
                 img_url = pq('link.hidden').attr('src')
             result = helper.downloadImg(
                 img_url,
                 os.path.join('.', 'imgs', platform, '%s.jpg' % number))
             if result == 1:
                 # 上传到七牛
                 qiniuUploader.upload_2_qiniu(
                     platform, '%s.jpg' % number,
                     './imgs/flightclub/%s.jpg' % number)
             img_downloaded = True
         mongo.insert_pending_goods(name,
                                    number,
                                    self.url,
                                    size_price_arr, ['%s.jpg' % number],
                                    self.gender,
                                    color_value,
                                    platform,
                                    '5ac8592c48555b1ba318964a',
                                    self.crawl_counter,
                                    img_downloaded=img_downloaded)
     except Exception as e:
         global error_detail_url
         error_counter = error_detail_url.get(self.url, 1)
         error_detail_url[self.url] = error_counter + 1
         helper.log(
             '[ERROR] error timer = %s, url = %s' %
             (error_counter, self.url), platform)
         helper.log(e, platform)
         if error_counter < 3:
             self.q.put(self.url)
 def run(self):
     '''
     解析网站源码
     '''
     time.sleep(2)
     try:
         pq = helper.get(self.url, myHeaders=self.headers)
         # 款型名称
         name = None
         try:
             name = pq('span.product_title').text()
         except:
             return
         # 配色的编号
         number = pq('span#productSKU').text()
         # 颜色尺寸
         # 找出所有的尺寸
         size_all_list = json.loads(re.compile(r'var model =.*"\};').findall(pq.html())[0].replace('var model = ', '').replace('"};', '"}'))
         size_all_list = size_all_list.get('AVAILABLE_SIZES')
         size_all_list = [float(size.strip()) for size in size_all_list]
         size_price_json = json.loads(re.compile(r'var sizeObj =.*}];').findall(pq.html())[0].replace('var sizeObj = ', '').replace('}];', '}]'))
         size_price_json = [{'size': item.get('size').strip(), 'price': float(item.get('pr_sale').strip())} for item in size_price_json]
         # size_price_list = [{'size': float(a.get('size').strip()), 'price': float(a.get('pr_sale').strip())} for a in size_price_json]
         size_price_list = []
         for size in size_all_list:
             finded = False
             for size_price in size_price_json:
                 if size_price.get('size') == size:
                     size_price_list.append({
                         'size': size,
                         'price': size_price.get('price'),
                         'isInStock': True,
                     })
                     finded = True
                     break
             if not finded:
                 size_price_list.append({
                     'size': size,
                     'price': 0.0,
                     'isInStock': False,
                 })
         # helper.log(name, number, self.url, size_price_list)
         img_downloaded = mongo.is_pending_goods_img_downloaded(self.url)
         if not img_downloaded:
             img_response = helper.get('https://images.champssports.com/is/image/EBFL2/%s?req=imageset,json' % number, returnText=True)
             img_response = re.compile(r'"IMAGE_SET":"\w+/[_\w]+;').findall(img_response)
             img_url = 'https://images.champssports.com/is/image/%s?hei=600&wid=600' % img_response[0].replace('"IMAGE_SET":"', '').replace(';', '')
             # 下载图片
             result = helper.downloadImg(img_url, os.path.join('.', 'imgs', 'champssports', '%s.jpg' % number))
             if result == 1:
                 # 上传到七牛
                 qiniuUploader.upload_2_qiniu('champssports', '%s.jpg' % number, './imgs/champssports/%s.jpg' % number)
                 img_downloaded = True
         result = mongo.insert_pending_goods(name, number, self.url, size_price_list, ['%s.jpg' % number], self.gender, '', 'champssports', '5af1310e48555b1ba3387bcc', self.crawl_counter, img_downloaded=img_downloaded)
     except:
         global error_detail_url
         error_counter = error_detail_url.get(self.url, 1)
         error_detail_url[self.url] = error_counter + 1
         helper.log('[ERROR] error timer = %s, url = %s' % (error_counter, self.url), 'champssports')
         if error_counter < 3:
             self.q.put(self.url)
Exemple #7
0
 def run(self):
     '''
     解析网站源码
     '''
     time.sleep(3.6)
     global platform
     global error_detail_url
     try:
         slug = self.url.replace('https://www.goat.com/sneakers/', '')
         html = helper.get(self.url, returnText=True, platform=platform)
         if html:
             json_data = re.compile(r'window.__context__.*')
             json_data = json_data.findall(html)[0]
             json_data = json_data.replace('window.__context__ = ', '')
             json_data = json_data.replace('</script>', '')
             json_data = json.loads(json_data)
             json_data = json_data.get('default_store')
             json_data = json_data.get('product-templates')
             product_json = json_data.get('slug_map').get(slug)
             name = product_json.get('name')
             number = product_json.get('sku')
             color_value = product_json.get('details')
             color_name = name.split('\'')[1] if '\'' in name else ''
             size_list = product_json.get(
                 'formatted_available_sizes_new_v2')
             size_price_list = [{
                 'size':
                 float(data.get('size')),
                 'price':
                 float(data.get('price_cents') / 100),
                 'isInStock':
                 True
             } for data in size_list]
             # print({
             #     'name': name,
             #     'number': number,
             #     'color_value': color_value,
             #     'color_name': color_name,
             #     'size_price_list': size_price_list,
             # })
             img_downloaded = mongo.is_pending_goods_img_downloaded(
                 self.url)
             if not img_downloaded:
                 img_url = product_json.get('original_picture_url')
                 result = helper.downloadImg(
                     img_url,
                     os.path.join('.', 'imgs', platform, '%s.jpg' % number))
                 if result == 1:
                     # 上传到七牛
                     qiniuUploader.upload_2_qiniu(
                         platform, '%s.jpg' % number,
                         './imgs/%s/%s.jpg' % (platform, number))
                     img_downloaded = True
             mongo.insert_pending_goods(
                 name, number, self.url, size_price_list,
                 ['%s.jpg' % number], self.gender, color_value, platform,
                 '5bbf4561c7e854cab45218ba', self.crawl_counter, color_name,
                 img_downloaded)
             fetched_url_list.append(self.url)
             helper.writeFile(json.dumps(fetched_url_list),
                              './logs/goat-%s.json' % helper.today())
         else:
             error_counter = error_detail_url.get(self.url, 1)
             error_detail_url[self.url] = error_counter + 1
             helper.log(
                 '[ERROR] error timer = %s, url = %s' %
                 (error_counter, self.url), platform)
             if error_counter < 3:
                 self.q.put(self.url)
     except Exception as e:
         error_counter = error_detail_url.get(self.url, 1)
         error_detail_url[self.url] = error_counter + 1
         helper.log(
             '[ERROR] error timer = %s, url = %s' %
             (error_counter, self.url), platform)
         helper.log(e, platform)
         if error_counter < 3:
             self.q.put(self.url)
     finally:
         helper.log('[INFO] %s is done' % self.url, platform)
Exemple #8
0
    def run(self):
        '''
        解析网站源码
        '''
        time.sleep(2)
        try:
            pq = helper.get(self.url, myHeaders=self.headers)
            name = pq('h1.product_title').text()
            if not name:
                return
            number = pq('span#productSKU').text()
            if not number:
                return
            color_value = pq('span.attType_color').text()

            json_str = re.compile(r'var\smodel\s=\s.*"\};').findall(
                pq.html())[0]
            size_arr = json.loads(
                json_str.replace('var model = ',
                                 '').replace('"};',
                                             '"}')).get('AVAILABLE_SIZES')
            try:
                size_arr = [float(size) for size in size_arr]
            except:
                size_arr = []
                return
            if len(size_arr) < 1:
                return

            json_str = re.compile(r'var\ssizeObj\s=\s.*"\}\];').findall(
                pq.html())[0]
            available_size_arr = json_str.replace('var sizeObj = ',
                                                  '').replace('"}];', '"}]')
            available_size_arr = json.loads(available_size_arr)
            size_price_arr = [{
                'size': size,
                'isInStock': False,
                'price': 0.00
            } for size in size_arr]
            for available_size in available_size_arr:
                tmp_size = float(available_size.get('size'))
                for size_price in size_price_arr:
                    if tmp_size == size_price.get('size'):
                        size_price['isInStock'] = True
                        size_price['price'] = float(
                            available_size.get('pr_sale'))
                        break

            img_json_str = helper.get(
                'https://images.eastbay.com/is/image/EBFL2/%sMM?req=set,json' %
                number,
                returnText=True)
            img_json = None
            img_url = None
            try:
                img_json = json.loads(
                    img_json_str.replace('/*jsonp*/s7jsonResponse(',
                                         '').replace(',"");', ''))
                img_item_arr = img_json.get('set').get('item')
                for img_item in img_item_arr:
                    if img_item.get('type') == 'img_set':
                        img_url = img_item.get('set').get('item')[0].get(
                            's').get('n')
                        break
            except:
                img_json_str = helper.get(
                    'https://images.eastbay.com/is/image/EBFL2/%s?req=set,json'
                    % number,
                    returnText=True)
                try:
                    img_json = json.loads(
                        img_json_str.replace('/*jsonp*/s7jsonResponse(',
                                             '').replace(',"");', ''))
                    img_item_arr = img_json.get('set').get('item')
                    if isinstance(img_item_arr, list):
                        img_url = img_item_arr[0].get('s').get('n')
                    elif isinstance(img_item_arr, dict):
                        img_url = img_item_arr.get('s').get('n')
                except:
                    img_url = None
            # print(name, number ,color_value, size_price_arr)
            # print(img_url)
            img_downloaded = mongo.is_pending_goods_img_downloaded(self.url)
            if img_url:
                img_url = 'https://images.eastbay.com/is/image/%s?wid=600&hei=600&fmt=jpg' % img_url
                # print(img_url)
                result = helper.downloadImg(
                    img_url,
                    os.path.join('.', 'imgs', 'eastbay', '%s.jpg' % number))
                if result == 1:
                    # 上传到七牛
                    qiniuUploader.upload_2_qiniu(
                        'eastbay', '%s.jpg' % number,
                        './imgs/eastbay/%s.jpg' % number)
                img_downloaded = True
            mongo.insert_pending_goods(name,
                                       number,
                                       self.url,
                                       size_price_arr, ['%s.jpg' % number],
                                       self.gender,
                                       color_value,
                                       'eastbay',
                                       '5b04ff19b0394165bc8de23d',
                                       self.crawl_counter,
                                       img_downloaded=img_downloaded)
        except:
            global error_detail_url
            error_counter = error_detail_url.get(self.url, 1)
            error_detail_url[self.url] = error_counter + 1
            helper.log(
                '[ERROR] error timer = %s, url = %s' %
                (error_counter, self.url), 'eastbay')
            if error_counter < 3:
                self.q.put(self.url)