def start(action): if action == 'common': crawl_counter = mongo.get_crawl_counter('champssports') # 创建一个队列用来保存进程获取到的数据 q = Queue() # 有错误的页面链接 error_page_url_queue = Queue() total_page = 16 # https://www.champssports.com/api/products/search?products=&query=%3Arelevance%3Agender%3A200000%3AproductType%3A200005¤tPage=1&pageSize=60×tamp=0 base_url = 'https://www.champssports.com/Mens/Shoes/_-_/N-24Zrj?cm_PAGE=%d&Rpp=180&crumbs=991&Nao=%d' fetch_page([base_url % ((page - 1) * 180, (page - 1) * 180) for page in range(1, total_page + 1)], 1, q, error_page_url_queue, crawl_counter) total_page = 7 base_url = 'https://www.champssports.com/Womens/Shoes/_-_/N-25Zrj?cm_PAGE=%d&Rpp=180&crumbs=991&Nap=%d' fetch_page([base_url % ((page - 1) * 180, (page - 1) * 180) for page in range(1, total_page + 1)], 2, q, error_page_url_queue, crawl_counter) # 处理出错的链接 while not error_page_url_queue.empty(): error_page_url_list = [] while not error_page_url_queue.empty(): error_page_url_list.append(error_page_url_queue.get()) error_page_men_url_list = [url_data.get('url') for url_data in error_page_url_list if url_data.get('gender') == 1] fetch_page(error_page_men_url_list, 1, q, error_page_url_queue, crawl_counter) error_page_women_url_list = [url_data.get('url') for url_data in error_page_url_list if url_data.get('gender') == 2] fetch_page(error_page_women_url_list, 2, q, error_page_url_queue, crawl_counter) helper.log('done', 'champssports')
def start(): crawl_counter = mongo.get_crawl_counter(platform) # 创建一个队列用来保存进程获取到的数据 q = Queue() # 有错误的页面链接 error_page_url_queue = Queue() # 先获取cookie _, tmpCookie = helper.get('https://www.kickz.com/us/men/shoes/c', myHeaders={ 'User-Agent': 'Mozilla/5.0' }, withCookie=True) global cookie cookie['JSESSIONID'] = tmpCookie.get('JSESSIONID', '') total_page = 20 fetch_page(['https://www.kickz.com/us/men/shoes/c?selectedPage=%d' % page for page in range(1, total_page + 1)], 1, q, error_page_url_queue, crawl_counter) total_page = 17 fetch_page(['https://www.kickz.com/us/kids,women/shoes/shoe-sizes/38+,36-2:3,40+,37+,41+,39-1:3,35+,36,36+,39+,39,37,38,41-1:3,42,41,40,39:40,38-2:3,40-2:3,35:36,37:38,37-1:3,41:42/c?selectedPage=%d' % page for page in range(1, total_page + 1)], 2, q, error_page_url_queue, crawl_counter) # # 处理出错的链接 # while not error_page_url_queue.empty(): # error_page_url_list = [] # while not error_page_url_queue.empty(): # error_page_url_list.append(error_page_url_queue.get()) # error_page_men_url_list = [url_data.get('url') for url_data in error_page_url_list if url_data.get('gender') == 1] # fetch_page(error_page_men_url_list, 1, q, error_page_url_queue, crawl_counter) # error_page_women_url_list = [url_data.get('url') for url_data in error_page_url_list if url_data.get('gender') == 2] # fetch_page(error_page_women_url_list, 2, q, error_page_url_queue, crawl_counter) helper.log('done', platform)
def start(action): crawl_counter = mongo.get_crawl_counter(platform) # 创建一个队列用来保存进程获取到的数据 q = Queue() # 有错误的页面链接 error_page_url_queue = Queue() if action == 'common': fetch_page('http://www.jimmyjazz.com/mens/footwear', q, crawl_counter, 1, error_page_url_queue) fetch_page('http://www.jimmyjazz.com/womens/footwear', q, crawl_counter, 2, error_page_url_queue) # fetch_detail('http://www.jimmyjazz.com/mens/footwear/jordan-1-mid-sneaker/554724-605?color=Red') helper.log('done', platform)
def start_spider(): crawl_counter = mongo.get_crawl_counter(platform) # 创建一个队列用来保存进程获取到的数据 q = Queue() # 有错误的页面链接 error_page_url_queue = Queue() total_num = 781 url_list = ['https://store.nike.com/html-services/gridwallData?country=US&lang_locale=en_US&gridwallPath=mens-shoes/7puZoi3&pn=%d' % page for page in range(1, int(math.ceil(total_num / 60 + 0.5)))] fetch_page(url_list, 1, q, error_page_url_queue, crawl_counter) total_num = 616 url_list = ['https://store.nike.com/html-services/gridwallData?country=US&lang_locale=en_US&gridwallPath=womens-shoes/7ptZoi3&pn=%d' % page for page in range(1, int(math.ceil(total_num / 60 + 0.5)))] fetch_page(url_list, 2, q, error_page_url_queue, crawl_counter)
def start(): crawl_counter = mongo.get_crawl_counter(platform) # 创建一个队列用来保存进程获取到的数据 q = Queue() # 有错误的页面链接 error_page_url_queue = Queue() # total_page = 86 total_page = 1 url_list = [ 'https://www.footlocker.com/api/products/search?currentPage=' + str(page - 1) + '&pageSize=60&query=Men%27s%20Shoes%20%20%20%20&sort=brand-asc×tamp=2' for page in range(1, total_page + 1) ] fetch_page(url_list, 1, q, error_page_url_queue, crawl_counter)
def start(action): if action == 'common': crawl_counter = mongo.get_crawl_counter('eastbay') # 创建一个队列用来保存进程获取到的数据 q = Queue() # 有错误的页面链接 error_page_url_queue = Queue() total_page = 143 fetch_page([ 'https://www.eastbay.com/Mens/_-_/N-1p?cm_PAGE=%d&Rpp=180&crumbs=61&Nao=%d' % ((page - 1) * 180, (page - 1) * 180) for page in range(1, total_page + 1) ], 1, q, error_page_url_queue, crawl_counter) total_page = 56 fetch_page([ 'https://www.eastbay.com/Womens/_-_/N-1q?cm_PAGE=%d&Rpp=180&crumbs=61&Nao=%d' % ((page - 1) * 180, (page - 1) * 180) for page in range(1, total_page + 1) ], 2, q, error_page_url_queue, crawl_counter) # 处理出错的链接 while not error_page_url_queue.empty(): error_page_url_list = [] while not error_page_url_queue.empty(): error_page_url_list.append(error_page_url_queue.get()) error_page_men_url_list = [ url_data.get('url') for url_data in error_page_url_list if url_data.get('gender') == 1 ] fetch_page(error_page_men_url_list, 1, q, error_page_url_queue, crawl_counter) error_page_women_url_list = [ url_data.get('url') for url_data in error_page_url_list if url_data.get('gender') == 2 ] fetch_page(error_page_women_url_list, 2, q, error_page_url_queue, crawl_counter) helper.log('done', 'eastbay')
def start(action): if action == 'common': crawl_counter = mongo.get_crawl_counter(platform) # 创建一个队列用来保存进程获取到的数据 q = Queue() # 有错误的页面链接 error_page_url_queue = Queue() total_page = 70 base_url = 'https://www.flightclub.com/men?id=446&limit=90&p=' fetch_page([base_url + str(page) for page in range(1, total_page + 1)], 1, q, error_page_url_queue, crawl_counter) total_page = 4 base_url = 'https://www.flightclub.com/women?id=350&limit=90&p=' fetch_page([base_url + str(page) for page in range(1, total_page + 1)], 2, q, error_page_url_queue, crawl_counter) # 处理出错的链接 while not error_page_url_queue.empty(): error_page_url_list = [] while not error_page_url_queue.empty(): error_page_url_list.append(error_page_url_queue.get()) error_page_men_url_list = [ url_data.get('url') for url_data in error_page_url_list if url_data.get('gender') == 1 ] fetch_page(error_page_men_url_list, 1, q, error_page_url_queue, crawl_counter) error_page_women_url_list = [ url_data.get('url') for url_data in error_page_url_list if url_data.get('gender') == 2 ] fetch_page(error_page_women_url_list, 2, q, error_page_url_queue, crawl_counter) helper.log('done', platform)
def start(action): if action == 'common': crawl_counter = mongo.get_crawl_counter('stadiumgoods') # 创建一个队列用来保存进程获取到的数据 q = Queue() # 有错误的页面链接 error_page_url_queue = Queue() total_page = 37 url_arr = ['https://www.stadiumgoods.com/nike/page/%d/show/96' % page for page in range(1, total_page + 1)] fetch_page(url_arr, q, error_page_url_queue, crawl_counter) total_page = 17 url_arr = ['https://www.stadiumgoods.com/air-jordan/page/%d/show/96' % page for page in range(1, total_page + 1)] fetch_page(url_arr, q, error_page_url_queue, crawl_counter) total_page = 14 url_arr = ['https://www.stadiumgoods.com/adidas/page/%d/show/96' % page for page in range(1, total_page + 1)] fetch_page(url_arr, q, error_page_url_queue, crawl_counter) total_page = 10 url_arr = ['https://www.stadiumgoods.com/footwear/page/%d/show/96' % page for page in range(1, total_page + 1)] fetch_page(url_arr, q, error_page_url_queue, crawl_counter) # 处理出错的链接 while not error_page_url_queue.empty(): error_page_url_list = [] while not error_page_url_queue.empty(): error_page_url_list.append(error_page_url_queue.get()) # print('wrong page url num:', len(error_page_url_list)) fetch_page(error_page_url_list, q, error_page_url_queue, crawl_counter) # ----------- # goods_spider = GoodsSpider('https://www.stadiumgoods.com/kobe-ad-922482-100-white-court-purple-black', Queue(), 1) # goods_spider.start() helper.log('done', 'stadiumgoods')
def start_hot(): crawl_counter = mongo.get_crawl_counter(platform) # 创建一个队列用来保存进程获取到的数据 q = Queue() url_list = [ 'https://stockx.com/api/browse?_tags=one%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=two%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=three%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=four%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=five%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=six%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=seven%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=eight%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=nine%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=ten%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=eleven%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=twelve%2Cair%20jordan¤cy=AUD&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=thirteen%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=fourteen%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=fifteen%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=sixteen%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=seventeen%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=eighteen%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=nineteen%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=twenty%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=twenty-one%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=twenty-two%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=twenty-three%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=twenty-four%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=twenty-five%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=twenty-six%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=twenty-seven%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=twenty-eight%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=twenty-nine%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=thirty%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=thirty-one%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=packs%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=other%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=spizike%2Cair%20jordan&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=foamposite%2Cnike&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=kd%2Cnike&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=kobe%2Cnike&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=lebron%2Cnike&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=air%20force%2Cnike&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=air%20max%2Cnike&productCategory=sneakers&page=', 'Https://stockx.com/api/browse?_tags=nike%20basketball%2Cnike&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=nike%20sb%2Cnike&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=nike%20other%2Cnike&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=yeezy%2Cadidas&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=ultra%20boost%2Cadidas&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=nmd%2Cadidas&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=iniki%2Cadidas&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=other%2Cadidas&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=asics&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=diadora&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=new%20balance&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=puma&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=under%20armour&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=vans¤cy=CAD&productCategory=sneakers&page=', 'https://stockx.com/api/browse?_tags=converse&productCategory=sneakers&page=', ] for url in url_list: page = 1 total_page = 1 while page <= total_page: html = helper.get(url + str(page), returnText=True) json_data = json.loads(html) pagination = json_data.get('Pagination') total_page = pagination.get('lastPage') product_list = json_data.get('Products') for product in product_list: price = product.get('market').get('lowestAsk', None) if price: number = product.get('styleId') if not mongo.add_hot_platform_with_number('5bace180c7e854cab4dbcc83', number): q.put('https://stockx.com/' + product.get('urlKey')) helper.log('not in db... url => ' + product.get('urlKey') + ' number => ' + number, 'stockx') page += 1 # 开始抓取还没有入库的商品 while True: queue_size = q.qsize() if queue_size > 0: goods_thread_list = [] # 每次启动5个抓取商品的线程 for i in range(5 if queue_size > 5 else queue_size): goods_spider = GoodsSpider(q.get(), q, crawl_counter) goods_spider.start() goods_thread_list.append(goods_spider) for t in goods_thread_list: t.join() goods_thread_list = [] else: break
def start_spider(): crawl_counter = mongo.get_crawl_counter(platform) # 创建一个队列用来保存进程获取到的数据 q = Queue() # # 有错误的页面链接 # error_page_url_queue = Queue() # # url = 'https://stockx.com/api/browse?order=DESC&page=1&productCategory=sneakers&sort=release_date' # # json_txt = helper.get(url, returnText=True) # # json_data = json.loads(json_txt) # # pagination = json_data.get('Pagination') # # total_page = pagination.get('lastPage') # # fetch_page(['https://stockx.com/api/browse?order=DESC&page=%d&productCategory=sneakers&sort=release_date' % page for page in range(1, total_page + 1)], q, error_page_url_queue, crawl_counter) # f = open('./keyword.json') # txt = f.read() # f.close() # keywords = json.loads(txt) # for keyword in keywords: # url = 'https://stockx.com/api/search?query=%s&page=0¤cy=USD' % keyword # json_txt = helper.get(url, returnText=True, platform=platform) # json_data = json.loads(json_txt) # total_page = json_data.get('nbPages') # fetch_page(['https://stockx.com/api/search?query=%s&page=%d¤cy=USD' % (keyword, page) for page in range(0, total_page)], q, error_page_url_queue, crawl_counter) # # # 处理出错的链接 # # while not error_page_url_queue.empty(): # # error_page_url_list = [] # # while not error_page_url_queue.empty(): # # error_page_url_list.append(error_page_url_queue.get()) # # fetch_page(error_page_url_list, q, error_page_url_queue, crawl_counter) # # goods_spider = GoodsSpider('https://stockx.com/adidas-sl-loop-wish-independent-currency', Queue(), 1) # # goods_spider.start() # # goods_spider.join() f = open('./keyword.json') txt = f.read() f.close() key_list = json.loads(txt) # 去重 key_list = helper.delRepeat(key_list) for key in key_list: helper.log('key = ' + key, platform) page = 0 total_page = 1 while page < total_page: time.sleep(2) data = {"params":"query=" + key.replace(' ', '%20').replace('/', '%2F') + "&facets=*&filters=product_category%3A%22sneakers%22&page=" + str(page)} headers = { 'accept': 'application/json', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'en-US,en;q=0.9', 'Connection': 'keep-alive', 'content-type': 'application/x-www-form-urlencoded', 'Host': 'xw7sbct9v6-2.algolianet.com', 'Origin': 'https://stockx.com', 'Referer': 'https://stockx.com/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36 OPR/56.0.3051.104', } html = helper.post('https://xw7sbct9v6-2.algolianet.com/1/indexes/products/query?x-algolia-agent=Algolia%20for%20vanilla%20JavaScript%203.30.0&x-algolia-application-id=XW7SBCT9V6&x-algolia-api-key=6bfb5abee4dcd8cea8f0ca1ca085c2b3', None, headers, returnText=True, platform=platform, json=data, timeout=60) json_data = json.loads(html) total_page = json_data.get('nbPages', 1) nb_hits = json_data.get('nbHits', 0) if nb_hits < 1: helper.log('no hit key = ' + key, platform) break hits = json_data.get('hits') for hit in hits: q.put('https://stockx.com/' + hit.get('url')) page += 1 while True: queue_size = q.qsize() if queue_size > 0: goods_thread_list = [] # 每次启动5个抓取商品的线程 for i in range(5 if queue_size > 5 else queue_size): goods_spider = GoodsSpider(q.get(), q, crawl_counter) goods_spider.start() goods_thread_list.append(goods_spider) for t in goods_thread_list: t.join() goods_thread_list = [] else: break
def start(action): if action == 'common': crawl_counter = mongo.get_crawl_counter('finishline') # 创建一个队列用来保存进程获取到的数据 q = Queue() # 有错误的页面链接 error_page_url_queue = Queue() total_page = 35 base_url = 'https://www.finishline.com/store/men/shoes/_/N-1737dkj?mnid=men_shoes&Ns=sku.daysAvailable%7C0&isAjax=true&No=' fetch_page([{ 'url': base_url + str((page - 1) * 40), 'count': (page - 1) * 40 } for page in range(1, total_page + 1)], 1, q, error_page_url_queue, { 'mnid': 'men_shoes', 'Ns': 'sku.bestSeller | 1', 'isAjax': 'true' }, crawl_counter) total_page = 23 base_url = 'https://www.finishline.com/store/women/shoes/_/N-1hednxh?mnid=women_shoes&isAjax=true&No=' fetch_page([{ 'url': base_url + str((page - 1) * 40), 'count': (page - 1) * 40 } for page in range(1, total_page + 1)], 2, q, error_page_url_queue, { 'mnid': 'women_shoes', 'isAjax': 'true', }, crawl_counter) # 处理出错的链接 while not error_page_url_queue.empty(): error_page_url_list = [] while not error_page_url_queue.empty(): error_page_url_list.append(error_page_url_queue.get()) error_page_men_url_list = [{ 'url': url_data.get('url'), 'count': url_data.get('count') } for url_data in error_page_url_list if url_data.get('gender') == 1] fetch_page([{ 'url': url_data.get('url'), 'count': url_data.get('count') } for url_data in error_page_men_url_list], 1, q, error_page_url_queue, { 'mnid': 'men_shoes', 'Ns': 'sku.bestSeller | 1', 'isAjax': 'true' }, crawl_counter) error_page_women_url_list = [{ 'url': url_data.get('url'), 'count': url_data.get('count') } for url_data in error_page_url_list if url_data.get('gender') == 2] fetch_page([{ 'url': url_data.get('url'), 'count': url_data.get('count') } for url_data in error_page_women_url_list], 2, q, error_page_url_queue, { 'mnid': 'women_shoes', 'isAjax': 'true', }, crawl_counter) helper.log('done', 'finishline')
def start(action): # 读取今天已经抓取过的url global fetched_url_list json_txt = helper.readFile('./logs/goat-%s.json' % helper.today()) try: if json_txt: fetched_url_list = json.loads(json_txt) except: fetched_url_list = [] f = open('./keyword.json') txt = f.read() f.close() key_list = json.loads(txt) # 去重 # key_list = list(set(key_list)) key_list = helper.delRepeat(key_list) crawl_counter = mongo.get_crawl_counter(platform) # 创建一个队列用来保存进程获取到的数据 q = Queue() # 有错误的页面链接 error_page_url_queue = Queue() # TODO: key_list = ['DUNK'] for key in key_list: key = key.replace('\n', '') helper.log('[INFO] now key = ' + key, platform) # 先取男鞋 价格从低到高 if fetch_page(1, 'PRICE_LOW_HIGH', key, q, error_page_url_queue, crawl_counter): helper.log('[INFO] => fetch_page is done, 1, PRICE_LOW_HIGH', platform) # 先取男鞋 价格从高到低 if fetch_page(1, 'PRICE_HIGH_LOW', key, q, error_page_url_queue, crawl_counter): helper.log('[INFO] => fetch_page is done, 1, PRICE_HIGH_LOW', platform) # 先取女鞋 价格从低到高 if fetch_page(2, 'PRICE_LOW_HIGH', key, q, error_page_url_queue, crawl_counter): helper.log('[INFO] => fetch_page is done, 2, PRICE_LOW_HIGH', platform) # 先取女鞋 价格从高到低 if fetch_page(2, 'PRICE_HIGH_LOW', key, q, error_page_url_queue, crawl_counter): helper.log('[INFO] => fetch_page is done, 2, PRICE_HIGH_LOW', platform) # 先取青少年鞋 价格从低到高 if fetch_page(5, 'PRICE_LOW_HIGH', key, q, error_page_url_queue, crawl_counter): helper.log('[INFO] => fetch_page is done, 5, PRICE_LOW_HIGH', platform) # 先取青少年鞋 价格从高到低 if fetch_page(5, 'PRICE_HIGH_LOW', key, q, error_page_url_queue, crawl_counter): helper.log('[INFO] => fetch_page is done, 5, PRICE_HIGH_LOW', platform) # # 先取婴儿鞋 价格从低到高 # if fetch_page(6, 'PRICE_LOW_HIGH', key, q, error_page_url_queue, crawl_counter): # helper.log('[INFO] => fetch_page is done, 6, PRICE_LOW_HIGH', platform) # # 先取婴儿鞋 价格从高到低 # fetch_page(6, 'PRICE_HIGH_LOW', key, q, error_page_url_queue, crawl_counter) # helper.log('[INFO] => fetch_page is done, 6, PRICE_HIGH_LOW', platform) # goods_spider = GoodsSpider('https://www.goat.com/sneakers/force-savage-pro-baseball-cleat-880144-410', 1, Queue(), crawl_counter) # goods_spider.start() # 处理出错的链接 # while not error_page_url_queue.empty(): # error_page_url_list = [] # while not error_page_url_queue.empty(): # error_page_url_list.append(error_page_url_queue.get()) # error_page_men_url_list = [{'url': url_data.get('url'), 'count': url_data.get('count')} for url_data in error_page_url_list if url_data.get('gender') == 1] # fetch_page([{'url': url_data.get('url'), 'count': url_data.get('count')} for url_data in error_page_men_url_list], 1, q, error_page_url_queue, { # 'mnid': 'men_shoes', # 'Ns': 'sku.bestSeller | 1', # 'isAjax': 'true' # }, crawl_counter) helper.log('done', platform)