def crawlProducts(start_url, limit=999): current_url = start_url products = [] page_count = 1 total_count = 0 #全部商品数目 while True: content = fetchPageWithUrl(current_url) print 'fetch page %s' % page_count if not content: break doc = PyQuery(content) nodeList = PyQuery( doc('div[data-tracelog-exp="wp_widget_offerhand_main_disp"]').eq(0) )( 'ul.offer-list-row > li' ) #PyQuery(doc('div.common-column-150').eq(0))('ul.offer-list-row > li') #common-column-220 if len(nodeList) < 4: print len(nodeList) for num, node in enumerate(nodeList): nodeQ = PyQuery(node) name = nodeQ('div.title > a').attr('title') product_url = process_url(nodeQ('div.title > a').attr('href')) try: MOQ = 0 p_content = fetchPageWithUrl(product_url) p_doc = PyQuery(p_content) MOQ = extractNum( p_doc('tr.amount > td.ladder-1-1 > span.value').text()) if not MOQ or MOQ == 0: MOQ = extractNum( PyQuery( p_doc('tr.amount').remove('td.amount-title'). children('td').eq(0))('span.value').text()) except: """do nothing""" img_url = "http:" + nodeQ('div.image > a > img').attr( 'data-lazy-load-src') price = nodeQ('div.price').text() if nodeQ('div.attributes > span'): tags = '' for tagNode in nodeQ('div.attributes > span'): tagNodeQ = PyQuery(tagNode) text = tagNodeQ.attr('class') tags = tags + ' ' + text else: tags = '' sold = extractNum(nodeQ('div.booked-count').text()) total_count += 1 products.append([ name, product_url, img_url, price, tags, sold, page_count, num + 1, total_count, MOQ ]) next_url = parse_next_url(doc) if not next_url: break current_url = process_url(next_url) page_count += 1 if page_count > limit: break return products
def crawlProducts(start_url, limit=999): current_url = start_url products = [] page_count = 1 total_count = 0 #全部商品数目 while True: content = fetchPageWithUrl(current_url) print 'fetch page %s' %page_count if not content: break doc = PyQuery(content) nodeList = PyQuery(doc('div[data-tracelog-exp="wp_widget_offerhand_main_disp"]').eq(0))('ul.offer-list-row > li')#PyQuery(doc('div.common-column-150').eq(0))('ul.offer-list-row > li') #common-column-220 if len(nodeList) < 4: print len(nodeList) for num, node in enumerate(nodeList): nodeQ = PyQuery(node) name = nodeQ('div.title > a').attr('title') product_url = process_url(nodeQ('div.title > a').attr('href')) try: MOQ = 0 p_content = fetchPageWithUrl(product_url) p_doc = PyQuery(p_content) MOQ = extractNum(p_doc('tr.amount > td.ladder-1-1 > span.value').text()) if not MOQ or MOQ == 0: MOQ = extractNum(PyQuery(p_doc('tr.amount').remove('td.amount-title').children('td').eq(0))('span.value').text()) except: """do nothing""" img_url = "http:" + nodeQ('div.image > a > img').attr('data-lazy-load-src') price = nodeQ('div.price').text() if nodeQ('div.attributes > span'): tags = '' for tagNode in nodeQ('div.attributes > span'): tagNodeQ = PyQuery(tagNode) text = tagNodeQ.attr('class') tags = tags + ' ' + text else: tags = '' sold = extractNum(nodeQ('div.booked-count').text()) total_count += 1 products.append([name, product_url, img_url, price, tags, sold, page_count, num+1, total_count, MOQ]) next_url = parse_next_url(doc) if not next_url: break current_url = process_url(next_url) page_count += 1 if page_count > limit: break return products
def get_img_urls(content): if not content: return [] url_list = [] doc = PyQuery(content) nodeList = doc('li.tab-trigger > div.vertical-img > a.box-img > img') for node in nodeList: url = PyQuery(node).attr('src') if not url: continue if url.find('60x60') > 0: url = url.replace('60x60', '400x400') url_list.append(url) needDescImg = True if needDescImg: link_url = doc('div#desc-lazyload-container').attr('data-tfs-url') if not link_url: return url_list desc_content = fetchPageWithUrl(link_url) #懒惰匹配模式 imgNodes = re.findall('<img[^<>]*>.*?', desc_content) #desc_content = re.sub('var[\s]*offer_details[\s]*=[\s]*', '', desc_content) for node in imgNodes: nodeQ = PyQuery(node) desc_url = nodeQ('img').attr('src') if desc_url: desc_url = desc_url.replace('\\"', '') if not desc_url: continue if 'gif' in desc_url: #gif图片不要 continue #if '//gd' in desc_url or '/2015/' in desc_url: url_list.append(desc_url) return url_list
def get_img_urls(content): if not content: return [] url_list = [] doc = PyQuery(content) nodeList = doc('li.tab-trigger > div.vertical-img > a.box-img > img') for node in nodeList: url = PyQuery(node).attr('src') if not url: continue if url.find('60x60') > 0: url=url.replace('60x60','400x400') url_list.append(url) needDescImg = True if needDescImg: link_url = doc('div#desc-lazyload-container').attr('data-tfs-url') if not link_url: return url_list desc_content = fetchPageWithUrl(link_url) #懒惰匹配模式 imgNodes = re.findall('<img[^<>]*>.*?', desc_content) #desc_content = re.sub('var[\s]*offer_details[\s]*=[\s]*', '', desc_content) for node in imgNodes: nodeQ = PyQuery(node) desc_url = nodeQ('img').attr('src') if desc_url: desc_url = desc_url.replace('\\"', '') if not desc_url: continue if 'gif' in desc_url: #gif图片不要 continue #if '//gd' in desc_url or '/2015/' in desc_url: url_list.append(desc_url) return url_list
def get_categories(url): content = fetchPageWithUrl(url) cate_list = [] if content: doc = PyQuery(content) categoryNodeList = doc("div.wp-category-nav-unit > ul > li > a") for node in categoryNodeList: url, num, name= process_url(PyQuery(node).attr('href')), PyQuery(node)('span').text(), PyQuery(node).remove('span').text() url = url.replace("showType=catalog", "showType=window") cate_list.append([url, name, num]) return cate_list
def get_categories(url): content = fetchPageWithUrl(url) cate_list = [] if content: doc = PyQuery(content) categoryNodeList = doc("div.wp-category-nav-unit > ul > li > a") for node in categoryNodeList: url, num, name = process_url(PyQuery(node).attr('href')), PyQuery( node)('span').text(), PyQuery(node).remove('span').text() url = url.replace("showType=catalog", "showType=window") cate_list.append([url, name, num]) return cate_list
def get_categories_url(home_url): content = fetchPageWithUrl(home_url) doc = PyQuery(content) url = doc('a.show-category').attr('href') return process_url(url)