def crawlProducts(start_url, limit=999):
    current_url = start_url
    products = []
    page_count = 1
    total_count = 0  #全部商品数目
    while True:
        content = fetchPageWithUrl(current_url)
        print 'fetch page %s' % page_count
        if not content:
            break
        doc = PyQuery(content)
        nodeList = PyQuery(
            doc('div[data-tracelog-exp="wp_widget_offerhand_main_disp"]').eq(0)
        )(
            'ul.offer-list-row > li'
        )  #PyQuery(doc('div.common-column-150').eq(0))('ul.offer-list-row > li') #common-column-220
        if len(nodeList) < 4:
            print len(nodeList)
        for num, node in enumerate(nodeList):
            nodeQ = PyQuery(node)
            name = nodeQ('div.title > a').attr('title')
            product_url = process_url(nodeQ('div.title > a').attr('href'))
            try:
                MOQ = 0
                p_content = fetchPageWithUrl(product_url)
                p_doc = PyQuery(p_content)
                MOQ = extractNum(
                    p_doc('tr.amount > td.ladder-1-1 > span.value').text())
                if not MOQ or MOQ == 0:
                    MOQ = extractNum(
                        PyQuery(
                            p_doc('tr.amount').remove('td.amount-title').
                            children('td').eq(0))('span.value').text())
            except:
                """do nothing"""
            img_url = "http:" + nodeQ('div.image > a > img').attr(
                'data-lazy-load-src')
            price = nodeQ('div.price').text()
            if nodeQ('div.attributes > span'):
                tags = ''
                for tagNode in nodeQ('div.attributes > span'):
                    tagNodeQ = PyQuery(tagNode)
                    text = tagNodeQ.attr('class')
                    tags = tags + ' ' + text
            else:
                tags = ''
            sold = extractNum(nodeQ('div.booked-count').text())
            total_count += 1
            products.append([
                name, product_url, img_url, price, tags, sold, page_count,
                num + 1, total_count, MOQ
            ])
        next_url = parse_next_url(doc)
        if not next_url:
            break
        current_url = process_url(next_url)
        page_count += 1
        if page_count > limit:
            break
    return products
def crawlProducts(start_url, limit=999):
    current_url = start_url
    products = []
    page_count = 1
    total_count = 0 #全部商品数目
    while True:
        content = fetchPageWithUrl(current_url)
        print 'fetch page %s' %page_count
        if not content:
            break
        doc = PyQuery(content)
        nodeList = PyQuery(doc('div[data-tracelog-exp="wp_widget_offerhand_main_disp"]').eq(0))('ul.offer-list-row > li')#PyQuery(doc('div.common-column-150').eq(0))('ul.offer-list-row > li') #common-column-220
        if len(nodeList) < 4:
           print len(nodeList)
        for num, node in enumerate(nodeList):
            nodeQ = PyQuery(node)
            name = nodeQ('div.title > a').attr('title')
            product_url = process_url(nodeQ('div.title > a').attr('href'))
            try:
                MOQ = 0
                p_content = fetchPageWithUrl(product_url)
                p_doc = PyQuery(p_content)
                MOQ = extractNum(p_doc('tr.amount > td.ladder-1-1 > span.value').text())
                if not MOQ or MOQ == 0:
                    MOQ = extractNum(PyQuery(p_doc('tr.amount').remove('td.amount-title').children('td').eq(0))('span.value').text())
            except:
                """do nothing"""
            img_url = "http:" + nodeQ('div.image > a > img').attr('data-lazy-load-src')
            price = nodeQ('div.price').text()
            if nodeQ('div.attributes > span'):
                tags = ''
                for tagNode in nodeQ('div.attributes > span'):
                    tagNodeQ = PyQuery(tagNode)
                    text = tagNodeQ.attr('class')
                    tags = tags + ' ' + text
            else:
                tags = ''
            sold = extractNum(nodeQ('div.booked-count').text())
            total_count += 1
            products.append([name, product_url, img_url, price, tags, sold, page_count, num+1, total_count, MOQ])
        next_url = parse_next_url(doc)
        if not next_url:
            break
        current_url = process_url(next_url)
        page_count += 1
        if page_count > limit:
            break
    return products
Exemple #3
0
def get_img_urls(content):
    if not content:
        return []
    url_list = []
    doc = PyQuery(content)
    nodeList = doc('li.tab-trigger > div.vertical-img > a.box-img > img')
    for node in nodeList:
        url = PyQuery(node).attr('src')
        if not url:
            continue
        if url.find('60x60') > 0:
            url = url.replace('60x60', '400x400')
            url_list.append(url)
    needDescImg = True
    if needDescImg:
        link_url = doc('div#desc-lazyload-container').attr('data-tfs-url')
        if not link_url:
            return url_list
        desc_content = fetchPageWithUrl(link_url)
        #懒惰匹配模式
        imgNodes = re.findall('<img[^<>]*>.*?', desc_content)
        #desc_content = re.sub('var[\s]*offer_details[\s]*=[\s]*', '', desc_content)
        for node in imgNodes:
            nodeQ = PyQuery(node)
            desc_url = nodeQ('img').attr('src')
            if desc_url:
                desc_url = desc_url.replace('\\"', '')
            if not desc_url:
                continue
            if 'gif' in desc_url:  #gif图片不要
                continue
            #if '//gd' in desc_url or '/2015/' in desc_url:
            url_list.append(desc_url)
    return url_list
def get_img_urls(content):
    if not content:
        return []
    url_list = []
    doc = PyQuery(content)
    nodeList = doc('li.tab-trigger > div.vertical-img > a.box-img > img')
    for node in nodeList:
        url = PyQuery(node).attr('src')
        if not url:
            continue
        if url.find('60x60') > 0:
            url=url.replace('60x60','400x400')
            url_list.append(url)
    needDescImg = True
    if needDescImg:
        link_url = doc('div#desc-lazyload-container').attr('data-tfs-url')
        if not link_url:
           return url_list
        desc_content = fetchPageWithUrl(link_url)
        #懒惰匹配模式
        imgNodes = re.findall('<img[^<>]*>.*?', desc_content)
        #desc_content = re.sub('var[\s]*offer_details[\s]*=[\s]*', '', desc_content)
        for node in imgNodes:
            nodeQ = PyQuery(node)
            desc_url = nodeQ('img').attr('src')
            if desc_url:
                desc_url = desc_url.replace('\\"', '')
            if not desc_url:
                continue
            if 'gif' in desc_url: #gif图片不要
                continue
            #if '//gd' in desc_url or '/2015/' in desc_url:
            url_list.append(desc_url)
    return url_list
def get_categories(url):
    content = fetchPageWithUrl(url)
    cate_list = []
    if content:
        doc = PyQuery(content)
        categoryNodeList = doc("div.wp-category-nav-unit > ul > li > a")
        for node in categoryNodeList:
            url, num, name= process_url(PyQuery(node).attr('href')), PyQuery(node)('span').text(), PyQuery(node).remove('span').text()
            url = url.replace("showType=catalog", "showType=window")
            cate_list.append([url, name, num])
    return cate_list
def get_categories(url):
    content = fetchPageWithUrl(url)
    cate_list = []
    if content:
        doc = PyQuery(content)
        categoryNodeList = doc("div.wp-category-nav-unit > ul > li > a")
        for node in categoryNodeList:
            url, num, name = process_url(PyQuery(node).attr('href')), PyQuery(
                node)('span').text(), PyQuery(node).remove('span').text()
            url = url.replace("showType=catalog", "showType=window")
            cate_list.append([url, name, num])
    return cate_list
def get_categories_url(home_url):
    content = fetchPageWithUrl(home_url)
    doc = PyQuery(content)
    url = doc('a.show-category').attr('href')
    return process_url(url)
def get_categories_url(home_url):
    content = fetchPageWithUrl(home_url)
    doc = PyQuery(content)
    url = doc('a.show-category').attr('href')
    return process_url(url)