Exemple #1
0
def main():
    urls = [
        # 'http://djzone.taobao.com',
        # 'http://glorylife.taobao.com/',
        # 'http://sffs.tmall.com/',
        # 'http://yazhou.tmall.com/shop/view_shop.htm?frm=yiyao',
        # 'http://nansin.tmall.com/'
        # 'http://store.taobao.com/shop/view_shop.htm?user_number_id=1597546113',
        # 'http://store.taobao.com/shop/view_shop.htm?user_number_id=397259828',
        'http://sanke.taobao.com'
    ]
    for url in urls:
        t = TopShop(url)
        d = t.getFullInfo()
        # d = t.getBasicInfo()
        try:
            print json.dumps(d, ensure_ascii=False, indent=4)  # .decode('utf-8')
            print ''
        except:
            for (k, v) in d.items():
                print k, \
                    (v or '').decode(
                        'utf-8',
                        'ignore').encode('GBK', 'ignore')

        # saveTopShopData(d)

        del t
    pass
Exemple #2
0
def main():
    urls = [
        # 'http://djzone.taobao.com',
        # 'http://glorylife.taobao.com',
        # 'http://sffs.tmall.com/',
        # 'http://yazhou.tmall.com/shop/view_shop.htm?frm=yiyao',
        # 'http://nansin.tmall.com/'
        # 'http://store.taobao.com/shop/view_shop.htm?user_number_id=1597546113',
        # 'http://store.taobao.com/shop/view_shop.htm?user_number_id=1643225788',
        # 'http://peers.tmall.com',
        # 'http://ynshadi.tmall.com',
        # 'http://ali.tmall.com',
        # 'http://shop34757726.taobao.com',
        "http://store.taobao.com/shop/view_shop.htm?user_number_id=13987814",
        # 'http://shop59346695.taobao.com',
        # 'http://chowtaifook.tmall.com'
    ]
    for url in urls:
        t = TopShop(url)
        d = t.getFullInfo()
        # d = t.getBasicInfo()
        try:
            print json.dumps(d, ensure_ascii=False, indent=4)  # .decode('utf-8')
            print ""
        except:
            for (k, v) in d.items():
                print k, (v or "").decode("utf-8", "ignore").encode("GBK", "ignore")

        # saveTopShopData(d)

        del t
    pass
Exemple #3
0
def main():
    urls = [
        # 'http://djzone.taobao.com',
        # 'http://glorylife.taobao.com/',
        # 'http://sffs.tmall.com/',
        # 'http://yazhou.tmall.com/shop/view_shop.htm?frm=yiyao',
        # 'http://nansin.tmall.com/'
        # 'http://store.taobao.com/shop/view_shop.htm?user_number_id=1597546113',
        # 'http://store.taobao.com/shop/view_shop.htm?user_number_id=397259828',
        'http://sanke.taobao.com'
    ]
    for url in urls:
        t = TopShop(url)
        d = t.getFullInfo()
        # d = t.getBasicInfo()
        try:
            print json.dumps(d, ensure_ascii=False,
                             indent=4)  # .decode('utf-8')
            print ''
        except:
            for (k, v) in d.items():
                print k, \
                    (v or '').decode(
                        'utf-8',
                        'ignore').encode('GBK', 'ignore')

        # saveTopShopData(d)

        del t
    pass
def getShopItemsOverview(url, page=10, count=200, reqinterval=0.2):
    '''
    [field] siteId
    [field] shopId
    [field] userId
    [filed] iid 商品ID
    [field] itemName
    [field] itemLink
    [field] itemPic
    [field] itemPriceType
    [field] itemPrice
    [field] itemSales
    [field] itemRateNum
    '''

    metadata = []
    shopLink = TopShop(url).getBasicInfo()['shopLink']
    url = shopLink + '?search=y&viewType=grid&orderType=_hotsell'
    siteId = shopId = userId = None
    page_cursor = count_cursor = 0
    iid_list = []
    while 1:
        if page_cursor >= page:
            break
        # r = request(url, requiredPropertyRegx=r'siteId', retries=10)
        r = requests.get(url)
        s = r.content
        if not siteId:
            siteId = _match(s, REGX[r'siteId'])
            shopId = _match(s, REGX['shopId'])
            userId = _match(s, REGX['userId'])
        for itemcontent in _matchallitems(s)[1]:
            if count_cursor >= count:
                break
            info = {
                'iid':
                re.findall(r'id\s*\=\s*(\d+)', itemcontent),
                'itemName':
                re.findall(
                    r'\<div\s+class\s*\=\s*\"desc\"\s*\>\s*\<a\s+.+?\>\s*(.+?)\s*\<\/a\>',
                    itemcontent, re.S),
                'itemLink':
                re.findall(r'\<a\s+href\=\"(.+?)\"', itemcontent, re.S),
                'itemPic':
                re.findall(r'\<img\s+data-ks-lazyload\=\"(.+?)\"', itemcontent,
                           re.S),
                'itemPriceType':
                re.findall(
                    r'\<div\s+class\s*\=\s*\"price\"\s*\>\s*\<span\>\s*(.+?)\<\/span\s*\>',
                    itemcontent, re.S),
                'itemPrice':
                re.findall(
                    r'\<div\s+class\s*\=\s*\"price\"\s*\>.+?\<strong\s*\>(\d+\.*\d*)',
                    itemcontent, re.S),
                'itemSales':
                re.findall(
                    r'\<div\s+class\s*\=\s*\"sales\-amount\"\s*\>.+?\<em\>\s*(\d+)\s*\<',
                    itemcontent, re.S),
                'itemRateNum':
                re.findall(
                    r'\<div\s+class\s*\=\s*\"rating\"\s*\>.+?\<a\s+.+?\>\D+(\d+).+?',
                    itemcontent, re.S),
            }
            for (k, v) in info.items():
                if len(v) > 0:
                    info[k] = v[0].decode(r.encoding,
                                          'ignore').encode('utf-8', 'ignore')
                else:
                    info[k] = None
            if iid_list.count(info['iid']) > 0:
                continue
            metadata.append([siteId, shopId, userId] + [
                info['iid'],
                info['itemName'],
                info['itemLink'],
                info['itemPic'],
                info['itemPriceType'],
                info['itemPrice'],
                info['itemSales'],
                info['itemRateNum'],
            ])
            iid_list.append(info['iid'])
            count_cursor += 1
        page_cursor += 1
        hasNext = \
            re.findall(r'\<a\s+class\=\"J\_SearchAsync\s+next\"\s+href\=\"(\S+?)\".?\>', s)
        if not hasNext:
            soup = BeautifulSoup(s, fromEncoding=r.encoding)
            hasNext = soup.findAll('a', {'class': 'page-next'}) or \
                soup.findAll('a', {'class': re.compile(r'J\_SearchAsync\snext')})
            hasNext = [hasNext[0]['href']] if hasNext else []
        if len(hasNext) > 0:
            if url == hasNext[0]:
                break
            url = hasNext[0]
            url = re.sub(r'&amp;', r'&', url)
        else:
            break
        time.sleep(reqinterval)
    return metadata