Example #1
0
    def jdProductShelvesTime(self):
        for itemOut in self.data:
            href = itemOut[2]
            src = myUrlOpen.requestByProxy(href)
            d = pq(src)
            try:
                frames = d.find('#parameter2>li')
            except:
                break
            for item in frames:
                d = pq(item)
                text = d.text()
                text = text.split(':')
                textTest = text[0]
                textTarget = text[1]
                if textTest == u'上架时间':
                    shelvesTime = textTarget
                    print(shelvesTime)
                    self.data[self.data.index(itemOut)].append(shelvesTime)
                    break

        fileName = str(time.strftime('%Y-%m-%d %H_%M_%S')) + '-jdAppleData.csv'
        with open(fileName, 'wb') as f:
            writer = csv.writer(f)
            writer.writerows(self.data)
Example #2
0
 def getINfo(self):
     while not queue_for_CommentDetailCount.empty():
         # time.sleep(abs(random.gauss(5, 2)))
         urlKeyWord = queue_for_CommentDetailCount.get()
         urlBase = 'https://rate.taobao.com/user-rate-'
         url = urlBase + urlKeyWord + '.htm?spm=a1z10.3-b.d4918101.' + spmKeywordRandom()
         temp = myUrlOpen.requestByProxy(url)
         temp = temp.decode('GBK', 'ignore')
         d = pq(temp)
         framesDetail = d.find('.count')
         textDetail = framesDetail.my_text()[3:]  # 所有细项评分比例
         if not textDetail:
             textDetail = []
         framesCount = d.find('.total>span')
         try:
             textCount = [framesCount.my_text()[0]]  # 参与评分人数
         except:
             textCount = ['-']
         framesOther = d.find('.title+ul>li')
         textOther = framesOther.my_text()  # 公司名称所在地等
         tempForTextOther = ['公 司 名:', '当前主营:', '开店时长:']
         textOther = [textOther[i + 1] for item in tempForTextOther for i in range(len(textOther)) if
                      textOther[i] == item]
         print(textCount, textDetail, textOther)
         result = textCount + textDetail + textOther
         result.append(urlKeyWord)
Example #3
0
 def downloader(self):
     while not queue_for_url_target.empty():
         topic, url = queue_for_url_target.get()
         print(topic.decode('gbk', 'ignore'))
         print(url)
         src = myUrlOpen.requestByProxy(url)
         if src:
             queue_for_src.put((url, src))
Example #4
0
 def produPrice(self):
     while queue_for_ProductPrice.qsize() > 0:
         sku = queue_for_ProductPrice.get()
         url = 'http://p.3.cn/prices/get?skuid=J_' + sku
         src = myUrlOpen.requestByProxy(url)
         jsonFile = src[1:-2]
         d = json.loads(jsonFile)
         res = [sku, d['p'], d['m']]
         print(res)
         queue_for_ProductPrice_result.put(res)
Example #5
0
 def getShopItem(self):
     while not queue_for_ShopDataUid.empty():
         shopDataUid = queue_for_ShopDataUid.get()
         urlData = {'from': 1, 'sort': 's', 'style': 'sg', 'user_id': shopDataUid, 's': 0}
         urlHeader = 'http://list.tmall.com/search_shopitem.htm?'
         url = urlHeader + urllib.urlencode(urlData)
         # print(url)
         # src = myUrlOpen.requestByProxy('http://1111.ip138.com/ic.asp')
         # print(src)
         src = myUrlOpen.requestByProxy(url)
         # print(src)
         d = pq(src)
         frames = d.find('.product-iWrap')
         print(len(frames))
Example #6
0
def test(fileName):
    temp = myUrlOpen.requestByProxy('https://rate.taobao.com/user-rate-UvGkuvGQYvGNy.htm')
    # with open(fileName, 'r') as f:
    #     temp = f.read()
    temp = temp.decode('GBK', 'ignore')
    d = pq(temp)
    framesDetail = d.find('.count')
    textDetail = framesDetail.my_text()[3:]  # 所有细项评分比例
    framesCount = d.find('.total>span')
    textCount = [framesCount.my_text()[0]]  # 参与评分人数
    framesOther = d.find('.title+ul>li')
    textOther = framesOther.my_text()  # 公司名称所在地等
    tempForTextOther = ['公 司 名:', '当前主营:', '开店时长:']
    textOther = [textOther[i + 1] for item in tempForTextOther for i in range(len(textOther)) if textOther[i] == item]
    result = textCount + textDetail + textOther
    for item in result:
        print(item)
Example #7
0
def getCategoryAndStartUrl():
    import json

    global queue_for_url_targetBase
    queue_for_url_targetBase = Queue(0)
    src = myUrlOpen.requestByProxy('http://dc.3.cn/category/get?callback=getCategoryCallback')
    srcTemp = src.split('(', 1)[1][:-1]
    srcTemp = srcTemp.decode('gbk', 'ignore')
    srcJson = json.loads(srcTemp)['data']
    category = []
    for Fi in srcJson:
        targetFi = Fi['s']
        for Se in targetFi:
            targetSeTitle = Se['n']
            targetSe = Se['s']
            for Ti in targetSe:
                targetTiTitle = Ti['n']
                targetTi = Ti['s']
                for Fo in targetTi:
                    targetFoTitle = Fo['n']
                    categoryTemp = [targetSeTitle.split('|')[1], targetSeTitle.split('|')[0],
                                    targetTiTitle.split('|')[1], targetTiTitle.split('|')[0],
                                    targetFoTitle.split('|')[1], targetFoTitle.split('|')[0]]
                    category.append(categoryTemp)
                    queue_for_url_targetBase.put((targetFoTitle.split('|')[1], targetFoTitle.split('|')[0]))
    db = DBService(dbName='jddata', tableName='jdkeyword')
    db.createTable(tableTitle=['category_fi_name', 'category_fi', 'category_se_name', 'category_se', 'category_ti_name',
                               'category_ti'])
    db.data2DB(data=category)
    # for item in category:
    #     print(item)
    #     try:
    #         db.data2DB(data=item)
    #     except:continue
    # print('=' * 50)
    return category
Example #8
0
    def getShopInfo(self):
        while not queue_GetShopList_keyWord.empty():
            keyWord = queue_GetShopList_keyWord.get()
            getData = {'initiative_id': 'staobaoz_20120515', 'q': keyWord, 'app': 'shopsearch', 'fs': 1, 'isb': 0,
                       'goodrate': '', 's': 0}
            urlStart = 'https://s.taobao.com/search?' + urllib.urlencode(getData)
            src = myUrlOpen.requestByProxy(urlStart)
            src = src.decode('gbk', 'ignore')
            d = pq(src)
            try:
                pageCount = d.find('.pagination').attr('bx-config')
                # 返回的json不标准(key无引号,json.loads出错),调用自编函数处理
                pageCount = jsonParse(pageCount)
            except:
                continue
            pageCount = int(pageCount['count'][1:-1]) / 20
            print(keyWord, pageCount)
            if pageCount:
                for i in range(0, pageCount * 20, 20):
                    getData = {'initiative_id': 'staobaoz_20120515', 'q': keyWord, 'app': 'shopsearch', 'fs': 1,
                               'isb': 0,
                               'goodrate': '', 's': i}
                    url = 'https://s.taobao.com/search?' + urllib.urlencode(getData)
                    queue_GetShopList_url.put(url)
                    # print(url)

        while queue_GetShopList_url.qsize() > 0:
            url = queue_GetShopList_url.get()
            src = myUrlOpen.requestByProxy(url)
            src = src.decode('gbk', 'ignore')
            d = pq(src)
            frames = d.find('.list-item')
            for item in frames:
                d = pq(item)
                score = d.find('.descr').attr('data-dsr')
                tempForScoreGet = ['mas', 'mg', 'sas', 'sg', 'cas', 'cg', 'sgr', 'srn', 'encryptedUserId']
                # mas,描述评分;mg,描述评分avg;sas,服务态度;sg,服务态度avg;cas,物流服务;cg物流服务avg
                jsonFile = json.loads(score)
                score = [jsonFile[sc] for sc in tempForScoreGet]
                dataUid = d.find('h4>a:nth-child(1)').attr('data-uid')
                shopHref = 'http:' + d.find('h4>a:nth-child(1)').attr('href')
                shopName = d.find('h4>a:nth-child(1)').text()
                if d.find('h4>a:nth-child(2)').attr('title') == u'企业卖家':
                    ifCompanySeller = 'YES'
                else:
                    ifCompanySeller = 'NO'
                if ifCompanySeller == 'YES':
                    shopRank = d.find('.shop-leval a').attr('class')
                    shopRank = shopRank if shopRank else '-'
                else:
                    shopRank = d.find('h4>a:nth-child(2)').attr('class')
                    shopRank = shopRank if shopRank else '-'
                addr = d.find('.shop-address').text()
                brand = d.find('.main-cat>a').text()
                monthSale = d.find('.info-sale').text()
                monthSale = monthSale.split(' ')[1]
                productSum = d.find('.info-sum').text()
                productSum = productSum.split(' ')[1]
                productPromotFrame = d.find('.one-product')
                tempForProductPromot = ['-' for i in range(12)]
                if productPromotFrame:
                    i = 0
                    for ppf in productPromotFrame:
                        di = pq(ppf)
                        dataNid = di.find('a').attr('data-nid')
                        productHref = 'http:' + di.find('a').attr('href')
                        productPrice = di.find('.price-num').text()
                        tempForProductPromot[i] = dataNid
                        tempForProductPromot[i + 1] = productHref
                        tempForProductPromot[i + 2] = productPrice
                        i += 3
                Result = [shopName, shopHref, addr, brand, monthSale, productSum] + score + tempForProductPromot + [
                    dataUid] + [ifCompanySeller, shopRank]
                queue_GetShopList_result.put(Result)
                print(Result)