def jdProductShelvesTime(self): for itemOut in self.data: href = itemOut[2] src = myUrlOpen.requestByProxy(href) d = pq(src) try: frames = d.find('#parameter2>li') except: break for item in frames: d = pq(item) text = d.text() text = text.split(':') textTest = text[0] textTarget = text[1] if textTest == u'上架时间': shelvesTime = textTarget print(shelvesTime) self.data[self.data.index(itemOut)].append(shelvesTime) break fileName = str(time.strftime('%Y-%m-%d %H_%M_%S')) + '-jdAppleData.csv' with open(fileName, 'wb') as f: writer = csv.writer(f) writer.writerows(self.data)
def getINfo(self): while not queue_for_CommentDetailCount.empty(): # time.sleep(abs(random.gauss(5, 2))) urlKeyWord = queue_for_CommentDetailCount.get() urlBase = 'https://rate.taobao.com/user-rate-' url = urlBase + urlKeyWord + '.htm?spm=a1z10.3-b.d4918101.' + spmKeywordRandom() temp = myUrlOpen.requestByProxy(url) temp = temp.decode('GBK', 'ignore') d = pq(temp) framesDetail = d.find('.count') textDetail = framesDetail.my_text()[3:] # 所有细项评分比例 if not textDetail: textDetail = [] framesCount = d.find('.total>span') try: textCount = [framesCount.my_text()[0]] # 参与评分人数 except: textCount = ['-'] framesOther = d.find('.title+ul>li') textOther = framesOther.my_text() # 公司名称所在地等 tempForTextOther = ['公 司 名:', '当前主营:', '开店时长:'] textOther = [textOther[i + 1] for item in tempForTextOther for i in range(len(textOther)) if textOther[i] == item] print(textCount, textDetail, textOther) result = textCount + textDetail + textOther result.append(urlKeyWord)
def downloader(self): while not queue_for_url_target.empty(): topic, url = queue_for_url_target.get() print(topic.decode('gbk', 'ignore')) print(url) src = myUrlOpen.requestByProxy(url) if src: queue_for_src.put((url, src))
def produPrice(self): while queue_for_ProductPrice.qsize() > 0: sku = queue_for_ProductPrice.get() url = 'http://p.3.cn/prices/get?skuid=J_' + sku src = myUrlOpen.requestByProxy(url) jsonFile = src[1:-2] d = json.loads(jsonFile) res = [sku, d['p'], d['m']] print(res) queue_for_ProductPrice_result.put(res)
def getShopItem(self): while not queue_for_ShopDataUid.empty(): shopDataUid = queue_for_ShopDataUid.get() urlData = {'from': 1, 'sort': 's', 'style': 'sg', 'user_id': shopDataUid, 's': 0} urlHeader = 'http://list.tmall.com/search_shopitem.htm?' url = urlHeader + urllib.urlencode(urlData) # print(url) # src = myUrlOpen.requestByProxy('http://1111.ip138.com/ic.asp') # print(src) src = myUrlOpen.requestByProxy(url) # print(src) d = pq(src) frames = d.find('.product-iWrap') print(len(frames))
def test(fileName): temp = myUrlOpen.requestByProxy('https://rate.taobao.com/user-rate-UvGkuvGQYvGNy.htm') # with open(fileName, 'r') as f: # temp = f.read() temp = temp.decode('GBK', 'ignore') d = pq(temp) framesDetail = d.find('.count') textDetail = framesDetail.my_text()[3:] # 所有细项评分比例 framesCount = d.find('.total>span') textCount = [framesCount.my_text()[0]] # 参与评分人数 framesOther = d.find('.title+ul>li') textOther = framesOther.my_text() # 公司名称所在地等 tempForTextOther = ['公 司 名:', '当前主营:', '开店时长:'] textOther = [textOther[i + 1] for item in tempForTextOther for i in range(len(textOther)) if textOther[i] == item] result = textCount + textDetail + textOther for item in result: print(item)
def getCategoryAndStartUrl(): import json global queue_for_url_targetBase queue_for_url_targetBase = Queue(0) src = myUrlOpen.requestByProxy('http://dc.3.cn/category/get?callback=getCategoryCallback') srcTemp = src.split('(', 1)[1][:-1] srcTemp = srcTemp.decode('gbk', 'ignore') srcJson = json.loads(srcTemp)['data'] category = [] for Fi in srcJson: targetFi = Fi['s'] for Se in targetFi: targetSeTitle = Se['n'] targetSe = Se['s'] for Ti in targetSe: targetTiTitle = Ti['n'] targetTi = Ti['s'] for Fo in targetTi: targetFoTitle = Fo['n'] categoryTemp = [targetSeTitle.split('|')[1], targetSeTitle.split('|')[0], targetTiTitle.split('|')[1], targetTiTitle.split('|')[0], targetFoTitle.split('|')[1], targetFoTitle.split('|')[0]] category.append(categoryTemp) queue_for_url_targetBase.put((targetFoTitle.split('|')[1], targetFoTitle.split('|')[0])) db = DBService(dbName='jddata', tableName='jdkeyword') db.createTable(tableTitle=['category_fi_name', 'category_fi', 'category_se_name', 'category_se', 'category_ti_name', 'category_ti']) db.data2DB(data=category) # for item in category: # print(item) # try: # db.data2DB(data=item) # except:continue # print('=' * 50) return category
def getShopInfo(self): while not queue_GetShopList_keyWord.empty(): keyWord = queue_GetShopList_keyWord.get() getData = {'initiative_id': 'staobaoz_20120515', 'q': keyWord, 'app': 'shopsearch', 'fs': 1, 'isb': 0, 'goodrate': '', 's': 0} urlStart = 'https://s.taobao.com/search?' + urllib.urlencode(getData) src = myUrlOpen.requestByProxy(urlStart) src = src.decode('gbk', 'ignore') d = pq(src) try: pageCount = d.find('.pagination').attr('bx-config') # 返回的json不标准(key无引号,json.loads出错),调用自编函数处理 pageCount = jsonParse(pageCount) except: continue pageCount = int(pageCount['count'][1:-1]) / 20 print(keyWord, pageCount) if pageCount: for i in range(0, pageCount * 20, 20): getData = {'initiative_id': 'staobaoz_20120515', 'q': keyWord, 'app': 'shopsearch', 'fs': 1, 'isb': 0, 'goodrate': '', 's': i} url = 'https://s.taobao.com/search?' + urllib.urlencode(getData) queue_GetShopList_url.put(url) # print(url) while queue_GetShopList_url.qsize() > 0: url = queue_GetShopList_url.get() src = myUrlOpen.requestByProxy(url) src = src.decode('gbk', 'ignore') d = pq(src) frames = d.find('.list-item') for item in frames: d = pq(item) score = d.find('.descr').attr('data-dsr') tempForScoreGet = ['mas', 'mg', 'sas', 'sg', 'cas', 'cg', 'sgr', 'srn', 'encryptedUserId'] # mas,描述评分;mg,描述评分avg;sas,服务态度;sg,服务态度avg;cas,物流服务;cg物流服务avg jsonFile = json.loads(score) score = [jsonFile[sc] for sc in tempForScoreGet] dataUid = d.find('h4>a:nth-child(1)').attr('data-uid') shopHref = 'http:' + d.find('h4>a:nth-child(1)').attr('href') shopName = d.find('h4>a:nth-child(1)').text() if d.find('h4>a:nth-child(2)').attr('title') == u'企业卖家': ifCompanySeller = 'YES' else: ifCompanySeller = 'NO' if ifCompanySeller == 'YES': shopRank = d.find('.shop-leval a').attr('class') shopRank = shopRank if shopRank else '-' else: shopRank = d.find('h4>a:nth-child(2)').attr('class') shopRank = shopRank if shopRank else '-' addr = d.find('.shop-address').text() brand = d.find('.main-cat>a').text() monthSale = d.find('.info-sale').text() monthSale = monthSale.split(' ')[1] productSum = d.find('.info-sum').text() productSum = productSum.split(' ')[1] productPromotFrame = d.find('.one-product') tempForProductPromot = ['-' for i in range(12)] if productPromotFrame: i = 0 for ppf in productPromotFrame: di = pq(ppf) dataNid = di.find('a').attr('data-nid') productHref = 'http:' + di.find('a').attr('href') productPrice = di.find('.price-num').text() tempForProductPromot[i] = dataNid tempForProductPromot[i + 1] = productHref tempForProductPromot[i + 2] = productPrice i += 3 Result = [shopName, shopHref, addr, brand, monthSale, productSum] + score + tempForProductPromot + [ dataUid] + [ifCompanySeller, shopRank] queue_GetShopList_result.put(Result) print(Result)