def main_CommentDetail(fileName=dirCheck.dirGen('D:\spider\jd\InnerPageProductDetail_2015-08-26 10_53_41.csv'), columnNo=2, threadCount=50): global queue_sku_commentDetail, queue_skuPageUrl_commentDetail, queue_commentDetail_result queue_sku_commentDetail = Queue(0) queue_skuPageUrl_commentDetail = Queue(0) queue_commentDetail_result = Queue(0) # 参数说明;fileName为需打开的文件绝对路径加文件名;columnNo为特定字段在第几列(第一列为0);threadCount为开启线程数,默认开启50个线程 # sku信息put至queue备用 with open(fileName, 'r') as f: reader = csv.reader(f) i = 1 for row in reader: if i > 1: queue_sku_commentDetail.put(row[columnNo]) i += 1 # 评论信息抓取 CommentDetail_thread = [] for i in range(threadCount): CommentDetail_thread.append(CommentDetail()) for item in CommentDetail_thread: item.start() time.sleep(0.01) while True: if queue_commentDetail_result.qsize() > 200000: resultForCommentDetail = [] for i in range(200000): resultForCommentDetail.append(queue_commentDetail_result.get()) title = ['productSku', 'userId', 'userGuid', 'content', 'createTime', 'referenceId', 'referenceTime', 'replyCount', 'score', 'userLevelId', 'userProvince', 'productColor', 'userLevelName', 'userClientShow', 'isMobile', 'urlFrom' ] # 数据写入csv文件 writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/jd/commentDetail'), name='commentDetail', title=title, result=resultForCommentDetail) writer.add_title_data() if queue_skuPageUrl_commentDetail.qsize() == 0: break # for item in CommentDetail_thread: # item.join() # 评论信息持久化 resultForCommentDetail = [] for i in range(queue_commentDetail_result.qsize()): resultForCommentDetail.append(queue_commentDetail_result.get()) title = ['productSku', 'userId', 'userGuid', 'content', 'createTime', 'referenceId', 'referenceTime', 'replyCount', 'score', 'userLevelId', 'userProvince', 'productColor', 'userLevelName', 'userClientShow', 'isMobile' ] # 数据写入csv文件 writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/jd/commentDetail'), name='commentDetail', title=title, result=resultForCommentDetail) writer.add_title_data()
def main_GetShopList(threadCount=50): main_GetKeyWord() global queue_GetShopList_url, queue_GetShopList_result queue_GetShopList_url = Queue(0) queue_GetShopList_result = Queue(0) GetShopList_thread = [] for i in range(threadCount): GetShopList_thread.append(GetShopList()) for item in GetShopList_thread: item.start() # 为下一个while True提供延时 time.sleep(60) count = 1 while queue_GetShopList_url.qsize() > 0: if queue_GetShopList_result.qsize() > 20000: result = [] for i in range(20000): result.append(queue_GetShopList_result.get()) title = ['name', 'href', 'addr', 'brnad', 'monthsale', 'productsum', 'dsr_desc_mark', 'dsr_desc_avg', 'dsr_service_mark', 'dsr_service_avg', 'dsr_sending_mark', 'dsr_sending_avg', 'sgr', 'srn', 'encryptedUserId', 'productDataNid_1', 'product_link_1', 'price_1', 'productDataNid_2', 'product_link_2', 'price_2', 'productDataNid_3', 'product_link_3', 'price_3', 'productDataNid_4', 'product_link_4', 'price_4', 'shopDataUid', 'ifCompanySeller', 'shopRank'] # 数据写入csv文件 writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/taobao/baseInfo'), name='shopInfo', title=title, result=result) writer.add_title_data() print(u'第 %s 个文件已输出,请检查数据!' % count) count += 1 # 输出最后一个文件 result = [] for i in range(queue_GetShopList_result.qsize()): result.append(queue_GetShopList_result.get(timeout=20)) title = ['name', 'href', 'addr', 'brnad', 'monthsale', 'productsum', 'dsr_desc_mark', 'dsr_desc_avg', 'dsr_service_mark', 'dsr_service_avg', 'dsr_sending_mark', 'dsr_sending_avg', 'sgr', 'srn', 'encryptedUserId', 'productDataNid_1', 'product_link_1', 'price_1', 'productDataNid_2', 'product_link_2', 'price_2', 'productDataNid_3', 'product_link_3', 'price_3', 'productDataNid_4', 'product_link_4', 'price_4', 'shopDataUid', 'ifCompanySeller', 'shopRank'] # 数据写入csv文件 writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/taobao/baseInfo'), name='shopInfo', title=title, result=result) writer.add_title_data() print(u'最后一个文件已输出,请检查数据!')
def productInfoFromLocalSrc(path): fileList = os.listdir(path) result = [] for item in fileList: with open(path + '/' + item, 'r') as f: temp = f.read() temp = temp.decode('utf-8', 'ignore') d = pq(temp) frames = d.find('.product-iWrap') print(len(frames)) for itemIneer in frames: d = pq(itemIneer) href = 'https:' + d.find('.productImg').attr('href') price = d.find('.productPrice>em').attr('title') tmallIdentification = d.find('.productPrice>a>img').attr('title') title = d.find('.productTitle').text() shop = d.find('.productShop>a').text() saleCount = d.find('.productStatus>span:nth-child(1)>em').text()[:-1] commentCount = d.find('.productStatus>span:nth-child(2)>a').text() res = [title, href, price, tmallIdentification, shop, saleCount, commentCount] result.append(res) title = ['title', 'href', 'price', 'tmallIdentification', 'shop', 'saleCount', 'commentCount'] # 数据写入csv文件 writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/tmall'), name=u'智能设备Info', title=title, result=result) writer.add_title_data() print(u'文件已输出,请检查数据!')
def main_ProductInfoFromListUrl(threadCount=1): global queue_for_ProductInfoFromListUrl_url, queue_for_ProductInfoFromListUrl_result queue_for_ProductInfoFromListUrl_url = Queue(0) queue_for_ProductInfoFromListUrl_result = Queue(0) url_base = 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.2Km9BG&cat=56148012&s=' url_after = '&sort=s&style=g&search_condition=7&from=sn_1_rightnav&active=1&industryCatId=55852013&theme=551&tmhkmain=0&type=pc' for i in range(0, 100 * 60, 60): if i == 0: queue_for_ProductInfoFromListUrl_url.put((url_base + str(i) + url_after, 'www.tmall.com')) print((url_base + str(i) + url_after, 'www.tmall.com')) else: queue_for_ProductInfoFromListUrl_url.put( (url_base + str(i) + url_after, url_base + str(i - 60) + url_after)) ProductInfoFromListUrl_thread = [] for i in range(threadCount): ProductInfoFromListUrl_thread.append(ProductInfoFromListUrl()) for item in ProductInfoFromListUrl_thread: item.start() for item in ProductInfoFromListUrl_thread: item.join() result = [] for i in range(queue_for_ProductInfoFromListUrl_result.qsize()): result.append(queue_for_ProductInfoFromListUrl_result.get()) title = ['title', 'href', 'price', 'tmallIdentification', 'shop', 'saleCount', 'commentCount', 'urlFrom'] # 数据写入csv文件 writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/tmall'), name=u'智能设备Info', title=title, result=result) writer.add_title_data() print(u'文件已输出,请检查数据!')
def main_ProductPrice(threadCound=50): # 注,本段代码未引用其他类 global queue_for_ProductPrice, queue_for_ProductPrice_result queue_for_ProductPrice = Queue(0) queue_for_ProductPrice_result = Queue(0) # 若单独执行ProductPrice类,则需为queue_for_ProductPrice赋值 fileName = 'D:/spider/jd/jd_intelligent_productInfoDetail_2015-08-25 09_48_22.csv' # fileName = 'D:/spider/jd/jd_intelligent_productInfoDetail_2015-08-25 01_41_33.csv' dict = {} with open(fileName, 'r') as f: reader = csv.reader(f) i = 0 for row in reader: if i > 0: dict[row[2]] = 1 i += 1 for item in dict.items(): queue_for_ProductPrice.put(item[0].split('/')[-1].split('.')[0]) ProductPrice_thread = [] for i in range(threadCound): ProductPrice_thread.append(ProductPrice()) for item in ProductPrice_thread: item.start() for item in ProductPrice_thread: item.join() result = [] for i in range(queue_for_ProductPrice_result.qsize()): result.append(queue_for_ProductPrice_result.get()) title = ['productSku', 'price', 'M'] # 数据写入csv文件 writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/jd/productDetail'), name='ProductPrice', title=title, result=result) writer.add_title_data()
def productKeyWordTest(): global queue_for_test queue_for_test = Queue(0) result = [] for i in range(queue_for_test.qsize()): result.append(queue_for_test.get()) title = ['url', 'textTest'] # 数据写入csv文件 writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/jd'), name='TextTest', title=title, result=result) writer.add_title_data()
def dataSaved(): from myTool import MyCsv, dirCheck result = [] for i in range(5000): temp = queueForResult.get() result.append(temp) title = ['', '', '', '', 'url'] # 数据写入csv文件 writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/jd/productDetail'), name='jdProductInfo', title=title, result=result) writer.add_title_data()
def main_innerProductDetail(threadCount=50): # 抓取商品页面商品介绍(商品参数类) main_productDetail() global queue_for_InnerPageProdctDetail, queue_for_InnerPageProdctDetail_result queue_for_InnerPageProdctDetail = Queue(0) queue_for_InnerPageProdctDetail_result = Queue(0) for i in range(queue_for_result.qsize()): item = queue_for_result.get()[-1] queue_for_InnerPageProdctDetail.put(item) # 若单独执行InnerPageProductDetail类,则需为queue_for_InnerPageProductDetail赋值 # fileName='D:/spider/jd/jd_intelligent_productInfoDetail_2015-08-25 09_48_22.csv' # dict = {} # with open(fileName,'r') as f: # reader = csv.reader(f) # i = 0 # for row in reader: # if i>0: # dict[row[2]]=1 # i+=1 # for item in dict.items(): # queue_for_InnerPageProdctDetail.put(item[0]) # 抓取商品页面商品介绍(商品参数类) InnerPageProductDetail_thread = [] for i in range(threadCount): InnerPageProductDetail_thread.append(InnerPageProductDetail()) for item in InnerPageProductDetail_thread: item.start() for item in InnerPageProductDetail_thread: item.join() # 商品页面信息持久化 resultForInnerPageProductDetail = [] for i in range(queue_for_InnerPageProdctDetail_result.qsize()): resultForInnerPageProductDetail.append(queue_for_InnerPageProdctDetail_result.get()) title = ['productUrl', 'commondityName', 'commondityCode', 'shelvesTime', 'goodsWeight', 'shopName', 'function', 'type', 'originOfGoods', 'usage', 'system', 'productNo', 'compatibility', 'applicableCrowd', 'brand', 'theoreticalEndurance', 'rateOfWork', 'scoreSum', 'scoreProduct', 'scoreProductAvg', 'scoreService', 'scoreServiceAvg', 'scoreExpress', 'scoreExpressAvg', 'companyName'] # 数据写入csv文件 writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/jd'), name='InnerPageProductDetail', title=title, result=resultForInnerPageProductDetail) writer.add_title_data()
def getBrandList(self): fileList = os.listdir(self.filePath) result = [] for item in fileList: with open(self.filePath + '/' + item, 'r') as f: res = f.read() d = pq(res) res = d.find('.J_valueList.v-fixed>li>a').my_text() res = [[item.split('.')[0].decode('gbk', 'ignore'), temp] for temp in res] result.append(res) result = [temp for item in result for temp in item] title = ['topic', 'brnad'] # 数据写入csv文件 writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/jd/brandList'), name='jd_intelligent_brand', title=title, result=result) writer.add_title_data()
def main_productDetail(threadCount=50): main_genUrlList() # 商品链接等详情抓取main global queue_for_result queue_for_result = Queue(0) ProductDetail_thread = [] for i in range(threadCount): ProductDetail_thread.append(ProductDetail()) for item in ProductDetail_thread: item.start() for item in ProductDetail_thread: item.join() result = [] for i in range(queue_for_result.qsize()): result.append(queue_for_result.get()) title = ['productName', 'sku', 'productHref', 'price', 'commentCount', 'topic', 'pageUrl'] # 数据写入csv文件 writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/jd/productDetail'), name='jd_intelligent_productInfoDetail', title=title, result=result) writer.add_title_data()
def main_brandList(): # 获取品牌列表 BrandList(dirCheck.dirGen('D:\spider\jd\brand_pagesource')).getBrandList()