Beispiel #1
0
def main_CommentDetail(fileName=dirCheck.dirGen('D:\spider\jd\InnerPageProductDetail_2015-08-26 10_53_41.csv'),
                       columnNo=2, threadCount=50):
    global queue_sku_commentDetail, queue_skuPageUrl_commentDetail, queue_commentDetail_result
    queue_sku_commentDetail = Queue(0)
    queue_skuPageUrl_commentDetail = Queue(0)
    queue_commentDetail_result = Queue(0)

    # 参数说明;fileName为需打开的文件绝对路径加文件名;columnNo为特定字段在第几列(第一列为0);threadCount为开启线程数,默认开启50个线程
    # sku信息put至queue备用
    with open(fileName, 'r') as f:
        reader = csv.reader(f)
        i = 1
        for row in reader:
            if i > 1:
                queue_sku_commentDetail.put(row[columnNo])
            i += 1

    # 评论信息抓取
    CommentDetail_thread = []
    for i in range(threadCount):
        CommentDetail_thread.append(CommentDetail())
    for item in CommentDetail_thread:
        item.start()
        time.sleep(0.01)

    while True:
        if queue_commentDetail_result.qsize() > 200000:
            resultForCommentDetail = []
            for i in range(200000):
                resultForCommentDetail.append(queue_commentDetail_result.get())
            title = ['productSku', 'userId', 'userGuid', 'content', 'createTime', 'referenceId',
                     'referenceTime', 'replyCount', 'score', 'userLevelId', 'userProvince',
                     'productColor', 'userLevelName', 'userClientShow', 'isMobile', 'urlFrom'
                     ]
            # 数据写入csv文件
            writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/jd/commentDetail'), name='commentDetail',
                                     title=title, result=resultForCommentDetail)
            writer.add_title_data()
        if queue_skuPageUrl_commentDetail.qsize() == 0:
            break

    # for item in CommentDetail_thread:
    # item.join()

    # 评论信息持久化
    resultForCommentDetail = []
    for i in range(queue_commentDetail_result.qsize()):
        resultForCommentDetail.append(queue_commentDetail_result.get())
    title = ['productSku', 'userId', 'userGuid', 'content', 'createTime', 'referenceId',
             'referenceTime', 'replyCount', 'score', 'userLevelId', 'userProvince',
             'productColor', 'userLevelName', 'userClientShow', 'isMobile'
             ]
    # 数据写入csv文件
    writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/jd/commentDetail'), name='commentDetail',
                             title=title, result=resultForCommentDetail)
    writer.add_title_data()
Beispiel #2
0
def main_GetShopList(threadCount=50):
    main_GetKeyWord()

    global queue_GetShopList_url, queue_GetShopList_result
    queue_GetShopList_url = Queue(0)
    queue_GetShopList_result = Queue(0)

    GetShopList_thread = []
    for i in range(threadCount):
        GetShopList_thread.append(GetShopList())
    for item in GetShopList_thread:
        item.start()

    # 为下一个while True提供延时
    time.sleep(60)
    count = 1
    while queue_GetShopList_url.qsize() > 0:
        if queue_GetShopList_result.qsize() > 20000:
            result = []
            for i in range(20000):
                result.append(queue_GetShopList_result.get())
            title = ['name', 'href', 'addr', 'brnad', 'monthsale', 'productsum', 'dsr_desc_mark', 'dsr_desc_avg',
                     'dsr_service_mark',
                     'dsr_service_avg', 'dsr_sending_mark', 'dsr_sending_avg', 'sgr', 'srn', 'encryptedUserId',
                     'productDataNid_1', 'product_link_1', 'price_1', 'productDataNid_2', 'product_link_2', 'price_2',
                     'productDataNid_3', 'product_link_3', 'price_3', 'productDataNid_4', 'product_link_4', 'price_4',
                     'shopDataUid', 'ifCompanySeller', 'shopRank']
            # 数据写入csv文件
            writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/taobao/baseInfo'), name='shopInfo',
                                     title=title, result=result)
            writer.add_title_data()
            print(u'第 %s 个文件已输出,请检查数据!' % count)
            count += 1

    # 输出最后一个文件
    result = []
    for i in range(queue_GetShopList_result.qsize()):
        result.append(queue_GetShopList_result.get(timeout=20))
    title = ['name', 'href', 'addr', 'brnad', 'monthsale', 'productsum', 'dsr_desc_mark', 'dsr_desc_avg',
             'dsr_service_mark',
             'dsr_service_avg', 'dsr_sending_mark', 'dsr_sending_avg', 'sgr', 'srn', 'encryptedUserId',
             'productDataNid_1', 'product_link_1', 'price_1', 'productDataNid_2', 'product_link_2', 'price_2',
             'productDataNid_3', 'product_link_3', 'price_3', 'productDataNid_4', 'product_link_4', 'price_4',
             'shopDataUid', 'ifCompanySeller', 'shopRank']
    # 数据写入csv文件
    writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/taobao/baseInfo'), name='shopInfo',
                             title=title, result=result)
    writer.add_title_data()
    print(u'最后一个文件已输出,请检查数据!')
Beispiel #3
0
def productInfoFromLocalSrc(path):
    fileList = os.listdir(path)
    result = []
    for item in fileList:
        with open(path + '/' + item, 'r') as f:
            temp = f.read()
        temp = temp.decode('utf-8', 'ignore')
        d = pq(temp)
        frames = d.find('.product-iWrap')
        print(len(frames))
        for itemIneer in frames:
            d = pq(itemIneer)
            href = 'https:' + d.find('.productImg').attr('href')
            price = d.find('.productPrice>em').attr('title')
            tmallIdentification = d.find('.productPrice>a>img').attr('title')
            title = d.find('.productTitle').text()
            shop = d.find('.productShop>a').text()
            saleCount = d.find('.productStatus>span:nth-child(1)>em').text()[:-1]
            commentCount = d.find('.productStatus>span:nth-child(2)>a').text()
            res = [title, href, price, tmallIdentification, shop, saleCount, commentCount]
            result.append(res)
    title = ['title', 'href', 'price', 'tmallIdentification', 'shop', 'saleCount', 'commentCount']
    # 数据写入csv文件
    writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/tmall'), name=u'智能设备Info',
                             title=title, result=result)
    writer.add_title_data()
    print(u'文件已输出,请检查数据!')
Beispiel #4
0
def main_ProductInfoFromListUrl(threadCount=1):
    global queue_for_ProductInfoFromListUrl_url, queue_for_ProductInfoFromListUrl_result
    queue_for_ProductInfoFromListUrl_url = Queue(0)
    queue_for_ProductInfoFromListUrl_result = Queue(0)

    url_base = 'https://list.tmall.com/search_product.htm?spm=a220m.1000858.0.0.2Km9BG&cat=56148012&s='
    url_after = '&sort=s&style=g&search_condition=7&from=sn_1_rightnav&active=1&industryCatId=55852013&theme=551&tmhkmain=0&type=pc'

    for i in range(0, 100 * 60, 60):
        if i == 0:
            queue_for_ProductInfoFromListUrl_url.put((url_base + str(i) + url_after, 'www.tmall.com'))
            print((url_base + str(i) + url_after, 'www.tmall.com'))
        else:
            queue_for_ProductInfoFromListUrl_url.put(
                    (url_base + str(i) + url_after, url_base + str(i - 60) + url_after))

    ProductInfoFromListUrl_thread = []
    for i in range(threadCount):
        ProductInfoFromListUrl_thread.append(ProductInfoFromListUrl())
    for item in ProductInfoFromListUrl_thread:
        item.start()
    for item in ProductInfoFromListUrl_thread:
        item.join()

    result = []
    for i in range(queue_for_ProductInfoFromListUrl_result.qsize()):
        result.append(queue_for_ProductInfoFromListUrl_result.get())
    title = ['title', 'href', 'price', 'tmallIdentification', 'shop', 'saleCount', 'commentCount', 'urlFrom']
    # 数据写入csv文件
    writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/tmall'), name=u'智能设备Info',
                             title=title, result=result)
    writer.add_title_data()
    print(u'文件已输出,请检查数据!')
Beispiel #5
0
def main_ProductPrice(threadCound=50):
    # 注,本段代码未引用其他类
    global queue_for_ProductPrice, queue_for_ProductPrice_result
    queue_for_ProductPrice = Queue(0)
    queue_for_ProductPrice_result = Queue(0)
    # 若单独执行ProductPrice类,则需为queue_for_ProductPrice赋值
    fileName = 'D:/spider/jd/jd_intelligent_productInfoDetail_2015-08-25 09_48_22.csv'
    # fileName = 'D:/spider/jd/jd_intelligent_productInfoDetail_2015-08-25 01_41_33.csv'
    dict = {}
    with open(fileName, 'r') as f:
        reader = csv.reader(f)
        i = 0
        for row in reader:
            if i > 0:
                dict[row[2]] = 1
            i += 1
    for item in dict.items():
        queue_for_ProductPrice.put(item[0].split('/')[-1].split('.')[0])

    ProductPrice_thread = []
    for i in range(threadCound):
        ProductPrice_thread.append(ProductPrice())
    for item in ProductPrice_thread:
        item.start()
    for item in ProductPrice_thread:
        item.join()

    result = []
    for i in range(queue_for_ProductPrice_result.qsize()):
        result.append(queue_for_ProductPrice_result.get())
    title = ['productSku', 'price', 'M']
    # 数据写入csv文件
    writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/jd/productDetail'), name='ProductPrice',
                             title=title, result=result)
    writer.add_title_data()
Beispiel #6
0
def productKeyWordTest():
    global queue_for_test
    queue_for_test = Queue(0)

    result = []
    for i in range(queue_for_test.qsize()):
        result.append(queue_for_test.get())
    title = ['url', 'textTest']
    # 数据写入csv文件
    writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/jd'), name='TextTest', title=title, result=result)
    writer.add_title_data()
Beispiel #7
0
def dataSaved():
    from myTool import MyCsv, dirCheck

    result = []
    for i in range(5000):
        temp = queueForResult.get()
        result.append(temp)
    title = ['', '', '', '', 'url']
    # 数据写入csv文件
    writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/jd/productDetail'), name='jdProductInfo',
                             title=title, result=result)
    writer.add_title_data()
Beispiel #8
0
def main_innerProductDetail(threadCount=50):
    # 抓取商品页面商品介绍(商品参数类)

    main_productDetail()

    global queue_for_InnerPageProdctDetail, queue_for_InnerPageProdctDetail_result
    queue_for_InnerPageProdctDetail = Queue(0)
    queue_for_InnerPageProdctDetail_result = Queue(0)

    for i in range(queue_for_result.qsize()):
        item = queue_for_result.get()[-1]
        queue_for_InnerPageProdctDetail.put(item)

    # 若单独执行InnerPageProductDetail类,则需为queue_for_InnerPageProductDetail赋值
    # fileName='D:/spider/jd/jd_intelligent_productInfoDetail_2015-08-25 09_48_22.csv'
    # dict = {}
    # with open(fileName,'r') as f:
    # reader = csv.reader(f)
    #     i = 0
    #     for row in reader:
    #         if i>0:
    #             dict[row[2]]=1
    #         i+=1
    # for item in dict.items():
    #     queue_for_InnerPageProdctDetail.put(item[0])

    # 抓取商品页面商品介绍(商品参数类)
    InnerPageProductDetail_thread = []
    for i in range(threadCount):
        InnerPageProductDetail_thread.append(InnerPageProductDetail())
    for item in InnerPageProductDetail_thread:
        item.start()
    for item in InnerPageProductDetail_thread:
        item.join()

    # 商品页面信息持久化
    resultForInnerPageProductDetail = []
    for i in range(queue_for_InnerPageProdctDetail_result.qsize()):
        resultForInnerPageProductDetail.append(queue_for_InnerPageProdctDetail_result.get())
    title = ['productUrl', 'commondityName', 'commondityCode', 'shelvesTime', 'goodsWeight', 'shopName', 'function',
             'type', 'originOfGoods', 'usage', 'system', 'productNo', 'compatibility', 'applicableCrowd',
             'brand', 'theoreticalEndurance', 'rateOfWork', 'scoreSum', 'scoreProduct', 'scoreProductAvg',
             'scoreService', 'scoreServiceAvg', 'scoreExpress', 'scoreExpressAvg', 'companyName']
    # 数据写入csv文件
    writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/jd'), name='InnerPageProductDetail',
                             title=title, result=resultForInnerPageProductDetail)
    writer.add_title_data()
Beispiel #9
0
 def getBrandList(self):
     fileList = os.listdir(self.filePath)
     result = []
     for item in fileList:
         with open(self.filePath + '/' + item, 'r') as f:
             res = f.read()
         d = pq(res)
         res = d.find('.J_valueList.v-fixed>li>a').my_text()
         res = [[item.split('.')[0].decode('gbk', 'ignore'), temp] for temp in res]
         result.append(res)
     result = [temp for item in result for temp in item]
     title = ['topic', 'brnad']
     # 数据写入csv文件
     writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/jd/brandList'), name='jd_intelligent_brand',
                              title=title,
                              result=result)
     writer.add_title_data()
Beispiel #10
0
def main_productDetail(threadCount=50):
    main_genUrlList()
    # 商品链接等详情抓取main
    global queue_for_result
    queue_for_result = Queue(0)

    ProductDetail_thread = []
    for i in range(threadCount):
        ProductDetail_thread.append(ProductDetail())
    for item in ProductDetail_thread:
        item.start()
    for item in ProductDetail_thread:
        item.join()

    result = []
    for i in range(queue_for_result.qsize()):
        result.append(queue_for_result.get())
    title = ['productName', 'sku', 'productHref', 'price', 'commentCount', 'topic', 'pageUrl']
    # 数据写入csv文件
    writer = MyCsv.Write_Csv(path=dirCheck.dirGen('d:/spider/jd/productDetail'),
                             name='jd_intelligent_productInfoDetail', title=title,
                             result=result)
    writer.add_title_data()
Beispiel #11
0
def main_brandList():
    # 获取品牌列表
    BrandList(dirCheck.dirGen('D:\spider\jd\brand_pagesource')).getBrandList()