コード例 #1
0
def mungingPchome(_BASE_PATH, searchword, objectiveFolder,
                  objectiveFolderClean, objective):

    dirRoute = f"{_BASE_PATH}/dataMunging/{objectiveFolder}/{objective}/{searchword}/"
    dir1 = dirRoute + "24h/"
    dir2 = dirRoute + "vdr/"
    dir3 = dirRoute + "kdn/"

    pchomeDict = {}
    productArray = []
    # pchomeDict["picurl"] = "https://d.ecimg.tw"
    # pchomeDict["produrl"] = "https://24h.pchome.com.tw/prod/"
    pchomeDict["keyword"] = f"{searchword}(24h_vdr_kdn)"
    pchomeDict["dateTime"] = timeStamp

    for directory in [dir1, dir2, dir3]:
        if initialFileZeroUnderscoreInt(directory):  #有些資料夾下面沒有檔案
            for file in initialFileZeroUnderscoreInt(directory):
                with open(directory + file) as f:
                    inn = json.load(f)

                # 處理soup=""的情況
                if not inn:
                    continue

                for fileinner in inn['prods']:
                    productDict = {}
                    productDict['Id'] = fileinner['Id']
                    productDict['name'] = fileinner['name']
                    productDict['originprice'] = fileinner['originPrice']
                    productDict[
                        'pics'] = 'https://d.ecimg.tw' + fileinner['picS']
                    productDict[
                        'picb'] = 'https://d.ecimg.tw' + fileinner['picB']
                    productDict[
                        "produrl"] = "https://24h.pchome.com.tw/prod/" + fileinner[
                            "Id"]
                    productArray.append(productDict)
    # 每一個搜索字下的3個資料夾中,每個json檔案的 'prods' 陣列資料都append後,再統一指定。
    pchomeDict['product'] = productArray

    source = '_'.join(
        [dirname.split('/')[-2] for dirname in [dir1, dir2, dir3]])

    print(f"===========進行 {searchword}(24h_vdr_kdn) 去重=============")

    pchomeDict['product'], setNums = EcommerceDataProcessToSet(
        pchomeDict['product'])

    mkdirForCleanData(objectiveFolderClean, objective)

    with open(
            f"{_BASE_PATH}/dataMunging/{objectiveFolderClean}/{objective}/pchome_{source}_{timeStamp}_{setNums}_{searchword}.json",
            'w') as f:
        json.dump(pchomeDict, f, indent=2, ensure_ascii=False)

    print(f"===========完成 {searchword}(24h_vdr_kdn) 清洗!=============")
コード例 #2
0
ファイル: momoMunging.py プロジェクト: UnCarter25le/iSelect3C
def dataMunging(input, dirRoute, objectiveFolderClean, objective, domainUrl):
    """
    "Id": "6631009",
      "name": "",
      "originprice": "NaN",
      "pics": "https://img1.momoshop.com.tw/goodsimg/0006/631/009/6631009_L.jpg?t=000",
      "picb": "None",
      "produrl": "https://www.momoshop.com.tw/goods/GoodsDetail.jsp?i_code=6631009&Area=search&mdiv=403&oid=55_8&cid=index&kw=%E5%86%B7%E6%9A%96%E7%A9%BA%E8%AA%BF"
    },
    
    有的商品有連結,但是價格與品名不全,要處理。

    {
      "Id": "6574471",
      "name": "【MITSUBISHI 三菱】16公升一級能效強力型除濕機(MJ-E160HN)",
      "originprice": "NaN",
      "pics": "https://img1.momoshop.com.tw/goodsimg/0006/574/471/6574471_L.jpg?t=000",
      "picb": "None",
      "produrl": "https://www.momoshop.com.tw/goods/GoodsDetail.jsp?i_code=6574471&Area=search&mdiv=403&oid=3_22&cid=index&kw=%E9%99%A4%E6%BF%95%E6%A9%9F"
    }
    """
    thisPID = os.getpid()
    while True:
        print(thisPID,"===========================================")
        searchword = input.get()
        
        mkdirForCleanData(objectiveFolderClean, objective)

        # '/home/bluevc/2019/iSelect3C/dataMunging/rawData/momo/冷暖空調電熱水瓶'  <---關鍵字累加的問題
        # dirRoute = dirRoute + searchword

        fileRoute = dirRoute + searchword
        
        if not os.listdir(fileRoute):
            print(f"============={objective} {searchword} 資料夾沒有東西,此進程準備結束。=============")
            input.task_done()
            timeSleepOne()
            print("========================break========================")
            break

        momoDict = {}
        productArray= [] 

        for file in initialFileZeroUnderscoreInt(fileRoute):
            # print("start " + file + " ! ")

            with open(fileRoute + "/" + file)as f:
                inn = f.read()

            # 處理soup=""的情況
            if not inn:
                continue
            textSoup = BeautifulSoup(inn,'html.parser')
            try:
                #一頁至多有30項
                products = textSoup.select_one('.listArea').select_one('ul').select('li')
                for item in products:
                    innerDict = {}
                    innerDict['Id'] = item.attrs.get('gcode')

                    productName = item.select_one('.goodsUrl').select_one('.prdName').text
                    originprice = item.select_one('.goodsUrl').select_one('.money .price').text.replace('$','').replace(',','')
                    if productName:
                        innerDict['name'] = productName

                        if originprice in ("NaN", "熱銷一空"):
                            innerDict['originprice'] = "0"  #"NaN"
                        else:
                            innerDict['originprice'] = originprice
                            
                    else:
                        innerDict['name'] = "品名抓不到"
                        innerDict['originprice'] = "0"  #"NaN"

                    innerDict['pics'] = item.select_one('.goodsUrl img').attrs.get('src') 
                    innerDict['picb'] = "None"
                    innerDict['produrl'] = domainUrl + item.select_one('.goodsUrl').attrs.get('href')
                    productArray.append(innerDict)
            except Exception as e:
                print(f"{file} 有 {e} 的問題。")


        dateTime = datetime.datetime.now()
        fmt = "%Y-%m-%d-%H-%M"  #"%Y年%m月%d日%H時%M分"
        timeStamp = dateTime.strftime(fmt)

        momoDict['product'] = productArray
        momoDict['keyword'] = searchword
        momoDict["dateTime"] = timeStamp


        print("===========進行去重=============")

        momoDict['product'], setNums = EcommerceDataProcessToSet(momoDict['product'])

        with open(f"{_BASE_PATH}/dataMunging/{objectiveFolderClean}/{objective}/momo_{timeStamp}_{setNums}_{searchword}.json", 'w')as f:
            json.dump(momoDict, f, indent=2, ensure_ascii=False)

        print("===========清洗完成=============")
        print(f"這裡是dataMunging_{thisPID},準備完成工作。 ")
        print()
        end = time.time()
        print('dataMunging 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()  #通知main process此次的input處理完成!
        timeSleepOne() #暫停幾秒來模擬現實狀況。
コード例 #3
0
def dataMunging(input, dirRoute, objectiveFolderClean, objective):
    # begin = timeCalculate()
    thisPID = os.getpid()
    bureauMunging = bureauEnergyMunging()
    while True:
        print(thisPID,"===========================================")
        searchword = input.get()

        dirNameCheck = dirRoute + f"{searchword}/"
        directory = dirRoute + f"{searchword}/detail/"
        dirNameWriteOut = dirRoute + f"{searchword}/jsonIntegration/"

        print('dataMunging is in new process %s, %s ' % (dataMunging_proc, thisPID))
        print()
        print('------接下來要處理資料夾路徑「 ' + dirNameWriteOut  + '」---------')
        print()


        mkdirForCleanData(objectiveFolderClean, objective)

        if not os.listdir(dirNameCheck):
            print(f"============={objective} {searchword} 資料夾沒有東西,此進程準備結束。=============")
            input.task_done()
            timeSleepOne()
            print("========================break========================")
            break

        # 此區已經採用簡化的寫法,因此若洗資料都無問題,那麼就可以刪除了。
        # if searchword == "除濕機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailDehumidification(searchword, directory)
        # elif searchword == "無風管空氣調節機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailAirConditioner(searchword, directory)
        # elif searchword == "電冰箱":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailRefrigerator(searchword, directory)
        # elif searchword == "電熱水瓶":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailElectricWarmer(searchword, directory)
        # elif searchword == "溫熱型開飲機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailWarmDrinkMachine(searchword, directory)
        # elif searchword == "溫熱型飲水機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailWarmDispenser(searchword, directory)
        # elif searchword == "冰溫熱型開飲機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailColdWarmDrinkMachine(searchword, directory)
        # elif searchword == "冰溫熱型飲水機":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailColdWarmDispenser(searchword, directory)
        # elif searchword == "貯備型電熱水器":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailStorageWaterHeaters(searchword, directory)
        # elif searchword == "瓦斯熱水器":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailGasWaterHeaters(searchword, directory)
        # elif searchword == "瓦斯爐":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailGasStove(searchword, directory)
        # elif searchword == "安定器內藏式螢光燈泡":
        #     bureauEnergyDetail, totalNums = bureauMunging.detailCompactFluorescentLamp(searchword, directory)

        # '無風管空氣調節機', '除濕機', '電冰箱', '電熱水瓶', '溫熱型開飲機',
        # '溫熱型飲水機', '冰溫熱型開飲機', '冰溫熱型飲水機', '貯備型電熱水器' , '瓦斯熱水器', '瓦斯爐', '安定器內藏式螢光燈泡'
        bureauEnergyDetail, totalNums = bureauMunging.detailMungingEntry(searchword, directory)

        with open(dirNameWriteOut + f"{objective}_detail_{timeStampGenerator()}_{totalNums}_{searchword}.json",'w',encoding='utf-8')as f:
            json.dump(bureauEnergyDetail, f, indent=2, ensure_ascii=False)

        # 找出 overviewJsonFile ,開始與detailJsonFile合併:
        overviewJsonFile = [overviewFile for overviewFile in os.listdir(dirNameWriteOut) if "bureauEnergy_overview" in overviewFile].pop()
        with open(dirNameWriteOut + overviewJsonFile)as f:
            bureauEnergyOverview = json.load(f)

        modelPool = [comparedValue['product_model'] for comparedValue in bureauEnergyDetail['productDetail']]
        modelPoolDict = { v: k  for k, v in enumerate(modelPool)}


        #打開overviewJson檔案,為每個產品增加欄位。  
        for jsonObject in bureauEnergyOverview['product']:
            index, test_report_of_energy_efficiency, benchmark, annual, labelUri = zipJsonObject(modelPoolDict, jsonObject['product_model'], bureauEnergyDetail)
            
            # print('正在處理索引值: '+str(index))
            jsonObject['test_report_of_energy_efficiency'] = test_report_of_energy_efficiency
            jsonObject['efficiency_benchmark'] = benchmark
            jsonObject['annual_power_consumption_degrees_dive_year'] = annual
            jsonObject['energy_efficiency_label_innerUri'] = labelUri
            # print('done '+str(index))

        # 新增欄位的Json檔案更新時間。
        timeStamp = timeStampGenerator()
        bureauEnergyOverview["dateTime"] = timeStamp
        
        with open(f"{_BASE_PATH}/dataMunging/{objectiveFolderClean}/{objective}/{objective}_{timeStamp}_{totalNums}_{searchword}.json",'w',encoding='utf-8')as f:
            json.dump(bureauEnergyOverview, f, indent=2, ensure_ascii=False)

        statistic.append(totalNums)

        print(f"這裡是dataMunging_{thisPID},準備完成工作。 ")
        print()
        end = timeCalculate()
        print('dataMunging 累計耗時:{0} 秒'.format(end-begin))
        input.task_done()  #通知main process此次的input處理完成!
        timeSleepOne() #暫停幾秒來模擬現實狀況。
コード例 #4
0
            print("擷取待爬取url出錯!", e)

    timeStamp = timeStampGenerator()
    totalNum = len(newsObjectReadyForCrawling)
    newsObjectWhole["dateTime"] = timeStamp
    newsObjectWhole["keyword"] = keyword
    newsObjectWhole["newsTotalNum"] = totalNum
    newsObjectWhole["newsUrl"] = newsObjectReadyForCrawling

    #-----------------------------------檢測判斷如何---------------------
    print("篩選出", len(readyLink), "則新聞。")
    # print(len(readyLinkComparison))
    # for row in readyLinkComparison:
    #     print(readyLinkComparison[row])

    newsTitleList = [
        newsObjectReadyForCrawling[key][0] + "\n"
        for key in newsObjectReadyForCrawling
    ]
    with open(
            f"{_BASE_PATH}/{objectiveFolderDataMining}/{objectiveFolderDictionary}/threeWord.txt",
            'w') as f:
        f.writelines(newsTitleList)

    # ----------------------------------寫出成果------------------
    mkdirForCleanData(objectiveFolderClean, objective)
    with open(
            f"{_BASE_PATH}/dataMunging/{objectiveFolderClean}/{objective}/googleNews_{timeStamp}_{totalNum}_{keyword}.json",
            "w",
            encoding="utf-8") as f:
        json.dump(newsObjectWhole, f, indent=2, ensure_ascii=False)
コード例 #5
0
from collections import Counter

_BASE_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(_BASE_PATH)

from libs.mining import newsMining
from libs.munging import rawDataMunging
from libs.timeWidget import (timeStampGenerator, timeCalculate)
from libs.sqlDMLAndsqlAlchemyORM import selectedNewsContentMunging
from libs.manipulateDir import mkdirForCleanData

if __name__ == '__main__':

    newsMining = newsMining()

    mkdirForCleanData(rawDataMunging._objectiveFolderCleanData,
                      rawDataMunging._objectiveFolderNewsWithContent)

    dirRoute = f"{rawDataMunging._dirRouteMungingClean}{rawDataMunging._objectiveFolderNews}/"

    begin = timeCalculate()

    fileName, fileNums = newsMining.judgeFolderFiles(dirRoute)

    referenceFile = selectedNewsContentMunging().loadReferenceFileIn()

    if fileNums == 0:
        print("""
            尚未有clean的newsUri可以爬!請先完成
            googleNewsMulti.py --> 
            newsMunging.py --> 
            jiebaForNewsTitle.py --> 
コード例 #6
0
def dataMunging(input, dirRoute, objectiveFolderClean, objective, domainUrl):
    thisPID = os.getpid()
    while True:
        print(thisPID, "===========================================")
        searchword = input.get()

        mkdirForCleanData(objectiveFolderClean, objective)

        # '/home/bluevc/2019/iSelect3C/dataMunging/rawData/momo/冷暖空調電熱水瓶'  <---關鍵字累加的問題
        # dirRoute = dirRoute + searchword

        fileRoute = dirRoute + searchword

        if not os.listdir(fileRoute):
            print(
                f"============={objective} {searchword} 資料夾沒有東西,此進程準備結束。============="
            )
            input.task_done()
            timeSleepOne()
            print("========================break========================")
            break

        momoDict = {}
        productArray = []

        for file in initialFileZeroUnderscoreInt(fileRoute):
            # print("start " + file + " ! ")

            with open(fileRoute + "/" + file) as f:
                inn = f.read()

            # 處理soup=""的情況
            if not inn:
                continue
            textSoup = BeautifulSoup(inn, 'html.parser')
            try:
                #一頁至多有30項
                products = textSoup.select_one('.listArea').select_one(
                    'ul').select('li')
                for item in products:
                    innerDict = {}
                    innerDict['Id'] = item.attrs.get('gcode')
                    innerDict['name'] = item.select_one(
                        '.goodsUrl').select_one('.prdName').text
                    innerDict['originprice'] = item.select_one(
                        '.goodsUrl').select_one('.money .price').text.replace(
                            '$', '').replace(',', '')
                    innerDict['pics'] = item.select_one(
                        '.goodsUrl img').attrs.get('src')
                    innerDict['picb'] = "None"
                    innerDict['produrl'] = domainUrl + item.select_one(
                        '.goodsUrl').attrs.get('href')
                    productArray.append(innerDict)
            except Exception as e:
                print(f"{file} 有 {e} 的問題。")

        dateTime = datetime.datetime.now()
        fmt = "%Y-%m-%d-%H-%M"  #"%Y年%m月%d日%H時%M分"
        timeStamp = dateTime.strftime(fmt)

        momoDict['product'] = productArray
        momoDict['keyword'] = searchword
        momoDict["dateTime"] = timeStamp

        print("===========進行去重=============")

        momoDict['product'], setNums = EcommerceDataProcessToSet(
            momoDict['product'])

        with open(
                f"{_BASE_PATH}/dataMunging/{objectiveFolderClean}/{objective}/momo_{timeStamp}_{setNums}_{searchword}.json",
                'w') as f:
            json.dump(momoDict, f, indent=2, ensure_ascii=False)

        print("===========清洗完成=============")
        print(f"這裡是dataMunging_{thisPID},準備完成工作。 ")
        print()
        end = time.time()
        print('dataMunging 累計耗時:{0} 秒'.format(end - begin))
        input.task_done()  #通知main process此次的input處理完成!
        timeSleepOne()  #暫停幾秒來模擬現實狀況。
コード例 #7
0
            f"/weather_{dateTime}_{keyword}.json", 'w') as f:
        json.dump(weatherDictOutter, f, indent=2, ensure_ascii=False)


if __name__ == '__main__':

    weatherRecord = weatherRecordMunging()
    # fileRouteList = [f for f in listSecondDirBelowFiles(weatherRecord._dirRouteMungingRaw + weatherRecord._weather)]

    fileRouteGenerator = listSecondDirBelowFiles(
        weatherRecord._dirRouteMungingRaw + weatherRecord._weather)

    begin = timeCalculate()
    eraseCleanData(weatherRecord._objectiveFolderCleanData,
                   weatherRecord._weather)
    mkdirForCleanData(weatherRecord._objectiveFolderCleanData,
                      weatherRecord._weather)

    dataMunging(fileRouteGenerator)

    #共同佇列宣告
    # file_queue = mp.JoinableQueue()

    # 啟動進程
    # Process_1 = []
    # for p in range(3):
    #     dataMungingMultiWay_proc = mp.Process(target=dataMungingMultiWay, args=(file_queue,))
    #     dataMungingMultiWay_proc.daemon = True
    #     dataMungingMultiWay_proc.start()
    #     print(f'建立第{p}個 dataMungingMultiWay_proc, {dataMungingMultiWay_proc}')
    #     Process_1.append(dataMungingMultiWay_proc)